Learn practical skills, build real-world projects, and advance your career
Created 4 years ago
'''Import basic modules'''
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression, SGDClassifier
'''import visualization'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
%matplotlib inline
import matplotlib.gridspec as gridspec
'''Plotly visualization .'''
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)
'''Display markdown formatted output like bold, italic bold etc.'''
from IPython.display import Markdown
def bold(string):
display(Markdown(string))
'''Ignore deprecation and future, and user warnings.'''
import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning)
wrn.filterwarnings('ignore', category = FutureWarning)
wrn.filterwarnings('ignore', category = UserWarning)
# Load data
train = pd.read_csv('../Desktop/android_bids_us.csv')
print(train.shape)
(3148828, 12)
from pandas_profiling import ProfileReport
import matplotlib.gridspec as gridspec
profile = ProfileReport(train, minimal=True)
profile
#Variable Description
def description(df):
print(f"Dataset Shape: {df.shape}")
summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary = summary[['Name','dtypes']]
summary['Missing'] = df.isnull().sum().values
summary['Uniques'] = df.nunique().values
summary['First Value'] = df.iloc[0].values
summary['Second Value'] = df.iloc[1].values
summary['Third Value'] = df.iloc[2].values
return summary
bold('**Variable Description of train Data:**')
description(train)
Dataset Shape: (3148828, 12)
# Replacing missing values by the most repeat data(mod)e in the colomns
def replace_nan(data):
for column in data.columns:
if data[column].isna().sum() > 0:
data[column] = data[column].fillna(data[column].mode()[0])
replace_nan(train)