Learn practical skills, build real-world projects, and advance your career
Created 4 years ago
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
plt.style.use('ggplot')
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.
['items.csv', 'shops.csv', 'sales_train.csv', 'test.csv', 'sample_submission.csv', 'item_categories.csv']
items = pd.read_csv('../input/items.csv')
train = pd.read_csv('../input/sales_train.csv')
test = pd.read_csv('../input/test.csv')
item_category = pd.read_csv('../input/item_categories.csv')
shops = pd.read_csv('../input/shops.csv')
Functions
def eda(data):
print("----------Top-5- Record----------")
print(data.head(5))
print("-----------Information-----------")
print(data.info())
print("-----------Data Types-----------")
print(data.dtypes)
print("----------Missing value-----------")
print(data.isnull().sum())
print("----------Null value-----------")
print(data.isna().sum())
print("----------Shape of Data----------")
print(data.shape)
def graph_insight(data):
print(set(data.dtypes.tolist()))
df_num = data.select_dtypes(include = ['float64', 'int64'])
df_num.hist(figsize=(16, 16), bins=50, xlabelsize=8, ylabelsize=8);
def drop_duplicate(data, subset):
print('Before drop shape:', data.shape)
before = data.shape[0]
data.drop_duplicates(subset,keep='first', inplace=True) #subset is list where you have to put all column for duplicate check
data.reset_index(drop=True, inplace=True)
print('After drop shape:', data.shape)
after = data.shape[0]
print('Total Duplicate:', before-after)