Learn practical skills, build real-world projects, and advance your career
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # plotting libraries
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

#set style of sns
sns.set(style='whitegrid', color_codes=True)
sns.set(rc={'figure.figsize':(12,9)})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/creditcardfraud/creditcard.csv
#load in the data and check out the first few rows
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df.head()

Explore the data to determine relationships between variables

#first, get some info about the data - note, first run appears to be a complete dataset with no null values
print(df.info())
print(df.describe())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB None Time V1 V2 V3 V4 \ count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 mean 94813.859575 3.919560e-15 5.688174e-16 -8.769071e-15 2.782312e-15 std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 V5 V6 V7 V8 V9 \ count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 mean -1.552563e-15 2.010663e-15 -1.694249e-15 -1.927028e-16 -3.137024e-15 std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 ... V21 V22 V23 V24 \ count ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 mean ... 1.537294e-16 7.959909e-16 5.367590e-16 4.458112e-15 std ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 min ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 25% ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 50% ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 75% ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 max ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 V25 V26 V27 V28 Amount \ count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 mean 1.453003e-15 1.699104e-15 -3.660161e-16 -1.206049e-16 88.349619 std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 Class count 284807.000000 mean 0.001727 std 0.041527 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 [8 rows x 31 columns]
#Class = 1 when there is a fraud detected; so lets investigate the proportion of the total that is fraudulent, and also correlations
prop_fraud = df['Class'].sum()/len(df['Class'])
prop_fraud
0.001727485630620034