XGBoost Model

Data Collection and Preprocessing

# All Imports Required Go Here

import requests
from datetime import datetime
from datetime import date
import os
import pandas as pd
import math
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode

# Data from the John Hopkins University Dataset on GitHub
# https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

# Defining the variables required
filenames = ['time_series_covid19_confirmed_global.csv',
             'time_series_covid19_deaths_global.csv',
             'time_series_covid19_recovered_global.csv']

url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

# Making the main dataframes required for the analysis
confirmed_global = pd.read_csv(url + filenames[0])
deaths_global = pd.read_csv(url + filenames[1])
recovered_global = pd.read_csv(url + filenames[2])
country_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')

# Simple Data Cleaning - Removing and renaming the Columns

# Removing the Province/State column, as it is pretty much not of any use
confirmed_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
deaths_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
recovered_global.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)

# Renaming the columns for easier access
confirmed_global.rename(columns = {"Country/Region": "country"}, inplace = True)
deaths_global.rename(columns = {"Country/Region": "country"}, inplace = True)
recovered_global.rename(columns = {"Country/Region": "country"}, inplace = True)

country_cases.rename(columns = {
    "Country_Region" : "country",
    "Last_Update": "last",
    "Confirmed": "confirmed",
    "Deaths": "deaths",
    "Recovered" : "recovered",
    "Active" : "active",
    "Mortality_Rate": "mortality"
}, inplace = True)

# Removing some duplicate values from the table
confirmed_global = confirmed_global.groupby(['country'], as_index = False).sum()
deaths_global = deaths_global.groupby(['country'], as_index = False).sum()
recovered_global = recovered_global.groupby(['country'], as_index = False).sum()

# This value is being changed as there was an error in the original dataset that had to be modified
confirmed_global.at[178, '5/20/20'] = 251667

# Making a dataframe with the country data in sorted order
country_cases_sorted = country_cases.sort_values('confirmed', ascending = False)
country_cases_sorted.index = [x for x in range(len(country_cases_sorted))]

Making the model

# Importing the necessary libraries
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error