Learn practical skills, build real-world projects, and advance your career
Updated 4 years ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
auto_prices = pd.read_csv('Automobile price data _Raw_.csv')
def clean_auto_data(auto_prices):
'Function to load the auto price data set from a .csv file'
import pandas as pd
import numpy as np
## Recode names
## fix column names so the '-' character becomes '_'
cols = auto_prices.columns
auto_prices.columns = [str.replace('-', '_') for str in cols]
## Treat missing values
## Remove rows with missing values, accounting for mising values coded as '?'
cols = ['price', 'bore', 'stroke',
'horsepower', 'peak_rpm']
for column in cols:
auto_prices.loc[auto_prices[column] == '?', column] = np.nan
auto_prices.dropna(axis = 0, inplace = True)
## Transform column data type
## Convert some columns to numeric values
for column in cols:
auto_prices[column] = pd.to_numeric(auto_prices[column])
return auto_prices
auto_prices = clean_auto_data(auto_prices)
print(auto_prices.columns)
Index(['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
'num_of_doors', 'body_style', 'drive_wheels', 'engine_location',
'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type',
'num_of_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke',
'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg',
'highway_mpg', 'price'],
dtype='object')
auto_prices.head()
auto_prices.dtypes
symboling int64
normalized_losses object
make object
fuel_type object
aspiration object
num_of_doors object
body_style object
drive_wheels object
engine_location object
wheel_base float64
length float64
width float64
height float64
curb_weight int64
engine_type object
num_of_cylinders object
engine_size int64
fuel_system object
bore float64
stroke float64
compression_ratio float64
horsepower int64
peak_rpm int64
city_mpg int64
highway_mpg int64
price int64
dtype: object
auto_prices.describe()