predict and evaluate multivariate linear regression model

Category:

# -*- coding: utf-8 -*-
"""
Created on Tue Jan 15 11:37:53 2019
https://www.ritchieng.com/machine-learning-evaluate-linear-regression-mo...
@author: K
"""

# imports
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import numpy as np

# allow plots to appear directly in the notebook
#%matplotlib inline
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')

data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()
# visualize the relationship between the features and the response using scatterplots
#sns.pairplot(data, x_vars=['TV','radio','newspaper'], y_vars='sales', height=7, aspect=0.7)


### STATSMODELS ###
# create a fitted model
lm1 = smf.ols(formula='sales ~ TV', data=data).fit()
# print the coefficients
#print("STATSMODELS params: ",lm1.params)
# Statsmodels Prediction
# you have to create a DataFrame since the Statsmodels formula interface expects it
X_new = pd.DataFrame({'TV': [50]})
# predict for a new observation
print("Statsmodels Prediction for TV 50: ",lm1.predict(X_new))


### SCIKIT-LEARN ###
# create X and y
feature_cols = ['TV','radio', 'newspaper']
X = data[feature_cols]
y = data.sales
# instantiate and fit
lm2 = LinearRegression()
lm2.fit(X, y)
# print the coefficients
#print("SCIKIT-LEARN params: ",lm2.intercept_,lm2.coef_)
# Scikit-learn Prediction 
# predict for a new observation
print("SCIKIT-LEARN Prediction for TV 44, 40, 45: : ",lm2.predict([[44, 40, 45]]))

# visualisation des données deux à deux
#sns.pairplot(data, x_vars=['TV','radio','newspaper'], y_vars='sales', height=7, aspect=0.7, kind='reg')

### SCIKIT-LEARN Split data ###
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# Instantiate model
lm2 = LinearRegression()
# Fit Model
lm2.fit(X_train, y_train)
# Predict
y_pred = lm2.predict(X_test)
# RMSE
print("RMSE = ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))