predict and evaluate multivariate linear regression model


# imports
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import numpy as np

# allow plots to appear directly in the notebook
#%matplotlib inline
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')

data = pd.read_csv('', index_col=0)
# visualize the relationship between the features and the response using scatterplots
#sns.pairplot(data, x_vars=['TV','radio','newspaper'], y_vars='sales', height=7, aspect=0.7)

# create a fitted model
lm1 = smf.ols(formula='sales ~ TV', data=data).fit()
# print the coefficients
#print("STATSMODELS params: ",lm1.params)
# Statsmodels Prediction
# you have to create a DataFrame since the Statsmodels formula interface expects it
X_new = pd.DataFrame({'TV': [50]})
# predict for a new observation
print("Statsmodels Prediction for TV 50: ",lm1.predict(X_new))

# create X and y
feature_cols = ['TV','radio', 'newspaper']
X = data[feature_cols]
y = data.sales
# instantiate and fit
lm2 = LinearRegression(), y)
# print the coefficients
#print("SCIKIT-LEARN params: ",lm2.intercept_,lm2.coef_)
# Scikit-learn Prediction 
# predict for a new observation
print("SCIKIT-LEARN Prediction for TV 44, 40, 45: : ",lm2.predict([[44, 40, 45]]))

# visualisation des données deux à deux
#sns.pairplot(data, x_vars=['TV','radio','newspaper'], y_vars='sales', height=7, aspect=0.7, kind='reg')

### SCIKIT-LEARN Split data ###
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# Instantiate model
lm2 = LinearRegression()
# Fit Model, y_train)
# Predict
y_pred = lm2.predict(X_test)
print("RMSE = ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))