sklearn linear_model LinearRegression on Salaries

Category:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
#from IPython import get_ipython
#ipy = get_ipython()
#if ipy is not None:
#    ipy.run_line_magic('matplotlib', 'inline')
sns.set()
#%matplotlib inline

df = pd.read_csv("./SalaryData1.csv")
print(df.shape)
print(df.isnull().values.any())

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

#df_copy = train_set.copy()

#print(df_copy.describe())
#print(df_copy.corr())

train_set.plot.scatter(x='YearsExperience', y='Salary')


#train_set = train_set.drop(["Salary"], axis=1)
Xtrain = train_set["YearsExperience"].to_frame()
yTrain = train_set["Salary"].to_frame()
#Xtrain = Xtrain.reshape(1,-1)
#yTrain = yTrain.reshape(1,-1)
# sex_train = data['Sex'].map({'male':0,'female':1}).to_frame()
Xtest = test_set["YearsExperience"].to_frame()
ytest = test_set["Salary"].to_frame()

X = np.array(Xtrain)
y = np.array(yTrain)
#Xtest = Xtest.reshape(1,-1)
#ytest = ytest.reshape(1,-1)
#train_labels = df_copy["Salary"]

lin_reg = LinearRegression()
lin_reg.fit(Xtrain, yTrain)

#print("Coefficients: ", lin_reg.coef_)
#print("Intercept: ", lin_reg.intercept_)

salary_pred = lin_reg.predict(10)

#salary_pred = lin_reg.predict(10)
print("salary_pred: ", salary_pred)
score = lin_reg.score(Xtest, ytest)
print("score: ", score)