In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
data_url = "http://bit.ly/w-data"
s_data = pd.read_csv(data_url)
print("Data imported successfully")
s_data.head()
Data imported successfully
Out[2]:
Hours Scores
0 2.5 21
1 5.1 47
2 3.2 27
3 8.5 75
4 3.5 30
In [3]:
s_data.plot(x='Hours',y='Scores',style='o')
plt.title('Hours Vs Percentage')
plt.xlabel('Hours Studied')
plt.ylabel('Percentage Score')
plt.show()
In [4]:
X = s_data.iloc[:,:-1].values
Y = s_data.iloc[:,1].values
In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=0)
In [6]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
print("Training Complete.")
Training Complete.
In [7]:
# Plotting regression line
line = regressor.coef_*X+regressor.intercept_
# Plotting for the test data
plt.scatter(X,Y)
plt.plot(X,line)
plt.title("Comparison")
plt.show()
In [8]:
# Making Predictions
print(x_test)
y_pred = regressor.predict(x_test)
[[1.5]
 [3.2]
 [7.4]
 [2.5]
 [5.9]]
In [9]:
# Comparing Actual Vs Predicted
df = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
df
Out[9]:
Actual Predicted
0 20 16.884145
1 27 33.732261
2 69 75.357018
3 30 26.794801
4 62 60.491033
In [10]:
score_pred = np.array([7])
score_pred = score_pred.reshape(-1,1)
predict = regressor.predict(score_pred)
print("No of hours = {}".format(7))
print("Predicted Score = {}".format(predict[0]))
No of hours = 7
Predicted Score = 71.39275540593033
In [11]:
# Evaluating the model
from sklearn import metrics
print('Mean Absolute Error: ',metrics.mean_absolute_error(y_test,y_pred))
Mean Absolute Error:  4.183859899002982