import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_url = "http://bit.ly/w-data"
s_data = pd.read_csv(data_url)
print("Data imported successfully")
s_data.head()
Data imported successfully
Hours | Scores | |
---|---|---|
0 | 2.5 | 21 |
1 | 5.1 | 47 |
2 | 3.2 | 27 |
3 | 8.5 | 75 |
4 | 3.5 | 30 |
s_data.plot(x='Hours',y='Scores',style='o')
plt.title('Hours Vs Percentage')
plt.xlabel('Hours Studied')
plt.ylabel('Percentage Score')
plt.show()
X = s_data.iloc[:,:-1].values
Y = s_data.iloc[:,1].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
print("Training Complete.")
Training Complete.
# Plotting regression line
line = regressor.coef_*X+regressor.intercept_
# Plotting for the test data
plt.scatter(X,Y)
plt.plot(X,line)
plt.title("Comparison")
plt.show()
# Making Predictions
print(x_test)
y_pred = regressor.predict(x_test)
[[1.5] [3.2] [7.4] [2.5] [5.9]]
# Comparing Actual Vs Predicted
df = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
df
Actual | Predicted | |
---|---|---|
0 | 20 | 16.884145 |
1 | 27 | 33.732261 |
2 | 69 | 75.357018 |
3 | 30 | 26.794801 |
4 | 62 | 60.491033 |
score_pred = np.array([7])
score_pred = score_pred.reshape(-1,1)
predict = regressor.predict(score_pred)
print("No of hours = {}".format(7))
print("Predicted Score = {}".format(predict[0]))
No of hours = 7 Predicted Score = 71.39275540593033
# Evaluating the model
from sklearn import metrics
print('Mean Absolute Error: ',metrics.mean_absolute_error(y_test,y_pred))
Mean Absolute Error: 4.183859899002982