import sqlite3
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')
!pip install fastai==0.7.0
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from matplotlib import rcParams
from matplotlib.cm import rainbow
import warnings
warnings.filterwarnings('ignore')
# Create your connection.
cnx = sqlite3.connect('/content/drive/My Drive/CSV files/database.sqlite')
df = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)
df.head()
print(df.shape)
df.isna().sum()
print(df.shape)
print(df.dtypes)
df.columns
features = [
'potential', 'crossing', 'finishing', 'heading_accuracy',
'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
'gk_reflexes']
Specifying the Prediction Target
target = ['overall_rating']
Cleaning the Data
df = df.dropna()
print(df.shape)
df.head()
X = df[features]
y = df[target]
print(X.shape)
print(y.shape)
X.head()
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(1,figsize=(24,15))
sns.heatmap(X.corr(),annot=True,cmap="YlGnBu")
plt.show()
Let us look at a typical row from our features:
X.iloc[2]
y.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324,shuffle=True)
y_test.mean()
model1= LinearRegression(n_jobs=-1)
model1.fit(X_train, y_train)
y_pred1=model1.predict(X_test)
RMSE1 = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred1))
print(y_test.mean())
print(RMSE1)
len(X.columns)
score=[]
for i in range(2,len(X.columns)+1):
model2= DecisionTreeRegressor(max_depth=10+i,max_features=i)
model2.fit(X_train, y_train)
y_pred2=model2.predict(X_test)
RMSE2 = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred2))
score.append(RMSE2)
RMSE2
plt.figure(1,figsize=(24,8))
list1=list(range(2,len(X.columns)+1))
sns.lineplot(x=list1,y=score)
sns.scatterplot(x=list1,y=score,color='red',legend='brief')
for i in range(len(list1)):
plt.text(x =list1[i]+0.05 , y =score[i]+0.05, s =round(score[i],3), size = 10)
plt.xticks([i for i in range(2, len(X.columns) + 1)])
plt.xlabel('Max features')
plt.xlim(1,35)
plt.ylim(1.2,2.8)
plt.ylabel('Scores')
plt.title('Decision Tree Regress RMSE scores for different number of maximum features')
plt.show()
What is the mean of the expected target value in test set ?
score1=[]
estimators=[1,2,4,6,8,10,12,14,18,20]
for i in estimators:
model3= RandomForestRegressor(n_jobs=-1,n_estimators=i,max_depth=5+i)
model3.fit(X_train, y_train.values.ravel())
y_pred3=model3.predict(X_test)
RMSE3 = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred3))
#print(y_test.mean())
score1.append(RMSE3)
sns.set_style('whitegrid')
plt.figure(1,figsize=(18,8))
sns.lineplot(x=estimators,y=score1,color='yellow')
sns.scatterplot(x=estimators,y=score1,color='blue')
for i in range(len(estimators)):
plt.text(x =estimators[i]+0.05 , y =score1[i]+0.05, s =round(score1[i],3), size = 12)
plt.xticks(estimators)
plt.ylim(0.8,3.1)
#plt.xlim(1,20)
plt.xlabel('n_estimators')
plt.ylabel('Scores')
plt.title('Random Forest Tree Regress RMSE scores for different number of maximum features')
plt.show()
print("Computed Random Forest Scores By tuning")
score1
m_fea=['auto','sqrt','log2',28,34]
score_fea=[]
for i in m_fea:
model3_rf= RandomForestRegressor(n_jobs=-1,n_estimators=20,max_depth=25,max_features=i)
model3_rf.fit(X_train, y_train.values.ravel())
y_pred3_rf=model3_rf.predict(X_test)
RMSE3_rf = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred3_rf))
#print(y_test.mean())
score_fea.append(RMSE3_rf)
print('max_features :'+str(i))
print(RMSE3_rf)
max_feat=pd.DataFrame({'Name':m_fea,'RMSE_Score':score_fea})
max_feat
plt.figure(1,figsize=(14,8))
sns.barplot(x='Name',y='RMSE_Score',data=max_feat)
for i in range(5):
plt.text(x =i-0.1 , y = score_fea[i]+0.002, s =round(score_fea[i],3), size = 14)
plt.ylim(0.9,1.05)
plt.ylabel('RMSE Score')
plt.xlabel('max_features method')
plt.title('Comparing various max_features values on the performance of the model: less the better')
plt.show()
reg_rf= RandomForestRegressor(n_jobs=-1,n_estimators=1,max_depth=3,bootstrap=False)
reg_rf.fit(X_train, y_train.values.ravel())
draw_tree(reg_rf.estimators_[0], X_train, precision=5)
print(RMSE1)
print(RMSE2)
print(RMSE3)
model5= KNeighborsRegressor()
model5.fit(X_train, y_train.values.ravel())
y_pred5=model5.predict(X_test)
RMSE5 = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred5))
print(y_test.mean())
print(RMSE5)
estimators_gb=[18,20,24,28,30,32,36,40,60,80,100,120]
score3=[]
for i in estimators_gb:
model6= GradientBoostingRegressor(n_estimators=i)
model6.fit(X_train, y_train.values.ravel())
y_pred6=model6.predict(X_test)
RMSE6 = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred6))
score3.append(RMSE6)
print("estimators :"+str(i))
print(RMSE6)
sns.set_style('whitegrid')
len(score3)==len(estimators_gb)
plt.figure(1,figsize=(18,8))
sns.lineplot(x=estimators_gb,y=score3,color='orange')
sns.scatterplot(x=estimators_gb,y=score3,color='red')
for i in range(len(estimators_gb)):
plt.text(x =estimators_gb[i]+0.05 , y =score3[i]+0.05, s =round(score3[i],3), size = 12)
#plt.xticks(estimators)
plt.ylim(1.5,3.5)
plt.xlim(1,140)
plt.xlabel('n_estimators')
plt.ylabel('Scores')
plt.title('Gradient boosting Regressor RMSE scores for different number of estimators')
regressor=pd.DataFrame({'Linear regression':[RMSE1],'Descicion Tree Regressor':[RMSE2],'Random Forest Regressor':[RMSE3_rf],'Gradient Boosting Regressor':[RMSE6],'K neighbours Regressor':[RMSE5]})
regressor
#For comparision: Mean of the expected target value in test set
y.mean()
prediction=pd.DataFrame({'Test':y_test.overall_rating,'DecisionTree':y_pred2,'RandomForest':y_pred3_rf,'GradientBoosting':y_pred6})
prediction=prediction.reset_index(drop=True)
print(prediction.shape)
prediction.head(15)
sns.set_style('darkgrid')
plt.figure(1,figsize=(20,8))
sns.lineplot(x=X_test.potential,y=y_test['overall_rating'],color='red',label='Test data')
sns.lineplot(x=X_test.potential,y=y_pred3,color='green',label='Random Forest Regressor')
sns.lineplot(x=X_test.potential,y=y_pred2,color='blue',label='Decision tree regresssor Predicted data')
plt.title("Plotting Prediction data vs Test data to see deviation.")
plt.legend()
plt.show()