import numpy as np # linear algebra
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pandas_profiling
from IPython.display import HTML
HTML('''
''')from google.colab import drive
drive.mount('/content/drive')
nfl= pd.read_csv('/content/drive/My Drive/CSV files/nfl_train.csv')
nfl.head()
profile=pandas_profiling.ProfileReport(nfl)
profile.to_file('nfl_overview.html')
nfl.shape
nfl.info()
nfl.columns
plt.figure(1,figsize=(8,6))
sns.countplot(x='Team',data=nfl)
plt.title('Count plot of home and away teams:')
plt.show()
sns.set_style('darkgrid')
plt.figure(1,figsize=(12,6))
sns.barplot(y='DisplayName',x='S',data=nfl.sort_values(by='S',ascending=False)[:6])
plt.title('Nfl top 6 fastest speed observed of players')
plt.xlim(8.5,9.5)
plt.xlabel('speed')
plt.ylabel('Players Name')
plt.show()
plt.figure(1,figsize=(16,10))
sns.barplot(y='DisplayName',x='A',data=nfl.sort_values(by='A',ascending=False)[:15])
plt.title('Nfl top 15 max acceleration observed of players')
plt.xlim(8,15)
plt.xlabel('Acceleration')
plt.ylabel('Players Name')
plt.show()
nfl[nfl.PlayerWeight==max(nfl.PlayerWeight)][['DisplayName','PlayerWeight']][:1]
plt.figure(figsize=(16,8))
plot = sns.countplot(y ="PlayerCollegeName",data=nfl,order=nfl['PlayerCollegeName'].value_counts().iloc[:10].index, palette = "Set1")
plt.xlim(7500,16500)
plt.title('College with maximum players')
plt.show()
plt.figure(figsize=(20,12))
plot = sns.countplot(y ="Stadium",data=nfl,order=nfl['Stadium'].value_counts().iloc[:20].index, palette = "Set1")
plt.xlim(13200,25000)
plt.title('Top 20 Stadiums which hosted the maximum games: ')
plt.show()
nfl.Stadium.unique()
plt.figure(figsize=(20,12))
plot = sns.countplot(y ="Location",data=nfl,order=nfl['Location'].value_counts().iloc[:20].index, palette = "Set3")
plt.xlim(12200,20000)
plt.title('Top 20 Cities which hosted the maximum games: ')
plt.show()
nfl[nfl.StadiumType=='Outdoor'].shape
plt.figure(figsize=(20,12))
plot = sns.countplot(y ='StadiumType',data=nfl,order=nfl['StadiumType'].value_counts().iloc[:20].index, palette = "Set3")
plt.xlim(0,20000)
plt.title('Top 20 types of Stadiums which hosted the maximum games: ')
plt.show()
plt.figure(figsize=(20,10))
sns.violinplot(y='Temperature',x='Turf',data=nfl[(nfl.Turf=='Natural Grass')|(nfl.Turf=='Artificial')|(nfl.Turf=='Field Turf')|(nfl.Turf=='UBU Sports Speed S5-M')])
plt.title('Distribution of temperature when games played on different Turf.')
plt.show()
nfl.Turf.unique()
plt.figure(figsize=(20,8))
sns.violinplot(y='Temperature',x='StadiumType',data=nfl[(nfl.StadiumType=='Outdoors')|(nfl.StadiumType=='Indoors')|(nfl.StadiumType=='Dome')|(nfl.StadiumType=='Retractable Roof')|(nfl.StadiumType=='Open')])
plt.title('Distribution of temperature when games played on different Stadium Types.')
plt.show()
nfl.StadiumType.unique()
plt.figure(figsize=(20,8))
sns.violinplot(y='Humidity',x='StadiumType',data=nfl[(nfl.StadiumType=='Outdoors')|(nfl.StadiumType=='Indoors')|(nfl.StadiumType=='Dome')|(nfl.StadiumType=='Retractable Roof')|(nfl.StadiumType=='Open')])
plt.title('Distribution of Humidity when games played on different Stadium Types.')
plt.show()
sns.set_style('darkgrid')
plt.figure(figsize=(20,8))
sns.kdeplot(nfl.Yards,shade=True)
plt.xlim(-10,25)
plt.title('Distribution of Yards gained')
plt.show()
plt.figure(1,figsize=(18,8))
sns.lineplot(x='Yards',y='DefendersInTheBox',data=nfl[:1000],label='Defenders in the box',ci=50)
plt.ylim(0,15)
plt.xlim(-5,15)
plt.title('lineplot showing relation between defenders in the box vs yards gained')
plt.xlabel('yards gained')
plt.ylabel('Defenders in The Box')
plt.show()
nfl.columns
features=['X', 'Y', 'S', 'A', 'Dis', 'Orientation',
'Dir', 'YardLine',
'Quarter', 'Down', 'Distance',
'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'OffenseFormation',
'DefendersInTheBox', 'Yards', 'PlayerWeight', 'Position', 'Week','Temperature', 'Humidity']
fea_dum=[ 'OffenseFormation','Position']
nfl.shape
nfl=nfl.dropna()
nfl.shape
nfl_c=nfl[features].corr()
nfl_c
sns.set_style('whitegrid')
plt.figure(1,figsize=(20,10))
mask = np.zeros_like(nfl_c)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(nfl_c,annot=True,cmap="YlGnBu",mask=mask)
plt.title('Heatmap of features of Nfl dataset')
plt.show()
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
sample_data=nfl.copy()
features
X=sample_data[features].copy()
X.head()
X=X.drop(['Yards'],axis=1)
X.dtypes
y=sample_data[['Yards']].copy()
y.shape
X.shape
fea_dum
X=pd.get_dummies(X,columns=fea_dum)
X.head()
nfl.Position.unique()
X.dtypes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=324)
X_train.shape
X_test.shape
y_test.shape
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_prediction = regressor.predict(X_test)
y_prediction
y_test.describe()
RMSE_L = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE_L)
regressor_d = DecisionTreeRegressor()
regressor_d.fit(X_train, y_train)
y_prediction_d = regressor_d.predict(X_test)
y_prediction_d
RMSE_D = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction_d))
print(RMSE_D)
y_predict_lin_data=pd.DataFrame(y_prediction,columns=['predict_linear_regressor'])
y_predict_dec_data=pd.DataFrame(y_prediction_d,columns=['predict_decision_tree_regressor'])
plt.figure(1,figsize=(18,8))
sns.lineplot(x='Yards',y='DefendersInTheBox',data=nfl[:1000],label='Defenders in the box',ci=50,color='red')
plt.ylim(0,15)
plt.title('lineplot showing relation between defenders in the box vs yards gained')
plt.xlabel('yards gained')
plt.ylabel('Defenders in The Box')
plt.show()
sns.set_style('darkgrid')
plt.figure(1,figsize=(18,8))
sns.lineplot(x=y_test.Yards,y=X_test.DefendersInTheBox,label='test data',color='red')
sns.lineplot(x=y_predict_dec_data.predict_decision_tree_regressor,y=X_test.DefendersInTheBox,label='decision tree regressor predicted data',color='blue')
plt.ylim(0,14)
plt.xlim(-8,20)
plt.title('lineplot showing relation between defenders in the box vs yards gained')
plt.xlabel('yards gained')
plt.ylabel('Defenders in The Box')
plt.show()