import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols as sm_ols
from statsmodels.iolib.summary2 import summary_col # nicer tables
import matplotlib.pyplot as plt
housing_df = pd.read_csv("./input_data2/housing_train.csv")
Insert cells as needed below to write a short EDA/data section that summarizes the data for someone who has never opened it before.
sns.boxplot(x='v_Neighborhood', y='v_SalePrice', data=housing_df)
plt.xticks(rotation=75, ha='right')
plt.tight_layout()
plt.show()
sns.barplot(x='v_Neighborhood', y='v_SalePrice', data=housing_df)
plt.xticks(rotation=75, ha='right')
plt.tight_layout()
plt.show()
sns.boxplot(x='v_Overall_Qual', y='v_SalePrice', data=housing_df)
plt.show()
sns.boxplot(x='v_Overall_Cond', y='v_SalePrice', data=housing_df)
plt.show()
reg1 = sm_ols('v_SalePrice ~ v_Lot_Area', data = housing_df).fit()
reg2 = sm_ols('v_SalePrice ~ np.log(v_Lot_Area)', data = housing_df).fit()
reg3 = sm_ols('np.log(v_SalePrice) ~ v_Lot_Area', data = housing_df).fit()
reg4 = sm_ols('np.log(v_SalePrice) ~ np.log(v_Lot_Area)', data = housing_df).fit()
reg5 = sm_ols('np.log(v_SalePrice) ~ v_Yr_Sold', data = housing_df).fit()
reg6 = sm_ols('np.log(v_SalePrice) ~ v_Yr_Sold==2007 + v_Yr_Sold==2008 ', data = housing_df).fit()
reg7 = sm_ols('np.log(v_SalePrice) ~ v_Garage_Cars + v_Gr_Liv_Area + v_Overall_Qual + v_Bldg_Type=="1Fam"+ v_Bldg_Type=="TwnhsE" + v_Neighborhood=="NoRidge"+ v_Neighborhood=="NridgHt"+ v_Neighborhood=="StoneBR"', data = housing_df).fit()
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
'Adj R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
'No. observations' : lambda x: f"{int(x.nobs):d}"}
print(summary_col(results=[reg1,reg2,reg3,reg4,reg5,reg6,reg7], # list the result obj here
float_format='%0.2f',
stars = True, # stars are easy way to see if anything is statistically significant
model_names=['Reg 1','Reg 2',' Reg 3','Reg 4','Reg 5','Reg 6','Reg 7',], # these are bad names, lol. Usually, just use the y variable name
info_dict=info_dict
)
)
=========================================================================================================
Reg 1 Reg 2 Reg 3 Reg 4 Reg 5 Reg 6 Reg 7
---------------------------------------------------------------------------------------------------------
Intercept 154789.55*** -327915.80*** 11.89*** 9.41*** 22.29 12.02*** 10.51***
(2911.59) (30221.35) (0.01) (0.15) (22.94) (0.02) (0.02)
R-squared 0.07 0.13 0.06 0.13 0.00 0.00 0.80
R-squared Adj. 0.07 0.13 0.06 0.13 -0.00 0.00 0.80
np.log(v_Lot_Area) 56028.17*** 0.29***
(3315.14) (0.02)
v_Bldg_Type == "1Fam"[T.True] 0.12***
(0.01)
v_Bldg_Type == "TwnhsE"[T.True] 0.11***
(0.02)
v_Garage_Cars 0.11***
(0.01)
v_Gr_Liv_Area 0.00***
(0.00)
v_Lot_Area 2.65*** 0.00***
(0.23) (0.00)
v_Neighborhood == "NoRidge"[T.True] 0.06**
(0.03)
v_Neighborhood == "NridgHt"[T.True] 0.10***
(0.02)
v_Neighborhood == "StoneBR"[T.True] -0.00***
(0.00)
v_Overall_Qual 0.14***
(0.00)
v_Yr_Sold -0.01
(0.01)
v_Yr_Sold == 2007[T.True] 0.03
(0.02)
v_Yr_Sold == 2008[T.True] -0.01
(0.02)
R-squared 0.07 0.13 0.06 0.13 0.00 0.00 0.80
Adj R-squared 0.07 0.13 0.06 0.13 -0.00 0.00 0.80
No. observations 1941 1941 1941 1941 1941 1941 1940
=========================================================================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01