# How to remove columns showing multicollinearity in a regression model?

463    Asked by ranjan_6399 in Data Science , Asked on Jan 15, 2020

# Multilinear Regression

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

cars = pd.read_csv("E:\Bokey\Excelr Data\Python Codes\all_py\Multilinear Regression\cars.csv")

# to get top 6 rows

# Correlation matrix

cars.corr()

# we see there exists High collinearity between input variables especially between

# [Hp & SP] , [VOL,WT] so there exists collinearity problem

# Scatter plot between the variables along with histograms

import seaborn as sns

sns.pairplot(cars)

# columns names

cars.columns

# pd.tools.plotting.scatter_matrix(cars); -> also used for plotting all in one graph

# preparing model considering all the variables

import statsmodels.formula.api as smf # for regression model

# Preparing model

ml1 = smf.ols('MPG~WT+VOL+SP+HP',data=cars).fit() # regression model

# Getting coefficients of variables

ml1.params

# Summary

ml1.summary()

# p-values for WT,VOL are more than 0.05 and also we know that [WT,VOL] has high correlation value

# preparing model based only on Volume

ml_v=smf.ols('MPG~VOL',data = cars).fit()

ml_v.summary() # 0.271

# p-value

# Preparing model based only on WT

ml_w=smf.ols('MPG~WT',data = cars).fit()

ml_w.summary() # 0.268

# Preparing model based only on WT & VOL

ml_wv=smf.ols('MPG~WT+VOL',data = cars).fit()

ml_wv.summary() # 0.264

# Both coefficients p-value became insignificant...

# So there may be a chance of considering only one among VOL & WT

# Checking whether data has any influential values

# influence index plots

import statsmodels.api as sm

sm.graphics.influence_plot(ml1)

# index 76 AND 78 is showing high influence so we can exclude that entire row

# Studentized Residuals = Residual/standard deviation of residuals

cars_new=cars.drop(cars.index[[76,70]],axis=0)

#cars.drop(["MPG"],axis=1)

# X => A B C D

# X.drop(["A","B"],axis=1) # Dropping columns

# X.drop(X.index[[5,9,19]],axis=0)

#X.drop(["X1","X2"],aixs=1)

#X.drop(X.index[[0,2,3]],axis=0)

# Preparing model

ml_new = smf.ols('MPG~WT+VOL+HP+SP',data = cars_new).fit()

# Getting coefficients of variables

ml_new.params

# Summary

ml_new.summary() # 0.806

# Confidence values 99%

print(ml_new.conf_int(0.01)) # 99% confidence level

# Predicted values of MPG

mpg_pred = ml_new.predict(cars_new[['WT','VOL','HP','SP']])

# calculating VIF's values of independent variables

rsq_hp = smf.ols('HP~WT+VOL+SP',data=cars_new).fit().rsquared

vif_hp = 1/(1-rsq_hp) # 16.33

rsq_wt = smf.ols('WT~HP+VOL+SP',data=cars_new).fit().rsquared

vif_wt = 1/(1-rsq_wt) # 564.98

rsq_vol = smf.ols('VOL~WT+SP+HP',data=cars_new).fit().rsquared

vif_vol = 1/(1-rsq_vol) # 564.84

rsq_sp = smf.ols('SP~WT+VOL+HP',data=cars_new).fit().rsquared

vif_sp = 1/(1-rsq_sp) # 16.35

# Storing vif values in a data frame

d1={'Variables':['Hp','WT','VOL','SP'],'VIF':[vif_hp,vif_wt,vif_vol,vif_sp]}

Vif_frame = pd.DataFrame(d1)

Vif_frame

# As weight is having higher VIF value, we are not going to include this prediction model

sm.graphics.plot_partregress_grid(ml_new)

# added variable plot for weight is not showing any significance

# final model

final_ml= smf.ols('MPG~VOL+SP+HP',data = cars_new).fit()

final_ml.params

final_ml.summary() # 0.809

# As we can see that r-squared value has increased from 0.810 to 0.812.