Linear-Regression/code at main · if-8/Linear-Regression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

# Q1
csl = pd.read_csv('CommuteStLouis.csv')
print(csl.describe())
plt.hist(csl['Age'])
plt.xlabel('Age')
plt.ylabel('Freq')
plt.title('Histogram of Age')

plt.show()

# Q2
import seaborn as sb
#Drop string variables
cls2=csl.drop(['City','Sex'], axis=1)

# Correlation matrix
print(cls2.corr())
# Distance and Time are most correlated, coefficient - 0.830241

sb.pairplot(cls2)
plt.show()
# The figures in the diagonal going from the top left to the bottom right show the frequency of Age, Distance and Time. The various attributes are skewed right meaning that the mean is greater than the median.
# The pairplot also shows how the different attributes are correlated
# Any suitable answer with a valid explanation is accepted

sb.boxplot(x=csl['Sex'], y=csl['Distance'])
plt.show()

# Q3
from sklearn.linear_model import LinearRegression

df=pd.read_csv('CommuteStLouis.csv')

x=df['Distance']
y=df['Time']
X=x[:,np.newaxis]
plt.scatter(X,y)
plt.xlabel('Distance')
plt.ylabel('Time')

model=LinearRegression(fit_intercept=True)
model.fit(X,y)
y_predicted=model.predict(X)

plt.plot(X,y_predicted)
plt.title('Scatterplot and Linear Regression of Time vs Distance')
plt.show()

# Q4
from yellowbrick.regressor import ResidualsPlot

model=LinearRegression(fit_intercept=True)
visualizer=ResidualsPlot(model)
visualizer.fit(X,y)
visualizer.poof()