-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode
More file actions
61 lines (47 loc) · 1.42 KB
/
code
File metadata and controls
61 lines (47 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
# Q1
csl = pd.read_csv('CommuteStLouis.csv')
print(csl.describe())
plt.hist(csl['Age'])
plt.xlabel('Age')
plt.ylabel('Freq')
plt.title('Histogram of Age')
plt.show()
# Q2
import seaborn as sb
#Drop string variables
cls2=csl.drop(['City','Sex'], axis=1)
# Correlation matrix
print(cls2.corr())
# Distance and Time are most correlated, coefficient - 0.830241
sb.pairplot(cls2)
plt.show()
# The figures in the diagonal going from the top left to the bottom right show the frequency of Age, Distance and Time. The various attributes are skewed right meaning that the mean is greater than the median.
# The pairplot also shows how the different attributes are correlated
# Any suitable answer with a valid explanation is accepted
sb.boxplot(x=csl['Sex'], y=csl['Distance'])
plt.show()
# Q3
from sklearn.linear_model import LinearRegression
df=pd.read_csv('CommuteStLouis.csv')
x=df['Distance']
y=df['Time']
X=x[:,np.newaxis]
plt.scatter(X,y)
plt.xlabel('Distance')
plt.ylabel('Time')
model=LinearRegression(fit_intercept=True)
model.fit(X,y)
y_predicted=model.predict(X)
plt.plot(X,y_predicted)
plt.title('Scatterplot and Linear Regression of Time vs Distance')
plt.show()
# Q4
from yellowbrick.regressor import ResidualsPlot
model=LinearRegression(fit_intercept=True)
visualizer=ResidualsPlot(model)
visualizer.fit(X,y)
visualizer.poof()