In the previous article, I analyze the bike sharing dataset from a daily level. In this article, I will do a similar procedure from a houly level.
Source data: hour.csv
Source python file: Bike Sharing Dataset Analytics -- Hourly Data.py
Exploring the Data
import pandas as pd
hour = pd.read_csv("hour.csv")
hour.head()
|
instant |
dteday |
season |
yr |
mnth |
hr |
holiday |
weekday |
workingday |
weathersit |
temp |
atemp |
hum |
windspeed |
casual |
registered |
cnt |
| 0 |
1 |
2011-01-01 |
1 |
0 |
1 |
0 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.81 |
0.0 |
3 |
13 |
16 |
| 1 |
2 |
2011-01-01 |
1 |
0 |
1 |
1 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0 |
8 |
32 |
40 |
| 2 |
3 |
2011-01-01 |
1 |
0 |
1 |
2 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0 |
5 |
27 |
32 |
| 3 |
4 |
2011-01-01 |
1 |
0 |
1 |
3 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0 |
3 |
10 |
13 |
| 4 |
5 |
2011-01-01 |
1 |
0 |
1 |
4 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0 |
0 |
1 |
1 |
Visualization
import matplotlib.pyplot as plt
result = hour[["hr", "casual", "registered", "cnt"]].groupby(["hr"]).mean()
result = (
result.stack()
.reset_index()
.set_index("hr")
.rename(columns={"level_1": "cat", 0: "people per hour"})
)
f, axes = plt.subplots(2, sharey=False, figsize=(25, 12))
sns.barplot(x=result.index, y="people per hour", hue="cat", data=result, ax=axes[0])
result = hour[["casual", "registered", "cnt"]].set_index(hour["hr"])
result = (
result.stack()
.reset_index()
.set_index("hr")
.rename(columns={"level_1": "cat", 0: "people"})
)
sns.violinplot(x=result.index, y="people", hue="cat", data=result, cut=0, ax=axes[1])
Linear Regression
Split Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
cat = ["season", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit"]
hour[cat] = hour[cat].apply(lambda x: x.astype("category"))
dummies = pd.get_dummies(hour[cat], drop_first=True)
print(dummies.shape)
conti_predictors = ["temp", "atemp", "hum", "windspeed"]
print(hour[conti_predictors].shape)
X = pd.concat([hour[conti_predictors], dummies], axis=1)
X.head()
y = hour.iloc[:, 14:]
y.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
sc = StandardScaler()
X_train[conti_predictors] = sc.fit_transform(X_train[conti_predictors])
X_test[conti_predictors] = sc.transform(X_test[conti_predictors])
(17379, 48)
(17379, 4)
|
temp |
atemp |
hum |
windspeed |
season_2 |
season_3 |
season_4 |
mnth_2 |
mnth_3 |
mnth_4 |
mnth_5 |
mnth_6 |
mnth_7 |
mnth_8 |
mnth_9 |
mnth_10 |
mnth_11 |
mnth_12 |
hr_1 |
hr_2 |
hr_3 |
hr_4 |
hr_5 |
hr_6 |
hr_7 |
hr_8 |
hr_9 |
hr_10 |
hr_11 |
hr_12 |
hr_13 |
hr_14 |
hr_15 |
hr_16 |
hr_17 |
hr_18 |
hr_19 |
hr_20 |
hr_21 |
hr_22 |
hr_23 |
holiday_1 |
weekday_1 |
weekday_2 |
weekday_3 |
weekday_4 |
weekday_5 |
weekday_6 |
workingday_1 |
weathersit_2 |
weathersit_3 |
weathersit_4 |
| 0 |
0.24 |
0.2879 |
0.81 |
0.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 1 |
0.22 |
0.2727 |
0.80 |
0.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 2 |
0.22 |
0.2727 |
0.80 |
0.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 3 |
0.24 |
0.2879 |
0.75 |
0.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 4 |
0.24 |
0.2879 |
0.75 |
0.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
|
casual |
registered |
cnt |
| 0 |
3 |
13 |
16 |
| 1 |
8 |
32 |
40 |
| 2 |
5 |
27 |
32 |
| 3 |
3 |
10 |
13 |
| 4 |
0 |
1 |
1 |
Train Model
from sklearn.linear_model import LinearRegression
from dmba import adjusted_r2_score
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns=["casual", "registered", "cnt"]).astype(int)
print(y_pred)
print(adjusted_r2_score(y_test, y_pred, lr))
|
casual |
registered |
cnt |
| 0 |
37 |
249 |
286 |
| 1 |
42 |
344 |
386 |
| 2 |
48 |
259 |
307 |
| 3 |
19 |
360 |
380 |
| 4 |
-9 |
-88 |
-98 |
| ... |
... |
... |
... |
| 4340 |
6 |
205 |
211 |
| 4341 |
6 |
109 |
115 |
| 4342 |
32 |
226 |
259 |
| 4343 |
52 |
93 |
146 |
| 4344 |
-5 |
142 |
137 |
0.5979873068757086
Because number of people cannot be negative, we change some data so that the result make sense.
for i in range(y_pred.shape[0]):
if y_pred["casual"].iloc[i] < 0:
y_pred["casual"].iloc[i] = 0
if y_pred["registered"].iloc[i] < 0:
y_pred["registered"].iloc[i] = 0
y_pred["cnt"].iloc[i] = y_pred["casual"].iloc[i] + y_pred["registered"].iloc[i]
print(y_pred)
print(adjusted_r2_score(y_test, y_pred, lr))
|
casual |
registered |
cnt |
| 0 |
37 |
249 |
286 |
| 1 |
42 |
344 |
386 |
| 2 |
48 |
259 |
307 |
| 3 |
19 |
360 |
380 |
| 4 |
0 |
0 |
0 |
| ... |
... |
... |
... |
| 4340 |
6 |
205 |
211 |
| 4341 |
6 |
109 |
115 |
| 4342 |
32 |
226 |
259 |
| 4343 |
52 |
93 |
146 |
| 4344 |
0 |
142 |
142 |
0.6153434162490663