This article aims to predict the count of casual users (feature casual), count of registered users (feature registered), and the total count of both causal and registered users (feature cnt) using the multi-regression model.
You can go to this webpage to find the source data day.csv and also you can find Bike Sharing Dataset Analytics -- Daily Data.py here.

Exploring the data

import pandas as pd
day = pd.read_csv("day.csv")
day.head()
instant dteday season yr mnth holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 6 0 2 0.344167 0.363625 0.805833 0.160446 331 654 985
1 2 2011-01-02 1 0 1 0 0 0 2 0.363478 0.353739 0.696087 0.248539 131 670 801
2 3 2011-01-03 1 0 1 0 1 1 1 0.196364 0.189405 0.437273 0.248309 120 1229 1349
3 4 2011-01-04 1 0 1 0 2 1 1 0.200000 0.212122 0.590435 0.160296 108 1454 1562
4 5 2011-01-05 1 0 1 0 3 1 1 0.226957 0.229270 0.436957 0.186900 82 1518 1600

Visualization

import matplotlib.pyplot as plt
import seaborn as sns

result = day[["mnth", "casual", "registered", "cnt"]].groupby(["mnth"]).mean()
result = (
    result.stack()
    .reset_index()
    .set_index("mnth")
    .rename(columns={"level_1": "cat", 0: "people per day"})
)
cat    = ["season", "yr", "holiday", "weekday", "workingday", "weathersit"]

sns.set(style="ticks", palette="pastel")
f, axes = plt.subplots(3, 3, sharey=False, figsize=(15, 12))
ax      = plt.subplot2grid((3, 3), (0, 0), colspan=3)
sns.barplot(x=result.index, y="people per day", data=result, hue="cat", ax=ax)
for ind, val in enumerate(cat):
    result = round(day[[val, "casual", "registered", "cnt"]].groupby([val]).mean())
    result = (
        result.stack()
        .reset_index()
        .set_index(val)
        .rename(columns={"level_1": "cat", 0: "people per day"})
    )
    sns.barplot(
        x    = result.index,
        y    = "people per day",
        data = result,
        hue  = "cat",
        ax   = axes[ind // 3 + 1, ind % 3],
    )
f.tight_layout(pad=3.0)
plt.show()
result = day[["casual", "registered", "cnt"]].set_index(day["mnth"])
result = (
    result.stack()
    .reset_index()
    .set_index("mnth")
    .rename(columns={"level_1": "cat", 0: "people"})
)

f, axes = plt.subplots(3, 3, sharey=False, figsize=(20, 12))
ax      = plt.subplot2grid((3, 3), (0, 0), colspan=3)
sns.violinplot(x=result.index, y="people", hue="cat", data=result, cut=0, ax=ax)
for ind, val in enumerate(cat):
    result = day[["casual", "registered", "cnt"]].set_index(day[val])
    result = (
        result.stack()
        .reset_index()
        .set_index(val)
        .rename(columns={"level_1": "cat", 0: "people"})
    )
    sns.violinplot(
        x    = result.index,
        y    = "people",
        hue  = "cat",
        data = result,
        cut  = 0,
        ax   = axes[ind // 3 + 1, ind % 3],
    )
f.tight_layout(pad=3.0)
plt.show()


The number of people who rental a bike every year is increasing, which indicates that riding bicycle is getting popular. This trend could be got if we fit a model on date and year. But here, since we don't have data of enough years, so we maily focus on the other features in our analysis.

Linear Regression

Split Data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

day[cat] = day[cat].apply(lambda x: x.astype("category"))
dummies  = pd.get_dummies(day[cat], drop_first=True)
print(dummies.shape)

conti_predictors = ["temp", "atemp", "hum", "windspeed"]
print(day[conti_predictors].shape)
X = pd.concat([day[conti_predictors], dummies], axis=1)
y = day.iloc[:, 13:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
sc                               = StandardScaler()
X_train[conti_predictors]        = sc.fit_transform(X_train[conti_predictors])
X_test [conti_predictors]        = sc.transform(X_test[conti_predictors])
(731, 24)
(731, 4)

Train Model

from sklearn.linear_model import LinearRegression
from dmba import adjusted_r2_score

lr     = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns=["casual", "registered", "cnt"]).astype(int)
print(y_pred)
print(adjusted_r2_score(y_test, y_pred, lr))
casual registered cnt
0 1119 5058 6178
1 886 3975 4861
2 961 5148 6109
3 865 5510 6375
4 -94 2389 2294
... ... ... ...
178 317 2450 2768
179 -5 2653 2647
180 1252 4690 5942
181 465 2813 3278
182 482 3775 4258
0.5664326431246609
Because number of people cannot be negative, we change some data so that the result make sense.
for i in range(y_pred.shape[0]):
    if y_pred["casual"].iloc[i] < 0:
        y_pred["casual"].iloc[i] = 0
    if y_pred["registered"].iloc[i] < 0:
        y_pred["registered"].iloc[i] = 0
        y_pred["cnt"].iloc[i]        = y_pred["casual"].iloc[i] + y_pred["registered"].iloc[i]
print(y_pred)
print(adjusted_r2_score(y_test, y_pred, lr))
casual registered cnt
0 1119 5058 6178
1 886 3975 4861
2 961 5148 6109
3 865 5510 6375
4 0 2389 2389
... ... ... ...
178 317 2450 2768
179 0 2653 2653
180 1252 4690 5942
181 465 2813 3278
182 482 3775 4258
0.5761640858277055