This article aims to predict the count of casual users (feature casual), count of registered users (feature registered), and the total count of both causal and registered users (feature cnt) using the multi-regression model.
You can go to this webpage to find the source data day.csv and also you can find Bike Sharing Dataset Analytics -- Daily Data.py here.

Exploring the data

import pandas as pd
day = pd.read_csv("day.csv")
day.head()

	instant	dteday	season	mnth	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	cnt
0	1	2011-01-01	1	1	6	0	2	0.344167	0.363625	0.805833	0.160446	331	654	985
1	2	2011-01-02	1	1	0	0	2	0.363478	0.353739	0.696087	0.248539	131	670	801
2	3	2011-01-03	1	1	1	1	1	0.196364	0.189405	0.437273	0.248309	120	1229	1349
3	4	2011-01-04	1	1	2	1	1	0.200000	0.212122	0.590435	0.160296	108	1454	1562
4	5	2011-01-05	1	1	3	1	1	0.226957	0.229270	0.436957	0.186900	82	1518	1600

Visualization

import matplotlib.pyplot as plt
import seaborn as sns

result = day[["mnth", "casual", "registered", "cnt"]].groupby(["mnth"]).mean()
result = (
    result.stack()
    .reset_index()
    .set_index("mnth")
    .rename(columns={"level_1": "cat", 0: "people per day"})
)
cat    = ["season", "yr", "holiday", "weekday", "workingday", "weathersit"]

sns.set(style="ticks", palette="pastel")
f, axes = plt.subplots(3, 3, sharey=False, figsize=(15, 12))
ax      = plt.subplot2grid((3, 3), (0, 0), colspan=3)
sns.barplot(x=result.index, y="people per day", data=result, hue="cat", ax=ax)
for ind, val in enumerate(cat):
    result = round(day[[val, "casual", "registered", "cnt"]].groupby([val]).mean())
    result = (
        result.stack()
        .reset_index()
        .set_index(val)
        .rename(columns={"level_1": "cat", 0: "people per day"})
    )
    sns.barplot(
        x    = result.index,
        y    = "people per day",
        data = result,
        hue  = "cat",
        ax   = axes[ind // 3 + 1, ind % 3],
    )
f.tight_layout(pad=3.0)
plt.show()

result = day[["casual", "registered", "cnt"]].set_index(day["mnth"])
result = (
    result.stack()
    .reset_index()
    .set_index("mnth")
    .rename(columns={"level_1": "cat", 0: "people"})
)

f, axes = plt.subplots(3, 3, sharey=False, figsize=(20, 12))
ax      = plt.subplot2grid((3, 3), (0, 0), colspan=3)
sns.violinplot(x=result.index, y="people", hue="cat", data=result, cut=0, ax=ax)
for ind, val in enumerate(cat):
    result = day[["casual", "registered", "cnt"]].set_index(day[val])
    result = (
        result.stack()
        .reset_index()
        .set_index(val)
        .rename(columns={"level_1": "cat", 0: "people"})
    )
    sns.violinplot(
        x    = result.index,
        y    = "people",
        hue  = "cat",
        data = result,
        cut  = 0,
        ax   = axes[ind // 3 + 1, ind % 3],
    )
f.tight_layout(pad=3.0)
plt.show()

The number of people who rental a bike every year is increasing, which indicates that riding bicycle is getting popular. This trend could be got if we fit a model on date and year. But here, since we don't have data of enough years, so we maily focus on the other features in our analysis.

Linear Regression

Split Data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

day[cat] = day[cat].apply(lambda x: x.astype("category"))
dummies  = pd.get_dummies(day[cat], drop_first=True)
print(dummies.shape)

conti_predictors = ["temp", "atemp", "hum", "windspeed"]
print(day[conti_predictors].shape)
X = pd.concat([day[conti_predictors], dummies], axis=1)
y = day.iloc[:, 13:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
sc                               = StandardScaler()
X_train[conti_predictors]        = sc.fit_transform(X_train[conti_predictors])
X_test [conti_predictors]        = sc.transform(X_test[conti_predictors])

(731, 24)
(731, 4)

Train Model

from sklearn.linear_model import LinearRegression
from dmba import adjusted_r2_score

lr     = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns=["casual", "registered", "cnt"]).astype(int)
print(y_pred)
print(adjusted_r2_score(y_test, y_pred, lr))

	casual	registered	cnt
0	1119	5058	6178
1	886	3975	4861
2	961	5148	6109
3	865	5510	6375
4	-94	2389	2294
...	...	...	...
178	317	2450	2768
179	-5	2653	2647
180	1252	4690	5942
181	465	2813	3278
182	482	3775	4258

0.5664326431246609

Because number of people cannot be negative, we change some data so that the result make sense.

for i in range(y_pred.shape[0]):
    if y_pred["casual"].iloc[i] < 0:
        y_pred["casual"].iloc[i] = 0
    if y_pred["registered"].iloc[i] < 0:
        y_pred["registered"].iloc[i] = 0
        y_pred["cnt"].iloc[i]        = y_pred["casual"].iloc[i] + y_pred["registered"].iloc[i]
print(y_pred)
print(adjusted_r2_score(y_test, y_pred, lr))

	casual	registered	cnt
0	1119	5058	6178
1	886	3975	4861
2	961	5148	6109
3	865	5510	6375
4	0	2389	2389
...	...	...	...
178	317	2450	2768
179	0	2653	2653
180	1252	4690	5942
181	465	2813	3278
182	482	3775	4258

0.5761640858277055