# Data
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager
"C:\Windows\Fonts\FiraSans-Regular.ttf")
font_manager.fontManager.addfont("font.family"] = "Fira Sans"
plt.rcParams[
283)
np.random.seed(
def pad(X):
return np.append(X, np.ones((X.shape[0], 1)), 1)
def LR_data(n_train=100, n_val=100, p_features=1, noise=.1, w=None):
if w is None:
= np.random.rand(p_features + 1) + .2
w
= np.random.rand(n_train, p_features)
X_train = pad(X_train)@w + noise*np.random.randn(n_train)
y_train
= np.random.rand(n_val, p_features)
X_val = pad(X_val)@w + noise*np.random.randn(n_val)
y_val
return X_train, y_train, X_val, y_val
Image credits: Figure 18.3, Sanchez, G., Marzban, E. (2020) All Models Are Wrong: Concepts of Statistical Learning.
Python source: 451-blog/linreg.py at main · doabell/451-blog
Instructions can be found at Implementing Linear Regression.
Algorithm Demo
We implement linear regression in two methods:
- Using the analytical formula for the optimal weight vector
- Using gradient descent on the gradient formula
Data
# Visualize
= 100
n_train = 100
n_val = 1
p_features = 0.16
noise
# create some data
= LR_data(n_train, n_val, p_features, noise)
X_train, y_train, X_val, y_val
# plot it
= plt.subplots(1, 2, sharex=True, sharey=True)
fig, axarr 0].scatter(X_train, y_train, alpha=0.5)
axarr[1].scatter(X_val, y_val, alpha=0.5)
axarr[= axarr[0].set(title="Training", xlabel="x", ylabel="y")
labs = axarr[1].set(title="Validation", xlabel="x")
labs plt.tight_layout()
Analytical
We call LR.fit()
, which defaults to using the analytical formula:
# Train model
from linreg import LinearRegression
= LinearRegression()
LR
LR.fit(X_train, y_train)
= plt.subplots(1, 2, sharex=True, sharey=True)
fig, axarr 0].scatter(X_train, y_train, alpha=0.5)
axarr[0].plot(X_train, pad(X_train)@LR.w, color="black")
axarr[1].scatter(X_val, y_val, alpha=0.5)
axarr[1].plot(X_val, pad(X_val)@LR.w, color="black")
axarr[= axarr[0].set(title="Training", xlabel="x", ylabel="y")
labs = axarr[1].set(title="Validation", xlabel="x")
labs
plt.tight_layout()
print(f"Training score = {LR.score(X_train, y_train).round(4)}")
print(f"Validation score = {LR.score(X_val, y_val).round(4)}")
Training score = 0.7162
Validation score = 0.7064
Our validation score is similar to the training score.
Visually, the resulting line looks like a good fit for the data.
We can then check our estimated weight vector:
# Estimated weight vector
LR.w
array([0.80273893, 0.49734664])
Gradient descent
We call linear regression with method="gradient"
:
# Using gradient descent
= LinearRegression()
LR2
="gradient", alpha=0.001, max_iter=100)
LR2.fit(X_train, y_train, methodprint(f"Training score = {LR2.score(X_train, y_train).round(4)}")
print(f"Validation score = {LR2.score(X_val, y_val).round(4)}")
Training score = 0.6433
Validation score = 0.6966
The scores are similar to that of the analytical method.
We can check how the training score evolved with each iteration:
# score history
plt.plot(LR2.score_history)= plt.gca().set(xlabel="Iteration", ylabel="Score") labels
We observe that the score improved quickly to around 0.6, and then improved more slowly.
Finally, we can plot the resulting regression lines:
= plt.subplots(1, 2, sharex=True, sharey=True)
fig, axarr 0].scatter(X_train, y_train, alpha=0.5)
axarr[0].plot(
axarr[@LR.w,
X_train, pad(X_train)="Analytical", alpha=0.7, color="orange"
label
)0].plot(
axarr[@LR2.w,
X_train, pad(X_train)="Gradient descent", alpha=0.7, color="green"
label
)1].scatter(X_val, y_val, alpha=0.5)
axarr[1].plot(
axarr[@LR.w,
X_val, pad(X_val)="Analytical", alpha=0.7, color="orange"
label
)1].plot(
axarr[@LR2.w,
X_val, pad(X_val)="Gradient descent", alpha=0.7, color="green"
label
)= axarr[0].set(title="Training", xlabel="x", ylabel="y")
labs = axarr[1].set(title="Validation", xlabel="x")
labs
plt.tight_layout()= plt.legend() legend
Although the two lines are slightly different, they are both close enough visually.
Experiments
We now experiment with the number of features used in our linear regression model.
More features may lead to better scores, but might also make the model overfit.
Analytical
First, we make some data with 1 to 99 features. Then, we can fit our analytical linear regression on the data as follows:
= []
scores_train_ana = []
scores_val_ana
= 100
n_train = 100
n_val = 0.2
noise
for p_features in range(1, n_train):
= LR_data(
X_train, y_train, X_val, y_val
n_train, n_val, p_features, noise
)= LinearRegression()
LR
LR.fit(X_train, y_train)
scores_train_ana.append(LR.score(X_train, y_train))
scores_val_ana.append(LR.score(X_val, y_val))
# https://stackoverflow.com/a/67037892
range(1, len(scores_train_ana) + 1),
plt.plot(="Training")
scores_train_ana, labelrange(1, len(scores_val_ana) + 1), scores_val_ana, label="Validation")
plt.plot(= plt.gca().set(xlabel="Number of features", ylabel="Score")
labels = plt.legend() legend
As we increase the number of features to fit on, the training accuracy increases because more features are available. The improvements beyond 20 features is very small though.
The validation score, however, decrease as we fit over more features. It even drops below 0 when the number of features gets close to 100 (the number of training and validation data points)!
Here, we can see overfitting going on. Especially in the case of fitting over 80 features on 100 data points, in the worst-case scenario, each data point can have a fairly distinct set of feature combinations, leading to overfitting. This is despite our data being generated with an underlying distribution (as seen in LR_data()
).
Gradient descent
We can repeat the same experiment, but for gradient descent.
= []
scores_train_gra = []
scores_val_gra
= 100
n_train = 100
n_val = 0.2
noise
1)
plt.figure(for p_features in range(1, n_train):
= LR_data(n_train, n_val, p_features, noise)
X_train, y_train, X_val, y_val = LinearRegression()
LR ="gradient", alpha=0.0001, max_iter=2000)
LR.fit(X_train, y_train, method
scores_train_gra.append(LR.score(X_train, y_train))
scores_val_gra.append(LR.score(X_val, y_val))# plot score over history
if p_features in [1, 5, 10, 25, 50, 99]:
plt.plot(
LR.score_history,=f"{p_features} features",
label=0.6
alpha
)= plt.gca().set(xlabel="Iteration", ylabel="Score")
labels = plt.legend()
legend =0.5, top=1)
plt.ylim(bottom
# plot score over features
2)
plt.figure(
plt.plot(range(1, len(scores_train_gra) + 1),
="Training", alpha=0.7
scores_train_gra, label
)
plt.plot(range(1, len(scores_val_gra) + 1),
="Validation", alpha=0.7
scores_val_gra, label
)= plt.gca().set(xlabel="Number of features", ylabel="Score")
labels = plt.legend() legend
For score based on the number of features, we observe a similar trend from using the analytical formula. There is not much improvement beyond 20 features, and the validation score fluctuates a lot with a higher number of features.
Also of interest is how the training score changes over each iteration, with different number of features:
- When there is only 1 feature, our training score does not get higher than 0.7.
- The more features, we use, the higher training score we get (as illustrated by the second plot)
- However, when only using 10 features, we already achieve a training score higher than 0.9.
- Higher number of features yield smaller improvements.
This behavior is consistent with overfitting, where the training score can get really close to 1, but the validation score fluctuates.
LASSO Regularization
One way to deal with this sort of overfitting, or indeed the presence of too many features, is to use a feature selection algorithm.
Here, we use LASSO regularization implemented in the scikit-learn
package. LASSO has a tendency to give zero weights to features, making for feature selection.
LASSO has a hyperparameter, alpha
, that controls the strength of regularization. This can be anywhere in [0, inf)
.
We repeat the above experiment to see how LASSO regularization can guard against overfitting.
from sklearn.linear_model import Lasso
= []
scores_train_las = []
scores_val_las
= 100
n_train = 100
n_val = 0.2
noise
for p_features in range(1, n_train):
= LR_data(
X_train, y_train, X_val, y_val
n_train, n_val, p_features, noise
)= Lasso(alpha=0.001)
L
L.fit(X_train, y_train)
scores_train_las.append(L.score(X_train, y_train)) scores_val_las.append(L.score(X_val, y_val))
# Plot
= plt.subplots(1, 2, sharex=True, sharey=True)
fig, axarr 0].plot(
axarr[range(1, len(scores_train_las) + 1), scores_train_las,
="Training", alpha=0.7
label
)0].plot(
axarr[range(1, len(scores_val_las) + 1), scores_val_las,
="Validation", alpha=0.7
label
)1].plot(
axarr[range(1, len(scores_train_ana) + 1), scores_train_ana,
="Training", alpha=0.7
label
)1].plot(
axarr[range(1, len(scores_val_ana) + 1), scores_val_ana,
="Validation", alpha=0.7
label
)= axarr[0].set(title="LASSO", xlabel="Number of features", ylabel="Score")
labs = axarr[1].set(title="Analytical", xlabel="Number of features")
labs
plt.tight_layout()= plt.legend() legend
With a large number of features, LASSO was successful in reducing the training-validation gap that indicates overfitting.
That said, the validation score still drops noticably for over 80 features. Thus, for overparameterized problems like this where the number of features is close to (or greater than!) the number of data points, it is still more advisable to:
- fit with less features, or
- reduce the severity of overparameterization
Additionally, LASSO also performs better with few (around 1-5) features, but this is subject to randomness (like gradient descent).
Data: Bike Sharing
Next, we can test the real-life performance of linear regression, with the analytical formula or with gradient descent.
We will be using this bike-sharing dataset, from Washington DC1, to predict the number of casual bikeshare users given info on that day.
(The setup code below is adapted from the blog post instructions.)
import pandas as pd
from sklearn.model_selection import train_test_split
= pd.read_csv(
bikeshare "https://philchodrow.github.io/PIC16A/datasets/Bike-Sharing-Dataset/day.csv")
= ["casual",
cols "mnth",
"weathersit",
"workingday",
"yr",
"temp",
"hum",
"windspeed",
"holiday",
"dteday"]
= bikeshare[cols]
bikeshare_plot = pd.get_dummies(bikeshare_plot, columns=[
bikeshare_plot 'mnth'], drop_first="if_binary")
= train_test_split(bikeshare_plot, test_size=.2, shuffle=False)
train, test
= train.drop(["casual", "dteday"], axis=1)
X_train = train["casual"]
y_train
= test.drop(["casual", "dteday"], axis=1)
X_test = test["casual"]
y_test
X_train.head()
weathersit | workingday | yr | temp | hum | windspeed | holiday | mnth_2 | mnth_3 | mnth_4 | mnth_5 | mnth_6 | mnth_7 | mnth_8 | mnth_9 | mnth_10 | mnth_11 | mnth_12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 0 | 0.344167 | 0.805833 | 0.160446 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2 | 0 | 0 | 0.363478 | 0.696087 | 0.248539 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 1 | 0 | 0.196364 | 0.437273 | 0.248309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 1 | 0 | 0.200000 | 0.590435 | 0.160296 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 1 | 0 | 0.226957 | 0.436957 | 0.186900 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
X_train.shape
(584, 18)
Analytical
We first use the analytical method, inspecting the scores and weight vectors:
= LinearRegression()
LR4
LR4.fit(X_train, y_train)
print(f"Training score = {LR4.score(X_train, y_train).round(6)}")
print(f"Test score = {LR4.score(X_test, y_test).round(6)}")
Training score = 0.731836
Test score = 0.696773
-1], index=X_train.columns).sort_values() pd.Series(LR4.w[:
windspeed -1242.800381
workingday -791.690549
hum -490.100340
holiday -235.879349
weathersit -108.371136
mnth_2 -3.354397
mnth_12 90.821460
mnth_7 228.881481
mnth_8 241.316412
mnth_11 252.433004
yr 280.586927
mnth_6 360.807998
mnth_3 369.271956
mnth_9 371.503854
mnth_10 437.600848
mnth_4 518.408753
mnth_5 537.301886
temp 1498.715113
dtype: float64
Gradient Descent
Then, we repeat the same process with gradient descent.
= LinearRegression()
LR5 ="gradient", alpha=0.0001, max_iter=10000)
LR5.fit(X_train, y_train, method
print(f"Training score = {LR5.score(X_train, y_train).round(6)}")
print(f"Test score = {LR5.score(X_test, y_test).round(6)}")
Training score = 0.731833
Test score = 0.696753
-1], index=X_train.columns).sort_values() pd.Series(LR5.w[:
windspeed -1230.491728
workingday -791.472877
hum -484.647522
holiday -235.399837
weathersit -109.260413
mnth_2 -1.748008
mnth_12 93.293066
mnth_7 235.754820
mnth_8 247.603486
mnth_11 255.436488
yr 281.231420
mnth_6 366.826944
mnth_3 371.749383
mnth_9 376.868083
mnth_10 441.334934
mnth_4 521.567646
mnth_5 541.885682
temp 1488.043873
dtype: float64
We achieved nearly identical results with the two methods.
Since we have reasonable numbers of data points (584) and features (18), we do expect that the analytical and gradient descent methods to arrive at a similarly good goal.
LASSO Regularization
= Lasso(alpha=0.05)
L2
L2.fit(X_train, y_train)print(f"Training score = {L2.score(X_train, y_train).round(6)}")
print(f"Test score = {L2.score(X_test, y_test).round(6)}")
Training score = 0.731824
Test score = 0.696659
=X_train.columns).sort_values() pd.Series(L2.coef_, index
windspeed -1233.278124
workingday -791.462521
hum -484.813541
holiday -234.619924
weathersit -108.888907
mnth_2 -8.285024
mnth_12 83.321973
mnth_7 214.900960
mnth_8 226.955076
mnth_11 243.204653
yr 279.195051
mnth_6 348.087677
mnth_9 358.328354
mnth_3 361.587089
mnth_10 427.148731
mnth_4 509.108560
mnth_5 525.695908
temp 1516.891474
dtype: float64
Yet again, we arrive at similar results. This indicates that there is no overparameterization in this dataset, like the previous experiment.
Feature weights
So what do these numbers actually mean? Features with positive numbers contribute to (casual) ridership; features with negative numbers, surprisingly, also contribute to ridership, but negatively.
Reading off the individual labels:
- The higher the average temperature, the more bike riders. This metric might be more helpful if we classified temperatures as “too cold”, “too hot”, and “great for biking”.
- People biked more in March, April, May, and September. This might also have to do with the temperature, or nice weather in general.
- People biked less if it is a
workingday
, and biked more at weekends. - People biked less if there is a strong wind.
Predictions
We first plot out the original, complete data for reference:
= plt.subplots(1, figsize=(7, 3))
fig, ax
ax.plot('dteday']),
pd.to_datetime(bikeshare['casual'], alpha=0.6
bikeshare[
)set(xlabel="Day", ylabel="# of casual users")
ax.= plt.tight_layout() l
This is important because the next plot is of the test set, and is therefore not continuous in dates.
Next, we can plot our predictions against the actual riderships. Here, LR5
, our linear regression model with gradient descent, is used as the predictor.
# import datetime
= plt.subplots(1, figsize=(8, 3))
fig, ax
ax.plot('dteday']),
pd.to_datetime(test['casual'],
test[="Actual",
label=0.6
alpha
)
ax.plot('dteday']),
pd.to_datetime(test[
LR5.predict(X_test),="Predicted",
label=0.6
alpha
)set(xlabel="Day (from test set)", ylabel="# of casual users")
ax.
plt.tight_layout()= plt.legend() legend
We observe that the predicted ridership is much smoother than the actual riderships.
This may be due to our features not capturing enough real-life conditions, and / or due to the original data varying a lot day by day.
However, our model was able to predict the “shape” of casual ridership changes, so the general direction of things is there.
Footnotes
Fanaee-T, H., Gama, J. Event labeling combining ensemble detectors and background knowledge. Prog Artif Intell 2, 113–127 (2014). https://doi.org/10.1007/s13748-013-0040-3↩︎