from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
np.random.seed(42)
plt.rcParams['figure.figsize'] = (10,6)
Nov 17, 2021
We'll load data compiled from two data sources:
# Load the data
data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()
# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
# Model selection
from sklearn.model_selection import train_test_split
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
First step: set up the test/train split of input data:
# Do a 70/30 train/test split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]
X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]
# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values
As we increase the polynomial degree (add more and more polynomial features) two things happen:
This is the classic case of overfitting — our model does not generalize well at all.
Ridge
adds regularization to the linear regression least squares modelRemember, regularization penalizes large parameter values and complex fits
Let's gain some intuition:
Important
LinearModel
and scales input features with StandardScaler
StandardScaler
and PolynomialFeatures(degree=3)
pre-processing to featuresSet up a grid of GDP per capita points to make predictions for:
# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)
# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]
# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))
# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
## Plot the data
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Data",
s=100,
zorder=10,
color="#666666",
)
## Evaluate the linear fit
print("Linear fit")
training_score = linear.score(scaler.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
test_score = linear.score(scaler.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
print()
## Plot the linear fit
ax.plot(
X_pred / 1e5,
linear.predict(scaler.fit_transform(X_pred)),
color="k",
label="Linear fit",
)
## Ridge regression: linear model with regularization
# Plot the predicted values for each alpha
for alpha in [0, 10, 100, 1e5]:
print(f"alpha = {alpha}")
# Create out Ridge model with this alpha
ridge = Ridge(alpha=alpha)
# Fit the model on the training set
# NOTE: Use the pipeline that includes polynomial features
ridge.fit(pipe.fit_transform(X_train), y_train)
# Evaluate on the training set
training_score = ridge.score(pipe.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
# Evaluate on the test set
test_score = ridge.score(pipe.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
# Plot the ridge results
y_pred = ridge.predict(pipe.fit_transform(X_pred))
ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")
print()
# Plot formatting
ax.legend(ncol=2, loc=0)
ax.set_ylim(4, 8)
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
LinearRegression()
with the polynomial featuresMore feature engineering!
In this case, I've done the hard work for you and pulled additional country properties from the OECD's website.
data2 = pd.read_csv("./data/gdp_vs_satisfaction_more_features.csv")
data2.head()
We'll move beyond simple linear regression and see if we can get a better fit.
A decision tree learns decision rules from the input features:
For a specific corner of the input feature space, the decision tree predicts an output target value
Decision trees can be very deep with many nodes — this will lead to overfitting your dataset!
This is an example of ensemble learning: combining multiple estimators to achieve better overall accuracy than any one model could achieve
from sklearn.ensemble import RandomForestRegressor
Let's split our data into training and test sets again:
# Split the data 70/30
train_set, test_set = train_test_split(data2, test_size=0.3, random_state=42)
# the target labels
y_train = train_set["life_satisfaction"].values
y_test = test_set["life_satisfaction"].values
# The features
feature_cols = [col for col in data2.columns if col not in ["life_satisfaction", "Country"]]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
import seaborn as sns
sns.heatmap(
train_set[feature_cols].corr(),
cmap="coolwarm",
annot=True,
vmin=-1,
vmax=1
)
New: Pipelines
support models as the last step!
Pipeline
behaves just like a model, but it runs the transformations beforehand.fit()
function of the pipeline instead of the modelEstablish a baseline with a linear model:
# Linear model pipeline with two steps
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())
# Fit the pipeline
# NEW: This applies all of the transformations, and then fits the model
print("Linear regression")
linear_pipe.fit(X_train, y_train)
# NEW: Print the training score
training_score = linear_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")
# NEW: Print the test score
test_score = linear_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")
Now fit a random forest:
# Random forest model pipeline with two steps
forest_pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, max_depth=2, random_state=42)
)
# Fit a random forest
print("Random forest")
forest_pipe.fit(X_train, y_train)
# Print the training score
training_score = forest_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")
# Print the test score
test_score = forest_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")
Because random forests are an ensemble method with multiple estimators, the algorithm can learn which features help improve the fit the most.
feature_importances_
attributefit()
!# What are the named steps?
forest_pipe.named_steps
# Get the forest model
forest_model = forest_pipe['randomforestregressor']
forest_model.feature_importances_
# Make the dataframe
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")
importance
# Plot
importance.hvplot.barh(
x="Feature", y="Importance", title="Does Money Make You Happier?"
)
For more information, see the scikit-learn docs
The cross_val_score()
function will automatically partition the training set into k folds, fit the model to the subset, and return the scores for each partition.
It takes a Pipeline
object, the training features, and the training labels as arguments
from sklearn.model_selection import cross_val_score
cross_val_score?
# Make a linear pipeline
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())
# Run the 3-fold cross validation
scores = cross_val_score(
linear_pipe,
X_train,
y_train,
cv=3,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
forest_pipe['randomforestregressor'].score?
forest_pipe['randomforestregressor'].score
forest_pipe['randomforestregressor'].score
forest_pipe['randomforestregressor'].score
forest_pipe['randomforestregressor'].score
model = forest_pipe['randomforestregressor']
model.score?
# Make a random forest pipeline
forest_pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 3-fold cross validation
scores = cross_val_score(
forest_pipe,
X_train,
y_train,
cv=3,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
n_estimators
is a model hyperparameter(via the fit() method)
This is when cross validation becomes very important
from sklearn.model_selection import GridSearchCV
Let's do a search over the n_estimators
parameter and the max_depth
parameter:
# Create our regression pipeline
pipe = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
pipe
"[step name]__[parameter name]"
model_step = "randomforestregressor"
param_grid = {
f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}
param_grid
# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3)
# Run the search
grid.fit(X_train, y_train)
# The best estimator
grid.best_estimator_
# The best hyper parameters
grid.best_params_
We'll define a helper utility function to calculate the accuracy in terms of the mean absolute percent error
def evaluate(model, X_test, y_test):
"""
Given a model and test features/targets, print out the
mean absolute error and accuracy
"""
# Make the predictions
predictions = model.predict(X_test)
# Absolute error
errors = abs(predictions - y_test)
avg_error = np.mean(errors)
# Mean absolute percentage error
mape = 100 * np.mean(errors / y_test)
# Accuracy
accuracy = 100 - mape
print("Model Performance")
print(f"Average Absolute Error: {avg_error:0.4f}")
print(f"Accuracy = {accuracy:0.2f}%.")
return accuracy
# Setup the pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())
# Fit on train set
linear.fit(X_train, y_train)
# Evaluate on test set
evaluate(linear, X_test, y_test)
# Initialize the pipeline
base_model = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
# Fit the training set
base_model.fit(X_train, y_train)
# Evaluate on the test set
base_accuracy = evaluate(base_model, X_test, y_test)
Small improvement!
# Evaluate the best random forest model
best_random = grid.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)
# What's the improvement?
improvement = 100 * (random_accuracy - base_accuracy) / base_accuracy
print(f'Improvement of {improvement:0.4f}%.')
cross_val_score
GridSearchCV
In this part, we'll use a random forest model and housing data from the Office of Property Assessment to predict residential sale prices in Philadelphia
Note: We'll focus on the first two components in this analysis (and in assignment #7)
Too often, these models perpetuate inequality: low-value homes are over-assessed and high-value homes are under-assessed
Let's download data for properties in Philadelphia that had their last sale during 2019.
Sources:
import carto2gpd
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' and sale_date < '2020-01-01' and category_code_description = 'Single Family'"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
salesRaw.head()
len(salesRaw)
import missingno as msno
# We'll visualize the first half of columns
# and then the second half
ncol = len(salesRaw.columns)
fields1 = salesRaw.columns[:ncol//2]
fields2 = salesRaw.columns[ncol//2:]
# The first half of columns
msno.bar(salesRaw[fields1]);
# The second half of columns
msno.bar(salesRaw[fields2]);
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()
# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# The features
feature_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
# Make a random forest pipeline
forest = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 10-fold cross validation
scores = cross_val_score(
forest,
X_train,
y_train,
cv=10,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
# Fit on the training data
forest.fit(X_train, y_train)
# What's the test score?
forest.score(X_test, y_test)
Model appears to generalize reasonably well!
Note: we should also be optimizing hyperparameters to see if we can find additional improvements!
# Extract the regressor from the pipeline
regressor = forest["randomforestregressor"]
# Create the data frame of importances
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": regressor.feature_importances_}
)
importance.hvplot.barh(x="Feature", y="Importance")