from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
np.random.seed(42)
pd.options.display.max_columns = 999
plt.rcParams['figure.figsize'] = (10,6)
Nov 22, 2021
Focus: much more hands-on experience with featuring engineering and adding spatial based features
First, let's setup all of the imports we'll need from scikit learn:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
Let's download data for single-family properties in Philadelphia that had their last sale during 2019.
Sources:
import carto2gpd
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' AND sale_date < '2020-01-01'"
where = where + " AND category_code_description = 'Single Family'"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")
salesRaw.head()
len(salesRaw)
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
"geometry"
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()
# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
# Split the data 70/30
train_set, test_set = train_test_split(sales,
test_size=0.3,
random_state=42)
# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# The features
feature_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
Run a linear regression model as a baseline:
# Make a linear model pipeline
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())
# Fit on the training data
linear_pipeline.fit(X_train, y_train)
# What's the test score?
linear_pipeline.score(X_test, y_test)
Run cross-validation on a random forest model:
# Make a random forest pipeline
forest_pipeline = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 10-fold cross validation
scores = cross_val_score(
forest_pipeline,
X_train,
y_train,
cv=10,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
# Fit on the training data
forest_pipeline.fit(X_train, y_train)
# What's the test score?
forest_pipeline.score(X_test, y_test)
The model appears to generalize reasonably well
Note: we should also be optimizing hyperparameters to see if we can find additional improvements!
# Extract the regressor from the pipeline
forest_model = forest_pipeline["randomforestregressor"]
# Create the data frame of importances
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")
importance.hvplot.barh(x="Feature", y="Importance")
We can use a technique called one-hot encoding
Steps:
OneHotEncoder
object is a preprocessor that will perform the vectorization stepColumnTransformer
object will help us apply different transformers to numerical and categorical columnsfrom sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
Let's try out the example data of colors:
# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]
colors.shape
colors
# Initialize the OHE transformer
ohe = OneHotEncoder()
# Fit the transformer and then transform the colors
ohe.fit_transform(colors).toarray()
# The corresponding category for each column
ohe.categories_
Let's apply separate transformers for our numerical and categorical columns:
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
Note: the handle_unknown='ignore'
parameter ensures that if categories show up in our training set, but not our test set, no error will be raised.
Initialize the pipeline object, using the column transformer and the random forest regressor
# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=10,
random_state=42)
)
Now, let's fit the model.
train_set
and test_set
X_train
and X_test
numpy arrays. # Fit the training set
pipe.fit(train_set, y_train);
# What's the test score?
pipe.score(test_set, y_test)
$R^2$ of ~0.30 improved to $R^2$ of ~0.56!
Takeaway: neighborhood based effects play a crucial role in determining housing prices.
Side Note: to fully validate the model we should run $k$-fold cross validation and optimize hyperparameters of the model as well...
This will be part of assignment #7
But first, we need to know the column names! The one-hot encoder created a column for each category type...
# The column transformer...
transformer
# The steps in the column transformer
transformer.named_transformers_
# The one-hot step
ohe = transformer.named_transformers_['cat']
ohe
# One column for each category type!
ohe_cols = ohe.get_feature_names()
ohe_cols
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
features
random_forest = pipe["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": random_forest.feature_importances_}
)
importance.head(n=20)
# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]
# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)
Garbage in, garbage out
Takeway: If your input features are poorly designed (for example, completely unrelated to thing you want to predict), then no matter how good your machine learning model is or how well you "train" it, then the model will never be able to do the translation from features to predicted value.
Yes, let's add distance-based features
The strategy
Distance from each sale to:
Source: https://www.opendataphilly.org/dataset/311-service-and-information-requests
We'll only pull data from 2019.
# the 311 table
table_name = "public_cases_fc"
# Peak at the first row of data
carto2gpd.get(carto_url, table_name, limit=1)
# Select only those for grafitti and in 2019
where_2019 = "requested_datetime >= '01-01-2019' and requested_datetime < '01-01-2020'"
where_grafitti = "service_name = 'Graffiti Removal'"
where = f"{where_2019} and {where_grafitti}"
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, table_name, where=where)
# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]
len(graffiti)
graffiti.head()
We will need to:
# Do the CRS conversion
sales_3857 = sales.to_crs(epsg=3857)
graffiti_3857 = graffiti.to_crs(epsg=3857)
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
"""
x = df.geometry.x
y = df.geometry.y
return np.column_stack((x, y)) # stack as columns
# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)
# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)
salesXY.shape
graffitiXY.shape
For this, we will use the $k$ nearest neighbors algorithm from scikit learn.
For each sale:
from sklearn.neighbors import NearestNeighbors
# STEP 1: Initialize the algorithm
k = 5
nbrs = NearestNeighbors(n_neighbors=k)
# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)
# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY)
*Note: I am using k=5
here without any real justification. In practice, you would want to try a few different k values to try to identify the best value to use.
grafDists
: For each sale, the distances to the 5 nearest graffiti callsgrafIndices
: For each sale, the index of each of the 5 neighbors in the original datasetprint("length of sales = ", len(salesXY))
print("shape of grafDists = ", grafDists.shape)
print("shape of grafIndices = ", grafIndices.shape)
# The distances from the first sale to the 5 nearest neighbors
grafDists[0]
# The coordinates for the first sale
x0, y0 = salesXY[0]
x0, y0
# The indices for the 5 nearest graffiti calls
grafIndices[0]
# the graffiti neighbors
sale0_neighbors = graffitiXY[grafIndices[0]]
sale0_neighbors
# Access the first and second column for x/y values
neighbors_x = sale0_neighbors[:,0]
neighbors_y = sale0_neighbors[:,1]
# The x/y differences between neighbors and first sale coordinates
dx = (neighbors_x - x0)
dy = (neighbors_y - y0)
# The Euclidean dist
manual_dists = (dx**2 + dy**2) ** 0.5
manual_dists
grafDists[0]
We'll average over the column axis: axis=1
# Average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)
# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5
# Calculate log of distances
sales['logDistGraffiti'] = np.log10(avgGrafDist)
sales.head()
# Load the City Limits to plot too
import esri2gpd
# From OpenDataPhilly's page
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
# Plot the log of the Graffiti distance
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=sales["logDistGraffiti"].values, gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
Use the osmnx
package to get subway stops in Philly — we can use the ox.geometries_from_polygon()
function.
station=subway
: see the OSM Wikipediaimport osmnx as ox
# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry
city_limits_outline
# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})
# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)
subway.head()
fig, ax = plt.subplots(figsize=(6,6))
# Plot the subway locations
subway.plot(ax=ax, markersize=10, color='crimson')
# City limits, too
city_limits.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=4)
ax.set_axis_off()
The stops on the Market-Frankford and Broad St. subway lines!
We'll use $k=1$ to get the distance to the nearest stop.
# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))
# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)
# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)
# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)
# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))
# Plot the log of the subway distance
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=sales['logDistSubway'].values, gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
Looks like it worked!
Let's have a look at the correlations of numerical columns:
import seaborn as sns
cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway", # NEW
"sale_price"
]
sns.heatmap(sales[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway" # NEW
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);
# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)
$R^2$ of ~0.58 improved to $R^2$ of ~0.62
def plot_feature_importances(pipeline, num_cols, transformer, top=20, **kwargs):
"""
Utility function to plot the feature importances from the input
random forest regressor.
Parameters
----------
pipeline :
the pipeline object
num_cols :
list of the numerical columns
transformer :
the transformer preprocessing step
top : optional
the number of importances to plot
**kwargs : optional
extra keywords passed to the hvplot function
"""
# The one-hot step
ohe = transformer.named_transformers_["cat"]
# One column for each category type!
ohe_cols = ohe.get_feature_names()
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
# The regressor
regressor = pipeline["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": regressor.feature_importances_}
)
# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False).iloc[:top]
# Plot
return importance.hvplot.barh(
x="Feature", y="Importance", flip_yaxis=True, **kwargs
)
plot_feature_importances(pipe, num_cols, transformer, top=30, height=500)
Modify the get_xy_from_geometry()
function to use the "centroid" of the geometry column.
Note: you can take the centroid of a Point() or Polygon() object. For a Point(), you just get the x/y coordinates back.
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
"""
# NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries
x = df.geometry.centroid.x
y = df.geometry.centroid.y
return np.column_stack((x, y)) # stack as columns
New feature: Distance to the nearest university/college
New feature: Distance to the nearest park centroid
Notes
x
and y
coordinates of the park centroids and calculate the distance to these centroids. geometry.centroid.x
and geometry.centroid.y
values to access these coordinates.
New feature: Distance to City Hall.
Notes
New feature: Distance to the 5 nearest new construction permits from 2019
Notes
permitdescription
equals 'NEW CONSTRUCTION PERMIT'permitissuedate
column
New feature: Distance to the 5 nearest aggravated assaults in 2019
Notes
Text_General_Code
equals 'Aggravated Assault No Firearm' or 'Aggravated Assault Firearm'dispatch_date
column
New feature: Distance to the 5 nearest abandoned vehicle 311 calls in 2019
Notes
service_name
equals 'Abandoned Vehicle'requested_datetime
column