from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)
plt.rcParams['figure.figsize'] = (10,6)
Nov 8, 2021
Example: A map of households without internet for US counties
Two ways:
folium.Choropleth
folium.GeoJson
import folium
Load the data, remove counties with no households and add our new column:
# Load CSV data from the data/ folder
census_data = pd.read_csv("./data/internet_avail_census.csv", dtype={"geoid": str})
# Remove counties with no households
valid = census_data['universe'] > 0
census_data = census_data.loc[valid]
# Calculate the percent without internet
census_data['percent_no_internet'] = census_data['no_internet'] / census_data['universe']
census_data.head()
Load the counties geometry data too:
# Load counties GeoJSOn from the data/ folder
counties = gpd.read_file("./data/us-counties-10m.geojson")
counties.head()
Note: this is different than using folium.Choropleth
, where data and features are stored in two separate data frames.
# Merge the county geometries with census data
# Left column: "id"
# Right column: "geoid"
census_joined = counties.merge(census_data, left_on="id", right_on="geoid")
census_joined.head()
# Minimum
min_val = census_joined['percent_no_internet'].min()
# Maximum
max_val = census_joined['percent_no_internet'].max()
# Calculate a normalized column
normalized = (census_joined['percent_no_internet'] - min_val) / (max_val - min_val)
# Add to the dataframe
census_joined['percent_no_internet_normalized'] = normalized
plt.get_cmap()
import matplotlib.colors as mcolors
# Use a red-purple colorbrewer color scheme
cmap = plt.get_cmap('RdPu')
# The minimum value of the color map as an RGB tuple
cmap(0)
# The minimum value of the color map as a hex string
mcolors.rgb2hex(cmap(0.0))
# The maximum value of the color map as a hex string
mcolors.rgb2hex(cmap(1.0))
def get_style(feature):
"""
Given an input GeoJSON feature, return a style dict.
Notes
-----
The color in the style dict is determined by the
"percent_no_internet_normalized" column in the
input "feature".
"""
# Get the data value from the feature
value = feature['properties']['percent_no_internet_normalized']
# Evaluate the color map
# NOTE: value must between 0 and 1
rgb_color = cmap(value) # this is an RGB tuple
# Convert to hex string
color = mcolors.rgb2hex(rgb_color)
# Return the style dictionary
return {'weight': 0.5, 'color': color, 'fillColor': color, "fillOpacity": 0.75}
def get_highlighted_style(feature):
"""
Return a style dict to use when the user highlights a
feature with the mouse.
"""
return {"weight": 3, "color": "black"}
.to_json()
function to convert to a GeoJSON stringneeded_cols = ['NAME', 'percent_no_internet', 'percent_no_internet_normalized', 'geometry']
census_json = census_joined[needed_cols].to_json()
# STEP 1: Initialize the map
m = folium.Map(location=[40, -98], zoom_start=4)
# STEP 2: Add the GeoJson to the map
folium.GeoJson(
census_json, # The geometry + data columns in GeoJSON format
style_function=get_style, # The style function to color counties differently
highlight_function=get_highlighted_style,
tooltip=folium.GeoJsonTooltip(['NAME', 'percent_no_internet'])
).add_to(m)
# avoid a rendering bug by saving as HTML and re-loading
m.save('percent_no_internet.html')
The hard way is harder, but we have a tooltip and highlight interactivity!
from IPython.display import IFrame
IFrame('percent_no_internet.html', width=800, height=500)
Try to replicate the above interactive map exactly (minus the background tiles). This includes:
Note: Altair's syntax is similar to the folium.Choropleth syntax — you should pass the counties GeoDataFrame to the alt.Chart()
and then use the transform_lookup()
to merge those geometries to the census data and pull in the census data column we need ("percent_without_internet").
Hints
import altair as alt
# Initialize the chart with the counties data
alt.Chart(counties).mark_geoshape(stroke="white", strokeWidth=0.25).encode(
# Encode the color
color=alt.Color(
"percent_no_internet:Q",
title="Percent Without Internet",
scale=alt.Scale(scheme="redpurple"),
legend=alt.Legend(format=".0%")
),
# Tooltip
tooltip=[
alt.Tooltip("NAME:N", title="Name"),
alt.Tooltip("percent_no_internet:Q", title="Percent Without Internet", format=".1%"),
],
).transform_lookup(
lookup="id", # The column name in the counties data to match on
from_=alt.LookupData(census_data, "geoid", ["percent_no_internet", "NAME"]), # Match census data on "geoid"
).project(
type="albersUsa"
).properties(
width=700, height=500
)
One of leaflet's strengths: a rich set of open-source plugins
https://leafletjs.com/plugins.html
Many of these are available in Folium!
from folium.plugins import HeatMap
HeatMap?
New construction in Philadelphia requires a building permit, which we can pull from Open Data Philly.
carto2gpd
package to get the dataThe "current_date" variable in SQL databases
You can use the pre-defined "current_date" variable to get the current date. For example, to get the permits from the past 30 days, we could do:
SELECT * FROM permits WHERE permitissuedate >= current_date - 30
Selecting only new construction permits
To select new construction permits, you can use the "permitdescription" field. There are two relevant categories:
We can use the SQL IN
command (documentation) to easily select rows that have these categories.
import carto2gpd
# API URL
url = "https://phl.carto.com/api/v2/sql"
# Table name on CARTO
table_name = "permits"
# The where clause, with two parts
DAYS = 30
where = f"permitissuedate >= current_date - {DAYS}"
where += " and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"
where
# Run the query
permits = carto2gpd.get(url, table_name, where=where)
len(permits)
permits.head()
Some permits don't have locations — use the .geometry.notnull()
function to trim the data frame to those incidents with valid geometries.
permits = permits.loc[permits.geometry.notnull()].copy()
Note: We need an array of (latitude, longitude) pairs. Be careful about the order!
# Extract the lat and longitude from the geometery column
permits['lat'] = permits.geometry.y
permits['lng'] = permits.geometry.x
permits.head()
# Split out the residential and commercial
residential = permits.query("permitdescription == 'RESIDENTIAL BUILDING PERMIT'")
commercial = permits.query("permitdescription == 'COMMERCIAL BUILDING PERMIT'")
# Make a NumPy array (use the "values" attribute)
residential_coords = residential[['lat', 'lng']].values
commercial_coords = commercial[['lat', 'lng']].values
commercial_coords[:5]
The HeatMap takes the list of coordinates: the first column is latitude and the second column longitude
# Initialize map
m = folium.Map(
location=[39.99, -75.13],
tiles='Cartodb Positron',
zoom_start=12
)
# Add heat map coordinates
HeatMap(commercial_coords, radius=15).add_to(m)
m
HeatMap?
# Initialize map
m = folium.Map(
location=[39.99, -75.13],
tiles='Cartodb Positron',
zoom_start=12
)
# Add heat map
HeatMap(residential_coords, radius=15).add_to(m)
m
Commercial construction concentrated in the greater Center City area while residential construction is primarily outside of Center City...
Now on to clustering...
https://scikit-learn.org/stable/
Note: We will focus on clustering algorithms today and discuss a few other machine learning techniques in the next two weeks. If there is a specific scikit-learn use case we won't cover, I'm open to ideas for incorporating it as part of the final project.
The goal
Partition a dataset into groups that have a similar set of attributes, or features, within the group and a dissimilar set of features between groups.
Minimize the intra-cluster variance and maximize the inter-cluster variance of features.
https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods
Minimizes the intra-cluster variance: minimizes the sum of the squared distances between all points in a cluster and the cluster centroid
import altair as alt
from vega_datasets import data as vega_data
Read the data from a URL:
gapminder = pd.read_csv(vega_data.gapminder_health_income.url)
gapminder.head()
alt.Chart(gapminder).mark_circle().encode(
alt.X("income:Q", scale=alt.Scale(type="log")),
alt.Y("health:Q", scale=alt.Scale(zero=False)),
size='population:Q',
tooltip=list(gapminder.columns),
).interactive()
from sklearn.cluster import KMeans
Let's start with 5 clusters
KMeans?
kmeans = KMeans(n_clusters=5)
Lot's of optional parameters, but n_clusters
is the most important:
kmeans
Use the fit()
function
kmeans.fit(gapminder[['income']])
Use the labels_
attribute
gapminder['label'] = kmeans.labels_
gapminder.groupby('label').size()
alt.Chart(gapminder).mark_circle().encode(
alt.X('income:Q', scale=alt.Scale(type='log')),
alt.Y('health:Q', scale=alt.Scale(zero=False)),
size='population:Q',
color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
tooltip=list(gapminder.columns)
).interactive()
gapminder.groupby("label")['income'].mean().sort_values()
Data is nicely partitioned into income levels
# Fit all three columns
kmeans.fit(gapminder[['income', 'health', 'population']])
# Extract the labels
gapminder['label'] = kmeans.labels_
alt.Chart(gapminder).mark_circle().encode(
alt.X('income:Q', scale=alt.Scale(type='log')),
alt.Y('health:Q', scale=alt.Scale(zero=False)),
size='population:Q',
color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
tooltip=list(gapminder.columns)
).interactive()
What's wrong?
K-means is distance-based, but our features have wildly different distance scales
StandardScaler
classfrom sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
fit_transform()
function to scale your features¶gapminder_scaled = scaler.fit_transform(gapminder[['income', 'health', 'population']])
Important: The fit_transform()
function converts the DataFrame to a numpy array:
# fit_transform() converts the data into a numpy array
gapminder_scaled[:5]
# mean of zero
gapminder_scaled.mean(axis=0)
# variance of one
gapminder_scaled.std(axis=0)
# Perform the fit
kmeans.fit(gapminder_scaled)
# Extract the labels
gapminder['label'] = kmeans.labels_
alt.Chart(gapminder).mark_circle().encode(
alt.X('income:Q', scale=alt.Scale(type='log')),
alt.Y('health:Q', scale=alt.Scale(zero=False)),
size='population:Q',
color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
tooltip=list(gapminder.columns)
).interactive()
# Number of countries per cluster
gapminder.groupby("label").size()
# Average population per cluster
gapminder.groupby("label")['population'].mean().sort_values() / 1e6
# Average life expectancy per cluster
gapminder.groupby("label")['health'].mean().sort_values()
# Average income per cluster
gapminder.groupby("label")['income'].mean().sort_values() / 1e3
gapminder.loc[gapminder['label']==4]
kmeans.inertia_
gapminder.loc[gapminder['label']==2]
I've extracted neighborhood Airbnb statistics for Philadelphia neighborhoods from Tom Slee's website.
The data includes average price per person, overall satisfaction, and number of listings.
Original research study: How Airbnb's Data Hid the Facts in New York City