Oct 18, 2021
By example:
# Initial imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
3 new packages:
Out-of-core (larger-than-memory) operations in Python
Extends the maximum file size from the size of memory to the size of your hard drive
Dask stores does not evaluate expressions immediately — rather it stores a task graph of the necessary calculations.
# Create an array of normally-distributed random numbers
a = np.random.randn(1000)
a[:10]
# Multiply this array by a factor
b = a * 4
# Find the minimum value
b_min = b.min()
b_min
Dask arrays mirror the numpy array syntax...but don't perform the calculation right away.
import dask.array as da
# Create a dask array from the above array
a_dask = da.from_array(a, chunks=200)
# Multiply this array by a factor
b_dask = a_dask * 4
# Find the minimum value
b_min_dask = b_dask.min()
print(b_min_dask)
We need to explicitly call the compute()
function to evaluate the expression. We get the same result as the non-lazy numpy result.
b_min_dask.compute()
Pandas DataFrames don't need to fit into memory...when evaluating an expression, dask will load the data into memory when necessary.
See datasets.yml in this week's repository.
import intake
datasets = intake.open_catalog("./datasets.yml")
list(datasets)
datasets.osm
to_dask()
¶osm_ddf = datasets.osm.to_dask()
osm_ddf
Only the data necessary to see the head of the file will be loaded.
# we can still get the head of the file quickly
osm_ddf.head(n=10)
All data partitions must be loaded for this calculation...it will take longer!
# getting the length means all of the data must be loaded though
nrows = len(osm_ddf)
print(f"number of rows = {nrows}")
# mean x/y coordinates
mean_x = osm_ddf['x'].mean()
mean_y = osm_ddf['y'].mean()
print(mean_x, mean_y)
# evaluate the expressions
print("mean x = ", mean_x.compute())
# evaluate the expressions
print("mean y = ", mean_y.compute())
Matplotlib struggles with only hundreds of points
"Turns even the largest data into images, accurately"
Datashader is a library that produces a "rasterized" image of large datasets, such that the visual color mapping is a fair representation of the underlying data.
Recommended reading: Understanding the datashader algorithm
# Datashader imports
import datashader as ds
import datashader.transfer_functions as tf
# Color-related imports
from datashader.colors import Greys9, viridis, inferno
from colorcet import fire
## SETUP GLOBAL CANVAS
# Web Mercator bounds
bound = 20026376.39
global_x_range = (-bound, bound)
global_y_range = (int(-bound*0.4), int(bound*0.6))
# Default width and height
global_plot_width = 900
global_plot_height = int(global_plot_width*0.5)
# Step 1: Setup the canvas
canvas = ds.Canvas(
plot_width=global_plot_width,
plot_height=global_plot_height,
x_range=global_x_range,
y_range=global_y_range,
)
# Step 2: Aggregate the points into pixels
# NOTE: Use the "count()" function — count number of points per pixel
agg = canvas.points(osm_ddf, "x", "y", agg=ds.count())
agg
# Step 3: Perform the shade operation
img = tf.shade(agg, cmap=fire)
# Format: set the background of the image to black so it looks better
img = tf.set_background(img, "black")
img
Remember: our agg
variable is an xarray DataArray.
So, we can leverage xarray's builtin where()
function to select a subsample of the pixels based on the pixel counts.
selected = agg.where(agg > 15)
selected
where
condition — masked pixels are set to NaN¶# plot the masked data
tf.set_background(tf.shade(selected, cmap=fire),"black")
We can use datashader to visualize all 300 million census dots produced as part of the Cooper Center racial dot map.
# Load the data
# REMEMBER: this will take some time to download the first time
census_ddf = datasets.census.to_dask()
census_ddf.head()
print("number of rows =", len(census_ddf))
Important: datashader has a utility function to convert from latitude/longitude (EPSG=4326) to Web Mercator (EPSG=3857)
See: lnglat_to_meters()
from datashader.utils import lnglat_to_meters
# Sensible lat/lng coordinates for U.S. cities
# NOTE: these are in lat/lng so EPSG=4326
USA = ((-124.72, -66.95), (23.55, 50.06))
Chicago = (( -88.29, -87.30), (41.57, 42.00))
NewYorkCity = (( -74.39, -73.44), (40.51, 40.91))
LosAngeles = ((-118.53, -117.81), (33.63, 33.96))
Houston = (( -96.05, -94.68), (29.45, 30.11))
Austin = (( -97.91, -97.52), (30.17, 30.37))
NewOrleans = (( -90.37, -89.89), (29.82, 30.05))
Atlanta = (( -84.88, -84.04), (33.45, 33.84))
Philly = (( -75.28, -74.96), (39.86, 40.14))
# Get USA xlim and ylim in meters (EPSG=3857)
USA_xlim_meters, USA_ylim_meters = [list(r) for r in lnglat_to_meters(USA[0], USA[1])]
# Define some a default plot width & height
plot_width = int(900)
plot_height = int(plot_width*7.0/12)
# Step 1: Setup the canvas
cvs = ds.Canvas(
plot_width, plot_height, x_range=USA_xlim_meters, y_range=USA_ylim_meters
)
# Step 2: Aggregate the x/y points
agg = cvs.points(census_ddf, "easting", "northing")
# Step 3: Shade with a "Grey" colormap and "linear" colormapping
img = tf.shade(agg, cmap=Greys9, how="linear")
# Format: Set the background
tf.set_background(img, "black")
# Step 3: Shade with a "Grey" colormap and "log" colormapping
img = tf.shade(agg, cmap=Greys9, how='log')
# Format: add a black background
img = tf.set_background(img, 'black')
img
"A collection of perceptually accurate colormaps"
## Step 3: Shade with "fire" color scale and "log" colormapping
img = tf.shade(agg, cmap=fire, how='log')
tf.set_background(img, 'black')
# Step 3: Shade with fire colormap and equalized histogram mapping
img = tf.shade(agg, cmap=fire, how='eq_hist')
tf.set_background(img, 'black')
img = tf.shade(agg, cmap=viridis, how='eq_hist')
img = tf.set_background(img, 'black')
img
Use the export_image()
function.
from datashader.utils import export_image
export_image(img, 'usa_census_viridis')
Datashader can plot use different colors for different categories of data.
census_ddf.head()
color_key = {"w": "aqua", "b": "lime", "a": "red", "h": "fuchsia", "o": "yellow"}
def create_census_image(longitude_range, latitude_range, w=plot_width, h=plot_height):
"""
A function for plotting the Census data, coloring pixel by race values.
"""
# x and y range from lng/lat ranges
x_range, y_range = lnglat_to_meters(longitude_range, latitude_range)
# Step 1: Setup the canvas
canvas = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
# Step 2: Aggregate, but this time count the "race" category
# NEW: specify the aggregation method to count the "race" values in each pixel
agg = canvas.points(census_ddf, "easting", "northing", agg=ds.count_cat("race"))
# Step 3: Shade, using our custom color map
img = tf.shade(agg, color_key=color_key, how="eq_hist")
return tf.set_background(img, "black")
Color pixel values according to the demographics data in each pixel.
create_census_image(USA[0], USA[1])
create_census_image(Philly[0], Philly[1], w=600, h=600)
Hint: use the bounding boxes provided earlier to explore racial patterns across various major cities
create_census_image(NewYorkCity[0], NewYorkCity[1])
create_census_image(Atlanta[0], Atlanta[1])
create_census_image(LosAngeles[0], LosAngeles[1])
create_census_image(Houston[0], Houston[1])
create_census_image(Chicago[0], Chicago[1])
create_census_image(NewOrleans[0], NewOrleans[1])
We can use xarray to slice the array of aggregated pixel values to examine specific aspects of the data.
Use the sel()
function of the xarray array
# Step 1: Setup canvas
cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height)
# Step 2: Aggregate and count race category
aggc = cvs.points(census_ddf, "easting", "northing", agg=ds.count_cat("race"))
# NEW: Select only African Americans (where "race" column is equal to "b")
agg_b = aggc.sel(race="b")
# Step 3: Shade and set background
img = tf.shade(agg_b, cmap=fire, how="eq_hist")
img = tf.set_background(img, "black")
img
Goal: Select pixels where each race has a non-zero count.
bool_sel = aggc.sel(race=['w', 'b', 'a', 'h']) > 0
bool_sel
# Do a "logical and" operation across the "race" dimension
# Pixels will be "True" if the pixel has a positive count for each race
diverse_selection = bool_sel.all(dim='race')
diverse_selection
# Select the pixel values where our diverse selection criteria is True
agg2 = aggc.where(diverse_selection).fillna(0)
# and shade using our color key
img = tf.shade(agg2, color_key=color_key, how='eq_hist')
img = tf.set_background(img,"black")
img
# Select where the "b" race dimension is greater than the "w" race dimension
selection = aggc.sel(race='b') > aggc.sel(race='w')
selection
# Select based on the selection criteria
agg3 = aggc.where(selection).fillna(0)
img = tf.shade(agg3, color_key=color_key, how="eq_hist")
img = tf.set_background(img, "black")
img