Lab 4: Spatial Predictive Analysis

Analyzing 311 Violations

Author

Liz Crouse

Published

April 6, 2026

#| message: false
#| warning: false
#| echo: false


# Load required packages
library(tidyverse)      # Data manipulation
library(sf)             # Spatial operations
library(here)           # Relative file paths
library(viridis)        # Color scales
library(terra)          # Raster operations (replaces 'raster')
library(spdep)          # Spatial dependence
library(FNN)            # Fast nearest neighbors
library(MASS)           # Negative binomial regression
library(patchwork)      # Plot composition (replaces grid/gridExtra)
library(knitr)          # Tables
library(kableExtra)     # Table formatting
library(classInt)       # Classification intervals


# Spatstat for analyzing point patterns - split into sub-packages
library(spatstat.geom)    # Spatial geometries
library(spatstat.explore) # Spatial exploration/KDE

# Set options
options(scipen = 999)  # No scientific notation
set.seed(5080)         # Reproducibility, could be any number

# Create consistent theme for visualizations
theme_crime <- function(base_size = 11) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold", size = base_size + 1),
      plot.subtitle = element_text(color = "gray30", size = base_size - 1),
      legend.position = "right",
      panel.grid.minor = element_blank(),
      axis.text = element_blank(),
      axis.title = element_blank()
    )
}

# Set as default
theme_set(theme_crime())

cat("✓ All packages loaded successfully!\n")
✓ All packages loaded successfully!
cat("✓ Working directory:", getwd(), "\n")
✓ Working directory: C:/Users/lizmc/Desktop/Y2S2/ppa/crouse/labs/lab_4 

Part 1: Data Loading & Exploration

  • Load your 311 data and Chicago spatial boundaries
  • Create visualizations showing the spatial distribution of your violation type
  • Describe patterns you observe
#| message: false

# Load police districts (used for spatial cross-validation)
policeDistricts <- 
  st_read("https://data.cityofchicago.org/api/geospatial/24zt-jpfn?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(District = dist_num)
Reading layer `OGRGeoJSON' from data source 
  `https://data.cityofchicago.org/api/geospatial/24zt-jpfn?method=export&format=GeoJSON' 
  using driver `GeoJSON'
Simple feature collection with 25 features and 2 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -87.94011 ymin: 41.64455 xmax: -87.52414 ymax: 42.02303
Geodetic CRS:  WGS 84
# Load police beats (smaller administrative units)
policeBeats <- 
  st_read("https://data.cityofchicago.org/api/geospatial/n9it-hstw?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(Beat = beat_num)
Reading layer `OGRGeoJSON' from data source 
  `https://data.cityofchicago.org/api/geospatial/n9it-hstw?method=export&format=GeoJSON' 
  using driver `GeoJSON'
Simple feature collection with 277 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -87.94011 ymin: 41.64455 xmax: -87.52414 ymax: 42.02303
Geodetic CRS:  WGS 84
# Load Chicago boundary
chicagoBoundary <- 
  st_read("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/Chapter5/chicagoBoundary.geojson") %>%
  st_transform('ESRI:102271')
Reading layer `chicagoBoundary' from data source 
  `https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/Chapter5/chicagoBoundary.geojson' 
  using driver `GeoJSON'
Simple feature collection with 1 feature and 1 field
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -87.8367 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
Geodetic CRS:  WGS 84
cat("✓ Loaded spatial boundaries\n")
✓ Loaded spatial boundaries
cat("  - Police districts:", nrow(policeDistricts), "\n")
  - Police districts: 25 
cat("  - Police beats:", nrow(policeBeats), "\n")
  - Police beats: 277 
  - Number of potholes: 95698 /n
  - CRS: ESRI:102271 
  - Date range: 01/01/2024 01:10:51 PM to 12/31/2025 07:49:19 AM 
# Simple point map
p1 <- ggplot() + 
  geom_sf(data = chicagoBoundary, fill = "gray95", color = "gray60") +
  geom_sf(data = potholes, color = "#d62828", size = 0.1, alpha = 0.4) +
  labs(
    title = "Pothole Locations",
    subtitle = paste0("Chicago 2024-2025, n = ", nrow(potholes))
  )

# Density surface using modern syntax
p2 <- ggplot() + 
  geom_sf(data = chicagoBoundary, fill = "gray95", color = "gray60") +
  geom_density_2d_filled(
    data = data.frame(st_coordinates(potholes)),
    aes(X, Y),
    alpha = 0.7,
    bins = 8
  ) +
  scale_fill_viridis_d(
    option = "plasma",
    direction = -1,
    guide = "none"  
  ) +
  labs(
    title = "Density Surface",
    subtitle = "Kernel density estimation"
  )

# Combine plots using patchwork 
p1 + p2 + 
  plot_annotation(
    title = "Spatial Distribution of Potholes in Chicago",
    tag_levels = 'A'
  )

# Describe patterns you observe
## Potholes are concentrated along the eastern edge of Chicago (shoreline of Lake Michigan), specifically in the northeastern area of the city. Smaller clusters appear in the north and southwest corners.

Part 2: Fishnet Grid Creation

  • Create a 500m x 500m fishnet grid
  • Aggregate your violations to grid cells
  • Visualize the count distribution
# Create 500m x 500m grid
fishnet <- st_make_grid(
  chicagoBoundary,
  cellsize = 500,  # 500 meters per cell
  square = TRUE
) %>%
  st_sf() %>%
  mutate(uniqueID = row_number()) %>%
  st_transform('ESRI:102271') 

# Keep only cells that intersect Chicago
fishnet <- fishnet[chicagoBoundary, ]

# View basic info
cat("✓ Created fishnet grid\n")
✓ Created fishnet grid
cat("  - Number of cells:", nrow(fishnet), "\n")
  - Number of cells: 2458 
cat("  - Cell size:", 500, "x", 500, "meters\n")
  - Cell size: 500 x 500 meters
cat("  - Cell area:", round(st_area(fishnet[1,])), "square meters\n")
  - Cell area: 250000 square meters
# Spatial join: which cell contains each burglary?
potholes_fishnet <- st_join(potholes, fishnet, join = st_within) %>%
  st_drop_geometry() %>%
  group_by(uniqueID) %>%
  summarize(countPotholes = n()) 


glimpse(potholes_fishnet)
Rows: 2,247
Columns: 2
$ uniqueID      <int> 37, 38, 44, 47, 48, 49, 50, 90, 91, 92, 93, 94, 96, 97, …
$ countPotholes <int> 7, 8, 2, 9, 3, 1, 9, 1, 5, 17, 15, 3, 1, 1, 6, 16, 48, 2…
# Join back to fishnet (cells with 0 burglaries will be NA)
fishnet <- fishnet %>%
  left_join(potholes_fishnet, by = "uniqueID") %>%
  mutate(countPotholes = replace_na(countPotholes, 0))
# Visualize aggregated counts
ggplot() +
  geom_sf(data = fishnet, aes(fill = countPotholes), color = NA) +
  geom_sf(data = chicagoBoundary, fill = NA, color = "NA", linewidth = 1) +
  scale_fill_viridis_c(
    name = "Potholes",
    option = "plasma",
    trans = "sqrt",  # Square root for better visualization of skewed data
    breaks = c(0, 100, 200, 300, 400)
  ) +
  labs(
    title = "Potholes Counts by Grid Cell",
    subtitle = "500m x 500m cells, Chicago 2024-2025"
  ) +
  theme_crime()

Part 3: Spatial Features

  • Calculate k-nearest neighbor features
  • Perform Local Moran’s I analysis
  • Identify hot spots and cold spots
  • Create distance-to-hotspot measures
  • Join any additional contextual data if you are looking for more to do and really get into this (e.g., demographics, land use)
# Convert potholes to ppp (point pattern) format for spatstat
potholes_ppp <- as.ppp(
  st_coordinates(potholes),
  W = as.owin(st_bbox(chicagoBoundary))
)

# Calculate KDE with 1km bandwidth
kde_potholes <- density.ppp(
  potholes_ppp,
  sigma = 1000,  # 1km bandwidth
  edge = TRUE    # Edge correction
)

# Convert to terra raster (modern approach, not raster::raster)
kde_raster <- rast(kde_potholes)

# Extract KDE values to fishnet cells
fishnet <- fishnet %>%
  mutate(
    kde_value = terra::extract(
      kde_raster,
      vect(fishnet),
      fun = mean,
      na.rm = TRUE
    )[, 2]  # Extract just the values column
  )

cat("✓ Calculated KDE baseline\n")
✓ Calculated KDE baseline
ggplot() +
  geom_sf(data = fishnet, aes(fill = kde_value), color = NA) +
  geom_sf(data = chicagoBoundary, fill = NA, color = "white", linewidth = 1) +
  scale_fill_viridis_c(
    name = "KDE Value",
    option = "plasma"
  ) +
  labs(
    title = "Kernel Density Estimation Baseline",
    subtitle = "Simple spatial smoothing of pothole locations"
  ) +
  theme_crime()

# Get coordinates
fishnet_coords <- st_coordinates(st_centroid(fishnet))
pothole_coords <- st_coordinates(potholes)

# Calculate k nearest neighbors and distances
nn_result <- get.knnx(pothole_coords, fishnet_coords, k = 3)

# Add to fishnet
fishnet <- fishnet %>%
  mutate(
    potholes.nn = rowMeans(nn_result$nn.dist)
  )

cat("✓ Calculated nearest neighbor distances\n")
✓ Calculated nearest neighbor distances
summary(fishnet$potholes.nn)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   1.667   51.885   82.414  137.322  141.881 1450.311 
# Function to calculate Local Moran's I
calculate_local_morans <- function(data, variable, k = 5) {
  
  # Create spatial weights
  coords <- st_coordinates(st_centroid(data))
  neighbors <- knn2nb(knearneigh(coords, k = k))
  weights <- nb2listw(neighbors, style = "W", zero.policy = TRUE)
  
  # Calculate Local Moran's I
  local_moran <- localmoran(data[[variable]], weights)
  
  # Classify clusters
  mean_val <- mean(data[[variable]], na.rm = TRUE)
  
  data %>%
    mutate(
      local_i = local_moran[, 1],
      p_value = local_moran[, 5],
      is_significant = p_value < 0.05,
      
      moran_class = case_when(
        !is_significant ~ "Not Significant",
        local_i > 0 & .data[[variable]] > mean_val ~ "High-High",
        local_i > 0 & .data[[variable]] <= mean_val ~ "Low-Low",
        local_i < 0 & .data[[variable]] > mean_val ~ "High-Low",
        local_i < 0 & .data[[variable]] <= mean_val ~ "Low-High",
        TRUE ~ "Not Significant"
      )
    )
}

# Apply to potholes
fishnet <- calculate_local_morans(fishnet, "countPotholes", k = 5)
# Visualize hot spots
ggplot() +
  geom_sf(
    data = fishnet, 
    aes(fill = moran_class), 
    color = NA
  ) +
  scale_fill_manual(
    values = c(
      "High-High" = "#d7191c",
      "High-Low" = "#fdae61",
      "Low-High" = "#abd9e9",
      "Low-Low" = "#2c7bb6",
      "Not Significant" = "gray90"
    ),
    name = "Cluster Type"
  ) +
  labs(
    title = "Local Moran's I: Potholes Clusters",
    subtitle = "High-High = Hot spots of disorder"
  ) +
  theme_crime()

# Get centroids of "High-High" cells (hot spots)
hotspots <- fishnet %>%
  filter(moran_class == "High-High") %>%
  st_centroid()

# Calculate distance from each cell to nearest hot spot
if (nrow(hotspots) > 0) {
  fishnet <- fishnet %>%
    mutate(
      dist_to_hotspot = as.numeric(
        st_distance(st_centroid(fishnet), hotspots %>% st_union())
      )
    )
  
  cat("✓ Calculated distance to abandoned car hot spots\n")
  cat("  - Number of hot spot cells:", nrow(hotspots), "\n")
} else {
  fishnet <- fishnet %>%
    mutate(dist_to_hotspot = 0)
  cat("⚠ No significant hot spots found\n")
}
✓ Calculated distance to abandoned car hot spots
  - Number of hot spot cells: 254 

Part 4: Count Regression Models

  • Fit Poisson regression
  • Fit Negative Binomial regression
  • Compare model fit (AIC)
# Strip District if it exists from a previous run
fishnet <- fishnet %>% dplyr::select(-any_of("District"))

# Join district information to fishnet via centroids
fishnet_centroids <- st_join(
  st_centroid(fishnet),
  dplyr::select(policeDistricts, District),
  join = st_within,
  left = TRUE
) %>%
  st_drop_geometry() %>%
  dplyr::select(uniqueID, District)

fishnet <- fishnet %>%
  left_join(fishnet_centroids, by = "uniqueID")

cat("✓ Joined police districts\n")
✓ Joined police districts
cat("  - Districts:", length(unique(fishnet$District)), "\n")
  - Districts: 24 
cat("  - Cells with District:", sum(!is.na(fishnet$District)), "\n")
  - Cells with District: 2260 
# Create clean modeling dataset
fishnet_model <- fishnet %>%
  st_drop_geometry() %>%
  dplyr::select(
    uniqueID,
    District,
   countPotholes,
    potholes.nn,
    dist_to_hotspot
  ) %>%
  na.omit()  # Remove any remaining NAs

cat("✓ Prepared modeling data\n")
✓ Prepared modeling data
cat("  - Observations:", nrow(fishnet_model), "\n")
  - Observations: 2260 
cat("  - Variables:", ncol(fishnet_model), "\n")
  - Variables: 5 
# Fit Poisson regression
model_poisson <- glm(
  countPotholes ~ potholes.nn + dist_to_hotspot,
  data = fishnet_model,
  family = "poisson"
)

# Summary
summary(model_poisson)

Call:
glm(formula = countPotholes ~ potholes.nn + dist_to_hotspot, 
    family = "poisson", data = fishnet_model)

Coefficients:
                    Estimate   Std. Error z value            Pr(>|z|)    
(Intercept)      4.700582282  0.006390654   735.5 <0.0000000000000002 ***
potholes.nn     -0.008508284  0.000080251  -106.0 <0.0000000000000002 ***
dist_to_hotspot -0.000120190  0.000001569   -76.6 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 62107  on 2259  degrees of freedom
Residual deviance: 26257  on 2257  degrees of freedom
AIC: 37506

Number of Fisher Scoring iterations: 5
# Fit Negative Binomial model
model_nb <- glm.nb(
  countPotholes ~ potholes.nn + 
    dist_to_hotspot,
  data = fishnet_model
)

# Summary
summary(model_nb)

Call:
glm.nb(formula = countPotholes ~ potholes.nn + dist_to_hotspot, 
    data = fishnet_model, init.theta = 3.459954798, link = log)

Coefficients:
                    Estimate   Std. Error z value            Pr(>|z|)    
(Intercept)      4.771149805  0.022719667  210.00 <0.0000000000000002 ***
potholes.nn     -0.010296842  0.000211564  -48.67 <0.0000000000000002 ***
dist_to_hotspot -0.000089496  0.000004417  -20.26 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(3.46) family taken to be 1)

    Null deviance: 6734.1  on 2259  degrees of freedom
Residual deviance: 2383.8  on 2257  degrees of freedom
AIC: 18733

Number of Fisher Scoring iterations: 1

              Theta:  3.460 
          Std. Err.:  0.117 

 2 x log-likelihood:  -18724.588 
# Compare AIC (lower is better)
cat("\nModel Comparison:\n")

Model Comparison:
cat("Poisson AIC:", round(AIC(model_poisson), 1), "\n")
Poisson AIC: 37505.9 
cat("Negative Binomial AIC:", round(AIC(model_nb), 1), "\n")
Negative Binomial AIC: 18732.6 

Part 5: Spatial Cross-Validation

  • Implement Leave-One-Group-Out cross-validation on data
  • Calculate and report error metrics (MAE, RMSE)
# Get unique districts
districts <- unique(fishnet_model$District)
cv_results <- tibble()

cat("Running LOGO Cross-Validation...\n")
Running LOGO Cross-Validation...
for (i in seq_along(districts)) {
  
  test_district <- districts[i]
  
  # Split data
  train_data <- fishnet_model %>% filter(District != test_district)
  test_data <- fishnet_model %>% filter(District == test_district)
  
  # Fit model on training data
  model_cv <- glm.nb(
  countPotholes ~ potholes.nn + dist_to_hotspot,
    data = train_data
  )
  
  # Predict on test data
  test_data <- test_data %>%
    mutate(
      prediction = predict(model_cv, test_data, type = "response")
    )
  
  # Calculate metrics
  mae <- mean(abs(test_data$countPotholes - test_data$prediction))
  rmse <- sqrt(mean((test_data$countPotholes - test_data$prediction)^2))
  
  # Store results
  cv_results <- bind_rows(
    cv_results,
    tibble(
      fold = i,
      test_district = test_district,
      n_test = nrow(test_data),
      mae = mae,
      rmse = rmse
    )
  )
  
  cat("  Fold", i, "/", length(districts), "- District", test_district, 
      "- MAE:", round(mae, 2), "\n")
}
  Fold 1 / 23 - District 5 - MAE: 8.37 
  Fold 2 / 23 - District 4 - MAE: 7.8 
  Fold 3 / 23 - District 22 - MAE: 13.19 
  Fold 4 / 23 - District 31 - MAE: 5.04 
  Fold 5 / 23 - District 6 - MAE: 14.5 
  Fold 6 / 23 - District 8 - MAE: 16.22 
  Fold 7 / 23 - District 7 - MAE: 17.26 
  Fold 8 / 23 - District 3 - MAE: 18.76 
  Fold 9 / 23 - District 2 - MAE: 15.3 
  Fold 10 / 23 - District 9 - MAE: 16.86 
  Fold 11 / 23 - District 10 - MAE: 16.51 
  Fold 12 / 23 - District 1 - MAE: 25.72 
  Fold 13 / 23 - District 12 - MAE: 23.06 
  Fold 14 / 23 - District 15 - MAE: 10.77 
  Fold 15 / 23 - District 11 - MAE: 17.22 
  Fold 16 / 23 - District 18 - MAE: 34.43 
  Fold 17 / 23 - District 25 - MAE: 18.07 
  Fold 18 / 23 - District 14 - MAE: 29.04 
  Fold 19 / 23 - District 19 - MAE: 28.03 
  Fold 20 / 23 - District 16 - MAE: 19.98 
  Fold 21 / 23 - District 17 - MAE: 25.44 
  Fold 22 / 23 - District 20 - MAE: 21.01 
  Fold 23 / 23 - District 24 - MAE: 20.46 
# Overall results
cat("\n✓ Cross-Validation Complete\n")

✓ Cross-Validation Complete
cat("Mean MAE:", round(mean(cv_results$mae), 2), "\n")
Mean MAE: 18.39 
cat("Mean RMSE:", round(mean(cv_results$rmse), 2), "\n")
Mean RMSE: 26.25 
# Fit final model on all data
final_model <- glm.nb(
   countPotholes ~ potholes.nn + dist_to_hotspot,
  data = fishnet_model
)

# Add predictions back to fishnet
fishnet <- fishnet %>%
  mutate(
    prediction_nb = predict(final_model, fishnet_model, type = "response")[match(uniqueID, fishnet_model$uniqueID)]
  )

# Also add KDE predictions (normalize to same scale as counts)
kde_sum <- sum(fishnet$kde_value, na.rm = TRUE)
count_sum <- sum(fishnet$countPotholes, na.rm = TRUE)
fishnet <- fishnet %>%
  mutate(
    prediction_kde = (kde_value / kde_sum) * count_sum
  )

Part 6: Model Evaluation

  • Compare to KDE baseline
# Create three maps
p1 <- ggplot() +
  geom_sf(data = fishnet, aes(fill = countPotholes), color = NA) +
  scale_fill_viridis_c(name = "Count", option = "plasma", limits = c(0, 15)) +
  labs(title = "Actual Potholes") +
  theme_crime()

p2 <- ggplot() +
  geom_sf(data = fishnet, aes(fill = prediction_nb), color = NA) +
  scale_fill_viridis_c(name = "Predicted", option = "plasma", limits = c(0, 15)) +
  labs(title = "Model Predictions (Neg. Binomial)") +
  theme_crime()

p3 <- ggplot() +
  geom_sf(data = fishnet, aes(fill = prediction_kde), color = NA) +
  scale_fill_viridis_c(name = "Predicted", option = "plasma", limits = c(0, 15)) +
  labs(title = "KDE Baseline Predictions") +
  theme_crime()

p1 + p2 + p3 +
  plot_annotation(
    title = "Actual vs. Predicted Potholes",
    subtitle = "Does our complex model outperform simple KDE?"
  )

# Calculate performance metrics
comparison <- fishnet %>%
  st_drop_geometry() %>%
  filter(!is.na(prediction_nb), !is.na(prediction_kde)) %>%
  summarize(
    model_mae = mean(abs(countPotholes - prediction_nb)),
    model_rmse = sqrt(mean((countPotholes - prediction_nb)^2)),
    kde_mae = mean(abs(countPotholes - prediction_kde)),
    kde_rmse = sqrt(mean((countPotholes - prediction_kde)^2))
  )

comparison %>%
  pivot_longer(everything(), names_to = "metric", values_to = "value") %>%
  separate(metric, into = c("approach", "metric"), sep = "_") %>%
  pivot_wider(names_from = metric, values_from = value) %>%
  kable(
    digits = 2,
    caption = "Model Performance Comparison"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Model Performance Comparison
approach mae rmse
model 17.10 26.07
kde 19.06 27.28