Lab 4: Spatial Predictive Analysis

Analyzing 311 Violations

Author

Liz Crouse

Published

April 6, 2026

#| message: false
#| warning: false
echo = FALSE


# Load required packages
library(tidyverse)      # Data manipulation
library(sf)             # Spatial operations
library(here)           # Relative file paths
library(viridis)        # Color scales
library(terra)          # Raster operations (replaces 'raster')
library(spdep)          # Spatial dependence
library(FNN)            # Fast nearest neighbors
library(MASS)           # Negative binomial regression
library(patchwork)      # Plot composition (replaces grid/gridExtra)
library(knitr)          # Tables
library(kableExtra)     # Table formatting
library(classInt)       # Classification intervals


# Spatstat for analyzing point patterns - split into sub-packages
library(spatstat.geom)    # Spatial geometries
library(spatstat.explore) # Spatial exploration/KDE

# Set options
options(scipen = 999)  # No scientific notation
set.seed(5080)         # Reproducibility, could be any number

# Create consistent theme for visualizations
theme_crime <- function(base_size = 11) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold", size = base_size + 1),
      plot.subtitle = element_text(color = "gray30", size = base_size - 1),
      legend.position = "right",
      panel.grid.minor = element_blank(),
      axis.text = element_blank(),
      axis.title = element_blank()
    )
}

# Set as default
theme_set(theme_crime())

cat("✓ All packages loaded successfully!\n")

✓ All packages loaded successfully!

cat("✓ Working directory:", getwd(), "\n")

✓ Working directory: C:/Users/lizmc/Desktop/Y2S2/ppa/crouse/labs/lab_4

Part 1: Data Loading & Exploration

Load your 311 data and Chicago spatial boundaries
Create visualizations showing the spatial distribution of your violation type
Describe patterns you observe

#| message: false

# Load police districts (used for spatial cross-validation)
policeDistricts <- 
  st_read("https://data.cityofchicago.org/api/geospatial/24zt-jpfn?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(District = dist_num)

Reading layer `OGRGeoJSON' from data source 
  `https://data.cityofchicago.org/api/geospatial/24zt-jpfn?method=export&format=GeoJSON' 
  using driver `GeoJSON'
Simple feature collection with 25 features and 2 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -87.94011 ymin: 41.64455 xmax: -87.52414 ymax: 42.02303
Geodetic CRS:  WGS 84

# Load police beats (smaller administrative units)
policeBeats <- 
  st_read("https://data.cityofchicago.org/api/geospatial/n9it-hstw?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(Beat = beat_num)

Reading layer `OGRGeoJSON' from data source 
  `https://data.cityofchicago.org/api/geospatial/n9it-hstw?method=export&format=GeoJSON' 
  using driver `GeoJSON'
Simple feature collection with 277 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -87.94011 ymin: 41.64455 xmax: -87.52414 ymax: 42.02303
Geodetic CRS:  WGS 84

# Load Chicago boundary
chicagoBoundary <- 
  st_read("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/Chapter5/chicagoBoundary.geojson") %>%
  st_transform('ESRI:102271')

Reading layer `chicagoBoundary' from data source 
  `https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/Chapter5/chicagoBoundary.geojson' 
  using driver `GeoJSON'
Simple feature collection with 1 feature and 1 field
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -87.8367 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
Geodetic CRS:  WGS 84

cat("✓ Loaded spatial boundaries\n")

✓ Loaded spatial boundaries

cat("  - Police districts:", nrow(policeDistricts), "\n")

  - Police districts: 25

cat("  - Police beats:", nrow(policeBeats), "\n")

  - Police beats: 277

  - Number of potholes: 95698 /n

  - CRS: ESRI:102271

  - Date range: 01/01/2024 01:10:51 PM to 12/31/2025 07:49:19 AM

#Here, I create simple visualizations showing hotspots of potholes through kernel densities.

### I chose to build a model using pothole data from the city of Chicago. As maintenance of public facilities in lower income and/or high crime areas is routinely lacking, I wanted to analyze whether pothole data could teach us anything about crime patterns in Chicago. I was interested in using specifically the street pothole data (as opposed to alleys) because I thought it would add an interesting layer of logic that puts the onus onto the public works department rather than the residents of given neighborhoods. In theory, the city should be responsible for maintaining streets, so if there are many potholes in an area it could indicate underinvestment from the city, which could be connected to high crime.


# Simple point map
p1 <- ggplot() + 
  geom_sf(data = chicagoBoundary, fill = "gray95", color = "gray60") +
  geom_sf(data = potholes, color = "#d62828", size = 0.1, alpha = 0.4) +
  labs(
    title = "Pothole Locations",
    subtitle = paste0("Chicago 2024-2025, n = ", nrow(potholes))
  )

# Density surface using modern syntax
p2 <- ggplot() + 
  geom_sf(data = chicagoBoundary, fill = "gray95", color = "gray60") +
  geom_density_2d_filled(
    data = data.frame(st_coordinates(potholes)),
    aes(X, Y),
    alpha = 0.7,
    bins = 8
  ) +
  scale_fill_viridis_d(
    option = "plasma",
    direction = -1,
    guide = "none"  
  ) +
  labs(
    title = "Density Surface",
    subtitle = "Kernel density estimation"
  )

# Combine plots using patchwork 
p1 + p2 + 
  plot_annotation(
    title = "Spatial Distribution of Potholes in Chicago",
    tag_levels = 'A'
  )

### Potholes are concentrated along the eastern edge of Chicago (shoreline of Lake Michigan), specifically in the northeastern area of the city. Smaller clusters appear in the north and southwest corners.

Part 2: Fishnet Grid Creation

Create a 500m x 500m fishnet grid
Aggregate your violations to grid cells
Visualize the count distribution

#This step creates the fishnet that will act as the foundation of the rest of the study, allowing me to analyze current conditions and predict future outcomes on a uniform scale across the city.

# Create 500m x 500m grid
fishnet <- st_make_grid(
  chicagoBoundary,
  cellsize = 500,  # 500 meters per cell
  square = TRUE
) %>%
  st_sf() %>%
  mutate(uniqueID = row_number()) %>%
  st_transform('ESRI:102271') 

# Keep only cells that intersect Chicago
fishnet <- fishnet[chicagoBoundary, ]

# View basic info
cat("✓ Created fishnet grid\n")

✓ Created fishnet grid

cat("  - Number of cells:", nrow(fishnet), "\n")

  - Number of cells: 2458

cat("  - Cell size:", 500, "x", 500, "meters\n")

  - Cell size: 500 x 500 meters

cat("  - Cell area:", round(st_area(fishnet[1,])), "square meters\n")

  - Cell area: 250000 square meters

#This step counts the potholes in each fishnet grid cell and joins it back to the fishnet data.

# Spatial join
potholes_fishnet <- st_join(potholes, fishnet, join = st_within) %>%
  st_drop_geometry() %>%
  group_by(uniqueID) %>%
  summarize(countPotholes = n()) 


glimpse(potholes_fishnet)

Rows: 2,247
Columns: 2
$ uniqueID      <int> 37, 38, 44, 47, 48, 49, 50, 90, 91, 92, 93, 94, 96, 97, …
$ countPotholes <int> 7, 8, 2, 9, 3, 1, 9, 1, 5, 17, 15, 3, 1, 1, 6, 16, 48, 2…

# Join back to fishnet (cells with 0 burglaries will be NA)
fishnet <- fishnet %>%
  left_join(potholes_fishnet, by = "uniqueID") %>%
  mutate(countPotholes = replace_na(countPotholes, 0))

#Like I did with kernel density, here I show the presence of potholes aggregated by grid cell. Like with kernel density, the highest concentration is in the northeast area of the city along the lakeshore.

# Visualize aggregated counts
ggplot() +
  geom_sf(data = fishnet, aes(fill = countPotholes), color = NA) +
  geom_sf(data = chicagoBoundary, fill = NA, color = "NA", linewidth = 1) +
  scale_fill_viridis_c(
    name = "Potholes",
    option = "plasma",
    trans = "sqrt",  # Square root for better visualization of skewed data
    breaks = c(0, 100, 200, 300, 400)
  ) +
  labs(
    title = "Potholes Counts by Grid Cell",
    subtitle = "500m x 500m cells, Chicago 2024-2025"
  ) +
  theme_crime()

Part 3: Spatial Features

Calculate k-nearest neighbor features
Perform Local Moran’s I analysis
Identify hot spots and cold spots
Create distance-to-hotspot measures
Join any additional contextual data if you are looking for more to do and really get into this (e.g., demographics, land use)

#Here I use kernel density as a baseline to see if I can build a model that better predicts outcomes.

# Convert potholes to ppp (point pattern) format for spatstat
potholes_ppp <- as.ppp(
  st_coordinates(potholes),
  W = as.owin(st_bbox(chicagoBoundary))
)

# Calculate KDE with 1km bandwidth
kde_potholes <- density.ppp(
  potholes_ppp,
  sigma = 1000,  # 1km bandwidth
  edge = TRUE    # Edge correction
)

# Convert to terra raster (modern approach, not raster::raster)
kde_raster <- rast(kde_potholes)

# Extract KDE values to fishnet cells
fishnet <- fishnet %>%
  mutate(
    kde_value = terra::extract(
      kde_raster,
      vect(fishnet),
      fun = mean,
      na.rm = TRUE
    )[, 2]  # Extract just the values column
  )

cat("✓ Calculated KDE baseline\n")

✓ Calculated KDE baseline

ggplot() +
  geom_sf(data = fishnet, aes(fill = kde_value), color = NA) +
  geom_sf(data = chicagoBoundary, fill = NA, color = "white", linewidth = 1) +
  scale_fill_viridis_c(
    name = "KDE Value",
    option = "plasma"
  ) +
  labs(
    title = "Kernel Density Estimation Baseline",
    subtitle = "Simple spatial smoothing of pothole locations"
  ) +
  theme_crime()

#Here I perform the k-nearest neighbor function and join it back to the fishnet. This represents another version of showing density by counting the 3 closest potholes to a grid cell and averaging them into the column potholes.nn

# Get coordinates
fishnet_coords <- st_coordinates(st_centroid(fishnet))
pothole_coords <- st_coordinates(potholes)

# Calculate k nearest neighbors and distances
nn_result <- get.knnx(pothole_coords, fishnet_coords, k = 3)

# Add to fishnet
fishnet <- fishnet %>%
  mutate(
    potholes.nn = rowMeans(nn_result$nn.dist)
  )

cat("✓ Calculated nearest neighbor distances\n")

✓ Calculated nearest neighbor distances

summary(fishnet$potholes.nn)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   1.667   51.885   82.414  137.322  141.881 1450.311

#Here I calculate the local Moran's I to determine whether errors are randomly distributed or concentrated in certain areas.I do this to identify "high-high" hotspots, or areas where a cluster of potholes appear in multiple contiguous cells.

# Function to calculate Local Moran's I
calculate_local_morans <- function(data, variable, k = 5) {
  
  # Create spatial weights
  coords <- st_coordinates(st_centroid(data))
  neighbors <- knn2nb(knearneigh(coords, k = k))
  weights <- nb2listw(neighbors, style = "W", zero.policy = TRUE)
  
  # Calculate Local Moran's I
  local_moran <- localmoran(data[[variable]], weights)
  
  # Classify clusters
  mean_val <- mean(data[[variable]], na.rm = TRUE)
  
  data %>%
    mutate(
      local_i = local_moran[, 1],
      p_value = local_moran[, 5],
      is_significant = p_value < 0.05,
      
      moran_class = case_when(
        !is_significant ~ "Not Significant",
        local_i > 0 & .data[[variable]] > mean_val ~ "High-High",
        local_i > 0 & .data[[variable]] <= mean_val ~ "Low-Low",
        local_i < 0 & .data[[variable]] > mean_val ~ "High-Low",
        local_i < 0 & .data[[variable]] <= mean_val ~ "Low-High",
        TRUE ~ "Not Significant"
      )
    )
}

# Apply to potholes
fishnet <- calculate_local_morans(fishnet, "countPotholes", k = 5)

# Visualize hot spots
ggplot() +
  geom_sf(
    data = fishnet, 
    aes(fill = moran_class), 
    color = NA
  ) +
  scale_fill_manual(
    values = c(
      "High-High" = "#d7191c",
      "High-Low" = "#fdae61",
      "Low-High" = "#abd9e9",
      "Low-Low" = "#2c7bb6",
      "Not Significant" = "gray90"
    ),
    name = "Cluster Type"
  ) +
  labs(
    title = "Local Moran's I: Potholes Clusters",
    subtitle = "High-High = Hot spots of disorder"
  ) +
  theme_crime()

#Finally, I create one more type of density measure: distance to hotspots. I extract centroid coordinates from the high-high cells identified by the local moran's test and calculate how close each grid cell is to the nearest high-high centroid.

# Get centroids of "High-High" cells (hot spots)
hotspots <- fishnet %>%
  filter(moran_class == "High-High") %>%
  st_centroid()

# Calculate distance from each cell to nearest hot spot
if (nrow(hotspots) > 0) {
  fishnet <- fishnet %>%
    mutate(
      dist_to_hotspot = as.numeric(
        st_distance(st_centroid(fishnet), hotspots %>% st_union())
      )
    )
  
  cat("✓ Calculated distance to abandoned car hot spots\n")
  cat("  - Number of hot spot cells:", nrow(hotspots), "\n")
} else {
  fishnet <- fishnet %>%
    mutate(dist_to_hotspot = 0)
  cat("⚠ No significant hot spots found\n")
}

✓ Calculated distance to abandoned car hot spots
  - Number of hot spot cells: 254

Part 4: Count Regression Models

Fit Poisson regression
Fit Negative Binomial regression
Compare model fit (AIC)

#First I join police districts from the beginning of the code to my fishnet cells for predictive analysis.

# Strip District if it exists from a previous run
fishnet <- fishnet %>% dplyr::select(-any_of("District"))

# Join district information to fishnet via centroids
fishnet_centroids <- st_join(
  st_centroid(fishnet),
  dplyr::select(policeDistricts, District),
  join = st_within,
  left = TRUE
) %>%
  st_drop_geometry() %>%
  dplyr::select(uniqueID, District)

fishnet <- fishnet %>%
  left_join(fishnet_centroids, by = "uniqueID")

cat("✓ Joined police districts\n")

✓ Joined police districts

cat("  - Districts:", length(unique(fishnet$District)), "\n")

  - Districts: 24

cat("  - Cells with District:", sum(!is.na(fishnet$District)), "\n")

  - Cells with District: 2260

# Create clean modeling dataset
fishnet_model <- fishnet %>%
  st_drop_geometry() %>%
  dplyr::select(
    uniqueID,
    District,
   countPotholes,
    potholes.nn,
    dist_to_hotspot
  ) %>%
  na.omit()  # Remove any remaining NAs

cat("✓ Prepared modeling data\n")

✓ Prepared modeling data

cat("  - Observations:", nrow(fishnet_model), "\n")

  - Observations: 2260

cat("  - Variables:", ncol(fishnet_model), "\n")

  - Variables: 5

#I create a Poisson regression, which better accounts for random distributions by offering a longer tail in its fit for cells with 0 potholes.

# Fit Poisson regression
model_poisson <- glm(
  countPotholes ~ potholes.nn + dist_to_hotspot,
  data = fishnet_model,
  family = "poisson"
)

# Summary
summary(model_poisson)


Call:
glm(formula = countPotholes ~ potholes.nn + dist_to_hotspot, 
    family = "poisson", data = fishnet_model)

Coefficients:
                    Estimate   Std. Error z value            Pr(>|z|)    
(Intercept)      4.700582282  0.006390654   735.5 <0.0000000000000002 ***
potholes.nn     -0.008508284  0.000080251  -106.0 <0.0000000000000002 ***
dist_to_hotspot -0.000120190  0.000001569   -76.6 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 62107  on 2259  degrees of freedom
Residual deviance: 26257  on 2257  degrees of freedom
AIC: 37506

Number of Fisher Scoring iterations: 5

#Here I fit a negative binomial model, which is a more comprehensive model for count data and can account for over-dispersion.

# Fit Negative Binomial model
model_nb <- glm.nb(
  countPotholes ~ potholes.nn + 
    dist_to_hotspot,
  data = fishnet_model
)

# Summary
summary(model_nb)


Call:
glm.nb(formula = countPotholes ~ potholes.nn + dist_to_hotspot, 
    data = fishnet_model, init.theta = 3.459954798, link = log)

Coefficients:
                    Estimate   Std. Error z value            Pr(>|z|)    
(Intercept)      4.771149805  0.022719667  210.00 <0.0000000000000002 ***
potholes.nn     -0.010296842  0.000211564  -48.67 <0.0000000000000002 ***
dist_to_hotspot -0.000089496  0.000004417  -20.26 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(3.46) family taken to be 1)

    Null deviance: 6734.1  on 2259  degrees of freedom
Residual deviance: 2383.8  on 2257  degrees of freedom
AIC: 18733

Number of Fisher Scoring iterations: 1

              Theta:  3.460 
          Std. Err.:  0.117 

 2 x log-likelihood:  -18724.588

# Compare AIC (lower is better)
cat("\nModel Comparison:\n")


Model Comparison:

cat("Poisson AIC:", round(AIC(model_poisson), 1), "\n")

Poisson AIC: 37505.9

cat("Negative Binomial AIC:", round(AIC(model_nb), 1), "\n")

Negative Binomial AIC: 18732.6

Part 5: Spatial Cross-Validation

Implement Leave-One-Group-Out cross-validation on data
Calculate and report error metrics (MAE, RMSE)

#I run spatial cross validation using the Leave One Group Out method to determine how accurate my model is using existing data.

# Get unique districts
districts <- unique(fishnet_model$District)
cv_results <- tibble()

cat("Running LOGO Cross-Validation...\n")

Running LOGO Cross-Validation...

for (i in seq_along(districts)) {
  
  test_district <- districts[i]
  
  # Split data
  train_data <- fishnet_model %>% filter(District != test_district)
  test_data <- fishnet_model %>% filter(District == test_district)
  
  # Fit model on training data
  model_cv <- glm.nb(
  countPotholes ~ potholes.nn + dist_to_hotspot,
    data = train_data
  )
  
  # Predict on test data
  test_data <- test_data %>%
    mutate(
      prediction = predict(model_cv, test_data, type = "response")
    )
  
  # Calculate metrics
  mae <- mean(abs(test_data$countPotholes - test_data$prediction))
  rmse <- sqrt(mean((test_data$countPotholes - test_data$prediction)^2))
  
  # Store results
  cv_results <- bind_rows(
    cv_results,
    tibble(
      fold = i,
      test_district = test_district,
      n_test = nrow(test_data),
      mae = mae,
      rmse = rmse
    )
  )
  
  cat("  Fold", i, "/", length(districts), "- District", test_district, 
      "- MAE:", round(mae, 2), "\n")
}

  Fold 1 / 23 - District 5 - MAE: 8.37 
  Fold 2 / 23 - District 4 - MAE: 7.8 
  Fold 3 / 23 - District 22 - MAE: 13.19 
  Fold 4 / 23 - District 31 - MAE: 5.04 
  Fold 5 / 23 - District 6 - MAE: 14.5 
  Fold 6 / 23 - District 8 - MAE: 16.22 
  Fold 7 / 23 - District 7 - MAE: 17.26 
  Fold 8 / 23 - District 3 - MAE: 18.76 
  Fold 9 / 23 - District 2 - MAE: 15.3 
  Fold 10 / 23 - District 9 - MAE: 16.86 
  Fold 11 / 23 - District 10 - MAE: 16.51 
  Fold 12 / 23 - District 1 - MAE: 25.72 
  Fold 13 / 23 - District 12 - MAE: 23.06 
  Fold 14 / 23 - District 15 - MAE: 10.77 
  Fold 15 / 23 - District 11 - MAE: 17.22 
  Fold 16 / 23 - District 18 - MAE: 34.43 
  Fold 17 / 23 - District 25 - MAE: 18.07 
  Fold 18 / 23 - District 14 - MAE: 29.04 
  Fold 19 / 23 - District 19 - MAE: 28.03 
  Fold 20 / 23 - District 16 - MAE: 19.98 
  Fold 21 / 23 - District 17 - MAE: 25.44 
  Fold 22 / 23 - District 20 - MAE: 21.01 
  Fold 23 / 23 - District 24 - MAE: 20.46

# Overall results
cat("\n✓ Cross-Validation Complete\n")


✓ Cross-Validation Complete

cat("Mean MAE:", round(mean(cv_results$mae), 2), "\n")

Mean MAE: 18.39

cat("Mean RMSE:", round(mean(cv_results$rmse), 2), "\n")

Mean RMSE: 26.25

#Generate final predictions

# Fit final model on all data
final_model <- glm.nb(
   countPotholes ~ potholes.nn + dist_to_hotspot,
  data = fishnet_model
)

# Add predictions back to fishnet
fishnet <- fishnet %>%
  mutate(
    prediction_nb = predict(final_model, fishnet_model, type = "response")[match(uniqueID, fishnet_model$uniqueID)]
  )

# Also add KDE predictions (normalize to same scale as counts)
kde_sum <- sum(fishnet$kde_value, na.rm = TRUE)
count_sum <- sum(fishnet$countPotholes, na.rm = TRUE)
fishnet <- fishnet %>%
  mutate(
    prediction_kde = (kde_value / kde_sum) * count_sum
  )

Part 6: Model Evaluation

Compare to KDE baseline

# Visualize comparisons between the two models

# Create three maps
p1 <- ggplot() +
  geom_sf(data = fishnet, aes(fill = countPotholes), color = NA) +
  scale_fill_viridis_c(name = "Count", option = "plasma", limits = c(0, 15)) +
  labs(title = "Actual Potholes") +
  theme_crime()

p2 <- ggplot() +
  geom_sf(data = fishnet, aes(fill = prediction_nb), color = NA) +
  scale_fill_viridis_c(name = "Predicted", option = "plasma", limits = c(0, 15)) +
  labs(title = "Model Predictions (Neg. Binomial)") +
  theme_crime()

p3 <- ggplot() +
  geom_sf(data = fishnet, aes(fill = prediction_kde), color = NA) +
  scale_fill_viridis_c(name = "Predicted", option = "plasma", limits = c(0, 15)) +
  labs(title = "KDE Baseline Predictions") +
  theme_crime()

p1 + p2 + p3 +
  plot_annotation(
    title = "Actual vs. Predicted Potholes",
    subtitle = "Does our complex model outperform simple KDE?"
  )

# Calculate performance metrics

#Ultimately, the negative binomial model more accurately predicted pothole outcomes, seen in the lower RMSE and MAE values. For predictive purposes moving forward, I recommend the negative binomial count regression.


comparison <- fishnet %>%
  st_drop_geometry() %>%
  filter(!is.na(prediction_nb), !is.na(prediction_kde)) %>%
  summarize(
    model_mae = mean(abs(countPotholes - prediction_nb)),
    model_rmse = sqrt(mean((countPotholes - prediction_nb)^2)),
    kde_mae = mean(abs(countPotholes - prediction_kde)),
    kde_rmse = sqrt(mean((countPotholes - prediction_kde)^2))
  )

comparison %>%
  pivot_longer(everything(), names_to = "metric", values_to = "value") %>%
  separate(metric, into = c("approach", "metric"), sep = "_") %>%
  pivot_wider(names_from = metric, values_from = value) %>%
  kable(
    digits = 2,
    caption = "Model Performance Comparison"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Model Performance Comparison
approach	mae	rmse
model	17.10	26.07
kde	19.06	27.28