Bikeshare_SpaceTime_Prediction

CPLN 5080 - Spring 2026

Author

Liz Crouse

Published

April 19, 2026

Code

##Create plot theme
plotTheme <- theme_minimal() +
  theme(
    plot.title    = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10),
    plot.caption  = element_text(size = 8),
    axis.text.x   = element_text(size = 10, angle = 45, hjust = 1),
    axis.text.y   = element_text(size = 10),
    axis.title    = element_text(size = 11, face = "bold"),
    panel.grid.major = element_line(colour = "#D0D0D0", linewidth = 0.2),
    panel.grid.minor = element_blank(),
    axis.ticks    = element_blank(),
    legend.position = "right"
  )

mapTheme <- theme_void() +
  theme(
    plot.title    = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10),
    plot.caption  = element_text(size = 8),
    legend.position  = "right",
    plot.margin      = margin(1, 1, 1, 1, "cm"),
    legend.key.height = unit(1, "cm"),
    legend.key.width  = unit(0.2, "cm")
  )

palette5 <- c("#eff3ff", "#bdd7e7", "#6baed6", "#3182bd", "#08519c")

Code

indego_q3 <- read_csv(here("labs/lab_5/data/indego-trips-2025-q3.csv"))
indego_q1 <- read_csv(here("labs/lab_5/data/indego-trips-2025-q1.csv"))

Code

#Initial data exploration and analysis
# How many trips?
cat("Total trips in Q3 2025:", nrow(indego_q3), "\n")

Total trips in Q3 2025: 465464

Code

##465,464

# Date range
cat("Date range:", 
    min(mdy_hm(indego_q3$start_time)), "to", 
    max(mdy_hm(indego_q3$start_time)), "\n")

Date range: 1751328360 to 1759276680

Code

##July 1 to Sept 30


# How many unique stations?
cat("Unique start stations:", length(unique(indego_q3$start_station)), "\n")

Unique start stations: 284

Code

##284 unique stations


# Trip types
table(indego_q3$trip_route_category)


   One Way Round Trip 
    434518      30946

Code

# Passholder types
table(indego_q3$passholder_type)


 Day Pass  Indego30 Indego365      NULL   Walk-up 
    18418    252897    157695         5     36449

Code

# Bike types
table(indego_q3$bike_type)


electric standard 
  299515   165949

Code

#Creating hourlong bins for start time
indego_q3 <- indego_q3 %>%
  mutate(
    # Parse datetime
    start_datetime = mdy_hm(start_time),
    end_datetime = mdy_hm(end_time),
    
    # Create hourly bins
    interval60 = floor_date(start_datetime, unit = "hour"),
    
    # Extract time features
    week = week(interval60),
    month = month(interval60, label = TRUE),
    dotw = wday(interval60, label = TRUE),
    hour = hour(interval60),
    date = as.Date(interval60),
    
    # Create useful indicators
    weekend = ifelse(dotw %in% c("Sat", "Sun"), 1, 0),
    rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0)
  )

# Look at temporal features
head(indego_q3 %>% select(start_datetime, interval60, week, dotw, hour, weekend))

# A tibble: 6 × 6
  start_datetime      interval60           week dotw   hour weekend
  <dttm>              <dttm>              <dbl> <ord> <int>   <dbl>
1 2025-07-01 00:06:00 2025-07-01 00:00:00    26 Tue       0       0
2 2025-07-01 00:11:00 2025-07-01 00:00:00    26 Tue       0       0
3 2025-07-01 00:13:00 2025-07-01 00:00:00    26 Tue       0       0
4 2025-07-01 00:16:00 2025-07-01 00:00:00    26 Tue       0       0
5 2025-07-01 00:16:00 2025-07-01 00:00:00    26 Tue       0       0
6 2025-07-01 00:18:00 2025-07-01 00:00:00    26 Tue       0       0

Code

indego_q1 <- indego_q1 %>%
  mutate(
    # Parse datetime
    start_datetime = mdy_hm(start_time),
    end_datetime = mdy_hm(end_time),
    
    # Create hourly bins
    interval60 = floor_date(start_datetime, unit = "hour"),
    
    # Extract time features
    week = week(interval60),
    month = month(interval60, label = TRUE),
    dotw = wday(interval60, label = TRUE),
    hour = hour(interval60),
    date = as.Date(interval60),
    
    # Create useful indicators
    weekend = ifelse(dotw %in% c("Sat", "Sun"), 1, 0),
    rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0)
  )

# Look at temporal features
head(indego_q1 %>% select(start_datetime, interval60, week, dotw, hour, weekend))

# A tibble: 6 × 6
  start_datetime      interval60           week dotw   hour weekend
  <dttm>              <dttm>              <dbl> <ord> <int>   <dbl>
1 2025-01-01 00:00:00 2025-01-01 00:00:00     1 Wed       0       0
2 2025-01-01 00:04:00 2025-01-01 00:00:00     1 Wed       0       0
3 2025-01-01 00:05:00 2025-01-01 00:00:00     1 Wed       0       0
4 2025-01-01 00:05:00 2025-01-01 00:00:00     1 Wed       0       0
5 2025-01-01 00:08:00 2025-01-01 00:00:00     1 Wed       0       0
6 2025-01-01 00:14:00 2025-01-01 00:00:00     1 Wed       0       0

Code

# Daily trip counts Q3
daily_trips_q3 <- indego_q3 %>%
  group_by(date) %>%
  summarize(trips = n())

ggplot(daily_trips_q3, aes(x = date, y = trips)) +
  geom_line(color = "#3182bd", linewidth = 1) +
  geom_smooth(se = FALSE, color = "red", linetype = "dashed") +
  labs(
    title = "Indego Daily Ridership - Q3 2025",
    subtitle = "Late summer demand patterns in Philadelphia",
    x = "Date",
    y = "Daily Trips",
    caption = "Source: Indego bike share"
  ) +
  plotTheme

Code

# Daily trip counts Q1
daily_trips_q1 <- indego_q1 %>%
  group_by(date) %>%
  summarize(trips = n())

ggplot(daily_trips_q1, aes(x = date, y = trips)) +
  geom_line(color = "#3182bd", linewidth = 1) +
  geom_smooth(se = FALSE, color = "red", linetype = "dashed") +
  labs(
    title = "Indego Daily Ridership - Q1 2025",
    subtitle = "Winter demand patterns in Philadelphia",
    x = "Date",
    y = "Daily Trips",
    caption = "Source: Indego bike share"
  ) +
  plotTheme

Code

#Q1 shows a steady increase over time likely do to warming weather conditions; Q3 has more variation but an overall upward trend. This could be due to the weather being too hot, school being out of session, etc. Overall, Q3 has a significantly greater volume of trips.

Code

# Average trips by hour and day type
hourly_patterns_q3 <- indego_q3 %>%
  group_by(hour, weekend) %>%
  summarize(avg_trips = n() / n_distinct(date)) %>%
  mutate(day_type = ifelse(weekend == 1, "Weekend", "Weekday"))

ggplot(hourly_patterns_q3, aes(x = hour, y = avg_trips, color = day_type)) +
  geom_line(linewidth = 1.2) +
  scale_color_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Average Hourly Ridership Patterns Q3",
    subtitle = "Clear commute patterns on weekdays",
    x = "Hour of Day",
    y = "Average Trips per Hour",
    color = "Day Type"
  ) +
  plotTheme

Code

hourly_patterns_q1 <- indego_q1 %>%
  group_by(hour, weekend) %>%
  summarize(avg_trips = n() / n_distinct(date)) %>%
  mutate(day_type = ifelse(weekend == 1, "Weekend", "Weekday"))

ggplot(hourly_patterns_q1, aes(x = hour, y = avg_trips, color = day_type)) +
  geom_line(linewidth = 1.2) +
  scale_color_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Average Hourly Ridership Patterns Q1",
    subtitle = "Clear commute patterns on weekdays",
    x = "Hour of Day",
    y = "Average Trips per Hour",
    color = "Day Type"
  ) +
  plotTheme

Code

#Both Q1 and Q3 show clear patterns of commutes on weekdays. Ridership is higher overall in Q3, but the trends mirror each other almost exactly.

Code

# Most popular origin stations
top_stations_q3 <- indego_q3 %>%
  count(start_station, start_lat, start_lon, name = "trips") %>%
  arrange(desc(trips)) %>%
  head(20)

kable(top_stations_q3, 
      caption = "Top 20 Indego Stations by Trip Origins Q3",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Top 20 Indego Stations by Trip Origins Q3
start_station	start_lat	start_lon	trips
3,010	39.94711	-75.16618	7,716
3,032	39.94527	-75.17971	5,884
3,213	39.93887	-75.16663	5,660
3,163	39.94974	-75.18097	5,174
3,359	39.94888	-75.16978	4,681
3,185	39.95169	-75.15888	4,670
3,028	39.94061	-75.14958	4,607
3,020	39.94855	-75.19007	4,556
3,022	39.95472	-75.18323	4,475
3,066	39.94561	-75.17348	4,353
3,059	39.96244	-75.16121	4,265
3,063	39.94633	-75.16980	4,230
3,161	39.95486	-75.18091	4,223
3,101	39.94295	-75.15955	4,183
3,054	39.96250	-75.17420	4,117
3,030	39.93935	-75.15716	3,986
3,362	39.94816	-75.16226	3,965
3,061	39.95425	-75.17761	3,716
3,057	39.96439	-75.17987	3,668
3,296	39.95134	-75.16758	3,665

Code

top_stations_q1 <- indego_q1 %>%
  count(start_station, start_lat, start_lon, name = "trips") %>%
  arrange(desc(trips)) %>%
  head(20)

kable(top_stations_q1, 
      caption = "Top 20 Indego Stations by Trip Origins Q1",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Top 20 Indego Stations by Trip Origins Q1
start_station	start_lat	start_lon	trips
3,010	39.94711	-75.16618	3,999
3,032	39.94527	-75.17971	2,842
3,359	39.94888	-75.16978	2,699
3,020	39.94855	-75.19007	2,673
3,208	39.95048	-75.19324	2,503
3,244	39.93865	-75.16674	2,486
3,066	39.94561	-75.17348	2,396
3,362	39.94816	-75.16226	2,387
3,012	39.94218	-75.17747	2,361
3,028	39.94061	-75.14958	2,348
3,161	39.95486	-75.18091	2,278
3,101	39.94295	-75.15955	2,274
3,295	39.95028	-75.16027	2,160
3,054	39.96250	-75.17420	2,123
3,185	39.95169	-75.15888	2,116
3,038	39.94781	-75.19409	2,111
3,203	39.94077	-75.17227	2,106
3,059	39.96244	-75.16121	2,027
3,022	39.95472	-75.18323	2,014
3,063	39.94633	-75.16980	2,014

Code

# Get Philadelphia census data
philly_census <- get_acs(
  geography = "tract",
  variables = c(
    "B01003_001",  # Total population
    "B19013_001",  # Median household income
    "B08301_001",  # Total commuters
    "B08301_010",  # Commute by transit
    "B02001_002",  # White alone
    "B25077_001"   # Median home value
  ),
  state = "PA",
  county = "Philadelphia",
  year = 2022,
  geometry = TRUE,
  output = "wide"
) %>%
  rename(
    Total_Pop = B01003_001E,
    Med_Inc = B19013_001E,
    Total_Commuters = B08301_001E,
    Transit_Commuters = B08301_010E,
    White_Pop = B02001_002E,
    Med_Home_Value = B25077_001E
  ) %>%
  mutate(
    Percent_Taking_Transit = (Transit_Commuters / Total_Commuters) * 100,
    Percent_White = (White_Pop / Total_Pop) * 100
  ) %>%
  st_transform(crs = 4326)  #transforming to WGS84 for lat/lon matching, currently in NAD83


  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |===================================                                   |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |===============================================================       |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%

Code

# Check the data
glimpse(philly_census)

Rows: 408
Columns: 17
$ GEOID                  <chr> "42101001500", "42101001800", "42101002802", "4…
$ NAME                   <chr> "Census Tract 15; Philadelphia County; Pennsylv…
$ Total_Pop              <dbl> 3251, 3300, 5720, 4029, 4415, 1815, 3374, 2729,…
$ B01003_001M            <dbl> 677, 369, 796, 437, 853, 210, 480, 734, 763, 11…
$ Med_Inc                <dbl> 110859, 114063, 78871, 61583, 32347, 48581, 597…
$ B19013_001M            <dbl> 24975, 30714, 20396, 22293, 4840, 13812, 6278, …
$ Total_Commuters        <dbl> 2073, 2255, 3032, 2326, 1980, 969, 2427, 708, 2…
$ B08301_001M            <dbl> 387, 308, 478, 383, 456, 189, 380, 281, 456, 68…
$ Transit_Commuters      <dbl> 429, 123, 685, 506, 534, 192, 658, 218, 438, 51…
$ B08301_010M            <dbl> 188, 66, 219, 144, 285, 71, 278, 184, 176, 235,…
$ White_Pop              <dbl> 2185, 2494, 3691, 3223, 182, 984, 2111, 231, 35…
$ B02001_002M            <dbl> 268, 381, 592, 380, 88, 190, 463, 112, 238, 778…
$ Med_Home_Value         <dbl> 568300, 605000, 350600, 296400, 76600, 289700, …
$ B25077_001M            <dbl> 58894, 34876, 12572, 22333, 10843, 118720, 1506…
$ geometry               <MULTIPOLYGON [°]> MULTIPOLYGON (((-75.16558 3..., MU…
$ Percent_Taking_Transit <dbl> 20.694645, 5.454545, 22.592348, 21.754084, 26.9…
$ Percent_White          <dbl> 67.2100892, 75.5757576, 64.5279720, 79.9950360,…

Code

# Map median income
ggplot() +
  geom_sf(data = philly_census, aes(fill = Med_Inc), color = NA) +
  scale_fill_viridis(
    option = "viridis",
    name = "Median\nIncome",
    labels = scales::dollar
  ) +
  labs(
    title = "Philadelphia Median Household Income by Census Tract",
    subtitle = "Context for understanding bike share demand patterns"
  ) +
  # Stations 
  geom_point(
    data = indego_q3,
    aes(x = start_lon, y = start_lat),
    color = "red", size = 0.25, alpha = 0.6
  ) +
  mapTheme

Code

#Stations cluster around areas with higher median income, but those also tend to be in Center City, which has more bike lanes, greater population density, etc.

Code

# Create sf object for stations
stations_sf <- indego_q3 %>%
  distinct(start_station, start_lat, start_lon) %>%
  filter(!is.na(start_lat), !is.na(start_lon)) %>%
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326)

# Spatial join to get census tract for each station
stations_census <- st_join(stations_sf, philly_census, left = TRUE) %>%
  st_drop_geometry()

stations_for_map <- indego_q3 %>%
  distinct(start_station, start_lat, start_lon) %>%
  filter(!is.na(start_lat), !is.na(start_lon)) %>%
  left_join(
    stations_census %>% select(start_station, Med_Inc),
    by = "start_station"
  ) %>%
  mutate(has_census = !is.na(Med_Inc))

# Add back to trip data
indego_census_q3 <- indego_q3 %>%
  left_join(
    stations_census %>% 
      select(start_station, Med_Inc, Percent_Taking_Transit, 
             Percent_White, Total_Pop),
    by = "start_station"
  )

indego_census_q1 <- indego_q1 %>%
  left_join(
    stations_census %>% 
      select(start_station, Med_Inc, Percent_Taking_Transit, 
             Percent_White, Total_Pop),
    by = "start_station"
  )


# Prepare data for visualization
stations_for_map <- indego_q3 %>%
  distinct(start_station, start_lat, start_lon) %>%
  filter(!is.na(start_lat), !is.na(start_lon)) %>%
  left_join(
    stations_census %>% select(start_station, Med_Inc),
    by = "start_station"
  ) %>%
  mutate(has_census = !is.na(Med_Inc))

# Create the map showing problem stations
ggplot() +
  geom_sf(data = philly_census, aes(fill = Med_Inc), color = "white", size = 0.05) +
  scale_fill_viridis(
    option = "viridis",
    name = "Median\nIncome",
    labels = scales::dollar,
    na.value = "grey90"
  ) +  geom_point(
    data = stations_for_map %>% filter(has_census),
    aes(x = start_lon, y = start_lat),
    color = "grey30", size = 1, alpha = 0.6
  ) +
  geom_point(
    data = stations_for_map %>% filter(!has_census),
    aes(x = start_lon, y = start_lat),
    color = "red", size = 1, shape = 4, stroke = 1.5
  ) +
  labs(
    title = "Philadelphia Median Household Income by Census Tract",
    subtitle = "Indego stations shown (RED = no census data match)",
    caption = "Red X marks indicate stations that didn't join to census tracts"
  ) +
  mapTheme

Code

#Stations along the river and peripheral, non-residential parts of the city must be accounted for when doing demographic research.

Code

# Identify which stations to keep
valid_stations <- stations_census %>%
  filter(!is.na(Med_Inc)) %>%
  pull(start_station)

# Filter trip data to valid stations only
indego_census <- indego_q3 %>%
  filter(start_station %in% valid_stations) %>%
  left_join(
    stations_census %>% 
      select(start_station, Med_Inc, Percent_Taking_Transit, 
             Percent_White, Total_Pop),
    by = "start_station"
  )

#Here we identify and filter out those stations that do not match a demographic census area.

Code

# Get weather from Philadelphia International Airport (KPHL)
# This covers Q3 2025: July 1 - September 30
weather_data_q3 <- riem_measures(
  station = "PHL",  # Philadelphia International Airport
  date_start = "2025-07-01",
  date_end = "2025-09-30"
)

# Process weather data
weather_processed_q3 <- weather_data_q3 %>%
  mutate(
    interval60 = floor_date(valid, unit = "hour"),
    Temperature = tmpf,  # Temperature in Fahrenheit
    Precipitation = ifelse(is.na(p01i), 0, p01i),  # Hourly precip in inches
    Wind_Speed = sknt  # Wind speed in knots
  ) %>%
  select(interval60, Temperature, Precipitation, Wind_Speed) %>%
  distinct(interval60, .keep_all = TRUE)

# Check for missing hours and interpolate if needed
weather_complete_q3 <- weather_processed_q3 %>%
  complete(interval60 = seq(min(interval60), max(interval60), by = "hour")) %>%
  fill(Temperature, Precipitation, Wind_Speed, .direction = "down")

# Look at the weather
summary(weather_complete_q3 %>% select(Temperature, Precipitation, Wind_Speed))

  Temperature    Precipitation         Wind_Speed    
 Min.   :57.00   Min.   :0.0000000   Min.   : 0.000  
 1st Qu.:70.00   1st Qu.:0.0000000   1st Qu.: 4.000  
 Median :76.00   Median :0.0000000   Median : 6.000  
 Mean   :75.94   Mean   :0.0008342   Mean   : 6.218  
 3rd Qu.:81.00   3rd Qu.:0.0000000   3rd Qu.: 8.000  
 Max.   :98.00   Max.   :0.1000000   Max.   :25.000

Code

# This covers Q1 2025: January 1 - March 31
weather_data_q1 <- riem_measures(
  station = "PHL",  # Philadelphia International Airport
  date_start = "2025-01-01",
  date_end = "2025-03-31"
)

# Process weather data
weather_processed_q1 <- weather_data_q1 %>%
  mutate(
    interval60 = floor_date(valid, unit = "hour"),
    Temperature = tmpf,  # Temperature in Fahrenheit
    Precipitation = ifelse(is.na(p01i), 0, p01i),  # Hourly precip in inches
    Wind_Speed = sknt  # Wind speed in knots
  ) %>%
  select(interval60, Temperature, Precipitation, Wind_Speed) %>%
  distinct(interval60, .keep_all = TRUE)

# Check for missing hours and interpolate if needed
weather_complete_q1 <- weather_processed_q1 %>%
  complete(interval60 = seq(min(interval60), max(interval60), by = "hour")) %>%
  fill(Temperature, Precipitation, Wind_Speed, .direction = "down")

# Look at the weather
summary(weather_complete_q1 %>% select(Temperature, Precipitation, Wind_Speed))

  Temperature    Precipitation        Wind_Speed    
 Min.   :10.00   Min.   :0.000000   Min.   : 0.000  
 1st Qu.:30.00   1st Qu.:0.000000   1st Qu.: 6.000  
 Median :37.00   Median :0.000000   Median : 8.000  
 Mean   :38.36   Mean   :0.001327   Mean   : 9.308  
 3rd Qu.:46.00   3rd Qu.:0.000000   3rd Qu.:13.000  
 Max.   :78.00   Max.   :0.280000   Max.   :30.000

Code

ggplot(weather_complete_q3, aes(x = interval60, y = Temperature)) +
  geom_line(color = "#3182bd", alpha = 0.7) +
  geom_smooth(se = FALSE, color = "red") +
  labs(
    title = "Philadelphia Temperature - Q3 2025",
    subtitle = "Summer to early fall transition",
    x = "Date",
    y = "Temperature (°F)"
  ) +
  plotTheme

Code

ggplot(weather_complete_q1, aes(x = interval60, y = Temperature)) +
  geom_line(color = "#3182bd", alpha = 0.7) +
  geom_smooth(se = FALSE, color = "red") +
  labs(
    title = "Philadelphia Temperature - Q1 2025",
    subtitle = "Winter to early spring transition",
    x = "Date",
    y = "Temperature (°F)"
  ) +
  plotTheme

Code

#Clear temperature trend upward in Q1 with most days averaging below 60. Q3 patterns have less of a trend, with the line fit mostly staying between 70 and 80 on average.

Code

# Count trips by station-hour
trips_panel_q3 <- indego_census_q3 %>%
  group_by(interval60, start_station, start_lat, start_lon,
           Med_Inc, Percent_Taking_Transit, Percent_White, Total_Pop) %>%
  summarize(Trip_Count = n()) %>%
  ungroup()

trips_panel_q1 <- indego_census_q1 %>%
  group_by(interval60, start_station, start_lat, start_lon,
           Med_Inc, Percent_Taking_Transit, Percent_White, Total_Pop) %>%
  summarize(Trip_Count = n()) %>%
  ungroup()

# How many station-hour observations?
nrow(trips_panel_q3)

[1] 225484

Code

# How many unique stations?
length(unique(trips_panel_q3$start_station))

[1] 284

Code

# How many unique hours?
length(unique(trips_panel_q3$interval60))

[1] 2208

Code

hist(trips_panel_q3$Trip_Count)

Code

# Calculate expected panel size
n_stations_q1 <- length(unique(trips_panel_q1$start_station))
n_hours_q1 <- length(unique(trips_panel_q1$interval60))
expected_rows_q1 <- n_stations_q1 * n_hours_q1

cat("Expected panel rows:", format(expected_rows_q1, big.mark = ","), "\n")

Expected panel rows: 570,015

Code

cat("Current rows:", format(nrow(trips_panel_q1), big.mark = ","), "\n")

Current rows: 122,508

Code

cat("Missing rows:", format(expected_rows_q1 - nrow(trips_panel_q1), big.mark = ","), "\n")

Missing rows: 447,507

Code

n_stations_q3 <- length(unique(trips_panel_q3$start_station))
n_hours_q3 <- length(unique(trips_panel_q3$interval60))
expected_rows_q3 <- n_stations_q3 * n_hours_q3

cat("Expected panel rows:", format(expected_rows_q3, big.mark = ","), "\n")

Expected panel rows: 627,072

Code

cat("Current rows:", format(nrow(trips_panel_q3), big.mark = ","), "\n")

Current rows: 225,484

Code

cat("Missing rows:", format(expected_rows_q3 - nrow(trips_panel_q3), big.mark = ","), "\n")

Missing rows: 401,588

Code

# Create complete panel Q3
study_panel_q3 <- expand_grid(
  interval60 = unique(trips_panel_q3$interval60),
  start_station = unique(trips_panel_q3$start_station)
) %>%
  # Join trip counts
  left_join(trips_panel_q3 %>% 
              select(interval60, start_station, Trip_Count),
            by = c("interval60", "start_station")) %>%
  # Replace NA trip counts with 0
  mutate(Trip_Count = replace_na(Trip_Count, 0))

glimpse(study_panel_q3)

Rows: 627,072
Columns: 3
$ interval60    <dttm> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01, 2025-07…
$ start_station <dbl> 3058, 3061, 3078, 3100, 3114, 3123, 3160, 3163, 3168, 32…
$ Trip_Count    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2,…

Code

hist(study_panel_q3$Trip_Count)

Code

# Create complete panel Q1
study_panel_q1 <- expand_grid(
  interval60 = unique(trips_panel_q1$interval60),
  start_station = unique(trips_panel_q1$start_station)
) %>%
  left_join(trips_panel_q1 %>% 
              select(interval60, start_station, Trip_Count),
            by = c("interval60", "start_station")) %>%
  mutate(Trip_Count = replace_na(Trip_Count, 0))

glimpse(study_panel_q1)

Rows: 570,015
Columns: 3
$ interval60    <dttm> 2025-01-01, 2025-01-01, 2025-01-01, 2025-01-01, 2025-01…
$ start_station <dbl> 3005, 3007, 3015, 3021, 3026, 3028, 3029, 3030, 3032, 30…
$ Trip_Count    <int> 6, 1, 4, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1,…

Code

hist(study_panel_q1$Trip_Count)

Code

# Fill in station attributes (they're the same for all hours)
station_attributes_q3 <- trips_panel_q3 %>%
  group_by(start_station) %>%
  summarize(
    start_lat = first(start_lat),
    start_lon = first(start_lon),
    Med_Inc = first(Med_Inc),
    Percent_Taking_Transit = first(Percent_Taking_Transit),
    Percent_White = first(Percent_White),
    Total_Pop = first(Total_Pop)
  )
station_attributes_q1 <- trips_panel_q1 %>%
  group_by(start_station) %>%
  summarize(
    start_lat = first(start_lat),
    start_lon = first(start_lon),
    Med_Inc = first(Med_Inc),
    Percent_Taking_Transit = first(Percent_Taking_Transit),
    Percent_White = first(Percent_White),
    Total_Pop = first(Total_Pop)
  )

glimpse(station_attributes_q3)

Rows: 284
Columns: 7
$ start_station          <dbl> 3000, 3005, 3006, 3007, 3008, 3009, 3010, 3012,…
$ start_lat              <dbl> NA, 39.94733, 39.95220, 39.94517, 39.98081, 39.…
$ start_lon              <dbl> NA, -75.14403, -75.20311, -75.15993, -75.15067,…
$ Med_Inc                <dbl> NA, 144922, 21064, 58557, 22244, 37674, 88583, …
$ Percent_Taking_Transit <dbl> NA, 11.742827, 9.418565, 11.401515, 27.430556, …
$ Percent_White          <dbl> NA, 86.29640, 46.88109, 62.85341, 39.65227, 60.…
$ Total_Pop              <dbl> NA, 3583, 6156, 4598, 5349, 7375, 3134, 3643, 3…

Code

study_panel_q1 <- study_panel_q1 %>%
  left_join(station_attributes_q1, by = "start_station")
study_panel_q3 <- study_panel_q3 %>%
  left_join(station_attributes_q3, by = "start_station")

glimpse(study_panel_q1)

Rows: 570,015
Columns: 9
$ interval60             <dttm> 2025-01-01, 2025-01-01, 2025-01-01, 2025-01-01…
$ start_station          <dbl> 3005, 3007, 3015, 3021, 3026, 3028, 3029, 3030,…
$ Trip_Count             <int> 6, 1, 4, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3,…
$ start_lat              <dbl> 39.94733, 39.94517, 39.94735, 39.95390, 39.9418…
$ start_lon              <dbl> -75.14403, -75.15993, -75.14886, -75.16902, -75…
$ Med_Inc                <dbl> 144922, 58557, 144922, 90980, 144922, 146071, 2…
$ Percent_Taking_Transit <dbl> 11.742827, 11.401515, 11.742827, 17.800289, 11.…
$ Percent_White          <dbl> 86.29640, 62.85341, 86.29640, 79.26518, 86.2964…
$ Total_Pop              <dbl> 3583, 4598, 3583, 3130, 3583, 2428, 6267, 3300,…

Code

glimpse(study_panel_q3)

Rows: 627,072
Columns: 9
$ interval60             <dttm> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01…
$ start_station          <dbl> 3058, 3061, 3078, 3100, 3114, 3123, 3160, 3163,…
$ Trip_Count             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1,…
$ start_lat              <dbl> 39.96716, 39.95425, 39.95355, 39.92777, 39.9377…
$ start_lon              <dbl> -75.17001, -75.17761, -75.17192, -75.15103, -75…
$ Med_Inc                <dbl> 89669, 86293, 90980, 95745, 102969, NA, 36574, …
$ Percent_Taking_Transit <dbl> 13.231794, 17.370326, 17.800289, 7.231245, 8.10…
$ Percent_White          <dbl> 77.84227, 57.64486, 79.26518, 76.83231, 44.7735…
$ Total_Pop              <dbl> 2929, 2675, 3130, 4407, 2363, 898, 2231, 1735, …

Code

# Verify we have complete panel
cat("Complete panel rows:", format(nrow(study_panel_q3), big.mark = ","), "\n")

Complete panel rows: 627,072

Code

cat("Complete panel rows:", format(nrow(study_panel_q1), big.mark = ","), "\n")

Complete panel rows: 570,015

Code

#The trends on both Q1 and Q3 look similar, but Q3 operates on a much higher scale.

Code

study_panel_q3 <- study_panel_q3 %>%
  mutate(
    week = week(interval60),
    month = month(interval60, label = TRUE),
    dotw = wday(interval60, label = TRUE),
    hour = hour(interval60),
    date = as.Date(interval60),
    weekend = ifelse(dotw %in% c("Sat", "Sun"), 1, 0),
    rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0)
  )

glimpse(study_panel_q3)

Rows: 627,072
Columns: 16
$ interval60             <dttm> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01…
$ start_station          <dbl> 3058, 3061, 3078, 3100, 3114, 3123, 3160, 3163,…
$ Trip_Count             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1,…
$ start_lat              <dbl> 39.96716, 39.95425, 39.95355, 39.92777, 39.9377…
$ start_lon              <dbl> -75.17001, -75.17761, -75.17192, -75.15103, -75…
$ Med_Inc                <dbl> 89669, 86293, 90980, 95745, 102969, NA, 36574, …
$ Percent_Taking_Transit <dbl> 13.231794, 17.370326, 17.800289, 7.231245, 8.10…
$ Percent_White          <dbl> 77.84227, 57.64486, 79.26518, 76.83231, 44.7735…
$ Total_Pop              <dbl> 2929, 2675, 3130, 4407, 2363, 898, 2231, 1735, …
$ week                   <dbl> 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,…
$ month                  <ord> Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Ju…
$ dotw                   <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Tue, Tue, Tu…
$ hour                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ date                   <date> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01…
$ weekend                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rush_hour              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

Code

study_panel_q1 <- study_panel_q1 %>%
  mutate(
    week = week(interval60),
    month = month(interval60, label = TRUE),
    dotw = wday(interval60, label = TRUE),
    hour = hour(interval60),
    date = as.Date(interval60),
    weekend = ifelse(dotw %in% c("Sat", "Sun"), 1, 0),
    rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0)
  )

glimpse(study_panel_q1)

Rows: 570,015
Columns: 16
$ interval60             <dttm> 2025-01-01, 2025-01-01, 2025-01-01, 2025-01-01…
$ start_station          <dbl> 3005, 3007, 3015, 3021, 3026, 3028, 3029, 3030,…
$ Trip_Count             <int> 6, 1, 4, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3,…
$ start_lat              <dbl> 39.94733, 39.94517, 39.94735, 39.95390, 39.9418…
$ start_lon              <dbl> -75.14403, -75.15993, -75.14886, -75.16902, -75…
$ Med_Inc                <dbl> 144922, 58557, 144922, 90980, 144922, 146071, 2…
$ Percent_Taking_Transit <dbl> 11.742827, 11.401515, 11.742827, 17.800289, 11.…
$ Percent_White          <dbl> 86.29640, 62.85341, 86.29640, 79.26518, 86.2964…
$ Total_Pop              <dbl> 3583, 4598, 3583, 3130, 3583, 2428, 6267, 3300,…
$ week                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ month                  <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Ja…
$ dotw                   <ord> Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, We…
$ hour                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ date                   <date> 2025-01-01, 2025-01-01, 2025-01-01, 2025-01-01…
$ weekend                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rush_hour              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

Code

study_panel_q3 <- study_panel_q3 %>%
  left_join(weather_complete_q3, by = "interval60")

glimpse(study_panel_q3)

Rows: 627,072
Columns: 19
$ interval60             <dttm> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01…
$ start_station          <dbl> 3058, 3061, 3078, 3100, 3114, 3123, 3160, 3163,…
$ Trip_Count             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1,…
$ start_lat              <dbl> 39.96716, 39.95425, 39.95355, 39.92777, 39.9377…
$ start_lon              <dbl> -75.17001, -75.17761, -75.17192, -75.15103, -75…
$ Med_Inc                <dbl> 89669, 86293, 90980, 95745, 102969, NA, 36574, …
$ Percent_Taking_Transit <dbl> 13.231794, 17.370326, 17.800289, 7.231245, 8.10…
$ Percent_White          <dbl> 77.84227, 57.64486, 79.26518, 76.83231, 44.7735…
$ Total_Pop              <dbl> 2929, 2675, 3130, 4407, 2363, 898, 2231, 1735, …
$ week                   <dbl> 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,…
$ month                  <ord> Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Ju…
$ dotw                   <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Tue, Tue, Tu…
$ hour                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ date                   <date> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01…
$ weekend                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rush_hour              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Temperature            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,…
$ Precipitation          <dbl> 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001,…
$ Wind_Speed             <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…

Code

study_panel_q1 <- study_panel_q1 %>%
  left_join(weather_complete_q1, by = "interval60")

glimpse(study_panel_q1)

Rows: 570,015
Columns: 19
$ interval60             <dttm> 2025-01-01, 2025-01-01, 2025-01-01, 2025-01-01…
$ start_station          <dbl> 3005, 3007, 3015, 3021, 3026, 3028, 3029, 3030,…
$ Trip_Count             <int> 6, 1, 4, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3,…
$ start_lat              <dbl> 39.94733, 39.94517, 39.94735, 39.95390, 39.9418…
$ start_lon              <dbl> -75.14403, -75.15993, -75.14886, -75.16902, -75…
$ Med_Inc                <dbl> 144922, 58557, 144922, 90980, 144922, 146071, 2…
$ Percent_Taking_Transit <dbl> 11.742827, 11.401515, 11.742827, 17.800289, 11.…
$ Percent_White          <dbl> 86.29640, 62.85341, 86.29640, 79.26518, 86.2964…
$ Total_Pop              <dbl> 3583, 4598, 3583, 3130, 3583, 2428, 6267, 3300,…
$ week                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ month                  <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Ja…
$ dotw                   <ord> Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, We…
$ hour                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ date                   <date> 2025-01-01, 2025-01-01, 2025-01-01, 2025-01-01…
$ weekend                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rush_hour              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Temperature            <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,…
$ Precipitation          <dbl> 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06,…
$ Wind_Speed             <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,…

Code

# Check for missing values
summary(study_panel_q3 %>% select(Trip_Count, Temperature, Precipitation))

   Trip_Count       Temperature    Precipitation  
 Min.   : 0.0000   Min.   :57.00   Min.   :0.000  
 1st Qu.: 0.0000   1st Qu.:70.00   1st Qu.:0.000  
 Median : 0.0000   Median :76.00   Median :0.000  
 Mean   : 0.7423   Mean   :75.94   Mean   :0.001  
 3rd Qu.: 1.0000   3rd Qu.:81.00   3rd Qu.:0.000  
 Max.   :31.0000   Max.   :98.00   Max.   :0.100  
                   NA's   :6816    NA's   :6816

Code

summary(study_panel_q1 %>% select(Trip_Count, Temperature, Precipitation))

   Trip_Count       Temperature    Precipitation  
 Min.   : 0.0000   Min.   :10.00   Min.   :0.000  
 1st Qu.: 0.0000   1st Qu.:30.00   1st Qu.:0.000  
 Median : 0.0000   Median :37.00   Median :0.000  
 Mean   : 0.3537   Mean   :38.39   Mean   :0.001  
 3rd Qu.: 0.0000   3rd Qu.:46.00   3rd Qu.:0.000  
 Max.   :26.0000   Max.   :78.00   Max.   :0.280  
                   NA's   :6360    NA's   :6360

Code

# Sort by station and time
study_panel_q3 <- study_panel_q3 %>%
  arrange(start_station, interval60)

# Create lag variables WITHIN each station
study_panel_q3 <- study_panel_q3 %>%
  group_by(start_station) %>%
  mutate(
    lag1Hour = lag(Trip_Count, 1),
    lag2Hours = lag(Trip_Count, 2),
    lag3Hours = lag(Trip_Count, 3),
    lag12Hours = lag(Trip_Count, 12),
    lag1day = lag(Trip_Count, 24)
  ) %>%
  ungroup()

# Sort by station and time
study_panel_q1 <- study_panel_q1 %>%
  arrange(start_station, interval60)

# Create lag variables WITHIN each station
study_panel_q1 <- study_panel_q1 %>%
  group_by(start_station) %>%
  mutate(
    lag1Hour = lag(Trip_Count, 1),
    lag2Hours = lag(Trip_Count, 2),
    lag3Hours = lag(Trip_Count, 3),
    lag12Hours = lag(Trip_Count, 12),
    lag1day = lag(Trip_Count, 24)
  ) %>%
  ungroup()

# Remove rows with NA lags (first 24 hours for each station)
study_panel_complete_q3 <- study_panel_q3 %>%
  filter(!is.na(lag1day))
study_panel_complete_q1 <- study_panel_q1 %>%
  filter(!is.na(lag1day))

cat("Rows after removing NA lags:", format(nrow(study_panel_complete_q1), big.mark = ","), "\n")

Rows after removing NA lags: 563,655

Code

cat("Rows after removing NA lags:", format(nrow(study_panel_complete_q3), big.mark = ","), "\n")

Rows after removing NA lags: 620,256

Code

# Sample one station to visualize
example_station <- study_panel_complete_q3 %>%
  filter(start_station == 3188) %>%
  head(168)  # One week

# Plot actual vs lagged demand
ggplot(example_station, aes(x = interval60)) +
  geom_line(aes(y = Trip_Count, color = "Current"), linewidth = 1) +
  geom_line(aes(y = lag1Hour, color = "1 Hour Ago"), linewidth = 1, alpha = 0.7) +
  geom_line(aes(y = lag1day, color = "24 Hours Ago"), linewidth = 1, alpha = 0.7) +
  scale_color_manual(values = c(
    "Current" = "#08519c",
    "1 Hour Ago" = "#3182bd",
    "24 Hours Ago" = "#6baed6"
  )) +
  labs(
    title = "Temporal Lag Patterns at One Station",
    subtitle = "Past demand predicts future demand",
    x = "Date-Time",
    y = "Trip Count",
    color = "Time Period"
  ) +
  plotTheme

Code

# Split by week
# Q3 has weeks  26-39 (Jul-Sep)
# Train on weeks 26-35 (Jul 1- early September)
# Test on weeks 36-39 (rest of September)

# Split by week
# Q1 has weeks 1-13 (Jan-Mar)
# Train on weeks 1-9 (Jan 1 - early March)
# Test on weeks 10-13 (rest of March)

# Which stations have trips in BOTH early and late periods?
early_stations_q3 <- study_panel_complete_q3 %>%
  filter(week < 36) %>%
  filter(Trip_Count > 0) %>%
  distinct(start_station) %>%
  pull(start_station)

late_stations_q3 <- study_panel_complete_q3 %>%
  filter(week >= 36) %>%
  filter(Trip_Count > 0) %>%
  distinct(start_station) %>%
  pull(start_station)

early_stations_q1 <- study_panel_complete_q1 %>%
  filter(week < 10) %>%
  filter(Trip_Count > 0) %>%
  distinct(start_station) %>%
  pull(start_station)

late_stations_q1 <- study_panel_complete_q1 %>%
  filter(week >= 10) %>%
  filter(Trip_Count > 0) %>%
  distinct(start_station) %>%
  pull(start_station)

# Keep only stations that appear in BOTH periods
common_stations_q3 <- intersect(early_stations_q3, late_stations_q3)
common_stations_q1 <- intersect(early_stations_q1, late_stations_q1)

# Filter panel to only common stations
study_panel_complete_q3 <- study_panel_complete_q3 %>%
  filter(start_station %in% common_stations_q3)

study_panel_complete_q1 <- study_panel_complete_q1 %>%
  filter(start_station %in% common_stations_q1)

# Create train/test split
train_q3 <- study_panel_complete_q3 %>%
  filter(week < 36)

test_q3 <- study_panel_complete_q3 %>%
  filter(week >= 36)

train_q1 <- study_panel_complete_q1 %>%
  filter(week < 10)

test_q1 <- study_panel_complete_q1 %>%
  filter(week >= 10)

# Convert start_station to factor in training data first,
# then align test levels to training levels so predict() doesn't error
# Convert to factor FIRST
train_q3 <- train_q3 %>%
  mutate(start_station = factor(start_station))

train_q1 <- train_q1 %>%
  mutate(start_station = factor(start_station))

train_q3 <- train_q3 %>% droplevels()
train_q1 <- train_q1 %>% droplevels()

cat("Training observations Q3:", format(nrow(train_q3), big.mark = ","), "\n")

Training observations Q3: 429,408

Code

cat("Testing observations Q3:", format(nrow(test_q3), big.mark = ","), "\n")

Testing observations Q3: 190,848

Code

cat("Training date range Q3:", min(train_q3$date), "to", max(train_q3$date), "\n")

Training date range Q3: 20271 to 20333

Code

cat("Testing date range Q3:", min(test_q3$date), "to", max(test_q3$date), "\n")

Testing date range Q3: 20334 to 20361

Code

cat("Training observations Q1:", format(nrow(train_q1), big.mark = ","), "\n")

Training observations Q1: 382,098

Code

cat("Testing observations Q1:", format(nrow(test_q1), big.mark = ","), "\n")

Testing observations Q1: 166,668

Code

cat("Training date range Q1:", min(train_q1$date), "to", max(train_q1$date), "\n")

Training date range Q1: 20090 to 20151

Code

cat("Testing date range Q1:", min(test_q1$date), "to", max(test_q1$date), "\n")

Testing date range Q1: 20152 to 20178

Code

# Q3
station_volume_q3 <- train_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  group_by(start_station) %>%
  summarize(avg_trips = mean(Trip_Count)) %>%
  mutate(volume_tier = case_when(
    avg_trips >= quantile(avg_trips, 0.67) ~ "High",
    avg_trips >= quantile(avg_trips, 0.33) ~ "Medium",
    TRUE ~ "Low"
  )) %>%
  select(start_station, volume_tier)

train_q3 <- train_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  left_join(station_volume_q3, by = "start_station") %>%
  mutate(start_station = factor(start_station))

test_q3 <- test_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  left_join(station_volume_q3, by = "start_station") %>%
  mutate(
    volume_tier = replace_na(volume_tier, "Low"),
    start_station = factor(start_station, levels = levels(train_q3$start_station))
  )

# Q1
station_volume_q1 <- train_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  group_by(start_station) %>%
  summarize(avg_trips = mean(Trip_Count)) %>%
  mutate(volume_tier = case_when(
    avg_trips >= quantile(avg_trips, 0.67) ~ "High",
    avg_trips >= quantile(avg_trips, 0.33) ~ "Medium",
    TRUE ~ "Low"
  )) %>%
  select(start_station, volume_tier)

train_q1 <- train_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  left_join(station_volume_q1, by = "start_station") %>%
  mutate(start_station = factor(start_station))

test_q1 <- test_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  left_join(station_volume_q1, by = "start_station") %>%
  mutate(
    volume_tier = replace_na(volume_tier, "Low"),
    start_station = factor(start_station, levels = levels(train_q1$start_station))
  )

# Factor levels
train_q3 <- train_q3 %>% mutate(volume_tier = factor(volume_tier, levels = c("Low", "Medium", "High")))
test_q3  <- test_q3  %>% mutate(volume_tier = factor(volume_tier, levels = c("Low", "Medium", "High")))
train_q1 <- train_q1 %>% mutate(volume_tier = factor(volume_tier, levels = c("Low", "Medium", "High")))
test_q1  <- test_q1  %>% mutate(volume_tier = factor(volume_tier, levels = c("Low", "Medium", "High")))

Code

# Center City Philadelphia coordinates (City Hall)
center_city_lat <- 39.9526
center_city_lon <- -75.1652

# Haversine function
haversine_miles <- function(lat1, lon1, lat2, lon2) {
  R <- 3958.8
  dlat <- (lat2 - lat1) * pi / 180
  dlon <- (lon2 - lon1) * pi / 180
  a <- sin(dlat/2)^2 + cos(lat1 * pi/180) * cos(lat2 * pi/180) * sin(dlon/2)^2
  2 * R * asin(sqrt(a))
}

# Calculate distances from training data only
station_distances_q3 <- train_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  distinct(start_station, start_lat, start_lon) %>%
  filter(!is.na(start_lat)) %>%
  mutate(dist_to_center = haversine_miles(start_lat, start_lon,
                                          center_city_lat, center_city_lon))

station_distances_q1 <- train_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  distinct(start_station, start_lat, start_lon) %>%
  filter(!is.na(start_lat)) %>%
  mutate(dist_to_center = haversine_miles(start_lat, start_lon,
                                          center_city_lat, center_city_lon))

# Join to train (remove first in case chunk is re-run)
train_q3 <- train_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  select(-any_of("dist_to_center")) %>%
  left_join(station_distances_q3 %>% select(start_station, dist_to_center),
            by = "start_station") %>%
  mutate(start_station = factor(start_station))

train_q1 <- train_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  select(-any_of("dist_to_center")) %>%
  left_join(station_distances_q1 %>% select(start_station, dist_to_center),
            by = "start_station") %>%
  mutate(start_station = factor(start_station))

# Join to test
test_q3 <- test_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  select(-any_of("dist_to_center")) %>%
  left_join(station_distances_q3 %>% select(start_station, dist_to_center),
            by = "start_station") %>%
  mutate(
    dist_to_center = replace_na(dist_to_center,
                                median(station_distances_q3$dist_to_center)),
    start_station = factor(start_station, levels = levels(train_q3$start_station))
  )

test_q1 <- test_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  select(-any_of("dist_to_center")) %>%
  left_join(station_distances_q1 %>% select(start_station, dist_to_center),
            by = "start_station") %>%
  mutate(
    dist_to_center = replace_na(dist_to_center,
                                median(station_distances_q1$dist_to_center)),
    start_station = factor(start_station, levels = levels(train_q1$start_station))
  )

# Verify
cat("Q3 train dist_to_center - min:", round(min(train_q3$dist_to_center, na.rm=TRUE), 2), 
    "max:", round(max(train_q3$dist_to_center, na.rm=TRUE), 2), "miles\n")

Q3 train dist_to_center - min: 0.09 max: 6.17 miles

Code

cat("Q1 train dist_to_center - min:", round(min(train_q1$dist_to_center, na.rm=TRUE), 2),
    "max:", round(max(train_q1$dist_to_center, na.rm=TRUE), 2), "miles\n")

Q1 train dist_to_center - min: 0.09 max: 6.17 miles

Code

cat("Q3 test NAs in dist_to_center:", sum(is.na(test_q3$dist_to_center)), "\n")

Q3 test NAs in dist_to_center: 0

Code

cat("Q1 test NAs in dist_to_center:", sum(is.na(test_q1$dist_to_center)), "\n")

Q1 test NAs in dist_to_center: 0

Code

# Create day of week factor with treatment (dummy) coding
train_q3 <- train_q3 %>%
  mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

# Set contrasts to treatment coding (dummy variables)
contrasts(train_q3$dotw_simple) <- contr.treatment(7)

# Now run the model
model1_q3 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation,
  data = train_q3
)

summary(model1_q3)


Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + 
    Precipitation, data = train_q3)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.7751 -0.7607 -0.2115  0.1990 30.5775 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)        0.5552447  0.0292999  18.950 < 0.0000000000000002 ***
as.factor(hour)1  -0.0805044  0.0135087  -5.959     0.00000000253307 ***
as.factor(hour)2  -0.1491618  0.0135178 -11.034 < 0.0000000000000002 ***
as.factor(hour)3  -0.2057658  0.0135533 -15.182 < 0.0000000000000002 ***
as.factor(hour)4  -0.2059895  0.0135906 -15.157 < 0.0000000000000002 ***
as.factor(hour)5  -0.0934305  0.0136151  -6.862     0.00000000000679 ***
as.factor(hour)6   0.1370608  0.0136578  10.035 < 0.0000000000000002 ***
as.factor(hour)7   0.4197828  0.0136762  30.694 < 0.0000000000000002 ***
as.factor(hour)8   0.7988544  0.0137190  58.230 < 0.0000000000000002 ***
as.factor(hour)9   0.5460090  0.0137285  39.772 < 0.0000000000000002 ***
as.factor(hour)10  0.4395751  0.0137063  32.071 < 0.0000000000000002 ***
as.factor(hour)11  0.5163151  0.0136153  37.922 < 0.0000000000000002 ***
as.factor(hour)12  0.6008212  0.0135203  44.438 < 0.0000000000000002 ***
as.factor(hour)13  0.6578191  0.0134845  48.783 < 0.0000000000000002 ***
as.factor(hour)14  0.7402729  0.0134886  54.881 < 0.0000000000000002 ***
as.factor(hour)15  0.8670706  0.0135223  64.122 < 0.0000000000000002 ***
as.factor(hour)16  1.1224602  0.0135745  82.689 < 0.0000000000000002 ***
as.factor(hour)17  1.4297544  0.0136267 104.923 < 0.0000000000000002 ***
as.factor(hour)18  1.1211782  0.0136683  82.028 < 0.0000000000000002 ***
as.factor(hour)19  0.8976638  0.0136756  65.640 < 0.0000000000000002 ***
as.factor(hour)20  0.6050844  0.0136644  44.282 < 0.0000000000000002 ***
as.factor(hour)21  0.4312721  0.0136005  31.710 < 0.0000000000000002 ***
as.factor(hour)22  0.2788083  0.0135455  20.583 < 0.0000000000000002 ***
as.factor(hour)23  0.1477672  0.0134939  10.951 < 0.0000000000000002 ***
dotw_simple2       0.1075925  0.0072775  14.784 < 0.0000000000000002 ***
dotw_simple3       0.0466286  0.0072763   6.408     0.00000000014728 ***
dotw_simple4       0.0744778  0.0073128  10.185 < 0.0000000000000002 ***
dotw_simple5       0.0772038  0.0072910  10.589 < 0.0000000000000002 ***
dotw_simple6      -0.0069404  0.0073020  -0.950                0.342    
dotw_simple7      -0.0678209  0.0072821  -9.313 < 0.0000000000000002 ***
Temperature       -0.0043092  0.0003455 -12.473 < 0.0000000000000002 ***
Precipitation     -4.1554687  0.3986394 -10.424 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.274 on 429376 degrees of freedom
Multiple R-squared:  0.1069,    Adjusted R-squared:  0.1068 
F-statistic:  1658 on 31 and 429376 DF,  p-value: < 0.00000000000000022

Code

# Create day of week factor with treatment (dummy) coding
train_q1 <- train_q1 %>%
  mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

# Set contrasts to treatment coding (dummy variables)
contrasts(train_q1$dotw_simple) <- contr.treatment(7)

# Now run the model
model1_q1 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation,
  data = train_q1
)

summary(model1_q1)


Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + 
    Precipitation, data = train_q1)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.9751 -0.3949 -0.1786  0.0195 18.5457 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)       -0.1834228  0.0085842 -21.367 < 0.0000000000000002 ***
as.factor(hour)1  -0.0242730  0.0085943  -2.824             0.004738 ** 
as.factor(hour)2  -0.0346059  0.0086723  -3.990  0.00006597929894139 ***
as.factor(hour)3  -0.0493839  0.0086364  -5.718  0.00000001078098821 ***
as.factor(hour)4  -0.0333981  0.0086104  -3.879             0.000105 ***
as.factor(hour)5   0.0207478  0.0085797   2.418             0.015596 *  
as.factor(hour)6   0.1657362  0.0085876  19.299 < 0.0000000000000002 ***
as.factor(hour)7   0.3169637  0.0085908  36.896 < 0.0000000000000002 ***
as.factor(hour)8   0.5198860  0.0085954  60.484 < 0.0000000000000002 ***
as.factor(hour)9   0.3713574  0.0085935  43.214 < 0.0000000000000002 ***
as.factor(hour)10  0.2684743  0.0085994  31.220 < 0.0000000000000002 ***
as.factor(hour)11  0.2840066  0.0086046  33.007 < 0.0000000000000002 ***
as.factor(hour)12  0.3539830  0.0085925  41.197 < 0.0000000000000002 ***
as.factor(hour)13  0.3427402  0.0085703  39.992 < 0.0000000000000002 ***
as.factor(hour)14  0.3476331  0.0085618  40.603 < 0.0000000000000002 ***
as.factor(hour)15  0.4115121  0.0085634  48.054 < 0.0000000000000002 ***
as.factor(hour)16  0.5221996  0.0085680  60.947 < 0.0000000000000002 ***
as.factor(hour)17  0.6440474  0.0085787  75.076 < 0.0000000000000002 ***
as.factor(hour)18  0.4380710  0.0085875  51.013 < 0.0000000000000002 ***
as.factor(hour)19  0.2848124  0.0085940  33.141 < 0.0000000000000002 ***
as.factor(hour)20  0.1483139  0.0085894  17.267 < 0.0000000000000002 ***
as.factor(hour)21  0.0878728  0.0085799  10.242 < 0.0000000000000002 ***
as.factor(hour)22  0.0679936  0.0085708   7.933  0.00000000000000214 ***
as.factor(hour)23  0.0419470  0.0085613   4.900  0.00000096072927124 ***
dotw_simple2       0.0533571  0.0046180  11.554 < 0.0000000000000002 ***
dotw_simple3       0.0585472  0.0047687  12.277 < 0.0000000000000002 ***
dotw_simple4       0.0318514  0.0046107   6.908  0.00000000000491616 ***
dotw_simple5       0.0385290  0.0046037   8.369 < 0.0000000000000002 ***
dotw_simple6      -0.0735625  0.0046018 -15.986 < 0.0000000000000002 ***
dotw_simple7      -0.0690652  0.0046282 -14.923 < 0.0000000000000002 ***
Temperature        0.0078615  0.0001521  51.672 < 0.0000000000000002 ***
Precipitation     -2.5442142  0.1885686 -13.492 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.7654 on 382066 degrees of freedom
Multiple R-squared:  0.07688,   Adjusted R-squared:  0.07681 
F-statistic:  1027 on 31 and 382066 DF,  p-value: < 0.00000000000000022

Code

model2_q3 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day,
  data = train_q3
)
summary(model2_q3)


Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + 
    Precipitation + lag1Hour + lag3Hours + lag1day, data = train_q3)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.1240 -0.5129 -0.1443  0.1653 30.8260 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)        0.2314121  0.0253638   9.124 < 0.0000000000000002 ***
as.factor(hour)1  -0.0070870  0.0116834  -0.607               0.5441    
as.factor(hour)2  -0.0183323  0.0116966  -1.567               0.1170    
as.factor(hour)3  -0.0288075  0.0117349  -2.455               0.0141 *  
as.factor(hour)4  -0.0049322  0.0117740  -0.419               0.6753    
as.factor(hour)5   0.0779153  0.0117994   6.603  0.00000000004025369 ***
as.factor(hour)6   0.2191581  0.0118478  18.498 < 0.0000000000000002 ***
as.factor(hour)7   0.3574748  0.0118813  30.087 < 0.0000000000000002 ***
as.factor(hour)8   0.5417481  0.0119553  45.314 < 0.0000000000000002 ***
as.factor(hour)9   0.2366865  0.0119372  19.828 < 0.0000000000000002 ***
as.factor(hour)10  0.1955542  0.0118756  16.467 < 0.0000000000000002 ***
as.factor(hour)11  0.2339657  0.0118009  19.826 < 0.0000000000000002 ***
as.factor(hour)12  0.3034943  0.0117221  25.891 < 0.0000000000000002 ***
as.factor(hour)13  0.3382933  0.0117002  28.913 < 0.0000000000000002 ***
as.factor(hour)14  0.3753055  0.0117117  32.045 < 0.0000000000000002 ***
as.factor(hour)15  0.4389240  0.0117556  37.337 < 0.0000000000000002 ***
as.factor(hour)16  0.5852189  0.0118385  49.434 < 0.0000000000000002 ***
as.factor(hour)17  0.7306476  0.0119494  61.145 < 0.0000000000000002 ***
as.factor(hour)18  0.4215442  0.0119749  35.202 < 0.0000000000000002 ***
as.factor(hour)19  0.3135822  0.0119267  26.293 < 0.0000000000000002 ***
as.factor(hour)20  0.1289772  0.0119087  10.831 < 0.0000000000000002 ***
as.factor(hour)21  0.1150041  0.0118033   9.743 < 0.0000000000000002 ***
as.factor(hour)22  0.0761434  0.0117315   6.490  0.00000000008565500 ***
as.factor(hour)23  0.0518285  0.0116719   4.440  0.00000897892852838 ***
dotw_simple2       0.0522938  0.0062952   8.307 < 0.0000000000000002 ***
dotw_simple3      -0.0092910  0.0062945  -1.476               0.1399    
dotw_simple4       0.0159215  0.0063257   2.517               0.0118 *  
dotw_simple5       0.0137169  0.0063073   2.175               0.0296 *  
dotw_simple6      -0.0489197  0.0063177  -7.743  0.00000000000000971 ***
dotw_simple7      -0.0648830  0.0062991 -10.300 < 0.0000000000000002 ***
Temperature       -0.0028286  0.0002988  -9.467 < 0.0000000000000002 ***
Precipitation     -1.7444529  0.3448869  -5.058  0.00000042375515913 ***
lag1Hour           0.2564446  0.0014605 175.582 < 0.0000000000000002 ***
lag3Hours          0.1118308  0.0014139  79.096 < 0.0000000000000002 ***
lag1day            0.2938194  0.0014365 204.532 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.102 on 429373 degrees of freedom
Multiple R-squared:  0.3322,    Adjusted R-squared:  0.3321 
F-statistic:  6282 on 34 and 429373 DF,  p-value: < 0.00000000000000022

Code

model2_q1 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day,
  data = train_q1
)
summary(model2_q1)


Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + 
    Precipitation + lag1Hour + lag3Hours + lag1day, data = train_q1)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.3484 -0.2715 -0.0910  0.0164 17.7743 

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)       -0.0879220  0.0077205 -11.388 < 0.0000000000000002 ***
as.factor(hour)1  -0.0054807  0.0077226  -0.710              0.47789    
as.factor(hour)2  -0.0026258  0.0077932  -0.337              0.73617    
as.factor(hour)3  -0.0055725  0.0077619  -0.718              0.47280    
as.factor(hour)4   0.0112757  0.0077391   1.457              0.14512    
as.factor(hour)5   0.0524594  0.0077123   6.802 0.000000000010327923 ***
as.factor(hour)6   0.1499659  0.0077240  19.415 < 0.0000000000000002 ***
as.factor(hour)7   0.2270021  0.0077373  29.339 < 0.0000000000000002 ***
as.factor(hour)8   0.3374065  0.0077647  43.454 < 0.0000000000000002 ***
as.factor(hour)9   0.1567663  0.0077610  20.199 < 0.0000000000000002 ***
as.factor(hour)10  0.1031398  0.0077467  13.314 < 0.0000000000000002 ***
as.factor(hour)11  0.1230232  0.0077604  15.853 < 0.0000000000000002 ***
as.factor(hour)12  0.1879950  0.0077426  24.281 < 0.0000000000000002 ***
as.factor(hour)13  0.1731522  0.0077218  22.424 < 0.0000000000000002 ***
as.factor(hour)14  0.1808445  0.0077136  23.445 < 0.0000000000000002 ***
as.factor(hour)15  0.2227993  0.0077213  28.855 < 0.0000000000000002 ***
as.factor(hour)16  0.2879709  0.0077407  37.202 < 0.0000000000000002 ***
as.factor(hour)17  0.3540352  0.0077728  45.548 < 0.0000000000000002 ***
as.factor(hour)18  0.1568594  0.0077746  20.176 < 0.0000000000000002 ***
as.factor(hour)19  0.0824594  0.0077562  10.631 < 0.0000000000000002 ***
as.factor(hour)20  0.0028929  0.0077559   0.373              0.70915    
as.factor(hour)21  0.0101926  0.0077238   1.320              0.18696    
as.factor(hour)22  0.0227490  0.0077060   2.952              0.00316 ** 
as.factor(hour)23  0.0199463  0.0076932   2.593              0.00952 ** 
dotw_simple2       0.0164234  0.0041515   3.956 0.000076224372168892 ***
dotw_simple3       0.0051431  0.0042898   1.199              0.23057    
dotw_simple4      -0.0065729  0.0041465  -1.585              0.11293    
dotw_simple5       0.0042948  0.0041388   1.038              0.29941    
dotw_simple6      -0.0794531  0.0041443 -19.172 < 0.0000000000000002 ***
dotw_simple7      -0.0462975  0.0041619 -11.124 < 0.0000000000000002 ***
Temperature        0.0033671  0.0001376  24.462 < 0.0000000000000002 ***
Precipitation     -1.3627100  0.1695047  -8.039 0.000000000000000906 ***
lag1Hour           0.2488613  0.0015596 159.562 < 0.0000000000000002 ***
lag3Hours          0.0913834  0.0015228  60.011 < 0.0000000000000002 ***
lag1day            0.2559832  0.0015400 166.221 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6877 on 382063 degrees of freedom
Multiple R-squared:  0.2547,    Adjusted R-squared:  0.2546 
F-statistic:  3840 on 34 and 382063 DF,  p-value: < 0.00000000000000022

Code

model3_q3 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day +
    Med_Inc + Percent_Taking_Transit + Percent_White,
  data = train_q3
)

summary(model3_q3)


Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + 
    Precipitation + lag1Hour + lag3Hours + lag1day + Med_Inc + 
    Percent_Taking_Transit + Percent_White, data = train_q3)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.8283 -0.5230 -0.1567  0.2146 17.0890 

Coefficients:
                             Estimate     Std. Error t value
(Intercept)             0.10773357692  0.02696503949   3.995
as.factor(hour)1       -0.01558819707  0.01199395592  -1.300
as.factor(hour)2       -0.03009411562  0.01200824460  -2.506
as.factor(hour)3       -0.04594668288  0.01204825493  -3.814
as.factor(hour)4       -0.02232469523  0.01208925762  -1.847
as.factor(hour)5        0.06811950152  0.01211530649   5.623
as.factor(hour)6        0.22217789036  0.01216525549  18.263
as.factor(hour)7        0.36556135843  0.01219967923  29.965
as.factor(hour)8        0.59068832596  0.01228264084  48.091
as.factor(hour)9        0.26831361218  0.01226388072  21.878
as.factor(hour)10       0.21753390755  0.01219578024  17.837
as.factor(hour)11       0.25224000798  0.01212128853  20.810
as.factor(hour)12       0.32637274154  0.01203827493  27.111
as.factor(hour)13       0.36404310756  0.01201588837  30.297
as.factor(hour)14       0.39830062530  0.01202823357  33.114
as.factor(hour)15       0.45927398057  0.01207335156  38.040
as.factor(hour)16       0.59000408117  0.01215492523  48.540
as.factor(hour)17       0.77665752100  0.01227540622  63.269
as.factor(hour)18       0.47967897428  0.01230784667  38.973
as.factor(hour)19       0.35931339331  0.01225519790  29.319
as.factor(hour)20       0.17139320152  0.01223532552  14.008
as.factor(hour)21       0.12887783846  0.01212326327  10.631
as.factor(hour)22       0.09269729000  0.01204591289   7.695
as.factor(hour)23       0.05244294796  0.01198288588   4.376
dotw_simple2            0.06061697763  0.00646288664   9.379
dotw_simple3           -0.00425607809  0.00646180965  -0.659
dotw_simple4            0.02464608992  0.00649390324   3.795
dotw_simple5            0.02156673863  0.00647523225   3.331
dotw_simple6           -0.03821970547  0.00648547670  -5.893
dotw_simple7           -0.06163979654  0.00646639013  -9.532
Temperature            -0.00292197518  0.00030672626  -9.526
Precipitation          -1.97406934420  0.35407317636  -5.575
lag1Hour                0.23793872049  0.00152892221 155.625
lag3Hours               0.10696504520  0.00148907742  71.833
lag1day                 0.27041973549  0.00150852511 179.261
Med_Inc                 0.00000131085  0.00000006522  20.098
Percent_Taking_Transit -0.00272973350  0.00019858324 -13.746
Percent_White           0.00192758671  0.00009971837  19.330
                                   Pr(>|t|)    
(Intercept)              0.0000646224404398 ***
as.factor(hour)1                   0.193714    
as.factor(hour)2                   0.012207 *  
as.factor(hour)3                   0.000137 ***
as.factor(hour)4                   0.064798 .  
as.factor(hour)5         0.0000000188232097 ***
as.factor(hour)6       < 0.0000000000000002 ***
as.factor(hour)7       < 0.0000000000000002 ***
as.factor(hour)8       < 0.0000000000000002 ***
as.factor(hour)9       < 0.0000000000000002 ***
as.factor(hour)10      < 0.0000000000000002 ***
as.factor(hour)11      < 0.0000000000000002 ***
as.factor(hour)12      < 0.0000000000000002 ***
as.factor(hour)13      < 0.0000000000000002 ***
as.factor(hour)14      < 0.0000000000000002 ***
as.factor(hour)15      < 0.0000000000000002 ***
as.factor(hour)16      < 0.0000000000000002 ***
as.factor(hour)17      < 0.0000000000000002 ***
as.factor(hour)18      < 0.0000000000000002 ***
as.factor(hour)19      < 0.0000000000000002 ***
as.factor(hour)20      < 0.0000000000000002 ***
as.factor(hour)21      < 0.0000000000000002 ***
as.factor(hour)22        0.0000000000000141 ***
as.factor(hour)23        0.0000120637886159 ***
dotw_simple2           < 0.0000000000000002 ***
dotw_simple3                       0.510120    
dotw_simple4                       0.000148 ***
dotw_simple5                       0.000867 ***
dotw_simple6             0.0000000037927021 ***
dotw_simple7           < 0.0000000000000002 ***
Temperature            < 0.0000000000000002 ***
Precipitation            0.0000000247242634 ***
lag1Hour               < 0.0000000000000002 ***
lag3Hours              < 0.0000000000000002 ***
lag1day                < 0.0000000000000002 ***
Med_Inc                < 0.0000000000000002 ***
Percent_Taking_Transit < 0.0000000000000002 ***
Percent_White          < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.089 on 397618 degrees of freedom
  (31752 observations deleted due to missingness)
Multiple R-squared:  0.3309,    Adjusted R-squared:  0.3308 
F-statistic:  5313 on 37 and 397618 DF,  p-value: < 0.00000000000000022

Code

model3_q1 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day +
    Med_Inc + Percent_Taking_Transit + Percent_White,
  data = train_q1
)

summary(model3_q1)


Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + 
    Precipitation + lag1Hour + lag3Hours + lag1day + Med_Inc + 
    Percent_Taking_Transit + Percent_White, data = train_q1)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.6594 -0.2908 -0.1036  0.0406 17.7774 

Coefficients:
                             Estimate     Std. Error t value
(Intercept)            -0.18575448097  0.00925708761 -20.066
as.factor(hour)1       -0.00870553259  0.00796367962  -1.093
as.factor(hour)2       -0.00774410394  0.00803664018  -0.964
as.factor(hour)3       -0.01197490863  0.00800445687  -1.496
as.factor(hour)4        0.00530618796  0.00798096451   0.665
as.factor(hour)5        0.05294852942  0.00795335942   6.657
as.factor(hour)6        0.16018098581  0.00796603710  20.108
as.factor(hour)7        0.24288130427  0.00798127883  30.431
as.factor(hour)8        0.37896167176  0.00801505812  47.281
as.factor(hour)9        0.18436046428  0.00801297381  23.008
as.factor(hour)10       0.12430665866  0.00799407277  15.550
as.factor(hour)11       0.14247230195  0.00801144188  17.784
as.factor(hour)12       0.20892748539  0.00799008158  26.148
as.factor(hour)13       0.18841969018  0.00796712110  23.650
as.factor(hour)14       0.19722143291  0.00795773248  24.784
as.factor(hour)15       0.23574071242  0.00796613833  29.593
as.factor(hour)16       0.28677035236  0.00798362645  35.920
as.factor(hour)17       0.36608755304  0.00801578702  45.671
as.factor(hour)18       0.17659049027  0.00801883364  22.022
as.factor(hour)19       0.09647240502  0.00799828892  12.062
as.factor(hour)20       0.02115968129  0.00799736957   2.646
as.factor(hour)21       0.01627997629  0.00796454453   2.044
as.factor(hour)22       0.02765112367  0.00794624234   3.480
as.factor(hour)23       0.02008860585  0.00793352889   2.532
dotw_simple2            0.02267324839  0.00428116549   5.296
dotw_simple3            0.01318203377  0.00442395923   2.980
dotw_simple4            0.00007381960  0.00427596477   0.017
dotw_simple5            0.01255911052  0.00426812980   2.943
dotw_simple6           -0.07234381246  0.00427301230 -16.930
dotw_simple7           -0.04441576652  0.00429168988 -10.349
Temperature             0.00389083598  0.00014211548  27.378
Precipitation          -1.44502547781  0.17480826683  -8.266
lag1Hour                0.22027888541  0.00164683932 133.759
lag3Hours               0.08829072892  0.00162045081  54.485
lag1day                 0.22622825319  0.00163192286 138.627
Med_Inc                 0.00000017104  0.00000004247   4.027
Percent_Taking_Transit -0.00097161201  0.00013350842  -7.278
Percent_White           0.00168296474  0.00006831168  24.637
                                   Pr(>|t|)    
(Intercept)            < 0.0000000000000002 ***
as.factor(hour)1                   0.274327    
as.factor(hour)2                   0.335247    
as.factor(hour)3                   0.134647    
as.factor(hour)4                   0.506143    
as.factor(hour)5          0.000000000027916 ***
as.factor(hour)6       < 0.0000000000000002 ***
as.factor(hour)7       < 0.0000000000000002 ***
as.factor(hour)8       < 0.0000000000000002 ***
as.factor(hour)9       < 0.0000000000000002 ***
as.factor(hour)10      < 0.0000000000000002 ***
as.factor(hour)11      < 0.0000000000000002 ***
as.factor(hour)12      < 0.0000000000000002 ***
as.factor(hour)13      < 0.0000000000000002 ***
as.factor(hour)14      < 0.0000000000000002 ***
as.factor(hour)15      < 0.0000000000000002 ***
as.factor(hour)16      < 0.0000000000000002 ***
as.factor(hour)17      < 0.0000000000000002 ***
as.factor(hour)18      < 0.0000000000000002 ***
as.factor(hour)19      < 0.0000000000000002 ***
as.factor(hour)20                  0.008149 ** 
as.factor(hour)21                  0.040949 *  
as.factor(hour)22                  0.000502 ***
as.factor(hour)23                  0.011338 *  
dotw_simple2              0.000000118408788 ***
dotw_simple3                       0.002886 ** 
dotw_simple4                       0.986226    
dotw_simple5                       0.003256 ** 
dotw_simple6           < 0.0000000000000002 ***
dotw_simple7           < 0.0000000000000002 ***
Temperature            < 0.0000000000000002 ***
Precipitation          < 0.0000000000000002 ***
lag1Hour               < 0.0000000000000002 ***
lag3Hours              < 0.0000000000000002 ***
lag1day                < 0.0000000000000002 ***
Med_Inc                   0.000056428198993 ***
Percent_Taking_Transit    0.000000000000341 ***
Percent_White          < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6768 on 347997 degrees of freedom
  (34063 observations deleted due to missingness)
Multiple R-squared:  0.2318,    Adjusted R-squared:  0.2317 
F-statistic:  2838 on 37 and 347997 DF,  p-value: < 0.00000000000000022

Code

model4_q3 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day +
    Med_Inc + Percent_Taking_Transit + Percent_White +
    start_station,
  data = train_q3
)
cat("Model 4 R-squared Q3:", summary(model4_q3)$r.squared, "\n")

Model 4 R-squared Q3: 0.3603716

Code

cat("Model 4 Adj R-squared Q3:", summary(model4_q3)$adj.r.squared, "\n")

Model 4 Adj R-squared Q3: 0.3598952

Code

model4_q1 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day +
    Med_Inc + Percent_Taking_Transit + Percent_White +
    start_station,
  data = train_q1
)
cat("Model 4 R-squared Q1:", summary(model4_q1)$r.squared, "\n")

Model 4 R-squared Q1: 0.2601801

Code

cat("Model 4 Adj R-squared Q1:", summary(model4_q1)$adj.r.squared, "\n")

Model 4 Adj R-squared Q1: 0.25961

Code

model5_q3 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day + rush_hour + as.factor(month) +
    Med_Inc + Percent_Taking_Transit + Percent_White +
    start_station +
    rush_hour * weekend,  # Rush hour effects different on weekends
  data = train_q3
)

cat("Model 5 R-squared Q3:", summary(model5_q3)$r.squared, "\n")

Model 5 R-squared Q3: 0.3641351

Code

cat("Model 5 Adj R-squared Q3:", summary(model5_q3)$adj.r.squared, "\n")

Model 5 Adj R-squared Q3: 0.3636566

Code

model5_q1 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day + rush_hour + as.factor(month) +
    Med_Inc + Percent_Taking_Transit + Percent_White +
    start_station +
    rush_hour * weekend,  # Rush hour effects different on weekends
  data = train_q1
)

cat("Model 5 R-squared Q1:", summary(model5_q1)$r.squared, "\n")

Model 5 R-squared Q1: 0.2651482

Code

cat("Model 5 Adj R-squared Q1:", summary(model5_q1)$adj.r.squared, "\n")

Model 5 Adj R-squared Q1: 0.2645756

Code

# Get predictions on test set

# Create day of week factor with treatment (dummy) coding
test_q3 <- test_q3 %>%
  mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

# Set contrasts to treatment coding (dummy variables)
contrasts(test_q3$dotw_simple) <- contr.treatment(7)

test_q3$start_station <- factor(
  test_q3$start_station,
  levels = model4_q3$xlevels$start_station
)

test_q1$start_station <- factor(
  test_q1$start_station,
  levels = model4_q1$xlevels$start_station
)

test_q3 <- test_q3 %>% droplevels()
test_q1 <- test_q1 %>% droplevels()

test_q3 <- test_q3 %>%
  mutate(
    pred1 = predict(model1_q3, newdata = test_q3),
    pred2 = predict(model2_q3, newdata = test_q3),
    pred3 = predict(model3_q3, newdata = test_q3),
    pred4 = predict(model4_q3, newdata = test_q3),
    pred5 = predict(model5_q3, newdata = test_q3)
  )

# Calculate MAE for each model
mae_results_q3 <- data.frame(
  Model = c(
    "1. Time + Weather",
    "2. + Temporal Lags",
    "3. + Demographics",
    "4. + Station FE",
    "5. + Rush Hour Interaction"
  ),
  MAE = c(
    mean(abs(test_q3$Trip_Count - test_q3$pred1), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred2), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred3), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred4), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred5), na.rm = TRUE)
  )
)

kable(mae_results_q3, 
      digits = 2,
      caption = "Mean Absolute Error by Model, Q3 (Test Set)",
      col.names = c("Model", "MAE (trips)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Mean Absolute Error by Model, Q3 (Test Set)
Model	MAE (trips)
1. Time + Weather	0.87
2. + Temporal Lags	0.74
3. + Demographics	0.75
4. + Station FE	0.74
5. + Rush Hour Interaction	0.74

Code

# Create day of week factor with treatment (dummy) coding
test_q1 <- test_q1 %>%
  mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

# Set contrasts to treatment coding (dummy variables)
contrasts(test_q1$dotw_simple) <- contr.treatment(7)

test_q1 <- test_q1 %>%
  mutate(
    pred1 = predict(model1_q1, newdata = test_q1),
    pred2 = predict(model2_q1, newdata = test_q1),
    pred3 = predict(model3_q1, newdata = test_q1),
    pred4 = predict(model4_q1, newdata = test_q1),
    pred5 = predict(model5_q1, newdata = test_q1)
  )

# Calculate MAE for each model
mae_results_q1 <- data.frame(
  Model = c(
    "1. Time + Weather",
    "2. + Temporal Lags",
    "3. + Demographics",
    "4. + Station FE",
    "5. + Rush Hour Interaction"
  ),
  MAE = c(
    mean(abs(test_q1$Trip_Count - test_q1$pred1), na.rm = TRUE),
    mean(abs(test_q1$Trip_Count - test_q1$pred2), na.rm = TRUE),
    mean(abs(test_q1$Trip_Count - test_q1$pred3), na.rm = TRUE),
    mean(abs(test_q1$Trip_Count - test_q1$pred4), na.rm = TRUE),
    mean(abs(test_q1$Trip_Count - test_q1$pred5), na.rm = TRUE)
  )
)

kable(mae_results_q1, 
      digits = 2,
      caption = "Mean Absolute Error by Model, q1 (Test Set)",
      col.names = c("Model", "MAE (trips)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Mean Absolute Error by Model, q1 (Test Set)
Model	MAE (trips)
1. Time + Weather	0.62
2. + Temporal Lags	0.53
3. + Demographics	0.53
4. + Station FE	0.52
5. + Rush Hour Interaction	0.53

Code

#MAE is higher for Q3 than Q1. Trends such as weather/temperature and weekend/weekday divide are trend more clearly in Q1, whereas more outliers exist in the Q3 dataset. This makes the models predict significantly better in Q1 than Q3. For Q1, the model perfoms best at stage 4 after adding fixed effects. For Q3, stage 4 and 5 half equal MAE.

Code

ggplot(mae_results_q3, aes(x = reorder(Model, -MAE), y = MAE)) +
  geom_col(fill = "#3182bd", alpha = 0.8) +
  geom_text(aes(label = round(MAE, 2)), vjust = -0.5) +
  labs(
    title = "Model Performance Comparison Q3",
    subtitle = "Lower MAE = Better Predictions",
    x = "Model",
    y = "Mean Absolute Error (trips)"
  ) +
  plotTheme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code

ggplot(mae_results_q1, aes(x = reorder(Model, -MAE), y = MAE)) +
  geom_col(fill = "#3182bd", alpha = 0.8) +
  geom_text(aes(label = round(MAE, 2)), vjust = -0.5) +
  labs(
    title = "Model Performance Comparison Q1",
    subtitle = "Lower MAE = Better Predictions",
    x = "Model",
    y = "Mean Absolute Error (trips)"
  ) +
  plotTheme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code

test_q3 <- test_q3 %>%
  mutate(
    error = Trip_Count - pred2,
    abs_error = abs(error),
    time_of_day = case_when(
      hour < 7 ~ "Overnight",
      hour >= 7 & hour < 10 ~ "AM Rush",
      hour >= 10 & hour < 15 ~ "Mid-Day",
      hour >= 15 & hour <= 18 ~ "PM Rush",
      hour > 18 ~ "Evening"
    )
  )

# Scatter plot by time and day type
ggplot(test_q3, aes(x = Trip_Count, y = pred2)) +
  geom_point(alpha = 0.2, color = "#3182bd") +
  geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
  geom_smooth(method = "lm", se = FALSE, color = "darkgreen") +
  facet_grid(weekend ~ time_of_day) +
  labs(
    title = "Observed vs. Predicted Bike Trips Q3",
    subtitle = "Model 2 performance by time period Q3",
    x = "Observed Trips",
    y = "Predicted Trips",
    caption = "Red line = perfect predictions; Green line = actual model fit"
  ) +
  plotTheme

Code

test_q1 <- test_q1 %>%
  mutate(
    error = Trip_Count - pred2,
    abs_error = abs(error),
    time_of_day = case_when(
      hour < 7 ~ "Overnight",
      hour >= 7 & hour < 10 ~ "AM Rush",
      hour >= 10 & hour < 15 ~ "Mid-Day",
      hour >= 15 & hour <= 18 ~ "PM Rush",
      hour > 18 ~ "Evening"
    )
  )

# Scatter plot by time and day type
ggplot(test_q1, aes(x = Trip_Count, y = pred2)) +
  geom_point(alpha = 0.2, color = "#3182bd") +
  geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
  geom_smooth(method = "lm", se = FALSE, color = "darkgreen") +
  facet_grid(weekend ~ time_of_day) +
  labs(
    title = "Observed vs. Predicted Bike Trips q1",
    subtitle = "Model 2 performance by time period q1",
    x = "Observed Trips",
    y = "Predicted Trips",
    caption = "Red line = perfect predictions; Green line = actual model fit"
  ) +
  plotTheme

Code

#There is less variation in the errors in Q1. The Q1 model does well at overnight and decently well with mid-day frames. Q3 shows more variation in error overall, especially struggling with the PM rush period.

Code

# Calculate MAE by station
station_errors_q3 <- test_q3 %>%
  group_by(start_station, start_lat, start_lon) %>%
  summarize(
    MAE = mean(abs_error, na.rm = TRUE),
    avg_demand = mean(Trip_Count, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(start_lat), !is.na(start_lon))

station_errors_q1 <- test_q1 %>%
  group_by(start_station, start_lat, start_lon) %>%
  summarize(
    MAE = mean(abs_error, na.rm = TRUE),
    avg_demand = mean(Trip_Count, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(start_lat), !is.na(start_lon))


## Create Two Maps Side-by-Side with Proper Legends (sorry these maps are ugly)

# Calculate station errors
station_errors_q3 <- test_q3 %>%
  filter(!is.na(pred2)) %>%
  group_by(start_station, start_lat, start_lon) %>%
  summarize(
    MAE = mean(abs(Trip_Count - pred2), na.rm = TRUE),
    avg_demand = mean(Trip_Count, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(start_lat), !is.na(start_lon))

station_errors_q1 <- test_q1 %>%
  filter(!is.na(pred2)) %>%
  group_by(start_station, start_lat, start_lon) %>%
  summarize(
    MAE = mean(abs(Trip_Count - pred2), na.rm = TRUE),
    avg_demand = mean(Trip_Count, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(start_lat), !is.na(start_lon))

# Map 1: Prediction Errors
p1_q3 <- ggplot() +
  geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
  geom_point(
    data = station_errors_q3,
    aes(x = start_lon, y = start_lat, color = MAE),
    size = 3.5,
    alpha = 0.7
  ) +
  scale_color_viridis(
    option = "plasma",
    name = "MAE\n(trips)",
    direction = -1,
    breaks = c(0.5, 1.0, 1.5),  # Fewer, cleaner breaks
    labels = c("0.5", "1.0", "1.5")
  ) +
  labs(title = "Prediction Errors Q3",
       subtitle = "Higher in Center City") +
  mapTheme +
  theme(
    legend.position = "right",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  ) +
  guides(color = guide_colorbar(
    barwidth = 1.5,
    barheight = 12,
    title.position = "top",
    title.hjust = 0.5
  ))

p1_q1 <- ggplot() +
  geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
  geom_point(
    data = station_errors_q1,
    aes(x = start_lon, y = start_lat, color = MAE),
    size = 3.5,
    alpha = 0.7
  ) +
  scale_color_viridis(
    option = "plasma",
    name = "MAE\n(trips)",
    direction = -1,
    breaks = c(0.5, 1.0, 1.5),  # Fewer, cleaner breaks
    labels = c("0.5", "1.0", "1.5")
  ) +
  labs(title = "Prediction Errors Q1",
       subtitle = "Higher in Center City") +
  mapTheme +
  theme(
    legend.position = "right",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  ) +
  guides(color = guide_colorbar(
    barwidth = 1.5,
    barheight = 12,
    title.position = "top",
    title.hjust = 0.5
  ))

# Map 2: Average Demand
p2_q3 <- ggplot() +
  geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
  geom_point(
    data = station_errors_q3,
    aes(x = start_lon, y = start_lat, color = avg_demand),
    size = 3.5,
    alpha = 0.7
  ) +
  scale_color_viridis(
    option = "viridis",
    name = "Avg\nDemand",
    direction = -1,
    breaks = c(0.5, 1.0, 1.5, 2.0, 2.5),  # Clear breaks
    labels = c("0.5", "1.0", "1.5", "2.0", "2.5")
  ) +
  labs(title = "Average Demand Q3",
       subtitle = "Trips per station-hour") +
  mapTheme +
  theme(
    legend.position = "right",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  ) +
  guides(color = guide_colorbar(
    barwidth = 1.5,
    barheight = 12,
    title.position = "top",
    title.hjust = 0.5
  ))

p2_q1 <- ggplot() +
  geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
  geom_point(
    data = station_errors_q1,
    aes(x = start_lon, y = start_lat, color = avg_demand),
    size = 3.5,
    alpha = 0.7
  ) +
  scale_color_viridis(
    option = "viridis",
    name = "Avg\nDemand",
    direction = -1,
    breaks = c(0.5, 1.0, 1.5, 2.0, 2.5),  # Clear breaks
    labels = c("0.5", "1.0", "1.5", "2.0", "2.5")
  ) +
  labs(title = "Average Demand q1",
       subtitle = "Trips per station-hour") +
  mapTheme +
  theme(
    legend.position = "right",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  ) +
  guides(color = guide_colorbar(
    barwidth = 1.5,
    barheight = 12,
    title.position = "top",
    title.hjust = 0.5
  ))



# Combine
(p1_q3 | p2_q3)

Code

(p1_q1 | p2_q1)

Code

#Stations with higher volumes have higher MAE, and Q3 has higher MAE overall than Q1. This shows us the model predicts better on stations further from Center City where there is a lower volume of trips.

Code

# MAE by time of day and day type
temporal_errors_q3 <- test_q3 %>%
  group_by(time_of_day, weekend) %>%
  summarize(
    MAE = mean(abs_error, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(day_type = ifelse(weekend == 1, "Weekend", "Weekday"))

ggplot(temporal_errors_q3, aes(x = time_of_day, y = MAE, fill = day_type)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Prediction Errors by Time Period Q3",
    subtitle = "When is the model struggling most?",
    x = "Time of Day",
    y = "Mean Absolute Error (trips)",
    fill = "Day Type"
  ) +
  plotTheme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code

temporal_errors_q1 <- test_q1 %>%
  group_by(time_of_day, weekend) %>%
  summarize(
    MAE = mean(abs_error, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(day_type = ifelse(weekend == 1, "Weekend", "Weekday"))

ggplot(temporal_errors_q1, aes(x = time_of_day, y = MAE, fill = day_type)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Prediction Errors by Time Period Q1",
    subtitle = "When is the model struggling most?",
    x = "Time of Day",
    y = "Mean Absolute Error (trips)",
    fill = "Day Type"
  ) +
  plotTheme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code

#Following with the trends observed so far, both models struggle more with high volume times, especially the PM rush. Both models excel during the overnight time period.

Code

# Join demographic data to station errors

station_errors_demo_q3 <- station_errors_q3 %>%
  mutate(start_station = as.character(start_station)) %>%
  left_join(
    station_attributes_q3 %>%
      mutate(start_station = as.character(start_station)) %>%
      select(start_station, Med_Inc, Percent_Taking_Transit, Percent_White),
    by = "start_station"
  ) %>%
  filter(!is.na(Med_Inc))

station_errors_demo_q1 <- station_errors_q1 %>%
  mutate(start_station = as.character(start_station)) %>%
  left_join(
    station_attributes_q1 %>%
      mutate(start_station = as.character(start_station)) %>%
      select(start_station, Med_Inc, Percent_Taking_Transit, Percent_White),
    by = "start_station"
  ) %>%
  filter(!is.na(Med_Inc))


# Create plots
p1_q3 <- ggplot(station_errors_demo_q3, aes(x = Med_Inc, y = MAE)) +
  geom_point(alpha = 0.5, color = "#3182bd") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Errors vs. Median Income Q3", x = "Median Income", y = "MAE") +
  plotTheme

p2_q3 <- ggplot(station_errors_demo_q3, aes(x = Percent_Taking_Transit, y = MAE)) +
  geom_point(alpha = 0.5, color = "#3182bd") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Errors vs. Transit Usage Q3", x = "% Taking Transit", y = "MAE") +
  plotTheme

p3_q3 <- ggplot(station_errors_demo_q3, aes(x = Percent_White, y = MAE)) +
  geom_point(alpha = 0.5, color = "#3182bd") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Errors vs. Race Q3", x = "% White", y = "MAE") +
  plotTheme

(p1_q3 | p2_q3) / (p3_q3 | plot_spacer())

Code

p1_q1 <- ggplot(station_errors_demo_q1, aes(x = Med_Inc, y = MAE)) +
  geom_point(alpha = 0.5, color = "#3182bd") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Errors vs. Median Income q1", x = "Median Income", y = "MAE") +
  plotTheme

p2_q1 <- ggplot(station_errors_demo_q1, aes(x = Percent_Taking_Transit, y = MAE)) +
  geom_point(alpha = 0.5, color = "#3182bd") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Errors vs. Transit Usage q1", x = "% Taking Transit", y = "MAE") +
  plotTheme

p3_q1 <- ggplot(station_errors_demo_q1, aes(x = Percent_White, y = MAE)) +
  geom_point(alpha = 0.5, color = "#3182bd") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Errors vs. Race q1", x = "% White", y = "MAE") +
  plotTheme

(p1_q1 | p2_q1) / (p3_q1 | plot_spacer())

Code

#Q1 and Q3models have errors distributed fairly randomly around their trend lines, which show clear trends that differ based on the demographic measure. While the scales of error and volume of trips varies widely, the slopes of the fitted error lines are nearly identical. The models have more errors in higher income areas and areas with a higher percentage of white people. Both models see a decrease in error as the % of transit ridership increases.

Code

# Model 6: Best previous model (5) + new features
# Adding dist_to_center and volume_tier to model 5's formula
model6_q3 <- lm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day + rush_hour + as.factor(month) +
    Med_Inc + Percent_Taking_Transit + Percent_White +
    start_station +
    rush_hour * weekend +
    dist_to_center + volume_tier,
  data = train_q3
)
cat("Model 6 Q3 R-squared:", summary(model6_q3)$r.squared, "\n")

Model 6 Q3 R-squared: 0.3641351

Code

cat("Model 6 Q3 Adj R-squared:", summary(model6_q3)$adj.r.squared, "\n")

Model 6 Q3 Adj R-squared: 0.3636566

Code

# Poisson is appropriate for count data like trip counts
# It assumes variance increases with the mean (unlike OLS which assumes constant variance)

model_poisson_q3 <- glm(
  Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
    lag1Hour + lag3Hours + lag1day + rush_hour + as.factor(month) +
    Med_Inc + Percent_Taking_Transit + Percent_White +
    start_station +
    rush_hour * weekend +
    dist_to_center + volume_tier,
  data = train_q3,
  family = poisson(link = "log")
)
cat("Poisson Q3 AIC:", AIC(model_poisson_q3), "\n")

Poisson Q3 AIC: 753566.5

Code

cat("OLS Model 6 Q3 AIC:", AIC(model6_q3), "\n")

OLS Model 6 Q3 AIC: 1176271

Code

# Get predictions on test set

# Create day of week factor with treatment (dummy) coding
test_q3 <- test_q3 %>%
  mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

# Set contrasts to treatment coding (dummy variables)
contrasts(test_q3$dotw_simple) <- contr.treatment(7)

test_q3$start_station <- factor(
  test_q3$start_station,
  levels = model4_q3$xlevels$start_station
)

test_q3 <- test_q3 %>% droplevels()

test_q3 <- test_q3 %>%
  mutate(
    pred1 = predict(model1_q3, newdata = test_q3),
    pred2 = predict(model2_q3, newdata = test_q3),
    pred3 = predict(model3_q3, newdata = test_q3),
    pred4 = predict(model4_q3, newdata = test_q3),
    pred5 = predict(model5_q3, newdata = test_q3),
    pred6 = predict(model6_q3, newdata = test_q3),
    pred_poisson = predict(model_poisson_q3, newdata = test_q3, type = "response")  )

# Calculate MAE for each model
mae_results_newmodels <- data.frame(
  Model = c(
    "1. Time + Weather",
    "2. + Temporal Lags",
    "3. + Demographics",
    "4. + Station FE",
    "5. + Rush Hour Interaction",
    "6. + Station Volume + Distance to Center City",
    "7. Poisson fit"
  ),
  MAE = c(
    mean(abs(test_q3$Trip_Count - test_q3$pred1), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred2), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred3), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred4), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred5), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred6), na.rm = TRUE),
    mean(abs(test_q3$Trip_Count - test_q3$pred_poisson), na.rm = TRUE)
  )
)

kable(mae_results_newmodels,
      digits = 2,
      caption = "Mean Absolute Error by Model, Q3 (Test Set)",
      col.names = c("Model", "MAE (trips)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Mean Absolute Error by Model, Q3 (Test Set)
Model	MAE (trips)
1. Time + Weather	0.87
2. + Temporal Lags	0.74
3. + Demographics	0.75
4. + Station FE	0.74
5. + Rush Hour Interaction	0.74
6. + Station Volume + Distance to Center City	0.74
7. Poisson fit	0.69

Code

#Station volume and distance to center city had no effect on the MAE, which I was surprised by. By creating volume tiers, I thought that the model would improve because the volume of trips seemed to have a large effect on the errors, as seen in the maps showing trip volume vs. error. Again, because Center City seemed to be a hotspot of errors, I thought that distance to City Hall might improve the model. In the future, perhaps a different engineering method of these two features would result in a better model. However, the Poisson fit improved the model greatly. This makes sense; the trip volumes are not continuous, so using a regression model that allows for a longer tail (like Poisson) improved the model.

Code

# MAE by time of day and day type
# Calculate errors for new models
test_q3 <- test_q3 %>%
  mutate(
    abs_error_m6      = abs(Trip_Count - pred6),
    abs_error_poisson = abs(Trip_Count - pred_poisson)
  )

# Pivot longer so both models can be faceted/compared
temporal_errors_newmodels <- test_q3 %>%
  select(time_of_day, weekend, abs_error_m6, abs_error_poisson) %>%
  pivot_longer(
    cols = c(abs_error_m6, abs_error_poisson),
    names_to = "model",
    values_to = "abs_error"
  ) %>%
  mutate(
    model    = recode(model,
                      "abs_error_m6"      = "Model 6: + Distance + Volume Tier",
                      "abs_error_poisson" = "Poisson"),
    day_type = ifelse(weekend == 1, "Weekend", "Weekday")
  ) %>%
  group_by(time_of_day, weekend, day_type, model) %>%
  summarize(MAE = mean(abs_error, na.rm = TRUE), .groups = "drop")

ggplot(temporal_errors_newmodels, aes(x = time_of_day, y = MAE, fill = day_type)) +
  geom_col(position = "dodge") +
  facet_wrap(~ model) +
  scale_fill_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
  labs(
    title = "Prediction Errors by Time Period - New Models",
    subtitle = "Model 6 vs. Poisson",
    x = "Time of Day",
    y = "Mean Absolute Error (trips)",
    fill = "Day Type"
  ) +
  plotTheme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code

#The model 6 MAE by hour is similar to that of the first five models, but the Poisson model reduces the overall MAE. It still follows the same temporal error patterns, showing that while the Poisson model improves the model overall, it does not account for some of the temporal error.

Code

#Operational Limitations

#The best OLS models (4 and 5) achieve MAE around 0.74 trips per station-hour in Q3, with the Poisson model improving on this further (MAE = 0.69). For low-volume stations averaging 0.3-0.5 trips per hour in Q1, an MAE of 0.8 represents substantial  error; this shows the the model is often off by more than the average demand itself. For high-volume Center City stations, the same absolute error is more tolerable. The model really begins to struggle PM rush on weekdays, which is likely when rebalancing demand is highest. Incorrect predictions could leave commuters, especially at lower-volume stations without a bike. I would recommend deploying this system only for high-volume stations where the model performs more reliably until more features can improve the model predictions.

#Equity Considerations
#The demographic error analysis reveals a consistent pattern across both Q1 and Q3: prediction errors are higher in higher-income, majority-white neighborhoods, and lower in higher-transit-usage areas. At first glance this seems to favor lower-income neighborhoods, but higher errors in wealthier Center City areas likely reflect those stations' higher volumes, not better service for lower-income communities. We can see this as the high median income areas that also have stations are in Center City, where population density and trip volume are higher. One possible negative implication is that if Indego uses this model to allocate rebalancing , lower-income and minority neighborhoods that the model predicts more "accurately" (because they have lower and more stable demand) may receive less attention, entrenching existing patterns of underservice. A system that performs well where demand is already high and poorly where demand risks creating a circular logic and perpetuating service flaws for minority communities or lower income areas. I  recommend reevaluating decisions by neighborhood demographics, setting a floor on service level for all stations, and not using prediction accuracy as a sole proxy for evaluating service levels.

#Model Limitations

#The most significant pattern the model is missing is event-driven demand — outlier events (such as the spike in Feb 2025 on the day the Eagles won the Super Bowl) that temporal lag  can anticipate because the model is not trained to factor in such irregularities. This could be one reason that Q1 performs so much better than Q3 overall; summer brings more irregular events, tourism, and changing school schedules. Further, using the temporal train/test split (training on July through early September and testing on late September) is important to not create a circular model through a different validation (such as k-fold), but if demand changes heavily based on holidays and summer patterns, this means the model is trained on summer demand patterns, which may differ from fall in ways the month fixed effect only shows a part of. With more time, the most valuable additions would be special event calendars and a cross-validation strategy that tests across multiple time windows while still paying attention to creating a logically sound temporal validation.