library(sf) library(raster) setwd("") # --- Inputs (files should be in your working directory) --- Counties <- st_read("US_county_2020.shp", quiet = TRUE) States <- st_read("tl_2020_us_state.shp", quiet = TRUE) |> dplyr::select(STUSPS, STATEFP) |> st_drop_geometry() |> dplyr::filter(!STUSPS %in% c("PR","AK","GU","VI","HI","AS","MP")) State_list <- States$STUSPS foot_path <- "STATE_preds" County_list <- list() i <- 1L for (abbr in State_list) { message(abbr) # Read the state-specific layer from the GeoPackage layer_name <- sub("STATE", abbr, foot_path) footprints <- st_read("Foot_preds.gpkg", layer = layer_name, quiet = TRUE) # Align CRS to Counties footprints <- st_transform(footprints, st_crs(Counties)) # Keep only county ID to avoid bloating attribute joins foot_join <- st_join(footprints, Counties["GISJOIN"]) # Drop features that didn't join to a county foot_join <- foot_join[!is.na(foot_join$GISJOIN), ] # Ensure an 'index' column exists if (!"index" %in% names(foot_join)) { foot_join$index <- seq_len(nrow(foot_join)) } # Binary target foot_join$MH <- as.integer(foot_join$pred == "MH") # Per-county counts foot_join <- foot_join |> group_by(GISJOIN) |> mutate(Foot_count = n(), MH_count = sum(MH, na.rm = TRUE)) |> ungroup() |> dplyr::select(index, GISJOIN, MH, Foot_count, MH_count, geometry) # Exclusions (insufficient pool size) excluded_cases_1 <- foot_join[(foot_join$Foot_count < 500) & (foot_join$MH == 0), ] excluded_cases_2 <- foot_join[(foot_join$MH_count < 500) & (foot_join$MH == 1), ] excluded_ID_1 <- excluded_cases_1$index excluded_ID_2 <- excluded_cases_2$index not_excluded <- foot_join[!(foot_join$index %in% excluded_ID_2), ] not_excluded <- not_excluded[!(not_excluded$index %in% excluded_ID_1), ] # Sample 500 per (county, MH) among the non-excluded pools selected_cases <- not_excluded |> group_by(GISJOIN, MH) |> slice_sample(n = 500) |> ungroup() # Add back excluded (kept in full) selected_cases <- dplyr::bind_rows(selected_cases, excluded_cases_1, excluded_cases_2) # Selection counts & probabilities selected_cases <- selected_cases |> group_by(GISJOIN, MH) |> mutate(Selected_count = n()) |> ungroup() |> mutate( prob_select = ifelse( MH == 1, Selected_count / MH_count, Selected_count / Foot_count ) ) County_list[[i]] <- selected_cases i <- i + 1L } Footprints_county_sample <- do.call(rbind, County_list) st_write(Footprints_county_sample, "Footprints_county_sample.shp", driver = "ESRI Shapefile", delete_layer = TRUE)