#### Load packages #### library(lwgeom) library(sf) library(dplyr) library(caTools) library(caret) library(kernlab) #setwd("") #### Data pre-processing #### # # buildings<-st_read("Harris_MH_final.gpkg") # buildings<-buildings%>% # select(MH) # # calc_lw_simple <- function(X) { # # Minimum rotated rectangle for each feature # mbr <- sf::st_as_sfc(geos::geos_minimum_rotated_rectangle(X)) # # # For each rectangle polygon, compute its two edge lengths # # Then take L = max edge, W = min edge # dims <- lapply(mbr, function(poly) { # coords <- sf::st_coordinates(poly) # n x (X,Y,...) with last == first # # edge lengths between consecutive coords (ignore the repeated last->first) # seg <- sqrt(rowSums((coords[-nrow(coords), 1:2] - coords[-1, 1:2])^2)) # # A rectangle has two unique edge lengths; guard against FP jitter # u <- unique(round(seg, 12)) # c(length = max(u), width = min(u)) # }) # dim_mat <- do.call(rbind, dims) # # R_leqw <- as.numeric(dim_mat[, "length"] / dim_mat[, "width"]) # R_peri <- as.numeric(sf::st_perimeter(X)) # R_f_area <- as.numeric(sf::st_area(X)) # # # Bind back to the sf object # cbind(X, # R_leqw = R_leqw, # R_peri = R_peri, # R_f_area = R_f_area) # } # # buildings<-calc_lw_simple(buildings) # # buildings_centroid<-st_centroid(buildings) # # buildings$index<-row.names(buildings) # # ## Selecting variables to keep # Buildings_short<-buildings # # colnames(Buildings_short)<-c('MH', 'leqw', # 'peri', 'area', "geometry", 'index') # # # Buildings_short$leqw<-log(Buildings_short$leqw) # # # Calculating mean and sd for Harris County sample # mean_leqw<-mean(Buildings_short$leqw) # sd_leqw<-sd(Buildings_short$leqw) # # mean_peri<-mean(Buildings_short$peri) # sd_peri<-sd(Buildings_short$peri) # # mean_area<-mean(Buildings_short$area) # sd_area<-sd(Buildings_short$area) # # Buildings_short$leqw<-(Buildings_short$leqw-mean_leqw)/sd_leqw # Buildings_short$peri<-(Buildings_short$peri-mean_peri)/sd_peri # Buildings_short$area<-(Buildings_short$area-mean_area)/sd_area # # Buildings_short$MH <- factor(Buildings_short$MH, levels = c("Other","MH")) # Buildings_short$id <- seq_len(nrow(Buildings_short)) # # # ========= Distances to nearest MHP (meters) for ALL buildings ========= # mh_parks <- st_read("./Mobile_Home_Parks.shp", quiet = TRUE) # mh_parks <- st_transform(mh_parks, st_crs(Buildings_short)) # # b_cent <- st_centroid(Buildings_short) # nn_idx <- st_nearest_feature(b_cent, mh_parks) # dist_m <- st_distance(b_cent, mh_parks[nn_idx, ], by_element = TRUE) # Buildings_short$dist_to_mhp_m <- as.numeric(dist_m) # # save(Buildings_short, file="Buildings_Prediction_Final.RData") #### Loading dataset #### load(file="Buildings_Prediction_Final.RData") # Work with a non-geometry copy for modeling Buildings_short_no_geo <- st_drop_geometry(Buildings_short) # ========= Train / validate split (training only affects model fit) ========= set.seed(123) split <- sample.split(Buildings_short_no_geo$MH, SplitRatio = 0.75) svm.train <- subset(Buildings_short_no_geo, split == TRUE) svm.validate <- subset(Buildings_short_no_geo, split == FALSE) trainX <- svm.train[, c("leqw","peri","area")] ctrl <- trainControl(method = "repeatedcv", number = 2, repeats = 2, summaryFunction = twoClassSummary, classProbs = TRUE) grid <- expand.grid( degree = 1:3, C = c(0.1, 0.5, 1, 2), scale = c(0.001, 0.01, 0.1) ) svm.tune <- train( x = trainX, y = svm.train$MH, method = "svmPoly", metric = "ROC", tuneGrid = grid, trControl = ctrl ) svm.tune$results svm.tune$bestTune # ========= Predict on ALL buildings ========= allX <- Buildings_short_no_geo[, c("leqw","peri","area")] pred_all <- predict(svm.tune, allX) prob_all <- predict(svm.tune, allX, type = "prob")[, "MH"] Buildings_short_no_geo$pred <- pred_all Buildings_short_no_geo$prob_MH <- prob_all # --- deps (no 'scales') --- library(dplyr) library(knitr) library(kableExtra) # simple comma formatter (base R) comma_fmt <- function(x) format(x, big.mark = ",", scientific = FALSE, trim = TRUE) # ========= Metrics on ALL buildings + distance bands ========= metrics <- function(df, positive = "MH") { truth <- df$MH pr <- df$pred tp <- sum(pr == positive & truth == positive, na.rm = TRUE) fp <- sum(pr == positive & truth != positive, na.rm = TRUE) tn <- sum(pr != positive & truth != positive, na.rm = TRUE) fn <- sum(pr != positive & truth == positive, na.rm = TRUE) n <- tp + fp + tn + fn data.frame( n = n, accuracy = (tp + tn) / n, precision = ifelse(tp + fp > 0, tp / (tp + fp), NA_real_), recall = ifelse(tp + fn > 0, tp / (tp + fn), NA_real_) ) } thresh <- 500 # meters # core cols all_core <- Buildings_short_no_geo[, c("MH","pred","dist_to_mhp_m")] # distance bands (exclude NA distances for band splits) in_band <- all_core %>% dplyr::filter(!is.na(dist_to_mhp_m), dist_to_mhp_m <= thresh) out_band <- all_core %>% dplyr::filter(!is.na(dist_to_mhp_m), dist_to_mhp_m > thresh) # ----- Observations block ----- obs_tbl <- tibble::tibble( rowname = c("MH", "Other", "All Buildings"), `Full Sample` = c( sum(all_core$MH == "MH", na.rm = TRUE), sum(all_core$MH != "MH" & !is.na(all_core$MH), na.rm = TRUE), sum(!is.na(all_core$MH)) ), `≤500m MHC` = c( sum(in_band$MH == "MH", na.rm = TRUE), sum(in_band$MH != "MH" & !is.na(in_band$MH), na.rm = TRUE), nrow(in_band) ), `>500m MHC` = c( sum(out_band$MH == "MH", na.rm = TRUE), sum(out_band$MH != "MH" & !is.na(out_band$MH), na.rm = TRUE), nrow(out_band) ) ) %>% dplyr::mutate(across(-rowname, ~ comma_fmt(.x))) # ----- Performance metrics block ----- m_all <- metrics(all_core) m_in <- metrics(in_band) m_out <- metrics(out_band) fmt_num <- function(x) ifelse(is.na(x), "---", sprintf("%.2f", x)) perf_tbl <- tibble::tibble( rowname = c("Accuracy","Precision","Recall"), `Full Sample` = c(m_all$accuracy, m_all$precision, m_all$recall) |> fmt_num(), `≤500m MHC` = c(m_in$accuracy, m_in$precision, m_in$recall) |> fmt_num(), `>500m MHC` = c(m_out$accuracy, m_out$precision, m_out$recall) |> fmt_num() ) # ----- Combine + render LaTeX ----- final_tbl <- dplyr::bind_rows(obs_tbl, perf_tbl) latex_tab <- final_tbl |> knitr::kable( format = "latex", booktabs = TRUE, align = c("l","c","c","c"), col.names = c("", "\\textbf{Full Sample}", "\\textbf{$\\leq$ 500m MHC}", "\\textbf{$>$ 500m MHC}"), escape = FALSE, caption = "Summary Statistics" ) |> kableExtra::add_header_above( c(" " = 1, " " = 1, "\\textbf{Buildings}" = 2), escape = FALSE ) |> kableExtra::pack_rows(index = c("\\textit{\\textbf{Observations}}" = 3, "\\textit{\\textbf{Performance Metrics}}" = 3), bold = TRUE, italic = TRUE, escape = FALSE) |> kableExtra::add_indent(positions = 1:6) |> kableExtra::kable_styling(font_size = 8, latex_options = "hold_position") latex_tab