#### Load packages ####
library(lwgeom)
library(sf)
library(dplyr)
library(caTools)
library(caret)
library(kernlab)
#setwd("")

#### Data pre-processing ####
# 
# buildings<-st_read("Harris_MH_final.gpkg")
# buildings<-buildings%>%
#   select(MH)
# 
# calc_lw_simple <- function(X) {
#   # Minimum rotated rectangle for each feature
#   mbr <- sf::st_as_sfc(geos::geos_minimum_rotated_rectangle(X))
#   
#   # For each rectangle polygon, compute its two edge lengths
#   # Then take L = max edge, W = min edge
#   dims <- lapply(mbr, function(poly) {
#     coords <- sf::st_coordinates(poly)  # n x (X,Y,...) with last == first
#     # edge lengths between consecutive coords (ignore the repeated last->first)
#     seg <- sqrt(rowSums((coords[-nrow(coords), 1:2] - coords[-1, 1:2])^2))
#     # A rectangle has two unique edge lengths; guard against FP jitter
#     u <- unique(round(seg, 12))
#     c(length = max(u), width = min(u))
#   })
#   dim_mat <- do.call(rbind, dims)
#   
#   R_leqw  <- as.numeric(dim_mat[, "length"] / dim_mat[, "width"])
#   R_peri  <- as.numeric(sf::st_perimeter(X))
#   R_f_area <- as.numeric(sf::st_area(X))
#   
#   # Bind back to the sf object
#   cbind(X,
#         R_leqw = R_leqw,
#         R_peri = R_peri,
#         R_f_area = R_f_area)
# }
# 
# buildings<-calc_lw_simple(buildings)
# 
# buildings_centroid<-st_centroid(buildings)
# 
# buildings$index<-row.names(buildings)
# 
# ## Selecting variables to keep
# Buildings_short<-buildings
# 
# colnames(Buildings_short)<-c('MH', 'leqw',
#                              'peri', 'area',  "geometry", 'index')
# 
# 
# Buildings_short$leqw<-log(Buildings_short$leqw)
# 
# # Calculating mean and sd for Harris County sample
# mean_leqw<-mean(Buildings_short$leqw)
# sd_leqw<-sd(Buildings_short$leqw)
# 
# mean_peri<-mean(Buildings_short$peri)
# sd_peri<-sd(Buildings_short$peri)
# 
# mean_area<-mean(Buildings_short$area)
# sd_area<-sd(Buildings_short$area)
# 
# Buildings_short$leqw<-(Buildings_short$leqw-mean_leqw)/sd_leqw
# Buildings_short$peri<-(Buildings_short$peri-mean_peri)/sd_peri
# Buildings_short$area<-(Buildings_short$area-mean_area)/sd_area
# 
# Buildings_short$MH <- factor(Buildings_short$MH, levels = c("Other","MH"))
# Buildings_short$id <- seq_len(nrow(Buildings_short))
# 
# # ========= Distances to nearest MHP (meters) for ALL buildings =========
# mh_parks <- st_read("./Mobile_Home_Parks.shp", quiet = TRUE)
# mh_parks <- st_transform(mh_parks, st_crs(Buildings_short))
# 
# b_cent  <- st_centroid(Buildings_short)
# nn_idx  <- st_nearest_feature(b_cent, mh_parks)
# dist_m  <- st_distance(b_cent, mh_parks[nn_idx, ], by_element = TRUE)
# Buildings_short$dist_to_mhp_m <- as.numeric(dist_m)
# 
# save(Buildings_short, file="Buildings_Prediction_Final.RData")

#### Loading dataset ####
load(file="Buildings_Prediction_Final.RData")

# Work with a non-geometry copy for modeling
Buildings_short_no_geo <- st_drop_geometry(Buildings_short)

# ========= Train / validate split (training only affects model fit) =========
set.seed(123)
split <- sample.split(Buildings_short_no_geo$MH, SplitRatio = 0.75)
svm.train    <- subset(Buildings_short_no_geo, split == TRUE)
svm.validate <- subset(Buildings_short_no_geo, split == FALSE)

trainX <- svm.train[, c("leqw","peri","area")]

ctrl <- trainControl(method = "repeatedcv",
                     number = 2, repeats = 2,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)

grid <- expand.grid(
  degree = 1:3,
  C      = c(0.1, 0.5, 1, 2),
  scale  = c(0.001, 0.01, 0.1)
)

svm.tune <- train(
  x = trainX,
  y = svm.train$MH,
  method = "svmPoly",
  metric = "ROC",
  tuneGrid = grid,
  trControl = ctrl
)

svm.tune$results
svm.tune$bestTune

# ========= Predict on ALL buildings =========
allX <- Buildings_short_no_geo[, c("leqw","peri","area")]
pred_all <- predict(svm.tune, allX)
prob_all <- predict(svm.tune, allX, type = "prob")[, "MH"]

Buildings_short_no_geo$pred    <- pred_all
Buildings_short_no_geo$prob_MH <- prob_all

# --- deps (no 'scales') ---
library(dplyr)
library(knitr)
library(kableExtra)

# simple comma formatter (base R)
comma_fmt <- function(x) format(x, big.mark = ",", scientific = FALSE, trim = TRUE)

# ========= Metrics on ALL buildings + distance bands =========
metrics <- function(df, positive = "MH") {
  truth <- df$MH
  pr    <- df$pred
  tp <- sum(pr == positive & truth == positive, na.rm = TRUE)
  fp <- sum(pr == positive & truth != positive, na.rm = TRUE)
  tn <- sum(pr != positive & truth != positive, na.rm = TRUE)
  fn <- sum(pr != positive & truth == positive, na.rm = TRUE)
  n  <- tp + fp + tn + fn
  data.frame(
    n         = n,
    accuracy  = (tp + tn) / n,
    precision = ifelse(tp + fp > 0, tp / (tp + fp), NA_real_),
    recall    = ifelse(tp + fn > 0, tp / (tp + fn), NA_real_)
  )
}

thresh <- 500  # meters

# core cols
all_core <- Buildings_short_no_geo[, c("MH","pred","dist_to_mhp_m")]

# distance bands (exclude NA distances for band splits)
in_band  <- all_core %>% dplyr::filter(!is.na(dist_to_mhp_m), dist_to_mhp_m <= thresh)
out_band <- all_core %>% dplyr::filter(!is.na(dist_to_mhp_m), dist_to_mhp_m  > thresh)

# ----- Observations block -----
obs_tbl <- tibble::tibble(
  rowname = c("MH", "Other", "All Buildings"),
  `Full Sample` = c(
    sum(all_core$MH == "MH", na.rm = TRUE),
    sum(all_core$MH != "MH" & !is.na(all_core$MH), na.rm = TRUE),
    sum(!is.na(all_core$MH))
  ),
  `≤500m MHC` = c(
    sum(in_band$MH == "MH", na.rm = TRUE),
    sum(in_band$MH != "MH" & !is.na(in_band$MH), na.rm = TRUE),
    nrow(in_band)
  ),
  `>500m MHC` = c(
    sum(out_band$MH == "MH", na.rm = TRUE),
    sum(out_band$MH != "MH" & !is.na(out_band$MH), na.rm = TRUE),
    nrow(out_band)
  )
) %>%
  dplyr::mutate(across(-rowname, ~ comma_fmt(.x)))

# ----- Performance metrics block -----
m_all <- metrics(all_core)
m_in  <- metrics(in_band)
m_out <- metrics(out_band)

fmt_num <- function(x) ifelse(is.na(x), "---", sprintf("%.2f", x))

perf_tbl <- tibble::tibble(
  rowname = c("Accuracy","Precision","Recall"),
  `Full Sample` = c(m_all$accuracy, m_all$precision, m_all$recall) |> fmt_num(),
  `≤500m MHC`   = c(m_in$accuracy,  m_in$precision,  m_in$recall)  |> fmt_num(),
  `>500m MHC`   = c(m_out$accuracy, m_out$precision, m_out$recall) |> fmt_num()
)

# ----- Combine + render LaTeX -----
final_tbl <- dplyr::bind_rows(obs_tbl, perf_tbl)

latex_tab <- final_tbl |>
  knitr::kable(
    format = "latex",
    booktabs = TRUE,
    align = c("l","c","c","c"),
    col.names = c("", "\\textbf{Full Sample}",
                  "\\textbf{$\\leq$ 500m MHC}", "\\textbf{$>$ 500m MHC}"),
    escape = FALSE,
    caption = "Summary Statistics"
  ) |>
  kableExtra::add_header_above(
    c(" " = 1, " " = 1, "\\textbf{Buildings}" = 2),
    escape = FALSE
  ) |>
  kableExtra::pack_rows(index = c("\\textit{\\textbf{Observations}}" = 3,
                                  "\\textit{\\textbf{Performance Metrics}}" = 3),
                        bold = TRUE, italic = TRUE, escape = FALSE) |>
  kableExtra::add_indent(positions = 1:6) |>
  kableExtra::kable_styling(font_size = 8, latex_options = "hold_position")

latex_tab