############################################################
# Hom vs Classical Statistical Modelling for Climate Data
# Real dataset: Mauna Loa atmospheric CO2
# Author: Halim Zeghdoudi / ChatGPT assisted version
#
# Objective:
# Compare a classical seasonal-trend model with Hom empirical-risk
# estimators using concave and convex twisting maps alpha.
#
# Classical loss:
#   sum(r^2)
#
# Hom concave robust loss:
#   alpha(u) = log(1 + u)
#   sum(log(1 + r^2 / c^2))
#
# Hom convex extreme-sensitive loss:
#   alpha(u) = u^p, p > 1
#   sum((r^2 / c^2)^p)
############################################################

############################
# 1. Packages
############################

required_packages <- c("zoo", "ggplot2")

for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

library(zoo)
library(ggplot2)

############################
# 2. Load Mauna Loa CO2 data
############################

# R base contains the monthly Mauna Loa CO2 time series.
# This is not weekly, but it is real Mauna Loa CO2 data and is easy
# to reproduce in R without downloading external files.
# For the weekly statsmodels version, see note at the end of this file.

data(co2)

# Convert time-series object into a data frame
# time(co2) gives decimal dates, e.g. 1959.000, 1959.083, etc.

df <- data.frame(
  date_decimal = as.numeric(time(co2)),
  co2 = as.numeric(co2)
)

# Remove or interpolate missing values
# Base R co2 is usually complete, but this makes the code safer.
if (any(is.na(df$co2))) {
  df$co2 <- zoo::na.approx(df$co2, x = df$date_decimal, na.rm = FALSE)
  df$co2 <- zoo::na.locf(df$co2, na.rm = FALSE)
  df$co2 <- zoo::na.locf(df$co2, fromLast = TRUE, na.rm = FALSE)
}

df <- na.omit(df)

############################
# 3. Build model variables
############################

# Time from first observation

df$t <- df$date_decimal - min(df$date_decimal)

# Fraction of year for seasonality

df$season <- df$date_decimal %% 1

# First and second annual harmonics

df$sin1 <- sin(2 * pi * df$season)
df$cos1 <- cos(2 * pi * df$season)
df$sin2 <- sin(4 * pi * df$season)
df$cos2 <- cos(4 * pi * df$season)

# Train/test split
# Training: before 1995
# Test: 1995 onward
# This split works well for the base R monthly dataset.

train <- subset(df, date_decimal < 1995)
test  <- subset(df, date_decimal >= 1995)

cat("Number of training observations:", nrow(train), "\n")
cat("Number of test observations:", nrow(test), "\n")

# Safety checks
needed_cols <- c("co2", "t", "sin1", "cos1", "sin2", "cos2")

if (nrow(train) == 0) {
  stop("Training set is empty. Check the train/test split date.")
}

if (any(colSums(is.na(train[, needed_cols])) > 0)) {
  print(colSums(is.na(train[, needed_cols])))
  stop("There are NA values in the training variables.")
}

############################
# 4. Classical least-squares model
############################

classical_model <- lm(
  co2 ~ t + I(t^2) + sin1 + cos1 + sin2 + cos2,
  data = train
)

cat("\nClassical model summary:\n")
print(summary(classical_model))

############################
# 5. Hom model functions
############################

# Prediction function for the same seasonal-trend structure
predict_hom <- function(par, data) {
  par[1] +
    par[2] * data$t +
    par[3] * data$t^2 +
    par[4] * data$sin1 +
    par[5] * data$cos1 +
    par[6] * data$sin2 +
    par[7] * data$cos2
}

# Starting parameters from classical least squares
start_par <- as.numeric(coef(classical_model))

# Scale constant for residuals.
# Using a robust scale makes the Hom risks numerically stable.
resid_train_classical <- residuals(classical_model)
c_scale <- median(abs(resid_train_classical - median(resid_train_classical)))

# Avoid zero scale
if (is.na(c_scale) || c_scale <= 0) {
  c_scale <- sd(resid_train_classical)
}
if (is.na(c_scale) || c_scale <= 0) {
  c_scale <- 1
}

cat("\nResidual scale c =", c_scale, "ppm\n")

# Classical empirical risk
classical_loss <- function(par, data) {
  yhat <- predict_hom(par, data)
  r <- data$co2 - yhat
  sum(r^2, na.rm = TRUE)
}

# Hom concave robust empirical risk
# alpha(u) = log(1 + u)
# Here u = (r / c)^2
# This compresses large residuals and reduces outlier influence.
hom_concave_loss <- function(par, data, c_scale) {
  yhat <- predict_hom(par, data)
  r <- data$co2 - yhat
  u <- (r / c_scale)^2
  sum(log(1 + u), na.rm = TRUE)
}

# Hom convex extreme-sensitive empirical risk
# alpha(u) = u^p, p > 1
# This emphasizes large residuals and tries to reduce extreme errors.
hom_convex_loss <- function(par, data, c_scale, p = 1.25) {
  yhat <- predict_hom(par, data)
  r <- data$co2 - yhat
  u <- (r / c_scale)^2
  sum(u^p, na.rm = TRUE)
}

############################
# 6. Fit Hom estimators
############################

fit_concave <- optim(
  par = start_par,
  fn = hom_concave_loss,
  data = train,
  c_scale = c_scale,
  method = "BFGS",
  control = list(maxit = 10000, reltol = 1e-10)
)

fit_convex <- optim(
  par = start_par,
  fn = hom_convex_loss,
  data = train,
  c_scale = c_scale,
  p = 1.25,
  method = "BFGS",
  control = list(maxit = 10000, reltol = 1e-10)
)

cat("\nHom concave optimization convergence:", fit_concave$convergence, "\n")
cat("Hom convex optimization convergence:", fit_convex$convergence, "\n")

############################
# 7. Predictions
############################

df$pred_classical <- predict(classical_model, newdata = df)
df$pred_hom_concave <- predict_hom(fit_concave$par, df)
df$pred_hom_convex <- predict_hom(fit_convex$par, df)

test$pred_classical <- predict(classical_model, newdata = test)
test$pred_hom_concave <- predict_hom(fit_concave$par, test)
test$pred_hom_convex <- predict_hom(fit_convex$par, test)

############################
# 8. Evaluation metrics
############################

rmse <- function(y, yhat) {
  sqrt(mean((y - yhat)^2, na.rm = TRUE))
}

mae <- function(y, yhat) {
  mean(abs(y - yhat), na.rm = TRUE)
}

maxae <- function(y, yhat) {
  max(abs(y - yhat), na.rm = TRUE)
}

bias <- function(y, yhat) {
  mean(y - yhat, na.rm = TRUE)
}

results <- data.frame(
  Model = c(
    "Classical least squares",
    "Hom concave robust: alpha(u)=log(1+u)",
    "Hom convex extreme-sensitive: alpha(u)=u^1.25"
  ),
  MAE = c(
    mae(test$co2, test$pred_classical),
    mae(test$co2, test$pred_hom_concave),
    mae(test$co2, test$pred_hom_convex)
  ),
  RMSE = c(
    rmse(test$co2, test$pred_classical),
    rmse(test$co2, test$pred_hom_concave),
    rmse(test$co2, test$pred_hom_convex)
  ),
  MaxAE = c(
    maxae(test$co2, test$pred_classical),
    maxae(test$co2, test$pred_hom_concave),
    maxae(test$co2, test$pred_hom_convex)
  ),
  Bias = c(
    bias(test$co2, test$pred_classical),
    bias(test$co2, test$pred_hom_concave),
    bias(test$co2, test$pred_hom_convex)
  )
)

cat("\nHeld-out test performance:\n")
print(results)

############################
# 9. Goodness of Hom summaries
############################

# Hom means for the CO2 series
# Classical mean: alpha(x)=x
# Concave Hom mean: alpha(x)=log(1+x)
# Square-root Hom mean: alpha(x)=sqrt(x)

classical_mean <- mean(df$co2)
hom_log_mean <- exp(mean(log(1 + df$co2))) - 1
hom_sqrt_mean <- mean(sqrt(df$co2))^2

hom_means <- data.frame(
  Summary = c(
    "Classical mean",
    "Hom mean: alpha(x)=log(1+x)",
    "Hom mean: alpha(x)=sqrt(x)"
  ),
  Value_ppm = c(classical_mean, hom_log_mean, hom_sqrt_mean)
)

cat("\nClassical and Hom means:\n")
print(hom_means)

# Goodness table: which model is best by each metric?
best_rmse <- results$Model[which.min(results$RMSE)]
best_mae <- results$Model[which.min(results$MAE)]
best_maxae <- results$Model[which.min(results$MaxAE)]

cat("\nGoodness of Hom interpretation:\n")
cat("Best RMSE model:", best_rmse, "\n")
cat("Best MAE model:", best_mae, "\n")
cat("Best maximum-error model:", best_maxae, "\n")
cat("\nInterpretation:\n")
cat("- Concave Hom alpha compresses large residuals, useful for robust baselines.\n")
cat("- Convex Hom alpha emphasizes large residuals, useful for extreme-aware prediction.\n")
cat("- Classical statistics is recovered when alpha(u)=u.\n")

############################
# 10. Plots
############################

# Mark train/test period
split_date <- 1995

p1 <- ggplot(df, aes(x = date_decimal, y = co2)) +
  geom_line(linewidth = 0.4) +
  geom_line(aes(y = pred_classical), linewidth = 0.4, linetype = "dashed") +
  geom_line(aes(y = pred_hom_concave), linewidth = 0.4, linetype = "dotdash") +
  geom_line(aes(y = pred_hom_convex), linewidth = 0.4, linetype = "longdash") +
  geom_vline(xintercept = split_date, linetype = "dotted") +
  labs(
    title = "Mauna Loa CO2: Classical vs Hom seasonal-trend models",
    subtitle = "Solid = observed; dashed/dotdash/longdash = fitted models; vertical line = test period start",
    x = "Year",
    y = "CO2 concentration (ppm)"
  ) +
  theme_minimal()

print(p1)

ggsave(
  filename = "hom_co2_forecast_R.png",
  plot = p1,
  width = 10,
  height = 6,
  dpi = 300
)

# Test residual plot
error_df <- data.frame(
  date_decimal = rep(test$date_decimal, 3),
  Error = c(
    test$co2 - test$pred_classical,
    test$co2 - test$pred_hom_concave,
    test$co2 - test$pred_hom_convex
  ),
  Model = rep(
    c(
      "Classical",
      "Hom concave robust",
      "Hom convex extreme-sensitive"
    ),
    each = nrow(test)
  )
)

p2 <- ggplot(error_df, aes(x = date_decimal, y = Error, linetype = Model)) +
  geom_hline(yintercept = 0) +
  geom_line(linewidth = 0.4) +
  labs(
    title = "Held-out forecast errors",
    x = "Year",
    y = "Forecast error: observed - predicted (ppm)"
  ) +
  theme_minimal()

print(p2)

ggsave(
  filename = "hom_co2_test_errors_R.png",
  plot = p2,
  width = 10,
  height = 6,
  dpi = 300
)

# Bar plot of RMSE
p3 <- ggplot(results, aes(x = Model, y = RMSE)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "RMSE comparison: Classical vs Hom models",
    x = "Model",
    y = "RMSE on test period (ppm)"
  ) +
  theme_minimal()

print(p3)

ggsave(
  filename = "hom_co2_rmse_R.png",
  plot = p3,
  width = 9,
  height = 5,
  dpi = 300
)

############################
# 11. Save numerical outputs
############################

write.csv(results, "hom_co2_results_R.csv", row.names = FALSE)
write.csv(hom_means, "hom_co2_hom_means_R.csv", row.names = FALSE)

cat("\nSaved files:\n")
cat("- hom_co2_forecast_R.png\n")
cat("- hom_co2_test_errors_R.png\n")
cat("- hom_co2_rmse_R.png\n")
cat("- hom_co2_results_R.csv\n")
cat("- hom_co2_hom_means_R.csv\n")

############################
# 12. Optional: formulas for paper
############################

cat("\nUseful formulas for the paper:\n")
cat("Classical risk: R(theta) = sum r_t^2\n")
cat("Hom concave risk: R_H(theta) = sum log(1 + (r_t/c)^2)\n")
cat("Hom convex risk: R_H(theta) = sum ((r_t/c)^2)^p, p > 1\n")
cat("Hom mean: E_H[X] = alpha^{-1}( E[alpha(X)] )\n")

############################################################
# End of script
############################################################