---
title: "Monitoring changes in vitamin D levels during the COVID-19 pandemic with routinely-collected laboratory data: Data Analysis"
output: html_document
date: "2025-01-30"
author: "Lea Skapetze"
---


```{r setup, include=FALSE}
# Set chunk options
knitr::opts_chunk$set(echo = TRUE)

# Load required libraries
library(lubridate)
library(ggplot2)
library(rlang)
library(tidyr)
library(stringr)
library(stats)
library(purrr)
library(ggsignif)
library(MatchIt)
library(lmtest)
library(sandwich)
library(parallel)
library(broom)
library(cobalt)
library(optmatch)
library(lm.beta)
library(grf)
library(tidyverse)
library(mediation)
library(dplyr)
library(sensemakr)
library(ggrepel)
library(tableone)
library(cobalt)
library(readr)
library(xtable)

# Turn off scientific notations
options(scipen = 999)

# Check versions of packages
R.version.string
packageVersion("grf")
packageVersion("MatchIt")
packageVersion("sensemakr")
```

## 0. Load the data and filter for only Vitamin + filter out undefined gender

```{r}
# Read the data
df <- read.csv("data.csv")

# Filter for Vitamin D measurements only
vd3_df <- df %>%
  filter(param_short == "VD3")

# Count and calculate percentage of gender "X" before removing it later
gender_x_summary <- vd3_df %>%
  filter(gender == "X") %>%
  summarise(
    count_gender_x = n(),
    percentage_gender_x = (n() / nrow(vd3_df)) * 100
  )

# Print the summary of gender "X"
print(gender_x_summary)

# Filter df accordingly
df <- vd3_df %>%
  filter(gender != "X") %>%
  mutate(
    full_name = "Vitamin D",
    unit = "µg/l"
  )

head(df)
```

## 1. Prepare the data

```{r}
# AGE GROUPS

df <- df %>%
  mutate(age_group = case_when(
    age_at_first_measurement %in% c("[18, 30[", "[30, 40[") ~ "[18, 39]",
    age_at_first_measurement %in% c("[40, 50[", "[50, 60[") ~ "[40, 59]",
    age_at_first_measurement == ">=60" ~ "60+",
    TRUE ~ "Unknown" # In case there are entries that don't fit the above categories
  ))

# DATES, PERIODS, SEASONS

# Convert 'cohort_year' and 'test_month' to date format
df$test_date <- with(df, paste(cohort_year, test_month, "01", sep="-"))
df$test_date <- as.Date(df$test_date, format="%Y-%m-%d")

# Create a new column `year_month` combining `cohort_year` and `test_month`
df <- df %>%
  mutate(
    year_month = as.Date(paste(cohort_year, test_month, "01", sep = "-"), format = "%Y-%m-%d")
  )

# Define periods “B” for before the pandemic and “D” for during the pandemic
period1 <- c(as.Date("2018-03-01"), as.Date("2020-02-29"))
period2 <- c(as.Date("2020-03-01"), as.Date("2022-02-28"))

# Assign each entry to a period
assign_period <- function(date) {
  if (date >= period1[1] && date <= period1[2]) {
    return("B")
  } else if (date >= period2[1] && date <= period2[2]) {
    return("D")
  } else {
    return(NA) 
  }
}

# Create a new column for the period
df$period <- as.character(sapply(df$test_date, assign_period))

# Filter out entries that do not belong to either period
df <- df[!is.na(df$period),]

# Define a function to convert test month to season and adjust cohort_year
# The get_season() function adjusts the cohort_year for December by incrementing the year (year + 1), ensuring that December 2019 belongs to Winter 2020
get_season <- function(month, year) {
  if (month %in% c(12)) {
    return(list(season = "Winter", cohort_year = year + 1)) # December moves to the next year's Winter
  } else if (month %in% c(1, 2)) {
    return(list(season = "Winter", cohort_year = year)) # January and February are Winter of the current year 
  } else if (month %in% c(3, 4, 5)) {
    return(list(season = "Spring", cohort_year = year))
  } else if (month %in% c(6, 7, 8)) {
    return(list(season = "Summer", cohort_year = year))
  } else if (month %in% c(9, 10, 11)) {
    return(list(season = "Fall", cohort_year = year))
  } else {
    return(list(season = NA, cohort_year = year)) # Handle unexpected values
  }
}
# For December measurements: In the end the column cohort_year is the reassigned year, and the column year is the original year

# Apply the function to adjust both season and cohort_year
adjusted_season <- mapply(get_season, df$test_month, df$cohort_year, SIMPLIFY = FALSE)
df$season <- factor(sapply(adjusted_season, function(x) x$season), levels = c("Winter", "Spring", "Summer", "Fall"))
df$cohort_year <- sapply(adjusted_season, function(x) x$cohort_year)

# Create ordered season-year by sorting year and season
df <- df %>%
  mutate(
    season_year = paste(cohort_year, season)
  ) %>%
  arrange(cohort_year, season) %>%
  mutate(
    season_year = factor(season_year, levels = unique(season_year), ordered = TRUE)
  )

# VIT D DEFICIENCY STATUS

# Determining deficiency status 
thresholds <- list(
  VD3 = 20  # to be changed for a different param
)

# Function to check deficiency for Vitamin D
get_deficiency_status <- function(value) {
  threshold <- thresholds$VD3 # to be changed for a different param
  if (!is.na(value) && value < threshold) {
    return(1)  # Deficient
  } else {
    return(0)  # Not deficient
  }
}


# Calculation of deficiency status based on numeric first_value column
df <- df %>%
  rowwise() %>%
  mutate(deficient = get_deficiency_status(first_value)) %>%
  ungroup()

# Make sure there are not NAs
df <- df %>% filter(!is.na(first_value), !is.na(period), !is.na(gender), !is.na(age_group))

# Factorize relevant columns for Propensity-Score Matching and Machine Learning
df$gender <- factor(df$gender)
df$age_group <- factor(df$age_group)
df$period <- factor(df$period)
df$test_month <- factor(df$test_month)
df$season <- factor(df$season)


head(df)

# STATISTICS

# Define Z Scores
z_99 <- 2.576
z_95 <- 1.960 

```

## 2. Descriptive Statistics

### 2.1 Basic comparison
#### 2.1.1 Sample sizes, counts, mean levels and deficiency rates (unstratified)
```{r}
# Count sample size in period "B" and period "D"
sample_size_period <- df %>% 
  count(period)

sample_size_contingency <- with(df, table(period))
sample_size_p_value <- chisq.test(sample_size_contingency)$p.value

# Count how many entries for each age group in period "B" and "D" (absolute nr and percentage)
age_group_counts <- df %>%
  group_by(period, age_group) %>%
  summarise(n = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = n / sum(n) * 100) %>%
  ungroup()

age_group_contingency <- with(df, table(period, age_group))
age_group_p_value <- chisq.test(age_group_contingency)$p.value

# Count how many entries for each gender group in period "B" and "D" (absolute nr and percentage)
gender_group_counts <- df %>%
  group_by(period, gender) %>%
  summarise(n = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = n / sum(n) * 100) %>%
  ungroup()


gender_group_contingency <- with(df, table(period, gender))
gender_group_p_value <- chisq.test(gender_group_contingency)$p.value

# Count how many entries for each season group in period "B" and "D" (absolute nr and percentage)
season_counts <- df %>%
  group_by(period, season) %>%
  summarise(n = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = n / sum(n) * 100) %>%
  ungroup()

season_contingency <- with(df, table(period, season))
season_counts_p_value <- chisq.test(season_contingency)$p.value

# The mean vitamin D level in period B and D (with standard deviation)
mean_vitamin_d_levels <- df %>%
  group_by(period) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

# Vitamin D deficiency rate for period B and D (in percent)
vitamin_d_deficiency_rate <- df %>%
  group_by(period) %>%
  summarise(
    deficiency_count = sum(deficient),             
    total_count = n(), 
    deficiency_rate = sum(deficient) / n() * 100, 
    .groups = 'drop')

# Printing results
print(sample_size_period)
print(sample_size_p_value)
print(age_group_counts)
print(age_group_p_value)
print(gender_group_counts)
print(gender_group_p_value)
print(season_counts)
print(season_counts_p_value)
print(mean_vitamin_d_levels)
print(vitamin_d_deficiency_rate)
```

#### 2.1.2 Effect size (Cohen's d) - Mean vitamin D levels
```{r}
# Means and standard deviations
mean_b <- mean(df$first_value[df$period == "B"])
mean_d <- mean(df$first_value[df$period == "D"])
sd_b <- sd(df$first_value[df$period == "B"])
sd_d <- sd(df$first_value[df$period == "D"])

# Pooled SD
n1 <- sum(df$period == "B")
n2 <- sum(df$period == "D")
sd_pooled <- sqrt(((n1 - 1) * sd_b^2 + (n2 - 1) * sd_d^2) / (n1 + n2 - 2))

# Cohen's d (flipped sign for "During - Before")
cohens_d <- (mean_d - mean_b) / sd_pooled  # Flips the direction
print(paste("Cohen's d:", round(cohens_d, 4)))

# Convert sample sizes to numeric to avoid integer overflow
n1 <- as.numeric(n1)  # Before pandemic
n2 <- as.numeric(n2)  # During pandemic

# Standard error of Cohen's d
se_d <- sqrt((n1 + n2) / (n1 * n2) + (cohens_d^2) / (2 * (n1 + n2)))

# Calculate CIs
ci_99_lower_d <- cohens_d - z_99 * se_d
ci_99_upper_d <- cohens_d + z_99 * se_d

ci_95_lower_d <- cohens_d - z_95 * se_d  
ci_95_upper_d <- cohens_d + z_95 * se_d  

# Print results
cat("99% CI for Cohen's d:", ci_99_lower_d, "to", ci_99_upper_d, "\n")
cat("95% CI for Cohen's d:", ci_95_lower_d, "to", ci_95_upper_d, "\n") 

# Note: Confidence intervals for Cohen's d can be calculated using bootstrapping for more precise estimates, but this script uses an analytical approximation based on the standard error of d and Z-scores (valid for sufficiently large sample sizes and normal effect size distribution).

```

#### 2.1.3 Effect size (Cohen's d) - Vitamin D deficiency rates

```{r}
# Ensure sample sizes are numeric to avoid integer overflow
n1_def <- as.numeric(sum(df$period == "B"))
n2_def <- as.numeric(sum(df$period == "D"))

# Compute means and standard deviations for deficiency rates
mean_b_def <- mean(df$deficient[df$period == "B"])
mean_d_def <- mean(df$deficient[df$period == "D"])
sd_b_def <- sd(df$deficient[df$period == "B"])
sd_d_def <- sd(df$deficient[df$period == "D"])

# Pooled SD for deficiency
sd_pooled_def <- sqrt(((n1_def - 1) * sd_b_def^2 + (n2_def - 1) * sd_d_def^2) / (n1_def + n2_def - 2))

# Cohen's d (flipped sign for "During - Before")
cohens_d_def <- (mean_d_def - mean_b_def) / sd_pooled_def
print(paste("Cohen's d for deficiency:", round(cohens_d_def, 4)))

# Standard error for Cohen's d (now using numeric sample sizes)
se_d_def <- sqrt((n1_def + n2_def) / (n1_def * n2_def) + (cohens_d_def^2) / (2 * (n1_def + n2_def)))

# Confidence intervals for Cohen's d
ci_99_lower_d_def <- cohens_d_def - z_99 * se_d_def
ci_99_upper_d_def <- cohens_d_def + z_99 * se_d_def
ci_95_lower_d_def <- cohens_d_def - z_95 * se_d_def
ci_95_upper_d_def <- cohens_d_def + z_95 * se_d_def

# Print results
cat("99% CI for Cohen's d:", ci_99_lower_d_def, "to", ci_99_upper_d_def, "\n")
cat("95% CI for Cohen's d:", ci_95_lower_d_def, "to", ci_95_upper_d_def, "\n")

```


### 2.2 Mean levels and deficiency rates, stratified by age, gender and season
```{r}
# Compare mean vitamin D levels between periods using t-test
vitamin_d3_levels_B <- df %>% filter(period == "B") %>% pull(first_value)
vitamin_d3_levels_D <- df %>% filter(period == "D") %>% pull(first_value)

t_test_results <- t.test(vitamin_d3_levels_B, vitamin_d3_levels_D, na.rm = TRUE)
mean_comparison_p_value <- t_test_results$p.value

print(mean_comparison_p_value)


# Construct a contingency table for deficiencies in periods B and D
deficiency_table <- df %>% 
  group_by(period) %>% 
  summarise(deficient = sum(deficient), not_deficient = n() - sum(deficient)) %>%
  dplyr::select(-period) %>% 
  as.matrix()

chi_square_test_result <- chisq.test(deficiency_table)
  deficiency_comparison_p_value <- chi_square_test_result$p.value
  

print(deficiency_comparison_p_value)

# Mean Vitamin D levels by age group
vitamin_d3_age_means <- df %>%
  group_by(period, age_group) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

age_group_stats <- df %>%
  group_by(period, age_group) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )


# Mean Vitamin D Levels by gender group
vitamin_d3_gender_means <- df %>%
  group_by(period, gender) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

gender_group_stats <- df %>%
  filter(gender %in% c("M", "F")) %>%
  group_by(period, gender) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

# Mean Vitamin D Levels by season 
vitamin_d3_season_means <- df %>%
  group_by(period, season) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

season_group_stats <- df %>%
  group_by(period, season) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

print(age_group_stats)
print(gender_group_stats)
print(season_group_stats)

```

### 2.4 Statistical testing for age, gender and season
```{r}
# Statistical Testing within each sub-group

# Function to perform a t-test and return p-value
perform_t_test <- function(data_group_1, data_group_2) {
  test_result <- t.test(data_group_1, data_group_2, na.rm = TRUE)
  return(test_result$p.value)
}

# Calculate p-values for age groups
p_values_age_groups <- df %>%
  group_by(age_group) %>%
  summarise(
    t_test_p_value = perform_t_test(
      first_value[period == "B"],
      first_value[period == "D"]
    ),
    .groups = 'drop'  
  )

# Calculate p-values for gender groups
p_values_gender_groups <- df %>%
  filter(gender %in% c("M", "F")) %>%
  group_by(gender) %>%
  summarise(
    t_test_p_value = perform_t_test(
      first_value[period == "B"],
      first_value[period == "D"]
    ),
    .groups = 'drop'  
  )

# Calculate p-values for gender groups
p_values_season_groups <- df %>%
  group_by(season) %>%
  summarise(
    t_test_p_value = t.test(
      first_value[period == "B"],
      first_value[period == "D"],
    )$p.value,
    .groups = 'drop'
  )

# Print p-values
print(p_values_age_groups)
print(p_values_gender_groups)
print(p_values_season_groups)
```


### 2.5 Deficiency rates for gender, age and season groups
```{r}
# Calculate the deficiency rates for each gender group
deficiency_gender_groups <- df %>%
  group_by(gender, period) %>%
  summarise(deficient_count = sum(deficient), total_count = n(), .groups = 'drop') %>%
  pivot_wider(
    names_from = period,
    values_from = c(deficient_count, total_count),
    values_fill = list(deficient_count = 0, total_count = 0)
  ) %>%
  mutate(
    rate_B = if_else(total_count_B > 0, deficient_count_B / total_count_B * 100, 0),
    rate_D = if_else(total_count_D > 0, deficient_count_D / total_count_D * 100, 0)
  ) %>%
  # Calculate p-values
  rowwise() %>%
  mutate(p_value = chisq.test(rbind(c(deficient_count_B, total_count_B - deficient_count_B), 
                                    c(deficient_count_D, total_count_D - deficient_count_D)))$p.value) %>%
  ungroup()

print(deficiency_gender_groups)
```

```{r}
# Calculate the deficiency rates for each age group
deficiency_age_groups <- df %>%
  group_by(age_group, period) %>%
  summarise(deficient_count = sum(deficient), total_count = n(), .groups = 'drop') %>%
  pivot_wider(
    names_from = period,
    values_from = c(deficient_count, total_count),
    values_fill = list(deficient_count = 0, total_count = 0) 
  ) %>%
  mutate(
    rate_B = if_else(total_count_B > 0, deficient_count_B / total_count_B * 100, 0),
    rate_D = if_else(total_count_D > 0, deficient_count_D / total_count_D * 100, 0)
  ) %>%
  rowwise() %>%
  mutate(p_value = chisq.test(rbind(c(deficient_count_B, total_count_B - deficient_count_B), 
                                    c(deficient_count_D, total_count_D - deficient_count_D)))$p.value) %>%
  ungroup()

print(deficiency_age_groups)
```


```{r}
# Calculate the deficiency rates for each season group
deficiency_season_groups <- df %>%
  group_by(season, period) %>%
  summarise(deficient_count = sum(deficient), total_count = n(), .groups = 'drop') %>%
  pivot_wider(
    names_from = period,
    values_from = c(deficient_count, total_count),
    values_fill = list(deficient_count = 0, total_count = 0)
  ) %>%
  mutate(
    rate_B = if_else(total_count_B > 0, deficient_count_B / total_count_B * 100, 0),
    rate_D = if_else(total_count_D > 0, deficient_count_D / total_count_D * 100, 0)
  ) %>%
  # Calculate p-values
  rowwise() %>%
  mutate(p_value = chisq.test(rbind(c(deficient_count_B, total_count_B - deficient_count_B), 
                                    c(deficient_count_D, total_count_D - deficient_count_D)))$p.value) %>%
  ungroup()

print(deficiency_season_groups)
```

### 2.6 Plots: Vit D levels over time
```{r}
# Vit D Levels Over Time (stratified by age and gender) - with Standard Error

# Calculate mean and standard error for Vitamin D levels by gender and season-year
gender_stats <- df %>%
  group_by(season_year, gender) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    se_value = sd(first_value, na.rm = TRUE) / sqrt(n()),  # Standard error calculation
    .groups = 'drop'
  )

# Plot Vitamin D levels over time by gender group (with standard error)
ggplot(gender_stats, aes(x = season_year, y = mean_value, color = gender, fill = gender, group = gender)) +
  geom_line(linewidth = 1) +  
  geom_point(size = 2) +  
  geom_ribbon(aes(ymin = mean_value - se_value, ymax = mean_value + se_value), alpha = 0.2) +  
  labs(
    x = "Season",
    y = "Vitamin D Level (µg/l)"
  ) +
  ylim(0, 60) +  
  theme_minimal() +
  scale_fill_manual(values = c("M" = "blue", "F" = "red")) +  
  scale_color_manual(values = c("M" = "blue", "F" = "red")) +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

# Calculate mean and standard error for Vitamin D levels by age group and season-year
age_group_stats <- df %>%
  group_by(season_year, age_group) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    se_value = sd(first_value, na.rm = TRUE) / sqrt(n()),  # Standard error calculation
    .groups = 'drop'
  )

# Plot Vitamin D levels over time by age group (with standard error)
ggplot(age_group_stats, aes(x = season_year, y = mean_value, color = age_group, fill = age_group, group = age_group)) +
  geom_line(linewidth = 1) +  
  geom_point(size = 2) +  
  geom_ribbon(aes(ymin = mean_value - se_value, ymax = mean_value + se_value), alpha = 0.2) +  
  labs(
    x = "Season",
    y = "Vitamin D Level (µg/l)"
  ) +
  ylim(0, 60) + 
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") +  
  scale_color_brewer(palette = "Set2") +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

```
```{r}
# Vit D Levels Over Time (stratified by age and gender) - with Standard Deviation

# Calculate mean and standard deviation for Vitamin D levels by gender and season-year
gender_stats <- df %>%
  group_by(season_year, gender) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE), 
    .groups = 'drop'
  )

# Plot Vitamin D levels over time by gender group (with standard deviation)
ggplot(gender_stats, aes(x = season_year, y = mean_value, color = gender, fill = gender, group = gender)) +
  geom_line(linewidth = 1) +  
  geom_point(size = 2) + 
  geom_ribbon(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), alpha = 0.2) +  
  labs(
    x = "Season",
    y = "Vitamin D Level (µg/l)"
  ) +
  ylim(0, 60) +  
  theme_minimal() +
  scale_fill_manual(values = c("M" = "blue", "F" = "red")) +  
  scale_color_manual(values = c("M" = "blue", "F" = "red")) +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

# Calculate mean and standard deviation for Vitamin D levels by age group and season-year
age_group_stats <- df %>%
  group_by(season_year, age_group) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE),  # Standard deviation calculation
    .groups = 'drop'
  )

# Plot Vitamin D levels over time by age group (with standard deviation)
ggplot(age_group_stats, aes(x = season_year, y = mean_value, color = age_group, fill = age_group, group = age_group)) +
  geom_line(linewidth = 1) +  
  geom_point(size = 2) +  
  geom_ribbon(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), alpha = 0.2) +  
  labs(
    x = "Season",
    y = "Vitamin D Level (µg/l)"
  ) +
  ylim(0, 60) +  
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") +  
  scale_color_brewer(palette = "Set2") +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

```

```{r}
# Vit D Levels Over Time (not stratified by age and gender)

# Calculate mean and standard error for Vitamin D levels for the entire cohort by season-year
cohort_stats <- df %>%
  group_by(season_year) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    se_value = sd(first_value, na.rm = TRUE) / sqrt(n()),
    sd_value = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )

print(cohort_stats)

# Plot Vitamin D levels over time for the entire cohort with standard error ribbon
ggplot(cohort_stats, aes(x = season_year, y = mean_value, group = 1)) +
  geom_line(color = "#1f77b4", linewidth = 1) +  
  geom_point(color = "#ff7f0e", size = 2) +  
  geom_ribbon(aes(ymin = mean_value - se_value, ymax = mean_value + se_value), fill = "#ff7f0e", alpha = 0.2) +  
  labs(
    x = "Season",
    y = "Vitamin D Level (µg/l)",
  ) +
  ylim(0, 60) + 
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )


# Plot Vitamin D levels over time for the entire cohort with standard deviation ribbon
ggplot(cohort_stats, aes(x = season_year, y = mean_value, group = 1)) +
  geom_line(color = "#1f77b4", linewidth = 1) +  
  geom_point(color = "#ff7f0e", size = 2) +  
  geom_ribbon(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), fill = "#ff7f0e", alpha = 0.2) +  
  labs(
    x = "Season",
    y = "Vitamin D Level (µg/l)",
  ) +
  ylim(0, 60) +  
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

```


### 2.7 Plots: Comparison Vit D levels before and during the pandemic

```{r}
# Remove outliers
# Count values below 0 and above 70
below_0 <- sum(df$first_value < 0, na.rm = TRUE)
above_70 <- sum(df$first_value > 70, na.rm = TRUE)

# Print the counts
print(paste("Number of values removed below 0:", below_0))
print(paste("Number of values removed above 70:", above_70))

# Filter data to remove outliers (values below 0 or above 70)
df_filtered <- df %>%
  filter(first_value >= 0 & first_value <= 70)

# Create a new column for combined gender and age group for specific grouping
df_filtered <- df_filtered %>%
  mutate(gender_age_group = paste(gender, age_group, sep = " - "))

# Calculate statistics for each gender-age group and period combination
gender_age_group_stats <- df_filtered %>%
  group_by(gender_age_group, period) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    sd_value = sd(first_value, na.rm = TRUE)
  ) %>%
  ungroup()

# Perform t-test comparing Vitamin D levels before and during the pandemic for each gender-age group
t_test_gender_age_group <- df_filtered %>%
  group_by(gender_age_group) %>%
  summarise(
    p_value = t.test(first_value[period == "B"], first_value[period == "D"], na.rm = TRUE)$p.value
  )

# Add significance stars based on p-values
t_test_gender_age_group <- t_test_gender_age_group %>%
  mutate(signif_label = case_when(
    p_value < 0.001 ~ "***",
    p_value < 0.01 ~ "**",
    p_value < 0.05 ~ "*",
    TRUE ~ "ns"  
  ))

# Print p-values and significance labels
print(t_test_gender_age_group)

# Calculate statistics for absolute and percentage change
gender_age_group_changes <- gender_age_group_stats %>%
  pivot_wider(names_from = period, values_from = c(mean_value, sd_value)) %>%
  rename(
    mean_value_Pre_Pandemic = `mean_value_B`,
    mean_value_Pandemic = `mean_value_D`
  ) %>%
  mutate(
    absolute_change = mean_value_Pandemic - mean_value_Pre_Pandemic,
    percentage_change = ((mean_value_Pandemic - mean_value_Pre_Pandemic) / mean_value_Pre_Pandemic) * 100
  )

# Create updated facet labels with absolute and percentage changes
facet_labels <- setNames(
  paste0(
    gender_age_group_changes$gender_age_group,
    "\nAbs. Change: ", round(gender_age_group_changes$absolute_change, 2),
    " µg/l (", round(gender_age_group_changes$percentage_change, 1), "%)"
  ),
  gender_age_group_changes$gender_age_group)

# Violin plot with updated facet labels
p_gender_age_violin_box <- ggplot(df_filtered, aes(x = period, y = first_value, fill = period)) +
  geom_violin(position = position_dodge(width = 0.8), width = 0.6, trim = FALSE) +  # Violin plot
  geom_boxplot(position = position_dodge(width = 0.8), width = 0.15, alpha = 0.5, outlier.shape = NA) +  # Boxplot without outliers
  stat_summary(
    fun.data = "mean_sdl", geom = "pointrange", 
    position = position_dodge(width = 0.8)
  ) +
  labs(
    x = "Period",
    y = "Vitamin D Level (µg/l)"
  ) +
  theme_minimal() +
  scale_fill_manual(
    values = c("B" = "#1f77b4", "D" = "#ff7f0e"),  
    name = "Period",
    labels = c("B" = "Pre-Pandemic", "D" = "Pandemic")
  ) +
  theme(
  legend.title = element_blank(),
  strip.text = element_text(size = 7),
  axis.text.x = element_blank(),  # Remove x-axis labels
  axis.ticks.length = unit(0.2, "cm")
) +
  facet_wrap(~ gender_age_group, scales = "free", labeller = as_labeller(facet_labels))  

# Add significance stars for each gender-age group
p_gender_age_violin_box <- p_gender_age_violin_box +
  geom_text(data = t_test_gender_age_group, aes(x = 1.5, y = 65, label = signif_label), size = 5, inherit.aes = FALSE)

print(p_gender_age_violin_box)
```


### 2.8 Plots: Seasonality 

```{r}
# Filter data to remove outliers (values below 0 or above 70)
df_filtered <- df %>%
  filter(first_value >= 0 & first_value <= 70)

# Prepare the data for the updated plot
season_year_stats <- df_filtered %>%
  filter(!(cohort_year == 2018 & season == "Winter")) %>% # Exclude Winter 2018
  mutate(
    cohort_year = factor(cohort_year),  
    period_label = case_when(
      period == "B" ~ "Pre-Pandemic",
      period == "D" ~ "Pandemic",
      TRUE ~ NA_character_
    ),
    period_label = factor(period_label, levels = c("Pre-Pandemic", "Pandemic"))
  )

# Calculate mean and median differences between pre-pandemic and pandemic periods for each season
seasonal_changes <- season_year_stats %>%
  group_by(season, period_label) %>%
  summarise(
    mean_value = mean(first_value, na.rm = TRUE),
    median_value = median(first_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = period_label, 
    values_from = c(mean_value, median_value)
  ) %>%
  rename(
    mean_value_Pre_Pandemic = `mean_value_Pre-Pandemic`,
    mean_value_Pandemic = `mean_value_Pandemic`,
    median_value_Pre_Pandemic = `median_value_Pre-Pandemic`,
    median_value_Pandemic = `median_value_Pandemic`
  ) %>%
  mutate(
    mean_diff = mean_value_Pandemic - mean_value_Pre_Pandemic,
    median_diff = median_value_Pandemic - median_value_Pre_Pandemic
  )

seasonal_changes <- seasonal_changes %>%
  dplyr::select(season, mean_diff) %>%
  mutate(mean_diff = round(mean_diff, 2))

# Create facet labels with mean difference included
facet_labels <- setNames(
  paste0(c("Winter", "Spring", "Summer", "Fall"), "\nMean Diff: ", seasonal_changes$mean_diff),
  c("Winter", "Spring", "Summer", "Fall")
)

# Updated plot with mean difference in facet labels
p_seasonal_violin_box <- ggplot(season_year_stats, aes(x = cohort_year, y = first_value, fill = period_label)) +
  geom_violin(alpha = 0.8, trim = TRUE, scale = "width", position = position_dodge(width = 0.8)) +
  geom_boxplot(width = 0.2, position = position_dodge(width = 0.8), alpha = 0.6, outlier.shape = NA) + 
  facet_wrap(~ season, ncol = 2, labeller = as_labeller(facet_labels)) +
  geom_vline(
  xintercept = seq_along(unique(season_year_stats$cohort_year)) - 0.5,
  color = "grey85",
  linetype = "dashed",
  size = 0.3
) +
  
  # Labels and colors
  labs(
    x = "Year",
    y = "Vitamin D Level (µg/l)",
    fill = "Period"
  ) +
  scale_fill_manual(
  values = c("Pre-Pandemic" = "#1f77b4", "Pandemic" = "#ff7f0e"),
  aesthetics = c("fill", "color")
) +
  
  # Adjust theme
  theme_classic() +
  theme(
    text = element_text(size = 10),
    legend.text = element_text(size = 8),
    panel.grid.major.y = element_line(color = "grey90", size = 0.2),
    panel.grid.minor.y = element_line(color = "grey95", size = 0.1),
    legend.position = "top",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold", size = 12),
    plot.title = element_text(size = 14, hjust = 0.5, face = "bold")
  )

print(p_seasonal_violin_box)
```

## 3. Propensity score matching (PSM)

### 3.1 Matching
```{r}
# performing exact matching parameters on the entire sample
test_match_obj <- matchit(period ~ age_group + gender + test_month, data = df, method = "nearest", distance = "glm", exact = c("gender", "age_group", "test_month")) 

matched_data <- match.data(test_match_obj)

# Calculate the sample sizes
original_size <- nrow(df)  # Total original sample size
matched_size <- nrow(matched_data)  # Total matched sample size

# Calculate the number and percentage of observations lost
num_lost <- original_size - matched_size
percent_lost <- (num_lost / original_size) * 100

# Print results
cat("Original Sample Size: ", original_size, "\n")
cat("Matched Sample Size: ", matched_size, "\n")
cat("Number of Observations Lost: ", num_lost, "\n")
cat("Percentage Lost: ", percent_lost, "%\n")

# Summary
summary(test_match_obj)
```
#### 3.1.1 Statistical testing

```{r}
# Calculate summaries for Age with Chi-squared Test for Independence
age_summary <- matched_data %>%
  group_by(period, age_group) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = count / sum(count) * 100)

# Conduct Chi-squared Test and append the result
age_table <- xtabs(count ~ period + age_group, data = age_summary)
age_chi_sq_test <- tryCatch(chisq.test(age_table), error = function(e) e)
age_p_value <- ifelse(class(age_chi_sq_test) == "try-error", NA, age_chi_sq_test$p.value)

# Calculate summaries for Gender with Chi-squared Test
gender_summary <- matched_data %>%
  group_by(period, gender) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = count / sum(count) * 100)

# Conduct Chi-squared Test and append the result
gender_table <- xtabs(count ~ period + gender, data = gender_summary)
gender_chi_sq_test <- tryCatch(chisq.test(gender_table), error = function(e) e)
gender_p_value <- ifelse(class(gender_chi_sq_test) == "try-error", NA, gender_chi_sq_test$p.value)

# Calculate summaries for Test month with Chi-squared Test
test_month_summary <- matched_data %>%
  group_by(period, test_month) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = count / sum(count) * 100)

# Conduct Chi-squared Test and append the result
test_month_table <- xtabs(count ~ period + test_month, data = test_month_summary)
test_month_chi_sq_test <- tryCatch(chisq.test(test_month_table), error = function(e) e)
test_month_p_value <- ifelse(class(test_month_chi_sq_test) == "try-error", NA, test_month_chi_sq_test$p.value)

# Calculate summaries for Season with Chi-squared Test
season_summary <- matched_data %>%
  group_by(period, season) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(period) %>%
  mutate(percentage = count / sum(count) * 100)

# Conduct Chi-squared Test and append the result
season_table <- xtabs(count ~ period + season, data = season_summary)
season_chi_sq_test <- tryCatch(chisq.test(season_table), error = function(e) e)
season_p_value <- ifelse(class(season_chi_sq_test) == "try-error", NA, season_chi_sq_test$p.value)

# Calculate mean and standard deviation for Vitamin D levels and perform t-test
vitamin_d_summary <- matched_data %>%
  group_by(period) %>%
  summarise(
    mean_vitamin_d = mean(first_value, na.rm = TRUE),
    sd_vitamin_d = sd(first_value, na.rm = TRUE),
    .groups = 'drop'
  )
vitamin_d_t_test <- t.test(first_value ~ period, data = matched_data)
vitamin_d_p_value <- vitamin_d_t_test$p.value

# Calculate the percentage of deficiencies for each period and perform Chi-squared Test
deficiency_summary <- matched_data %>%
  group_by(period) %>%
  summarise(
    total = n(),
    deficient_count = sum(deficient == 1, na.rm = TRUE),
    non_deficient_count = sum(deficient == 0, na.rm = TRUE),
    deficiency_rate = deficient_count / total * 100,
    .groups = 'drop'
  )

# Create a contingency table for the Chi-squared test
deficiency_table <- xtabs(~ period + deficient, data = matched_data)

# Conduct Chi-squared Test and append the result
deficiency_chi_sq_test <- tryCatch(chisq.test(deficiency_table), error = function(e) e)
deficiency_p_value <- ifelse(class(deficiency_chi_sq_test) == "try-error", NA, deficiency_chi_sq_test$p.value)

# Print the summaries and p-values
print(age_summary)
print(paste("Age Group P-value:", age_p_value))
print(gender_summary)
print(paste("Gender P-value:", gender_p_value))
print(test_month_summary)
print(paste("Test Month P-value:", test_month_p_value))
print(season_summary)
print(paste("Season P-value:", season_p_value))
print(vitamin_d_summary)
print(paste("Vitamin D Levels P-value:", vitamin_d_p_value))
print(deficiency_summary)
print(paste("Deficiency Rate P-value:", deficiency_p_value))

```

#### 3.1.2 Effect size (SMD) - Mean vitamin D levels

```{r}
# Effect Size

# Calculate means and SDs for matched data
mean_matched_b <- mean(matched_data$first_value[matched_data$period == "B"])
mean_matched_d <- mean(matched_data$first_value[matched_data$period == "D"])
sd_matched_b <- sd(matched_data$first_value[matched_data$period == "B"])
sd_matched_d <- sd(matched_data$first_value[matched_data$period == "D"])

# Pooled SD for matched data
n1_matched <- nrow(matched_data[matched_data$period == "B", ])
n2_matched <- nrow(matched_data[matched_data$period == "D", ])
sd_matched_pooled <- sqrt(((n1_matched - 1) * sd_matched_b^2 + 
                           (n2_matched - 1) * sd_matched_d^2) / 
                          (n1_matched + n2_matched - 2))

# SMD (flipped sign for "During - Before")
smd <- (mean_matched_d - mean_matched_b) / sd_matched_pooled  # Flips the direction
print(paste("Standardized Mean Difference (SMD):", round(smd, 4)))

# Standard error for SMD (same formula as Cohen's d)
se_smd <- sqrt((n1 + n2) / (n1 * n2) + (smd^2) / (2 * (n1 + n2)))

# Confidence intervals
ci_99_lower_smd <- smd - z_99 * se_smd
ci_99_upper_smd <- smd + z_99 * se_smd

ci_95_lower_smd <- smd - z_95 * se_smd  
ci_95_upper_smd <- smd + z_95 * se_smd  

# Print results
cat("99% CI for SMD:", ci_99_lower_smd, "to", ci_99_upper_smd, "\n")
cat("95% CI for SMD:", ci_95_lower_smd, "to", ci_95_upper_smd, "\n")  

# Note: Confidence intervals for the Standardized Mean Difference (SMD) are calculated using the formula:CI = SMD ± Z * SE_SMD where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_SMD is the standard error of the SMD estimate.
```

#### 3.1.3 Effect size (SMD) - vitamin D deficiency rate

```{r}
# Ensure sample sizes are numeric to avoid integer overflow
n1_matched_def <- as.numeric(nrow(matched_data[matched_data$period == "B", ]))
n2_matched_def <- as.numeric(nrow(matched_data[matched_data$period == "D", ]))

# Compute means and standard deviations for matched deficiency rates
mean_matched_b_def <- mean(matched_data$deficient[matched_data$period == "B"])
mean_matched_d_def <- mean(matched_data$deficient[matched_data$period == "D"])
sd_matched_b_def <- sd(matched_data$deficient[matched_data$period == "B"])
sd_matched_d_def <- sd(matched_data$deficient[matched_data$period == "D"])

# Pooled SD for matched deficiency data
sd_matched_pooled_def <- sqrt(((n1_matched_def - 1) * sd_matched_b_def^2 + 
                               (n2_matched_def - 1) * sd_matched_d_def^2) / 
                              (n1_matched_def + n2_matched_def - 2))

# SMD for deficiency (flipped sign for "During - Before")
smd_def <- (mean_matched_d_def - mean_matched_b_def) / sd_matched_pooled_def
print(paste("SMD for deficiency:", round(smd_def, 4)))

# Standard error for SMD
se_smd_def <- sqrt((n1_matched_def + n2_matched_def) / (n1_matched_def * n2_matched_def) + (smd_def^2) / (2 * (n1_matched_def + n2_matched_def)))

# Confidence intervals for SMD
ci_99_lower_smd_def <- smd_def - z_99 * se_smd_def
ci_99_upper_smd_def <- smd_def + z_99 * se_smd_def
ci_95_lower_smd_def <- smd_def - z_95 * se_smd_def
ci_95_upper_smd_def <- smd_def + z_95 * se_smd_def

# Print results
cat("99% CI for SMD:", ci_99_lower_smd_def, "to", ci_99_upper_smd_def, "\n")
cat("95% CI for SMD:", ci_95_lower_smd_def, "to", ci_95_upper_smd_def, "\n")

# Note: Confidence intervals for the Standardized Mean Difference (SMD) are calculated using the formula:CI = SMD ± Z * SE_SMD where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_SMD is the standard error of the SMD estimate.
```


## 4.Causal Machine Learning: Random Forest ON UNMATCHED DATA

### 4.1 Change in mean vitamin D levels
```{r}
# Convert categorical variables to numeric for causal forest
df$age_group_numeric <- as.integer(df$age_group)
df$gender_numeric <- as.integer(df$gender)
df$season_numeric <- as.integer(df$season)

# Convert period to a binary treatment indicator
df$treatment <- ifelse(df$period == "D", 1, 0)

# Calculate the pre-pandemic average combined value directly from the dataset
average_first_value <- df %>%
  filter(period == "B") %>%
  summarise(avg_value = mean(first_value, na.rm = TRUE)) %>%
  pull(avg_value)

print(paste("Pre-pandemic average combined value:", round(average_first_value, 4)))

# Run causal forest model
causal_forest_model <- causal_forest(
  X = df[c("age_group_numeric", "gender_numeric", "season_numeric")], 
  Y = df$first_value, 
  W = df$treatment, 
  num.trees = 2000
)

# Estimating the average treatment effect (ATE) with confidence intervals
ate_result <- average_treatment_effect(causal_forest_model, target.sample = "all")

# Extract the point estimate and standard error
ate_estimate <- ate_result[1]
ate_std_err <- ate_result[2]

# Print the results
print(paste("ATE Estimate:", round(ate_estimate, 4)))

# Calculate percentage change
percentage_change <- (ate_estimate / average_first_value) * 100

# Print the percentage change
print(paste("Percentage change due to pandemic:", round(percentage_change, 2), "%"))

# Confidence intervals
ci_99_lower_ate <- ate_estimate - z_99 * ate_std_err
ci_99_upper_ate <- ate_estimate + z_99 * ate_std_err

ci_95_lower_ate <- ate_estimate - z_95 * ate_std_err
ci_95_upper_ate <- ate_estimate + z_95 * ate_std_err

cat("99% CI for ATE:", ci_99_lower_ate, "to", ci_99_upper_ate, "\n")
cat("95% CI for ATE:", ci_95_lower_ate, "to", ci_95_upper_ate, "\n")

# Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate.
```

#### 4.1.1 Feature importance

```{r}
# Calculate variable importance and convert to percentages
importance <- variable_importance(causal_forest_model)
importance_percentage <- (importance / sum(importance)) * 100

# Create a data frame for plotting
importance_df <- data.frame(
  covariate = c("Age Group", "Gender", "Season"),
  importance = importance,
  percentage = importance_percentage
)

# Print the numeric importance values and percentages
print(importance_df)

# Plot
ggplot(importance_df, aes(x = reorder(covariate, importance), y = percentage, fill = covariate)) +
  geom_bar(stat = "identity", color = "black", width = 0.7) +  
  geom_text(aes(label = sprintf("%.1f%%", percentage)), hjust = -0.2, size = 5) +  
  labs(
    x = "Covariate",
    y = "Feature importance (explained variance in %)"
  ) +
  scale_fill_manual(values = c("Age Group" = "#1f77b4", "Gender" = "#1f77b4", "Season" = "#1f77b4")) +  
  coord_flip() +  
  ylim(0, 80) +  
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),  
    axis.text.x = element_text(size = 12),  
    axis.text.y = element_text(size = 12),  
    axis.title.x = element_text(size = 14),  
    axis.title.y = element_text(size = 14),  
    legend.position = "none"  
  )

```
#### 4.1.2 Subgroup Analysis - for each subgroup

```{r}
# Define the subgroups for age_group, gender, and season
age_groups <- unique(df$age_group)
gender_groups <- unique(df$gender)
season_groups <- unique(df$season)

# Function to run the causal forest model on a given subset and return the ATE and CI
run_causal_forest <- function(subset_df) {
  causal_forest_model <- causal_forest(
    X = subset_df[c("age_group_numeric", "gender_numeric", "season_numeric")], 
    Y = subset_df$first_value, 
    W = subset_df$treatment, 
    num.trees = 2000
  )
  
  # Estimate ATE
  ate_result <- average_treatment_effect(causal_forest_model)
  ate_estimate <- ate_result[1]
  ate_std_err <- ate_result[2]
  
  # Calculate the 99% confidence interval
  z_score_99 <- 2.576
  ate_ci_lower_99 <- ate_estimate - z_score_99 * ate_std_err
  ate_ci_upper_99 <- ate_estimate + z_score_99 * ate_std_err
  
  # Calculate the 95% confidence interval 
  z_score_95 <- 1.960  
  ate_ci_lower_95 <- ate_estimate - z_score_95 * ate_std_err
  ate_ci_upper_95 <- ate_estimate + z_score_95 * ate_std_err
  
  # Calculate percentage change
  average_first_value <- 26.30
  percentage_change <- (ate_estimate / average_first_value) * 100
  
  return(data.frame(
    Category = "",
    ATE_Estimate = round(ate_estimate, 4),
    CI_Lower_99 = round(ate_ci_lower_99, 4),
    CI_Upper_99 = round(ate_ci_upper_99, 4),
    CI_Lower_95 = round(ate_ci_lower_95, 4),  
    CI_Upper_95 = round(ate_ci_upper_95, 4),  
    Percentage_Change = round(percentage_change, 2)
  ))
}

# Run the analysis for each age group, gender group, and season group individually
age_group_results <- lapply(age_groups, function(age) {
  subset_df <- df[df$age_group == age, ]
  result <- run_causal_forest(subset_df)
  result$Category <- paste("Age Group:", age)
  return(result)
})

gender_group_results <- lapply(gender_groups, function(gender) {
  subset_df <- df[df$gender == gender, ]
  result <- run_causal_forest(subset_df)
  result$Category <- paste("Gender:", gender)
  return(result)
})

season_group_results <- lapply(season_groups, function(season) {
  subset_df <- df[df$season == season, ]
  result <- run_causal_forest(subset_df)
  result$Category <- paste("Season:", season)
  return(result)
})

# Combine all results into a single data frame
final_results <- do.call(rbind, c(age_group_results, gender_group_results, season_group_results))

# Display the final results as a data frame
print(final_results)
```

```{r}
# Adjust the order for seasons
season_order <- c("Season: Fall", "Season: Summer", "Season: Spring", "Season: Winter")

# Set factor levels for the Category column
final_results$Category <- factor(
  final_results$Category,
  levels = c(season_order, setdiff(unique(final_results$Category), season_order))
)

forest_plot <- ggplot(final_results, aes(x = Category, y = ATE_Estimate)) +
  # Add the points for ATE estimates
  geom_point(size = 5, shape = 21, fill = "black", color = "black", stroke = 0.5) +
  
  # Add 99% CI as the longest bars
  geom_linerange(aes(ymin = CI_Lower_99, ymax = CI_Upper_99), size = 1.0, color = "black") +
  
  # Add 95% CI as shorter bars
  geom_linerange(aes(ymin = CI_Lower_95, ymax = CI_Upper_95), size = 2.0, color = "firebrick") +
  
  # Add a dashed line at 0 for reference
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  
  # Flip coordinates for horizontal plot
  coord_flip() +
  
  # Labels and theme
  labs(
    x = "Subgroup category",
    y = "Estimated reduction (in µg/l)"  
  ) +
  theme_classic(base_size = 16) +  # Increased base font size
  theme(
    axis.text.x = element_text(hjust = 0.5, size = 14),  
    axis.text.y = element_text(size = 14),
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    panel.grid.major.y = element_line(color = "grey80"),
    panel.grid.minor.y = element_blank()
  )

# Display the updated plot
print(forest_plot)
```

#### 4.1.3 Comparison of effect sizes across all 3 methods used
```{r}
# Cohens d and SMD are standardized, transform back by multiplying with pooled SD

# Calculate pooled SD for original data
s_pooled <- sqrt(((n1 - 1) * sd_b^2 + (n2 - 1) * sd_d^2) / (n1 + n2 - 2))

# ATE from Cohen's d
ate_from_cohens_d <- cohens_d * s_pooled

# ATE confidence intervals for Cohen's d (95% instead of 90%)
ci_99_lower_ate_d <- ci_99_lower_d * s_pooled
ci_99_upper_ate_d <- ci_99_upper_d * s_pooled
ci_95_lower_ate_d <- ci_95_lower_d * s_pooled
ci_95_upper_ate_d <- ci_95_upper_d * s_pooled

# ATE from SMD
ate_from_smd <- smd * s_pooled

# ATE confidence intervals for SMD (95% instead of 90%)
ci_99_lower_ate_smd <- ci_99_lower_smd * s_pooled
ci_99_upper_ate_smd <- ci_99_upper_smd * s_pooled
ci_95_lower_ate_smd <- ci_95_lower_smd * s_pooled
ci_95_upper_ate_smd <- ci_95_upper_smd * s_pooled

# Create updated data frame with recalculated ATE values
effect_sizes_updated <- data.frame(
  Category = c("Cohen's d", "SMD (PSM)", "ATE (ML)"),
  Estimate = c(ate_from_cohens_d, ate_from_smd, ate_estimate),
  CI_Lower_99 = c(ci_99_lower_ate_d, ci_99_lower_ate_smd, ci_99_lower_ate),
  CI_Upper_99 = c(ci_99_upper_ate_d, ci_99_upper_ate_smd, ci_99_upper_ate),
  CI_Lower_95 = c(ci_95_lower_ate_d, ci_95_lower_ate_smd, ci_95_lower_ate),
  CI_Upper_95 = c(ci_95_upper_ate_d, ci_95_upper_ate_smd, ci_95_upper_ate)
)

# Reorder the Category levels to match the new desired order (Cohen’s d on top, ATE ML at bottom)
effect_sizes_updated$Category <- factor(effect_sizes_updated$Category, levels = c("Cohen's d", "SMD (PSM)", "ATE (ML)"))

# Create the forest plot with overlapping whiskers for CIs
forest_plot_updated <- ggplot(effect_sizes_updated, aes(x = Category, y = Estimate)) +
  # Add points for the ATE estimates
  geom_point(size = 5, shape = 21, fill = "black", color = "black", stroke = 0.5) +
  
  # Add 99% CI as the longest bars
  geom_linerange(aes(ymin = CI_Lower_99, ymax = CI_Upper_99), size = 1.0, color = "black") +
  
  # Add 95% CI as shorter bars
  geom_linerange(aes(ymin = CI_Lower_95, ymax = CI_Upper_95), size = 2.0, color = "firebrick") +
  
  # Add a dashed line at 0 for reference
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  
  # Flip coordinates for horizontal orientation
  coord_flip() +
  
  # Labels and theme
  labs(
    y = "Estimated effect size (in µg/l)",  # Updated y-axis label
    x = NULL  # Remove x-axis label
  ) +
  theme_classic(base_size = 16) +  # Set a clean, classic theme
  theme(
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14),
    plot.title = element_text(size = 18, face = "bold"),
    panel.grid.major.y = element_line(color = "grey80"),
    panel.grid.minor.y = element_blank()
  )

# Display the updated plot
print(forest_plot_updated)
```


#### 4.1.4 Sensitivity analysis with sensemakr

```{r}
# since sensemakr works directly with linear regression models, we need to approximate the causal forest results in a linear regression framework

# Prepare data for linear regression
df_linear <- df

# Fit a linear regression model
linear_model <- lm(
  first_value ~ treatment + age_group_numeric + gender_numeric + season_numeric,
  data = df_linear
)

# Summarize the model
summary(linear_model)

```

```{r}
# Run sensitivity analysis using sensemakr
sensitivity_analysis <- sensemakr(
  model = linear_model,
  treatment = "treatment", 
  benchmark_covariates = c("age_group_numeric", "gender_numeric", "season_numeric"),
  kd = 1 # Specify robustness threshold (default is 1)
)

summary(sensitivity_analysis)

sensitivity_summary <- summary(sensitivity_analysis)

# Generate minimal reporting in LaTeX format
latex_table <- ovb_minimal_reporting(sensitivity_analysis, format = "latex")
```
#### 4.1.5 Weighted ASMD Analysis
```{r}
df$period <- as.factor(df$period)

# Convert categorical variables to numeric for mean calculation
df <- df %>%
  mutate(
    age_group_numeric = as.numeric(as.factor(age_group)),  # Encodes age groups as 1, 2, 3
    gender_numeric = as.numeric(as.factor(gender)),        # Encodes gender (F=1, M=2)
    test_month_numeric = as.numeric(as.factor(test_month))  # Encodes months as 1-12
  )

# Estimate Propensity Scores
ps_model <- glm(period ~ age_group + gender + test_month, data = df, family = binomial)
df$pscore <- predict(ps_model, type = "response")  

# Compute Inverse Probability of Treatment Weights (IPTW)
df$iptw <- ifelse(df$period == "D", 1 / df$pscore, 1 / (1 - df$pscore))

# Compute Balance Statistics Using Weighted Data
balance_stats <- bal.tab(ps_model, weights = df$iptw, 
                         covs = df[, c("age_group", "gender", "test_month")], 
                         estimand = "ATE", thresholds = c(m = 0.1)) 

# Convert to Data Frame
asmd_results <- as.data.frame(balance_stats$Balance)

# Compute Mean ASMD for Each Category
asmd_summary <- data.frame(
  Covariate = c("Age Group", "Gender", "Test Month"),
  Weighted_ASMD = c(  # Fix column name issue
    mean(asmd_results[grep("age_group", rownames(asmd_results)), "Diff.Adj"], na.rm = TRUE),
    mean(asmd_results[grep("gender", rownames(asmd_results)), "Diff.Adj"], na.rm = TRUE),
    mean(asmd_results[grep("test_month", rownames(asmd_results)), "Diff.Adj"], na.rm = TRUE)
  ),
  M.Threshold = "Balanced, <0.1"
)

# Fix Covariate Naming to Match `asmd_results`
means_table <- df %>%
  group_by(period) %>%
  summarise(
    `Age Group` = mean(age_group_numeric, na.rm = TRUE),
    `Gender` = mean(gender_numeric, na.rm = TRUE),
    `Test Month` = mean(test_month_numeric, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = -period, names_to = "Covariate", values_to = "Mean")

# Reshape Table to Wide Format (Pre-Pandemic & Pandemic Means)
means_table_wide <- means_table %>%
  pivot_wider(names_from = period, values_from = Mean) %>%
  rename(`Pre-Pandemic Mean` = `B`, `Pandemic Mean` = `D`)

# Merge Means with ASMD Data
final_balance_table <- asmd_summary %>%
  rename(`Weighted ASMD` = Weighted_ASMD) %>%  # Ensure correct column name
  left_join(means_table_wide, by = "Covariate") %>%
  dplyr::select(Covariate, `Pre-Pandemic Mean`, `Pandemic Mean`, `Weighted ASMD`, M.Threshold)

# Print Final Table for Review
print(final_balance_table)
```

### 4.2 Change in vitamin D deficiency rates

```{r}
# Convert categorical variables to numeric for causal forest
df$age_group_numeric <- as.integer(df$age_group)
df$gender_numeric <- as.integer(df$gender)
df$season_numeric <- as.integer(df$season)

# Convert period to a binary treatment indicator
df$treatment <- ifelse(df$period == "D", 1, 0)

# Ensure the deficiency column is numeric (Binary: 1 = Deficient, 0 = Not Deficient)
df$deficient <- as.numeric(df$deficient)

# Calculate the pre-pandemic average deficiency rate directly from the dataset
pre_pandemic_deficiency_rate <- df %>%
  filter(period == "B") %>%
  summarise(avg_deficiency = mean(deficient, na.rm = TRUE)) %>%
  pull(avg_deficiency)

print(paste("Pre-pandemic average deficiency rate:", round(pre_pandemic_deficiency_rate, 4)))

# Run causal forest model for deficiency rates
causal_forest_deficiency <- causal_forest(
  X = df[c("age_group_numeric", "gender_numeric", "season_numeric")], 
  Y = df$deficient, 
  W = df$treatment, 
  num.trees = 2000
)

# Estimating the average treatment effect (ATE) with confidence intervals
ate_result_deficiency <- average_treatment_effect(causal_forest_deficiency, target.sample = "all")

# Extract the point estimate and standard error
ate_estimate_deficiency <- ate_result_deficiency[1]
ate_std_err_deficiency <- ate_result_deficiency[2]

# Print the results
print(paste("ATE Estimate (Effect on Deficiency Rate):", round(ate_estimate_deficiency, 4)))

# Calculate percentage change based on the actual pre-pandemic deficiency rate
percentage_change_deficiency <- (ate_estimate_deficiency / pre_pandemic_deficiency_rate) * 100

# Print the percentage change
print(paste("Percentage change due to pandemic:", round(percentage_change_deficiency, 2), "%"))

# Confidence intervals
z_99 <- 2.576
z_95 <- 1.960 

ci_99_lower_ate_deficiency <- ate_estimate_deficiency - z_99 * ate_std_err_deficiency
ci_99_upper_ate_deficiency <- ate_estimate_deficiency + z_99 * ate_std_err_deficiency

ci_95_lower_ate_deficiency <- ate_estimate_deficiency - z_95 * ate_std_err_deficiency
ci_95_upper_ate_deficiency <- ate_estimate_deficiency + z_95 * ate_std_err_deficiency

cat("99% CI for ATE (Deficiency Rate):", ci_99_lower_ate_deficiency, "to", ci_99_upper_ate_deficiency, "\n")
cat("95% CI for ATE (Deficiency Rate):", ci_95_lower_ate_deficiency, "to", ci_95_upper_ate_deficiency, "\n")

```
#### 4.2.1 Comparison of effect sizes across all 3 methods used

```{r}
# Extracting ATE estimate and standard error from causal forest model
ate_estimate_def <- ate_result_deficiency[1]
ate_std_err_def <- ate_result_deficiency[2]

# Confidence intervals for ATE
ci_99_lower_ate_def <- ate_estimate_def - z_99 * ate_std_err_def
ci_99_upper_ate_def <- ate_estimate_def + z_99 * ate_std_err_def
ci_95_lower_ate_def <- ate_estimate_def - z_95 * ate_std_err_def
ci_95_upper_ate_def <- ate_estimate_def + z_95 * ate_std_err_def

# Bring all effect size metrics together in one df
effect_sizes_deficiency <- data.frame(
  Category = c("Cohen's d", "SMD (PSM)", "ATE (ML)"),
  Estimate = c(cohens_d_def * sd_pooled_def, smd_def * sd_matched_pooled_def, ate_estimate_def),
  CI_Lower_99 = c(ci_99_lower_d_def * sd_pooled_def, ci_99_lower_smd_def * sd_matched_pooled_def, ci_99_lower_ate_def),
  CI_Upper_99 = c(ci_99_upper_d_def * sd_pooled_def, ci_99_upper_smd_def * sd_matched_pooled_def, ci_99_upper_ate_def),
  CI_Lower_95 = c(ci_95_lower_d_def * sd_pooled_def, ci_95_lower_smd_def * sd_matched_pooled_def, ci_95_lower_ate_def),
  CI_Upper_95 = c(ci_95_upper_d_def * sd_pooled_def, ci_95_upper_smd_def * sd_matched_pooled_def, ci_95_upper_ate_def)
)

# Reorder factor levels for display order
effect_sizes_deficiency$Category <- factor(effect_sizes_deficiency$Category, levels = c("Cohen's d", "SMD (PSM)", "ATE (ML)"))

forest_plot_deficiency <- ggplot(effect_sizes_deficiency, aes(x = Category, y = Estimate)) +
  # Add points for the ATE estimates
  geom_point(size = 5, shape = 21, fill = "black", color = "black", stroke = 0.5) +
  
  # Add 99% CI as the longest bars
  geom_linerange(aes(ymin = CI_Lower_99, ymax = CI_Upper_99), size = 1.0, color = "black") +
  
  # Add 95% CI as shorter bars
  geom_linerange(aes(ymin = CI_Lower_95, ymax = CI_Upper_95), size = 2.0, color = "firebrick") +
  
  # Add a dashed line at 0 for reference
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  
  # Flip coordinates for horizontal orientation
  coord_flip() +
  
  # Labels and theme
  labs(
    y = "Estimated effect size (percentage points)",  
    x = NULL  
  ) +
  theme_classic(base_size = 16) +  
  theme(
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14),
    plot.title = element_text(size = 18, face = "bold"),
    panel.grid.major.y = element_line(color = "grey80"),
    panel.grid.minor.y = element_blank()
  )

# Display the updated plot
print(forest_plot_deficiency)

```


## 5. Negative Controls Outcome Analysis 1: Entire analysis on a different time frame

```{r}
# Define the new periods - uncomment the desired time frame to be tested

# Time Frame 1
period1 <- c(as.Date("2018-03-01"), as.Date("2020-01-31"))
period2 <- c(as.Date("2020-02-01"), as.Date("2022-02-28"))

# Time Frame 2
period1 <- c(as.Date("2018-03-01"), as.Date("2020-03-31"))
period2 <- c(as.Date("2020-04-01"), as.Date("2022-02-28"))

# Create a new data frame for robustness check
df_robust <- df

# Assign each entry to a period
assign_period <- function(date) {
  if (date >= period1[1] && date <= period1[2]) {
    return("B")
  } else if (date >= period2[1] && date <= period2[2]) {
    return("D")
  } else {
    return(NA) 
  }
}

df_robust$period <- as.character(sapply(df_robust$test_date, assign_period))
df_robust <- df_robust[!is.na(df_robust$period),]
```


### 5.1 Descriptive
```{r}
sample_size_period <- df_robust %>% count(period)
print("Sample size per period:")
print(sample_size_period)

sample_size_contingency <- with(df_robust, table(period))
sample_size_p_value <- chisq.test(sample_size_contingency)$p.value
print("P-value for sample size distribution:")
print(sample_size_p_value)

age_group_counts <- df_robust %>% group_by(period, age_group) %>%
  summarise(n = n(), .groups = 'drop') %>% group_by(period) %>%
  mutate(percentage = n / sum(n) * 100) %>% ungroup()
print("Age group counts per period:")
print(age_group_counts)

age_group_p_value <- chisq.test(with(df_robust, table(period, age_group)))$p.value
print("P-value for age group distribution:")
print(age_group_p_value)

gender_group_counts <- df_robust %>% group_by(period, gender) %>%
  summarise(n = n(), .groups = 'drop') %>% group_by(period) %>%
  mutate(percentage = n / sum(n) * 100) %>% ungroup()
print("Gender group counts per period:")
print(gender_group_counts)

gender_group_p_value <- chisq.test(with(df_robust, table(period, gender)))$p.value
print("P-value for gender distribution:")
print(gender_group_p_value)

season_counts <- df_robust %>% group_by(period, season) %>%
  summarise(n = n(), .groups = 'drop') %>% group_by(period) %>%
  mutate(percentage = n / sum(n) * 100) %>% ungroup()
print("Seasonal distribution per period:")
print(season_counts)

season_counts_p_value <- chisq.test(with(df_robust, table(period, season)))$p.value
print("P-value for seasonal distribution:")
print(season_counts_p_value)

mean_vitamin_d_levels <- df_robust %>% group_by(period) %>%
  summarise(mean_value = mean(first_value, na.rm = TRUE),
            sd_value = sd(first_value, na.rm = TRUE),
            .groups = 'drop')
print("Mean and standard deviation of Vitamin D levels per period:")
print(mean_vitamin_d_levels)

vitamin_d_t_test <- t.test(first_value ~ period, data = df_robust)
print("T-test results for Vitamin D levels:")
print(vitamin_d_t_test)

vitamin_d_deficiency_rate <- df_robust %>% group_by(period) %>%
  summarise(deficiency_count = sum(deficient), 
            total_count = n(), 
            deficiency_rate = sum(deficient) / n() * 100, 
            .groups = 'drop')
print("Vitamin D deficiency rates per period:")
print(vitamin_d_deficiency_rate)

deficiency_contingency <- with(df_robust, table(period, deficient))
deficiency_p_value <- chisq.test(deficiency_contingency)$p.value
print("P-value for Vitamin D deficiency rate differences:")
print(deficiency_p_value)

```

### 5.2 PSM

```{r}
df_robust$gender <- factor(df_robust$gender)
df_robust$age_group <- factor(df_robust$age_group)
df_robust$period <- factor(df_robust$period)
df_robust$test_month <- factor(df_robust$test_month)

test_match_obj <- matchit(period ~ age_group + gender + test_month, data = df_robust, 
                          method = "nearest", distance = "glm", 
                          exact = c("gender", "age_group", "test_month")) 
matched_data <- match.data(test_match_obj)

# Statistical testing for matched data
vitamin_d_t_test_matched <- t.test(first_value ~ period, data = matched_data)
print("T-test results for Vitamin D levels in matched data:")
print(vitamin_d_t_test_matched)

# Contingency table for deficiency rates
deficiency_contingency_matched <- with(matched_data, table(period, deficient))
deficiency_p_value_matched <- chisq.test(deficiency_contingency_matched)$p.value
print("P-value for Vitamin D deficiency rate differences in matched data:")
print(deficiency_p_value_matched)

# Calculate deficiency rates before and during the pandemic
deficiency_rates_matched <- prop.table(deficiency_contingency_matched, margin = 1) * 100
print("Deficiency rates in matched sample (before and during the pandemic):")
print(deficiency_rates_matched)

```

### 5.3 ML

```{r}
df_robust$age_group_numeric <- as.integer(factor(df_robust$age_group))
df_robust$gender_numeric <- as.integer(factor(df_robust$gender))
df_robust$season_numeric <- as.integer(factor(df_robust$season))
df_robust$treatment <- ifelse(df_robust$period == "D", 1, 0)

causal_forest_model <- causal_forest(
  X = df_robust[c("age_group_numeric", "gender_numeric", "season_numeric")], 
  Y = df_robust$first_value, 
  W = df_robust$treatment, 
  num.trees = 2000
)

# Estimating the average treatment effect (ATE) with confidence intervals
ate_result <- average_treatment_effect(causal_forest_model, target.sample = "all")

# Extract the point estimate and standard error
ate_estimate <- ate_result[1]
ate_std_err <- ate_result[2]

# Print the results
print(paste("ATE Estimate:", round(ate_estimate, 4)))

# Calculate percentage change
average_first_value <- mean(df_robust$first_value, na.rm = TRUE)
percentage_change <- (ate_estimate / average_first_value) * 100

# Print the percentage change
print(paste("Percentage change due to pandemic:", round(percentage_change, 2), "%"))

# Confidence intervals
ci_99_lower_ate <- ate_estimate - z_99 * ate_std_err
ci_99_upper_ate <- ate_estimate + z_99 * ate_std_err
ci_95_lower_ate <- ate_estimate - z_95 * ate_std_err
ci_95_upper_ate <- ate_estimate + z_95 * ate_std_err

cat("99% Confidence Interval for ATE:", ci_99_lower_ate, "to", ci_99_upper_ate, "\n")
cat("95% Confidence Interval for ATE:", ci_95_lower_ate, "to", ci_95_upper_ate, "\n")

# Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate.
```

## 6. Negative Controls Outcome Analysis 2: Compare different months of different years
### 6.1 Descriptive
```{r}
# Filter data for the months of March, April, and May for the years 2018, 2019, 2020, and 2021
df_spring <- df %>%
  filter(year(test_date) %in% c(2018, 2019, 2020, 2021) &
           month(test_date) %in% c(3, 4, 5))

# Ensure factors
df_spring$period <- factor(df_spring$period)
df_spring$gender <- factor(df_spring$gender)
df_spring$age_group <- factor(df_spring$age_group)
df_spring$season <- factor(df_spring$season)
df_spring$test_month <- factor(month(df_spring$test_date))

# 1. Descriptive Statistics
spring_vitamin_d_stats <- df_spring %>%
  group_by(cohort_year, test_month) %>%
  summarise(mean_vit_d = mean(first_value, na.rm = TRUE),
            sd_vit_d = sd(first_value, na.rm = TRUE),
            deficiency_count = sum(deficient, na.rm = TRUE),
            total_count = n(),
            deficiency_rate = (sum(deficient, na.rm = TRUE) / n()) * 100,
            .groups = "drop") %>%
  arrange(test_month, cohort_year)

print("Vitamin D statistics by year and month:")
print(spring_vitamin_d_stats)

# Mean Vitamin D levels before and during the pandemic
mean_vit_d_period <- df_spring %>%
  group_by(period) %>%
  summarise(mean_vit_d = mean(first_value, na.rm = TRUE),
            sd_vit_d = sd(first_value, na.rm = TRUE),
            .groups = "drop")

print("Mean Vitamin D levels before and during the pandemic:")
print(mean_vit_d_period)

# Deficiency rates before and during the pandemic
deficiency_rate_period <- df_spring %>%
  group_by(period) %>%
  summarise(deficiency_count = sum(deficient, na.rm = TRUE),
            total_count = n(),
            deficiency_rate = (sum(deficient, na.rm = TRUE) / n()) * 100,
            .groups = "drop")

print("Deficiency rates before and during the pandemic:")
print(deficiency_rate_period)

# ANOVA test for mean Vitamin D levels
vitamin_d_anova <- aov(first_value ~ factor(cohort_year) * factor(test_month), data = df_spring)
summary(vitamin_d_anova)

# Chi-square test for deficiency rate
deficiency_contingency <- with(df_spring, table(cohort_year, test_month, deficient))
deficiency_p_value <- chisq.test(rowSums(deficiency_contingency))$p.value
print("P-value for Vitamin D deficiency rate differences across years and months:")
print(deficiency_p_value)

```
### 6.2 PSM
```{r}
test_match_obj <- matchit(period ~ age_group + gender + test_month, data = df_spring, 
                          method = "nearest", distance = "glm", 
                          exact = c("gender", "age_group", "test_month"))
matched_data <- match.data(test_match_obj)

# T-test for Vitamin D levels in matched data
vitamin_d_t_test_matched <- t.test(first_value ~ period, data = matched_data)
print("T-test results for Vitamin D levels in matched data:")
print(vitamin_d_t_test_matched)

# Deficiency rates in matched data
deficiency_contingency_matched <- with(matched_data, table(period, deficient))
deficiency_p_value_matched <- chisq.test(deficiency_contingency_matched)$p.value
print("P-value for Vitamin D deficiency rate differences in matched data:")
print(deficiency_p_value_matched)

deficiency_rates_matched <- prop.table(deficiency_contingency_matched, margin = 1) * 100
print("Deficiency rates in matched sample (before and during the pandemic):")
print(deficiency_rates_matched)
```

### 6.3 ML
```{r}
df_spring$age_group_numeric <- as.integer(factor(df_spring$age_group))
df_spring$gender_numeric <- as.integer(factor(df_spring$gender))
df_spring$season_numeric <- as.integer(factor(df_spring$season))
df_spring$treatment <- ifelse(df_spring$period == "D", 1, 0)

causal_forest_model <- causal_forest(
  X = df_spring[c("age_group_numeric", "gender_numeric", "season_numeric")], 
  Y = df_spring$first_value, 
  W = df_spring$treatment, 
  num.trees = 2000
)

ate_result <- average_treatment_effect(causal_forest_model, target.sample = "all")
ate_estimate <- ate_result[1]
ate_std_err <- ate_result[2]

print(paste("ATE Estimate:", round(ate_estimate, 4)))

average_first_value <- mean(df_spring$first_value, na.rm = TRUE)
percentage_change <- (ate_estimate / average_first_value) * 100
print(paste("Percentage change due to pandemic:", round(percentage_change, 2), "%"))

# Confidence intervals
ci_99_lower_ate <- ate_estimate - z_99 * ate_std_err
ci_99_upper_ate <- ate_estimate + z_99 * ate_std_err
ci_95_lower_ate <- ate_estimate - z_95 * ate_std_err
ci_95_upper_ate <- ate_estimate + z_95 * ate_std_err

cat("99% Confidence Interval for ATE:", ci_99_lower_ate, "to", ci_99_upper_ate, "\n")
cat("95% Confidence Interval for ATE:", ci_95_lower_ate, "to", ci_95_upper_ate, "\n")

# Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate.

```


## 7. Negative Controls Outcome Analysis 3: Hours of sunshine
```{r}
# Define the base URL
base_url <- "https://opendata.dwd.de/climate_environment/CDC/regional_averages_DE/monthly/sunshine_duration/"

# Initialize an empty data frame to store sunshine data
sunshine_data <- data.frame()

# Loop through each month to read and combine the data
for (i in 1:12) {
  # Format the file name
  file_name <- sprintf("regional_averages_sd_%02d.txt", i)
  file_url <- paste0(base_url, file_name)
  
  # Read the data, skipping the first row to use the second row as headers
  monthly_data <- read_delim(file_url, delim = ";", col_types = cols(), skip = 1)
  
  # Select relevant columns (Jahr = year, Monat = month, Bayern = sunshine duration)
  monthly_data <- monthly_data %>%
    dplyr::select(Jahr, Monat, Bayern) %>%
    rename(sunshine_duration = Bayern) %>%
    mutate(Monat = as.numeric(Monat)) # Ensure month is numeric
  
  # Append the monthly data to the combined data frame
  sunshine_data <- bind_rows(sunshine_data, monthly_data)
}

# Ensuring year and month are numeric for merging
df <- df %>%
  mutate(
    cohort_year = as.numeric(cohort_year),
    test_month = as.numeric(test_month)
  )

# Merge the sunshine data with your existing data frame
df <- df %>%
  left_join(sunshine_data, by = c("cohort_year" = "Jahr", "test_month" = "Monat"))

# Check for any unmatched entries
unmatched_entries <- df %>%
  filter(is.na(sunshine_duration))

# Print the number of unmatched entries, if any
cat("Number of entries with missing sunshine duration data:", nrow(unmatched_entries), "\n")

# View the first few rows of the updated data frame
head(df)

```

```{r}
# Convert categorical variables to numeric for causal forest
df$age_group <- as.factor(df$age_group)
df$age_group_numeric <- as.integer(df$age_group)

df$gender <- as.factor(df$gender)
df$gender_numeric <- as.integer(df$gender)

df$season <- as.factor(df$season)
df$season_numeric <- as.integer(df$season)

# Convert period to a binary treatment indicator
df$treatment <- ifelse(df$period == "D", 1, 0)

# Ensure sunshine_duration is numeric
df$sunshine_numeric <- as.numeric(df$sunshine_duration)

# Run causal forest model with sunshine hours as an additional covariate
causal_forest_model_sunshine <- causal_forest(
  X = df[c("age_group_numeric", "gender_numeric", "season_numeric", "sunshine_numeric")], 
  Y = df$first_value, 
  W = df$treatment, 
  num.trees = 2000
)

# Estimating the average treatment effect (ATE) with confidence intervals
ate_result_sunshine <- average_treatment_effect(causal_forest_model_sunshine, target.sample = "overlap") # using target.sample = "all" resulted in NaN because the sunshine column produced too extreme propensity scores

# Extract the point estimate and standard error
ate_estimate_sunshine <- ate_result_sunshine[1]
ate_std_err_sunshine <- ate_result_sunshine[2]

# Print the results
print(paste("ATE Estimate (with sunshine):", round(ate_estimate_sunshine, 4)))

# Confidence intervals
z_99 <- 2.576
z_95 <- 1.960  

ci_99_lower_ate_sunshine <- ate_estimate_sunshine - z_99 * ate_std_err_sunshine
ci_99_upper_ate_sunshine <- ate_estimate_sunshine + z_99 * ate_std_err_sunshine

ci_95_lower_ate_sunshine <- ate_estimate_sunshine - z_95 * ate_std_err_sunshine
ci_95_upper_ate_sunshine <- ate_estimate_sunshine + z_95 * ate_std_err_sunshine

cat("99% CI for ATE (with sunshine):", ci_99_lower_ate_sunshine, "to", ci_99_upper_ate_sunshine, "\n")
cat("95% CI for ATE (with sunshine):", ci_95_lower_ate_sunshine, "to", ci_95_upper_ate_sunshine, "\n")

# Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate.

```

## 8. Negative Controls Outcome Analysis 4: Randomly shuffling the Vit D results

```{r}
# Create a copy of df with shuffled Vitamin D values
df_shuffled <- df %>%
  mutate(shuffled_value = sample(first_value, replace = FALSE))

# Ensure there are no missing values in the shuffled data
df_shuffled <- df_shuffled %>%
  filter(!is.na(shuffled_value))
```

### 8.1 Descriptive
```{r}
# Compute statistics for the shuffled Vitamin D values
vitamin_d_stats_shuffled <- df_shuffled %>%
  group_by(cohort_year, test_month) %>%
  summarise(mean_vit_d = mean(shuffled_value, na.rm = TRUE),
            sd_vit_d = sd(shuffled_value, na.rm = TRUE),
            deficiency_count = sum(shuffled_value < thresholds$VD3, na.rm = TRUE),
            total_count = n(),
            deficiency_rate = (sum(shuffled_value < thresholds$VD3, na.rm = TRUE) / n()) * 100,
            .groups = "drop") %>%
  arrange(test_month, cohort_year)

print("Shuffled Vitamin D statistics by year and month:")
print(vitamin_d_stats_shuffled)

# Mean Vitamin D levels before and during the pandemic
mean_vit_d_period_shuffled <- df_shuffled %>%
  group_by(period) %>%
  summarise(mean_vit_d = mean(shuffled_value, na.rm = TRUE),
            sd_vit_d = sd(shuffled_value, na.rm = TRUE),
            .groups = "drop")

print("Mean Shuffled Vitamin D levels before and during the pandemic:")
print(mean_vit_d_period_shuffled)

# Deficiency rates before and during the pandemic
deficiency_rate_period_shuffled <- df_shuffled %>%
  group_by(period) %>%
  summarise(deficiency_count = sum(shuffled_value < thresholds$VD3, na.rm = TRUE),
            total_count = n(),
            deficiency_rate = (sum(shuffled_value < thresholds$VD3, na.rm = TRUE) / n()) * 100,
            .groups = "drop")

print("Shuffled Deficiency rates before and during the pandemic:")
print(deficiency_rate_period_shuffled)

# ANOVA test for mean Vitamin D levels on shuffled data
vitamin_d_anova_shuffled <- aov(shuffled_value ~ factor(cohort_year) * factor(test_month), data = df_shuffled)
summary(vitamin_d_anova_shuffled)

# Chi-square test for deficiency rate in shuffled data
deficiency_contingency_shuffled <- with(df_shuffled, table(cohort_year, test_month, shuffled_value < thresholds$VD3))
deficiency_p_value_shuffled <- chisq.test(rowSums(deficiency_contingency_shuffled))$p.value
print("P-value for Shuffled Vitamin D deficiency rate differences across years and months:")
print(deficiency_p_value_shuffled)
```

### 8.2 PSM
```{r}
test_match_obj_shuffled <- matchit(period ~ age_group + gender + test_month, data = df_shuffled, 
                                   method = "nearest", distance = "glm", 
                                   exact = c("gender", "age_group", "test_month"))
matched_data_shuffled <- match.data(test_match_obj_shuffled)

# T-test for Vitamin D levels in matched shuffled data
vitamin_d_t_test_matched_shuffled <- t.test(shuffled_value ~ period, data = matched_data_shuffled)
print("T-test results for Shuffled Vitamin D levels in matched data:")
print(vitamin_d_t_test_matched_shuffled)

# Deficiency rates in matched shuffled data
deficiency_contingency_matched_shuffled <- with(matched_data_shuffled, table(period, shuffled_value < thresholds$VD3))
deficiency_p_value_matched_shuffled <- chisq.test(deficiency_contingency_matched_shuffled)$p.value
print("P-value for Shuffled Vitamin D deficiency rate differences in matched data:")
print(deficiency_p_value_matched_shuffled)

deficiency_rates_matched_shuffled <- prop.table(deficiency_contingency_matched_shuffled, margin = 1) * 100
print("Shuffled Deficiency rates in matched sample (before and during the pandemic):")
print(deficiency_rates_matched_shuffled)
```


### 8.3 ML
```{r}
df_shuffled$age_group_numeric <- as.integer(factor(df_shuffled$age_group))
df_shuffled$gender_numeric <- as.integer(factor(df_shuffled$gender))
df_shuffled$season_numeric <- as.integer(factor(df_shuffled$season))
df_shuffled$treatment <- ifelse(df_shuffled$period == "D", 1, 0)

causal_forest_model_shuffled <- causal_forest(
  X = df_shuffled[c("age_group_numeric", "gender_numeric", "season_numeric")], 
  Y = df_shuffled$shuffled_value, 
  W = df_shuffled$treatment, 
  num.trees = 2000
)

ate_result_shuffled <- average_treatment_effect(causal_forest_model_shuffled, target.sample = "all")
ate_estimate_shuffled <- ate_result_shuffled[1]
ate_std_err_shuffled <- ate_result_shuffled[2]

print(paste("Shuffled ATE Estimate:", round(ate_estimate_shuffled, 4)))

average_first_value_shuffled <- mean(df_shuffled$shuffled_value, na.rm = TRUE)
percentage_change_shuffled <- (ate_estimate_shuffled / average_first_value_shuffled) * 100
print(paste("Shuffled Percentage change due to pandemic:", round(percentage_change_shuffled, 2), "%"))

# Confidence intervals
ci_99_lower_ate_shuffled <- ate_estimate_shuffled - z_99 * ate_std_err_shuffled
ci_99_upper_ate_shuffled <- ate_estimate_shuffled + z_99 * ate_std_err_shuffled
ci_95_lower_ate_shuffled <- ate_estimate_shuffled - z_95 * ate_std_err_shuffled
ci_95_upper_ate_shuffled <- ate_estimate_shuffled + z_95 * ate_std_err_shuffled

cat("99% Confidence Interval for Shuffled ATE:", ci_99_lower_ate_shuffled, "to", ci_99_upper_ate_shuffled, "\n")
cat("95% Confidence Interval for Shuffled ATE:", ci_95_lower_ate_shuffled, "to", ci_95_upper_ate_shuffled, "\n")

# Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate.
```