--- title: "Monitoring changes in vitamin D levels during the COVID-19 pandemic with routinely-collected laboratory data: Data Analysis" output: html_document date: "2025-01-30" author: "Lea Skapetze" --- ```{r setup, include=FALSE} # Set chunk options knitr::opts_chunk$set(echo = TRUE) # Load required libraries library(lubridate) library(ggplot2) library(rlang) library(tidyr) library(stringr) library(stats) library(purrr) library(ggsignif) library(MatchIt) library(lmtest) library(sandwich) library(parallel) library(broom) library(cobalt) library(optmatch) library(lm.beta) library(grf) library(tidyverse) library(mediation) library(dplyr) library(sensemakr) library(ggrepel) library(tableone) library(cobalt) library(readr) library(xtable) # Turn off scientific notations options(scipen = 999) # Check versions of packages R.version.string packageVersion("grf") packageVersion("MatchIt") packageVersion("sensemakr") ``` ## 0. Load the data and filter for only Vitamin + filter out undefined gender ```{r} # Read the data df <- read.csv("data.csv") # Filter for Vitamin D measurements only vd3_df <- df %>% filter(param_short == "VD3") # Count and calculate percentage of gender "X" before removing it later gender_x_summary <- vd3_df %>% filter(gender == "X") %>% summarise( count_gender_x = n(), percentage_gender_x = (n() / nrow(vd3_df)) * 100 ) # Print the summary of gender "X" print(gender_x_summary) # Filter df accordingly df <- vd3_df %>% filter(gender != "X") %>% mutate( full_name = "Vitamin D", unit = "µg/l" ) head(df) ``` ## 1. Prepare the data ```{r} # AGE GROUPS df <- df %>% mutate(age_group = case_when( age_at_first_measurement %in% c("[18, 30[", "[30, 40[") ~ "[18, 39]", age_at_first_measurement %in% c("[40, 50[", "[50, 60[") ~ "[40, 59]", age_at_first_measurement == ">=60" ~ "60+", TRUE ~ "Unknown" # In case there are entries that don't fit the above categories )) # DATES, PERIODS, SEASONS # Convert 'cohort_year' and 'test_month' to date format df$test_date <- with(df, paste(cohort_year, test_month, "01", sep="-")) df$test_date <- as.Date(df$test_date, format="%Y-%m-%d") # Create a new column `year_month` combining `cohort_year` and `test_month` df <- df %>% mutate( year_month = as.Date(paste(cohort_year, test_month, "01", sep = "-"), format = "%Y-%m-%d") ) # Define periods “B” for before the pandemic and “D” for during the pandemic period1 <- c(as.Date("2018-03-01"), as.Date("2020-02-29")) period2 <- c(as.Date("2020-03-01"), as.Date("2022-02-28")) # Assign each entry to a period assign_period <- function(date) { if (date >= period1[1] && date <= period1[2]) { return("B") } else if (date >= period2[1] && date <= period2[2]) { return("D") } else { return(NA) } } # Create a new column for the period df$period <- as.character(sapply(df$test_date, assign_period)) # Filter out entries that do not belong to either period df <- df[!is.na(df$period),] # Define a function to convert test month to season and adjust cohort_year # The get_season() function adjusts the cohort_year for December by incrementing the year (year + 1), ensuring that December 2019 belongs to Winter 2020 get_season <- function(month, year) { if (month %in% c(12)) { return(list(season = "Winter", cohort_year = year + 1)) # December moves to the next year's Winter } else if (month %in% c(1, 2)) { return(list(season = "Winter", cohort_year = year)) # January and February are Winter of the current year } else if (month %in% c(3, 4, 5)) { return(list(season = "Spring", cohort_year = year)) } else if (month %in% c(6, 7, 8)) { return(list(season = "Summer", cohort_year = year)) } else if (month %in% c(9, 10, 11)) { return(list(season = "Fall", cohort_year = year)) } else { return(list(season = NA, cohort_year = year)) # Handle unexpected values } } # For December measurements: In the end the column cohort_year is the reassigned year, and the column year is the original year # Apply the function to adjust both season and cohort_year adjusted_season <- mapply(get_season, df$test_month, df$cohort_year, SIMPLIFY = FALSE) df$season <- factor(sapply(adjusted_season, function(x) x$season), levels = c("Winter", "Spring", "Summer", "Fall")) df$cohort_year <- sapply(adjusted_season, function(x) x$cohort_year) # Create ordered season-year by sorting year and season df <- df %>% mutate( season_year = paste(cohort_year, season) ) %>% arrange(cohort_year, season) %>% mutate( season_year = factor(season_year, levels = unique(season_year), ordered = TRUE) ) # VIT D DEFICIENCY STATUS # Determining deficiency status thresholds <- list( VD3 = 20 # to be changed for a different param ) # Function to check deficiency for Vitamin D get_deficiency_status <- function(value) { threshold <- thresholds$VD3 # to be changed for a different param if (!is.na(value) && value < threshold) { return(1) # Deficient } else { return(0) # Not deficient } } # Calculation of deficiency status based on numeric first_value column df <- df %>% rowwise() %>% mutate(deficient = get_deficiency_status(first_value)) %>% ungroup() # Make sure there are not NAs df <- df %>% filter(!is.na(first_value), !is.na(period), !is.na(gender), !is.na(age_group)) # Factorize relevant columns for Propensity-Score Matching and Machine Learning df$gender <- factor(df$gender) df$age_group <- factor(df$age_group) df$period <- factor(df$period) df$test_month <- factor(df$test_month) df$season <- factor(df$season) head(df) # STATISTICS # Define Z Scores z_99 <- 2.576 z_95 <- 1.960 ``` ## 2. Descriptive Statistics ### 2.1 Basic comparison #### 2.1.1 Sample sizes, counts, mean levels and deficiency rates (unstratified) ```{r} # Count sample size in period "B" and period "D" sample_size_period <- df %>% count(period) sample_size_contingency <- with(df, table(period)) sample_size_p_value <- chisq.test(sample_size_contingency)$p.value # Count how many entries for each age group in period "B" and "D" (absolute nr and percentage) age_group_counts <- df %>% group_by(period, age_group) %>% summarise(n = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = n / sum(n) * 100) %>% ungroup() age_group_contingency <- with(df, table(period, age_group)) age_group_p_value <- chisq.test(age_group_contingency)$p.value # Count how many entries for each gender group in period "B" and "D" (absolute nr and percentage) gender_group_counts <- df %>% group_by(period, gender) %>% summarise(n = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = n / sum(n) * 100) %>% ungroup() gender_group_contingency <- with(df, table(period, gender)) gender_group_p_value <- chisq.test(gender_group_contingency)$p.value # Count how many entries for each season group in period "B" and "D" (absolute nr and percentage) season_counts <- df %>% group_by(period, season) %>% summarise(n = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = n / sum(n) * 100) %>% ungroup() season_contingency <- with(df, table(period, season)) season_counts_p_value <- chisq.test(season_contingency)$p.value # The mean vitamin D level in period B and D (with standard deviation) mean_vitamin_d_levels <- df %>% group_by(period) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) # Vitamin D deficiency rate for period B and D (in percent) vitamin_d_deficiency_rate <- df %>% group_by(period) %>% summarise( deficiency_count = sum(deficient), total_count = n(), deficiency_rate = sum(deficient) / n() * 100, .groups = 'drop') # Printing results print(sample_size_period) print(sample_size_p_value) print(age_group_counts) print(age_group_p_value) print(gender_group_counts) print(gender_group_p_value) print(season_counts) print(season_counts_p_value) print(mean_vitamin_d_levels) print(vitamin_d_deficiency_rate) ``` #### 2.1.2 Effect size (Cohen's d) - Mean vitamin D levels ```{r} # Means and standard deviations mean_b <- mean(df$first_value[df$period == "B"]) mean_d <- mean(df$first_value[df$period == "D"]) sd_b <- sd(df$first_value[df$period == "B"]) sd_d <- sd(df$first_value[df$period == "D"]) # Pooled SD n1 <- sum(df$period == "B") n2 <- sum(df$period == "D") sd_pooled <- sqrt(((n1 - 1) * sd_b^2 + (n2 - 1) * sd_d^2) / (n1 + n2 - 2)) # Cohen's d (flipped sign for "During - Before") cohens_d <- (mean_d - mean_b) / sd_pooled # Flips the direction print(paste("Cohen's d:", round(cohens_d, 4))) # Convert sample sizes to numeric to avoid integer overflow n1 <- as.numeric(n1) # Before pandemic n2 <- as.numeric(n2) # During pandemic # Standard error of Cohen's d se_d <- sqrt((n1 + n2) / (n1 * n2) + (cohens_d^2) / (2 * (n1 + n2))) # Calculate CIs ci_99_lower_d <- cohens_d - z_99 * se_d ci_99_upper_d <- cohens_d + z_99 * se_d ci_95_lower_d <- cohens_d - z_95 * se_d ci_95_upper_d <- cohens_d + z_95 * se_d # Print results cat("99% CI for Cohen's d:", ci_99_lower_d, "to", ci_99_upper_d, "\n") cat("95% CI for Cohen's d:", ci_95_lower_d, "to", ci_95_upper_d, "\n") # Note: Confidence intervals for Cohen's d can be calculated using bootstrapping for more precise estimates, but this script uses an analytical approximation based on the standard error of d and Z-scores (valid for sufficiently large sample sizes and normal effect size distribution). ``` #### 2.1.3 Effect size (Cohen's d) - Vitamin D deficiency rates ```{r} # Ensure sample sizes are numeric to avoid integer overflow n1_def <- as.numeric(sum(df$period == "B")) n2_def <- as.numeric(sum(df$period == "D")) # Compute means and standard deviations for deficiency rates mean_b_def <- mean(df$deficient[df$period == "B"]) mean_d_def <- mean(df$deficient[df$period == "D"]) sd_b_def <- sd(df$deficient[df$period == "B"]) sd_d_def <- sd(df$deficient[df$period == "D"]) # Pooled SD for deficiency sd_pooled_def <- sqrt(((n1_def - 1) * sd_b_def^2 + (n2_def - 1) * sd_d_def^2) / (n1_def + n2_def - 2)) # Cohen's d (flipped sign for "During - Before") cohens_d_def <- (mean_d_def - mean_b_def) / sd_pooled_def print(paste("Cohen's d for deficiency:", round(cohens_d_def, 4))) # Standard error for Cohen's d (now using numeric sample sizes) se_d_def <- sqrt((n1_def + n2_def) / (n1_def * n2_def) + (cohens_d_def^2) / (2 * (n1_def + n2_def))) # Confidence intervals for Cohen's d ci_99_lower_d_def <- cohens_d_def - z_99 * se_d_def ci_99_upper_d_def <- cohens_d_def + z_99 * se_d_def ci_95_lower_d_def <- cohens_d_def - z_95 * se_d_def ci_95_upper_d_def <- cohens_d_def + z_95 * se_d_def # Print results cat("99% CI for Cohen's d:", ci_99_lower_d_def, "to", ci_99_upper_d_def, "\n") cat("95% CI for Cohen's d:", ci_95_lower_d_def, "to", ci_95_upper_d_def, "\n") ``` ### 2.2 Mean levels and deficiency rates, stratified by age, gender and season ```{r} # Compare mean vitamin D levels between periods using t-test vitamin_d3_levels_B <- df %>% filter(period == "B") %>% pull(first_value) vitamin_d3_levels_D <- df %>% filter(period == "D") %>% pull(first_value) t_test_results <- t.test(vitamin_d3_levels_B, vitamin_d3_levels_D, na.rm = TRUE) mean_comparison_p_value <- t_test_results$p.value print(mean_comparison_p_value) # Construct a contingency table for deficiencies in periods B and D deficiency_table <- df %>% group_by(period) %>% summarise(deficient = sum(deficient), not_deficient = n() - sum(deficient)) %>% dplyr::select(-period) %>% as.matrix() chi_square_test_result <- chisq.test(deficiency_table) deficiency_comparison_p_value <- chi_square_test_result$p.value print(deficiency_comparison_p_value) # Mean Vitamin D levels by age group vitamin_d3_age_means <- df %>% group_by(period, age_group) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), .groups = 'drop' ) age_group_stats <- df %>% group_by(period, age_group) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) # Mean Vitamin D Levels by gender group vitamin_d3_gender_means <- df %>% group_by(period, gender) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), .groups = 'drop' ) gender_group_stats <- df %>% filter(gender %in% c("M", "F")) %>% group_by(period, gender) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) # Mean Vitamin D Levels by season vitamin_d3_season_means <- df %>% group_by(period, season) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) season_group_stats <- df %>% group_by(period, season) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) print(age_group_stats) print(gender_group_stats) print(season_group_stats) ``` ### 2.4 Statistical testing for age, gender and season ```{r} # Statistical Testing within each sub-group # Function to perform a t-test and return p-value perform_t_test <- function(data_group_1, data_group_2) { test_result <- t.test(data_group_1, data_group_2, na.rm = TRUE) return(test_result$p.value) } # Calculate p-values for age groups p_values_age_groups <- df %>% group_by(age_group) %>% summarise( t_test_p_value = perform_t_test( first_value[period == "B"], first_value[period == "D"] ), .groups = 'drop' ) # Calculate p-values for gender groups p_values_gender_groups <- df %>% filter(gender %in% c("M", "F")) %>% group_by(gender) %>% summarise( t_test_p_value = perform_t_test( first_value[period == "B"], first_value[period == "D"] ), .groups = 'drop' ) # Calculate p-values for gender groups p_values_season_groups <- df %>% group_by(season) %>% summarise( t_test_p_value = t.test( first_value[period == "B"], first_value[period == "D"], )$p.value, .groups = 'drop' ) # Print p-values print(p_values_age_groups) print(p_values_gender_groups) print(p_values_season_groups) ``` ### 2.5 Deficiency rates for gender, age and season groups ```{r} # Calculate the deficiency rates for each gender group deficiency_gender_groups <- df %>% group_by(gender, period) %>% summarise(deficient_count = sum(deficient), total_count = n(), .groups = 'drop') %>% pivot_wider( names_from = period, values_from = c(deficient_count, total_count), values_fill = list(deficient_count = 0, total_count = 0) ) %>% mutate( rate_B = if_else(total_count_B > 0, deficient_count_B / total_count_B * 100, 0), rate_D = if_else(total_count_D > 0, deficient_count_D / total_count_D * 100, 0) ) %>% # Calculate p-values rowwise() %>% mutate(p_value = chisq.test(rbind(c(deficient_count_B, total_count_B - deficient_count_B), c(deficient_count_D, total_count_D - deficient_count_D)))$p.value) %>% ungroup() print(deficiency_gender_groups) ``` ```{r} # Calculate the deficiency rates for each age group deficiency_age_groups <- df %>% group_by(age_group, period) %>% summarise(deficient_count = sum(deficient), total_count = n(), .groups = 'drop') %>% pivot_wider( names_from = period, values_from = c(deficient_count, total_count), values_fill = list(deficient_count = 0, total_count = 0) ) %>% mutate( rate_B = if_else(total_count_B > 0, deficient_count_B / total_count_B * 100, 0), rate_D = if_else(total_count_D > 0, deficient_count_D / total_count_D * 100, 0) ) %>% rowwise() %>% mutate(p_value = chisq.test(rbind(c(deficient_count_B, total_count_B - deficient_count_B), c(deficient_count_D, total_count_D - deficient_count_D)))$p.value) %>% ungroup() print(deficiency_age_groups) ``` ```{r} # Calculate the deficiency rates for each season group deficiency_season_groups <- df %>% group_by(season, period) %>% summarise(deficient_count = sum(deficient), total_count = n(), .groups = 'drop') %>% pivot_wider( names_from = period, values_from = c(deficient_count, total_count), values_fill = list(deficient_count = 0, total_count = 0) ) %>% mutate( rate_B = if_else(total_count_B > 0, deficient_count_B / total_count_B * 100, 0), rate_D = if_else(total_count_D > 0, deficient_count_D / total_count_D * 100, 0) ) %>% # Calculate p-values rowwise() %>% mutate(p_value = chisq.test(rbind(c(deficient_count_B, total_count_B - deficient_count_B), c(deficient_count_D, total_count_D - deficient_count_D)))$p.value) %>% ungroup() print(deficiency_season_groups) ``` ### 2.6 Plots: Vit D levels over time ```{r} # Vit D Levels Over Time (stratified by age and gender) - with Standard Error # Calculate mean and standard error for Vitamin D levels by gender and season-year gender_stats <- df %>% group_by(season_year, gender) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), se_value = sd(first_value, na.rm = TRUE) / sqrt(n()), # Standard error calculation .groups = 'drop' ) # Plot Vitamin D levels over time by gender group (with standard error) ggplot(gender_stats, aes(x = season_year, y = mean_value, color = gender, fill = gender, group = gender)) + geom_line(linewidth = 1) + geom_point(size = 2) + geom_ribbon(aes(ymin = mean_value - se_value, ymax = mean_value + se_value), alpha = 0.2) + labs( x = "Season", y = "Vitamin D Level (µg/l)" ) + ylim(0, 60) + theme_minimal() + scale_fill_manual(values = c("M" = "blue", "F" = "red")) + scale_color_manual(values = c("M" = "blue", "F" = "red")) + theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) # Calculate mean and standard error for Vitamin D levels by age group and season-year age_group_stats <- df %>% group_by(season_year, age_group) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), se_value = sd(first_value, na.rm = TRUE) / sqrt(n()), # Standard error calculation .groups = 'drop' ) # Plot Vitamin D levels over time by age group (with standard error) ggplot(age_group_stats, aes(x = season_year, y = mean_value, color = age_group, fill = age_group, group = age_group)) + geom_line(linewidth = 1) + geom_point(size = 2) + geom_ribbon(aes(ymin = mean_value - se_value, ymax = mean_value + se_value), alpha = 0.2) + labs( x = "Season", y = "Vitamin D Level (µg/l)" ) + ylim(0, 60) + theme_minimal() + scale_fill_brewer(palette = "Set2") + scale_color_brewer(palette = "Set2") + theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) ``` ```{r} # Vit D Levels Over Time (stratified by age and gender) - with Standard Deviation # Calculate mean and standard deviation for Vitamin D levels by gender and season-year gender_stats <- df %>% group_by(season_year, gender) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) # Plot Vitamin D levels over time by gender group (with standard deviation) ggplot(gender_stats, aes(x = season_year, y = mean_value, color = gender, fill = gender, group = gender)) + geom_line(linewidth = 1) + geom_point(size = 2) + geom_ribbon(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), alpha = 0.2) + labs( x = "Season", y = "Vitamin D Level (µg/l)" ) + ylim(0, 60) + theme_minimal() + scale_fill_manual(values = c("M" = "blue", "F" = "red")) + scale_color_manual(values = c("M" = "blue", "F" = "red")) + theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) # Calculate mean and standard deviation for Vitamin D levels by age group and season-year age_group_stats <- df %>% group_by(season_year, age_group) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), # Standard deviation calculation .groups = 'drop' ) # Plot Vitamin D levels over time by age group (with standard deviation) ggplot(age_group_stats, aes(x = season_year, y = mean_value, color = age_group, fill = age_group, group = age_group)) + geom_line(linewidth = 1) + geom_point(size = 2) + geom_ribbon(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), alpha = 0.2) + labs( x = "Season", y = "Vitamin D Level (µg/l)" ) + ylim(0, 60) + theme_minimal() + scale_fill_brewer(palette = "Set2") + scale_color_brewer(palette = "Set2") + theme(legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) ``` ```{r} # Vit D Levels Over Time (not stratified by age and gender) # Calculate mean and standard error for Vitamin D levels for the entire cohort by season-year cohort_stats <- df %>% group_by(season_year) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), se_value = sd(first_value, na.rm = TRUE) / sqrt(n()), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop' ) print(cohort_stats) # Plot Vitamin D levels over time for the entire cohort with standard error ribbon ggplot(cohort_stats, aes(x = season_year, y = mean_value, group = 1)) + geom_line(color = "#1f77b4", linewidth = 1) + geom_point(color = "#ff7f0e", size = 2) + geom_ribbon(aes(ymin = mean_value - se_value, ymax = mean_value + se_value), fill = "#ff7f0e", alpha = 0.2) + labs( x = "Season", y = "Vitamin D Level (µg/l)", ) + ylim(0, 60) + theme_minimal() + theme( legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = 16, face = "bold") ) # Plot Vitamin D levels over time for the entire cohort with standard deviation ribbon ggplot(cohort_stats, aes(x = season_year, y = mean_value, group = 1)) + geom_line(color = "#1f77b4", linewidth = 1) + geom_point(color = "#ff7f0e", size = 2) + geom_ribbon(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), fill = "#ff7f0e", alpha = 0.2) + labs( x = "Season", y = "Vitamin D Level (µg/l)", ) + ylim(0, 60) + theme_minimal() + theme( legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = 16, face = "bold") ) ``` ### 2.7 Plots: Comparison Vit D levels before and during the pandemic ```{r} # Remove outliers # Count values below 0 and above 70 below_0 <- sum(df$first_value < 0, na.rm = TRUE) above_70 <- sum(df$first_value > 70, na.rm = TRUE) # Print the counts print(paste("Number of values removed below 0:", below_0)) print(paste("Number of values removed above 70:", above_70)) # Filter data to remove outliers (values below 0 or above 70) df_filtered <- df %>% filter(first_value >= 0 & first_value <= 70) # Create a new column for combined gender and age group for specific grouping df_filtered <- df_filtered %>% mutate(gender_age_group = paste(gender, age_group, sep = " - ")) # Calculate statistics for each gender-age group and period combination gender_age_group_stats <- df_filtered %>% group_by(gender_age_group, period) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE) ) %>% ungroup() # Perform t-test comparing Vitamin D levels before and during the pandemic for each gender-age group t_test_gender_age_group <- df_filtered %>% group_by(gender_age_group) %>% summarise( p_value = t.test(first_value[period == "B"], first_value[period == "D"], na.rm = TRUE)$p.value ) # Add significance stars based on p-values t_test_gender_age_group <- t_test_gender_age_group %>% mutate(signif_label = case_when( p_value < 0.001 ~ "***", p_value < 0.01 ~ "**", p_value < 0.05 ~ "*", TRUE ~ "ns" )) # Print p-values and significance labels print(t_test_gender_age_group) # Calculate statistics for absolute and percentage change gender_age_group_changes <- gender_age_group_stats %>% pivot_wider(names_from = period, values_from = c(mean_value, sd_value)) %>% rename( mean_value_Pre_Pandemic = `mean_value_B`, mean_value_Pandemic = `mean_value_D` ) %>% mutate( absolute_change = mean_value_Pandemic - mean_value_Pre_Pandemic, percentage_change = ((mean_value_Pandemic - mean_value_Pre_Pandemic) / mean_value_Pre_Pandemic) * 100 ) # Create updated facet labels with absolute and percentage changes facet_labels <- setNames( paste0( gender_age_group_changes$gender_age_group, "\nAbs. Change: ", round(gender_age_group_changes$absolute_change, 2), " µg/l (", round(gender_age_group_changes$percentage_change, 1), "%)" ), gender_age_group_changes$gender_age_group) # Violin plot with updated facet labels p_gender_age_violin_box <- ggplot(df_filtered, aes(x = period, y = first_value, fill = period)) + geom_violin(position = position_dodge(width = 0.8), width = 0.6, trim = FALSE) + # Violin plot geom_boxplot(position = position_dodge(width = 0.8), width = 0.15, alpha = 0.5, outlier.shape = NA) + # Boxplot without outliers stat_summary( fun.data = "mean_sdl", geom = "pointrange", position = position_dodge(width = 0.8) ) + labs( x = "Period", y = "Vitamin D Level (µg/l)" ) + theme_minimal() + scale_fill_manual( values = c("B" = "#1f77b4", "D" = "#ff7f0e"), name = "Period", labels = c("B" = "Pre-Pandemic", "D" = "Pandemic") ) + theme( legend.title = element_blank(), strip.text = element_text(size = 7), axis.text.x = element_blank(), # Remove x-axis labels axis.ticks.length = unit(0.2, "cm") ) + facet_wrap(~ gender_age_group, scales = "free", labeller = as_labeller(facet_labels)) # Add significance stars for each gender-age group p_gender_age_violin_box <- p_gender_age_violin_box + geom_text(data = t_test_gender_age_group, aes(x = 1.5, y = 65, label = signif_label), size = 5, inherit.aes = FALSE) print(p_gender_age_violin_box) ``` ### 2.8 Plots: Seasonality ```{r} # Filter data to remove outliers (values below 0 or above 70) df_filtered <- df %>% filter(first_value >= 0 & first_value <= 70) # Prepare the data for the updated plot season_year_stats <- df_filtered %>% filter(!(cohort_year == 2018 & season == "Winter")) %>% # Exclude Winter 2018 mutate( cohort_year = factor(cohort_year), period_label = case_when( period == "B" ~ "Pre-Pandemic", period == "D" ~ "Pandemic", TRUE ~ NA_character_ ), period_label = factor(period_label, levels = c("Pre-Pandemic", "Pandemic")) ) # Calculate mean and median differences between pre-pandemic and pandemic periods for each season seasonal_changes <- season_year_stats %>% group_by(season, period_label) %>% summarise( mean_value = mean(first_value, na.rm = TRUE), median_value = median(first_value, na.rm = TRUE), .groups = "drop" ) %>% pivot_wider( names_from = period_label, values_from = c(mean_value, median_value) ) %>% rename( mean_value_Pre_Pandemic = `mean_value_Pre-Pandemic`, mean_value_Pandemic = `mean_value_Pandemic`, median_value_Pre_Pandemic = `median_value_Pre-Pandemic`, median_value_Pandemic = `median_value_Pandemic` ) %>% mutate( mean_diff = mean_value_Pandemic - mean_value_Pre_Pandemic, median_diff = median_value_Pandemic - median_value_Pre_Pandemic ) seasonal_changes <- seasonal_changes %>% dplyr::select(season, mean_diff) %>% mutate(mean_diff = round(mean_diff, 2)) # Create facet labels with mean difference included facet_labels <- setNames( paste0(c("Winter", "Spring", "Summer", "Fall"), "\nMean Diff: ", seasonal_changes$mean_diff), c("Winter", "Spring", "Summer", "Fall") ) # Updated plot with mean difference in facet labels p_seasonal_violin_box <- ggplot(season_year_stats, aes(x = cohort_year, y = first_value, fill = period_label)) + geom_violin(alpha = 0.8, trim = TRUE, scale = "width", position = position_dodge(width = 0.8)) + geom_boxplot(width = 0.2, position = position_dodge(width = 0.8), alpha = 0.6, outlier.shape = NA) + facet_wrap(~ season, ncol = 2, labeller = as_labeller(facet_labels)) + geom_vline( xintercept = seq_along(unique(season_year_stats$cohort_year)) - 0.5, color = "grey85", linetype = "dashed", size = 0.3 ) + # Labels and colors labs( x = "Year", y = "Vitamin D Level (µg/l)", fill = "Period" ) + scale_fill_manual( values = c("Pre-Pandemic" = "#1f77b4", "Pandemic" = "#ff7f0e"), aesthetics = c("fill", "color") ) + # Adjust theme theme_classic() + theme( text = element_text(size = 10), legend.text = element_text(size = 8), panel.grid.major.y = element_line(color = "grey90", size = 0.2), panel.grid.minor.y = element_line(color = "grey95", size = 0.1), legend.position = "top", axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(face = "bold", size = 12), plot.title = element_text(size = 14, hjust = 0.5, face = "bold") ) print(p_seasonal_violin_box) ``` ## 3. Propensity score matching (PSM) ### 3.1 Matching ```{r} # performing exact matching parameters on the entire sample test_match_obj <- matchit(period ~ age_group + gender + test_month, data = df, method = "nearest", distance = "glm", exact = c("gender", "age_group", "test_month")) matched_data <- match.data(test_match_obj) # Calculate the sample sizes original_size <- nrow(df) # Total original sample size matched_size <- nrow(matched_data) # Total matched sample size # Calculate the number and percentage of observations lost num_lost <- original_size - matched_size percent_lost <- (num_lost / original_size) * 100 # Print results cat("Original Sample Size: ", original_size, "\n") cat("Matched Sample Size: ", matched_size, "\n") cat("Number of Observations Lost: ", num_lost, "\n") cat("Percentage Lost: ", percent_lost, "%\n") # Summary summary(test_match_obj) ``` #### 3.1.1 Statistical testing ```{r} # Calculate summaries for Age with Chi-squared Test for Independence age_summary <- matched_data %>% group_by(period, age_group) %>% summarise(count = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = count / sum(count) * 100) # Conduct Chi-squared Test and append the result age_table <- xtabs(count ~ period + age_group, data = age_summary) age_chi_sq_test <- tryCatch(chisq.test(age_table), error = function(e) e) age_p_value <- ifelse(class(age_chi_sq_test) == "try-error", NA, age_chi_sq_test$p.value) # Calculate summaries for Gender with Chi-squared Test gender_summary <- matched_data %>% group_by(period, gender) %>% summarise(count = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = count / sum(count) * 100) # Conduct Chi-squared Test and append the result gender_table <- xtabs(count ~ period + gender, data = gender_summary) gender_chi_sq_test <- tryCatch(chisq.test(gender_table), error = function(e) e) gender_p_value <- ifelse(class(gender_chi_sq_test) == "try-error", NA, gender_chi_sq_test$p.value) # Calculate summaries for Test month with Chi-squared Test test_month_summary <- matched_data %>% group_by(period, test_month) %>% summarise(count = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = count / sum(count) * 100) # Conduct Chi-squared Test and append the result test_month_table <- xtabs(count ~ period + test_month, data = test_month_summary) test_month_chi_sq_test <- tryCatch(chisq.test(test_month_table), error = function(e) e) test_month_p_value <- ifelse(class(test_month_chi_sq_test) == "try-error", NA, test_month_chi_sq_test$p.value) # Calculate summaries for Season with Chi-squared Test season_summary <- matched_data %>% group_by(period, season) %>% summarise(count = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = count / sum(count) * 100) # Conduct Chi-squared Test and append the result season_table <- xtabs(count ~ period + season, data = season_summary) season_chi_sq_test <- tryCatch(chisq.test(season_table), error = function(e) e) season_p_value <- ifelse(class(season_chi_sq_test) == "try-error", NA, season_chi_sq_test$p.value) # Calculate mean and standard deviation for Vitamin D levels and perform t-test vitamin_d_summary <- matched_data %>% group_by(period) %>% summarise( mean_vitamin_d = mean(first_value, na.rm = TRUE), sd_vitamin_d = sd(first_value, na.rm = TRUE), .groups = 'drop' ) vitamin_d_t_test <- t.test(first_value ~ period, data = matched_data) vitamin_d_p_value <- vitamin_d_t_test$p.value # Calculate the percentage of deficiencies for each period and perform Chi-squared Test deficiency_summary <- matched_data %>% group_by(period) %>% summarise( total = n(), deficient_count = sum(deficient == 1, na.rm = TRUE), non_deficient_count = sum(deficient == 0, na.rm = TRUE), deficiency_rate = deficient_count / total * 100, .groups = 'drop' ) # Create a contingency table for the Chi-squared test deficiency_table <- xtabs(~ period + deficient, data = matched_data) # Conduct Chi-squared Test and append the result deficiency_chi_sq_test <- tryCatch(chisq.test(deficiency_table), error = function(e) e) deficiency_p_value <- ifelse(class(deficiency_chi_sq_test) == "try-error", NA, deficiency_chi_sq_test$p.value) # Print the summaries and p-values print(age_summary) print(paste("Age Group P-value:", age_p_value)) print(gender_summary) print(paste("Gender P-value:", gender_p_value)) print(test_month_summary) print(paste("Test Month P-value:", test_month_p_value)) print(season_summary) print(paste("Season P-value:", season_p_value)) print(vitamin_d_summary) print(paste("Vitamin D Levels P-value:", vitamin_d_p_value)) print(deficiency_summary) print(paste("Deficiency Rate P-value:", deficiency_p_value)) ``` #### 3.1.2 Effect size (SMD) - Mean vitamin D levels ```{r} # Effect Size # Calculate means and SDs for matched data mean_matched_b <- mean(matched_data$first_value[matched_data$period == "B"]) mean_matched_d <- mean(matched_data$first_value[matched_data$period == "D"]) sd_matched_b <- sd(matched_data$first_value[matched_data$period == "B"]) sd_matched_d <- sd(matched_data$first_value[matched_data$period == "D"]) # Pooled SD for matched data n1_matched <- nrow(matched_data[matched_data$period == "B", ]) n2_matched <- nrow(matched_data[matched_data$period == "D", ]) sd_matched_pooled <- sqrt(((n1_matched - 1) * sd_matched_b^2 + (n2_matched - 1) * sd_matched_d^2) / (n1_matched + n2_matched - 2)) # SMD (flipped sign for "During - Before") smd <- (mean_matched_d - mean_matched_b) / sd_matched_pooled # Flips the direction print(paste("Standardized Mean Difference (SMD):", round(smd, 4))) # Standard error for SMD (same formula as Cohen's d) se_smd <- sqrt((n1 + n2) / (n1 * n2) + (smd^2) / (2 * (n1 + n2))) # Confidence intervals ci_99_lower_smd <- smd - z_99 * se_smd ci_99_upper_smd <- smd + z_99 * se_smd ci_95_lower_smd <- smd - z_95 * se_smd ci_95_upper_smd <- smd + z_95 * se_smd # Print results cat("99% CI for SMD:", ci_99_lower_smd, "to", ci_99_upper_smd, "\n") cat("95% CI for SMD:", ci_95_lower_smd, "to", ci_95_upper_smd, "\n") # Note: Confidence intervals for the Standardized Mean Difference (SMD) are calculated using the formula:CI = SMD ± Z * SE_SMD where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_SMD is the standard error of the SMD estimate. ``` #### 3.1.3 Effect size (SMD) - vitamin D deficiency rate ```{r} # Ensure sample sizes are numeric to avoid integer overflow n1_matched_def <- as.numeric(nrow(matched_data[matched_data$period == "B", ])) n2_matched_def <- as.numeric(nrow(matched_data[matched_data$period == "D", ])) # Compute means and standard deviations for matched deficiency rates mean_matched_b_def <- mean(matched_data$deficient[matched_data$period == "B"]) mean_matched_d_def <- mean(matched_data$deficient[matched_data$period == "D"]) sd_matched_b_def <- sd(matched_data$deficient[matched_data$period == "B"]) sd_matched_d_def <- sd(matched_data$deficient[matched_data$period == "D"]) # Pooled SD for matched deficiency data sd_matched_pooled_def <- sqrt(((n1_matched_def - 1) * sd_matched_b_def^2 + (n2_matched_def - 1) * sd_matched_d_def^2) / (n1_matched_def + n2_matched_def - 2)) # SMD for deficiency (flipped sign for "During - Before") smd_def <- (mean_matched_d_def - mean_matched_b_def) / sd_matched_pooled_def print(paste("SMD for deficiency:", round(smd_def, 4))) # Standard error for SMD se_smd_def <- sqrt((n1_matched_def + n2_matched_def) / (n1_matched_def * n2_matched_def) + (smd_def^2) / (2 * (n1_matched_def + n2_matched_def))) # Confidence intervals for SMD ci_99_lower_smd_def <- smd_def - z_99 * se_smd_def ci_99_upper_smd_def <- smd_def + z_99 * se_smd_def ci_95_lower_smd_def <- smd_def - z_95 * se_smd_def ci_95_upper_smd_def <- smd_def + z_95 * se_smd_def # Print results cat("99% CI for SMD:", ci_99_lower_smd_def, "to", ci_99_upper_smd_def, "\n") cat("95% CI for SMD:", ci_95_lower_smd_def, "to", ci_95_upper_smd_def, "\n") # Note: Confidence intervals for the Standardized Mean Difference (SMD) are calculated using the formula:CI = SMD ± Z * SE_SMD where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_SMD is the standard error of the SMD estimate. ``` ## 4.Causal Machine Learning: Random Forest ON UNMATCHED DATA ### 4.1 Change in mean vitamin D levels ```{r} # Convert categorical variables to numeric for causal forest df$age_group_numeric <- as.integer(df$age_group) df$gender_numeric <- as.integer(df$gender) df$season_numeric <- as.integer(df$season) # Convert period to a binary treatment indicator df$treatment <- ifelse(df$period == "D", 1, 0) # Calculate the pre-pandemic average combined value directly from the dataset average_first_value <- df %>% filter(period == "B") %>% summarise(avg_value = mean(first_value, na.rm = TRUE)) %>% pull(avg_value) print(paste("Pre-pandemic average combined value:", round(average_first_value, 4))) # Run causal forest model causal_forest_model <- causal_forest( X = df[c("age_group_numeric", "gender_numeric", "season_numeric")], Y = df$first_value, W = df$treatment, num.trees = 2000 ) # Estimating the average treatment effect (ATE) with confidence intervals ate_result <- average_treatment_effect(causal_forest_model, target.sample = "all") # Extract the point estimate and standard error ate_estimate <- ate_result[1] ate_std_err <- ate_result[2] # Print the results print(paste("ATE Estimate:", round(ate_estimate, 4))) # Calculate percentage change percentage_change <- (ate_estimate / average_first_value) * 100 # Print the percentage change print(paste("Percentage change due to pandemic:", round(percentage_change, 2), "%")) # Confidence intervals ci_99_lower_ate <- ate_estimate - z_99 * ate_std_err ci_99_upper_ate <- ate_estimate + z_99 * ate_std_err ci_95_lower_ate <- ate_estimate - z_95 * ate_std_err ci_95_upper_ate <- ate_estimate + z_95 * ate_std_err cat("99% CI for ATE:", ci_99_lower_ate, "to", ci_99_upper_ate, "\n") cat("95% CI for ATE:", ci_95_lower_ate, "to", ci_95_upper_ate, "\n") # Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate. ``` #### 4.1.1 Feature importance ```{r} # Calculate variable importance and convert to percentages importance <- variable_importance(causal_forest_model) importance_percentage <- (importance / sum(importance)) * 100 # Create a data frame for plotting importance_df <- data.frame( covariate = c("Age Group", "Gender", "Season"), importance = importance, percentage = importance_percentage ) # Print the numeric importance values and percentages print(importance_df) # Plot ggplot(importance_df, aes(x = reorder(covariate, importance), y = percentage, fill = covariate)) + geom_bar(stat = "identity", color = "black", width = 0.7) + geom_text(aes(label = sprintf("%.1f%%", percentage)), hjust = -0.2, size = 5) + labs( x = "Covariate", y = "Feature importance (explained variance in %)" ) + scale_fill_manual(values = c("Age Group" = "#1f77b4", "Gender" = "#1f77b4", "Season" = "#1f77b4")) + coord_flip() + ylim(0, 80) + theme_classic() + theme( plot.title = element_text(hjust = 0.5, size = 16, face = "bold"), axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14), legend.position = "none" ) ``` #### 4.1.2 Subgroup Analysis - for each subgroup ```{r} # Define the subgroups for age_group, gender, and season age_groups <- unique(df$age_group) gender_groups <- unique(df$gender) season_groups <- unique(df$season) # Function to run the causal forest model on a given subset and return the ATE and CI run_causal_forest <- function(subset_df) { causal_forest_model <- causal_forest( X = subset_df[c("age_group_numeric", "gender_numeric", "season_numeric")], Y = subset_df$first_value, W = subset_df$treatment, num.trees = 2000 ) # Estimate ATE ate_result <- average_treatment_effect(causal_forest_model) ate_estimate <- ate_result[1] ate_std_err <- ate_result[2] # Calculate the 99% confidence interval z_score_99 <- 2.576 ate_ci_lower_99 <- ate_estimate - z_score_99 * ate_std_err ate_ci_upper_99 <- ate_estimate + z_score_99 * ate_std_err # Calculate the 95% confidence interval z_score_95 <- 1.960 ate_ci_lower_95 <- ate_estimate - z_score_95 * ate_std_err ate_ci_upper_95 <- ate_estimate + z_score_95 * ate_std_err # Calculate percentage change average_first_value <- 26.30 percentage_change <- (ate_estimate / average_first_value) * 100 return(data.frame( Category = "", ATE_Estimate = round(ate_estimate, 4), CI_Lower_99 = round(ate_ci_lower_99, 4), CI_Upper_99 = round(ate_ci_upper_99, 4), CI_Lower_95 = round(ate_ci_lower_95, 4), CI_Upper_95 = round(ate_ci_upper_95, 4), Percentage_Change = round(percentage_change, 2) )) } # Run the analysis for each age group, gender group, and season group individually age_group_results <- lapply(age_groups, function(age) { subset_df <- df[df$age_group == age, ] result <- run_causal_forest(subset_df) result$Category <- paste("Age Group:", age) return(result) }) gender_group_results <- lapply(gender_groups, function(gender) { subset_df <- df[df$gender == gender, ] result <- run_causal_forest(subset_df) result$Category <- paste("Gender:", gender) return(result) }) season_group_results <- lapply(season_groups, function(season) { subset_df <- df[df$season == season, ] result <- run_causal_forest(subset_df) result$Category <- paste("Season:", season) return(result) }) # Combine all results into a single data frame final_results <- do.call(rbind, c(age_group_results, gender_group_results, season_group_results)) # Display the final results as a data frame print(final_results) ``` ```{r} # Adjust the order for seasons season_order <- c("Season: Fall", "Season: Summer", "Season: Spring", "Season: Winter") # Set factor levels for the Category column final_results$Category <- factor( final_results$Category, levels = c(season_order, setdiff(unique(final_results$Category), season_order)) ) forest_plot <- ggplot(final_results, aes(x = Category, y = ATE_Estimate)) + # Add the points for ATE estimates geom_point(size = 5, shape = 21, fill = "black", color = "black", stroke = 0.5) + # Add 99% CI as the longest bars geom_linerange(aes(ymin = CI_Lower_99, ymax = CI_Upper_99), size = 1.0, color = "black") + # Add 95% CI as shorter bars geom_linerange(aes(ymin = CI_Lower_95, ymax = CI_Upper_95), size = 2.0, color = "firebrick") + # Add a dashed line at 0 for reference geom_hline(yintercept = 0, linetype = "dashed", color = "black") + # Flip coordinates for horizontal plot coord_flip() + # Labels and theme labs( x = "Subgroup category", y = "Estimated reduction (in µg/l)" ) + theme_classic(base_size = 16) + # Increased base font size theme( axis.text.x = element_text(hjust = 0.5, size = 14), axis.text.y = element_text(size = 14), plot.title = element_text(hjust = 0.5, size = 18, face = "bold"), panel.grid.major.y = element_line(color = "grey80"), panel.grid.minor.y = element_blank() ) # Display the updated plot print(forest_plot) ``` #### 4.1.3 Comparison of effect sizes across all 3 methods used ```{r} # Cohens d and SMD are standardized, transform back by multiplying with pooled SD # Calculate pooled SD for original data s_pooled <- sqrt(((n1 - 1) * sd_b^2 + (n2 - 1) * sd_d^2) / (n1 + n2 - 2)) # ATE from Cohen's d ate_from_cohens_d <- cohens_d * s_pooled # ATE confidence intervals for Cohen's d (95% instead of 90%) ci_99_lower_ate_d <- ci_99_lower_d * s_pooled ci_99_upper_ate_d <- ci_99_upper_d * s_pooled ci_95_lower_ate_d <- ci_95_lower_d * s_pooled ci_95_upper_ate_d <- ci_95_upper_d * s_pooled # ATE from SMD ate_from_smd <- smd * s_pooled # ATE confidence intervals for SMD (95% instead of 90%) ci_99_lower_ate_smd <- ci_99_lower_smd * s_pooled ci_99_upper_ate_smd <- ci_99_upper_smd * s_pooled ci_95_lower_ate_smd <- ci_95_lower_smd * s_pooled ci_95_upper_ate_smd <- ci_95_upper_smd * s_pooled # Create updated data frame with recalculated ATE values effect_sizes_updated <- data.frame( Category = c("Cohen's d", "SMD (PSM)", "ATE (ML)"), Estimate = c(ate_from_cohens_d, ate_from_smd, ate_estimate), CI_Lower_99 = c(ci_99_lower_ate_d, ci_99_lower_ate_smd, ci_99_lower_ate), CI_Upper_99 = c(ci_99_upper_ate_d, ci_99_upper_ate_smd, ci_99_upper_ate), CI_Lower_95 = c(ci_95_lower_ate_d, ci_95_lower_ate_smd, ci_95_lower_ate), CI_Upper_95 = c(ci_95_upper_ate_d, ci_95_upper_ate_smd, ci_95_upper_ate) ) # Reorder the Category levels to match the new desired order (Cohen’s d on top, ATE ML at bottom) effect_sizes_updated$Category <- factor(effect_sizes_updated$Category, levels = c("Cohen's d", "SMD (PSM)", "ATE (ML)")) # Create the forest plot with overlapping whiskers for CIs forest_plot_updated <- ggplot(effect_sizes_updated, aes(x = Category, y = Estimate)) + # Add points for the ATE estimates geom_point(size = 5, shape = 21, fill = "black", color = "black", stroke = 0.5) + # Add 99% CI as the longest bars geom_linerange(aes(ymin = CI_Lower_99, ymax = CI_Upper_99), size = 1.0, color = "black") + # Add 95% CI as shorter bars geom_linerange(aes(ymin = CI_Lower_95, ymax = CI_Upper_95), size = 2.0, color = "firebrick") + # Add a dashed line at 0 for reference geom_hline(yintercept = 0, linetype = "dashed", color = "black") + # Flip coordinates for horizontal orientation coord_flip() + # Labels and theme labs( y = "Estimated effect size (in µg/l)", # Updated y-axis label x = NULL # Remove x-axis label ) + theme_classic(base_size = 16) + # Set a clean, classic theme theme( axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14), plot.title = element_text(size = 18, face = "bold"), panel.grid.major.y = element_line(color = "grey80"), panel.grid.minor.y = element_blank() ) # Display the updated plot print(forest_plot_updated) ``` #### 4.1.4 Sensitivity analysis with sensemakr ```{r} # since sensemakr works directly with linear regression models, we need to approximate the causal forest results in a linear regression framework # Prepare data for linear regression df_linear <- df # Fit a linear regression model linear_model <- lm( first_value ~ treatment + age_group_numeric + gender_numeric + season_numeric, data = df_linear ) # Summarize the model summary(linear_model) ``` ```{r} # Run sensitivity analysis using sensemakr sensitivity_analysis <- sensemakr( model = linear_model, treatment = "treatment", benchmark_covariates = c("age_group_numeric", "gender_numeric", "season_numeric"), kd = 1 # Specify robustness threshold (default is 1) ) summary(sensitivity_analysis) sensitivity_summary <- summary(sensitivity_analysis) # Generate minimal reporting in LaTeX format latex_table <- ovb_minimal_reporting(sensitivity_analysis, format = "latex") ``` #### 4.1.5 Weighted ASMD Analysis ```{r} df$period <- as.factor(df$period) # Convert categorical variables to numeric for mean calculation df <- df %>% mutate( age_group_numeric = as.numeric(as.factor(age_group)), # Encodes age groups as 1, 2, 3 gender_numeric = as.numeric(as.factor(gender)), # Encodes gender (F=1, M=2) test_month_numeric = as.numeric(as.factor(test_month)) # Encodes months as 1-12 ) # Estimate Propensity Scores ps_model <- glm(period ~ age_group + gender + test_month, data = df, family = binomial) df$pscore <- predict(ps_model, type = "response") # Compute Inverse Probability of Treatment Weights (IPTW) df$iptw <- ifelse(df$period == "D", 1 / df$pscore, 1 / (1 - df$pscore)) # Compute Balance Statistics Using Weighted Data balance_stats <- bal.tab(ps_model, weights = df$iptw, covs = df[, c("age_group", "gender", "test_month")], estimand = "ATE", thresholds = c(m = 0.1)) # Convert to Data Frame asmd_results <- as.data.frame(balance_stats$Balance) # Compute Mean ASMD for Each Category asmd_summary <- data.frame( Covariate = c("Age Group", "Gender", "Test Month"), Weighted_ASMD = c( # Fix column name issue mean(asmd_results[grep("age_group", rownames(asmd_results)), "Diff.Adj"], na.rm = TRUE), mean(asmd_results[grep("gender", rownames(asmd_results)), "Diff.Adj"], na.rm = TRUE), mean(asmd_results[grep("test_month", rownames(asmd_results)), "Diff.Adj"], na.rm = TRUE) ), M.Threshold = "Balanced, <0.1" ) # Fix Covariate Naming to Match `asmd_results` means_table <- df %>% group_by(period) %>% summarise( `Age Group` = mean(age_group_numeric, na.rm = TRUE), `Gender` = mean(gender_numeric, na.rm = TRUE), `Test Month` = mean(test_month_numeric, na.rm = TRUE) ) %>% pivot_longer(cols = -period, names_to = "Covariate", values_to = "Mean") # Reshape Table to Wide Format (Pre-Pandemic & Pandemic Means) means_table_wide <- means_table %>% pivot_wider(names_from = period, values_from = Mean) %>% rename(`Pre-Pandemic Mean` = `B`, `Pandemic Mean` = `D`) # Merge Means with ASMD Data final_balance_table <- asmd_summary %>% rename(`Weighted ASMD` = Weighted_ASMD) %>% # Ensure correct column name left_join(means_table_wide, by = "Covariate") %>% dplyr::select(Covariate, `Pre-Pandemic Mean`, `Pandemic Mean`, `Weighted ASMD`, M.Threshold) # Print Final Table for Review print(final_balance_table) ``` ### 4.2 Change in vitamin D deficiency rates ```{r} # Convert categorical variables to numeric for causal forest df$age_group_numeric <- as.integer(df$age_group) df$gender_numeric <- as.integer(df$gender) df$season_numeric <- as.integer(df$season) # Convert period to a binary treatment indicator df$treatment <- ifelse(df$period == "D", 1, 0) # Ensure the deficiency column is numeric (Binary: 1 = Deficient, 0 = Not Deficient) df$deficient <- as.numeric(df$deficient) # Calculate the pre-pandemic average deficiency rate directly from the dataset pre_pandemic_deficiency_rate <- df %>% filter(period == "B") %>% summarise(avg_deficiency = mean(deficient, na.rm = TRUE)) %>% pull(avg_deficiency) print(paste("Pre-pandemic average deficiency rate:", round(pre_pandemic_deficiency_rate, 4))) # Run causal forest model for deficiency rates causal_forest_deficiency <- causal_forest( X = df[c("age_group_numeric", "gender_numeric", "season_numeric")], Y = df$deficient, W = df$treatment, num.trees = 2000 ) # Estimating the average treatment effect (ATE) with confidence intervals ate_result_deficiency <- average_treatment_effect(causal_forest_deficiency, target.sample = "all") # Extract the point estimate and standard error ate_estimate_deficiency <- ate_result_deficiency[1] ate_std_err_deficiency <- ate_result_deficiency[2] # Print the results print(paste("ATE Estimate (Effect on Deficiency Rate):", round(ate_estimate_deficiency, 4))) # Calculate percentage change based on the actual pre-pandemic deficiency rate percentage_change_deficiency <- (ate_estimate_deficiency / pre_pandemic_deficiency_rate) * 100 # Print the percentage change print(paste("Percentage change due to pandemic:", round(percentage_change_deficiency, 2), "%")) # Confidence intervals z_99 <- 2.576 z_95 <- 1.960 ci_99_lower_ate_deficiency <- ate_estimate_deficiency - z_99 * ate_std_err_deficiency ci_99_upper_ate_deficiency <- ate_estimate_deficiency + z_99 * ate_std_err_deficiency ci_95_lower_ate_deficiency <- ate_estimate_deficiency - z_95 * ate_std_err_deficiency ci_95_upper_ate_deficiency <- ate_estimate_deficiency + z_95 * ate_std_err_deficiency cat("99% CI for ATE (Deficiency Rate):", ci_99_lower_ate_deficiency, "to", ci_99_upper_ate_deficiency, "\n") cat("95% CI for ATE (Deficiency Rate):", ci_95_lower_ate_deficiency, "to", ci_95_upper_ate_deficiency, "\n") ``` #### 4.2.1 Comparison of effect sizes across all 3 methods used ```{r} # Extracting ATE estimate and standard error from causal forest model ate_estimate_def <- ate_result_deficiency[1] ate_std_err_def <- ate_result_deficiency[2] # Confidence intervals for ATE ci_99_lower_ate_def <- ate_estimate_def - z_99 * ate_std_err_def ci_99_upper_ate_def <- ate_estimate_def + z_99 * ate_std_err_def ci_95_lower_ate_def <- ate_estimate_def - z_95 * ate_std_err_def ci_95_upper_ate_def <- ate_estimate_def + z_95 * ate_std_err_def # Bring all effect size metrics together in one df effect_sizes_deficiency <- data.frame( Category = c("Cohen's d", "SMD (PSM)", "ATE (ML)"), Estimate = c(cohens_d_def * sd_pooled_def, smd_def * sd_matched_pooled_def, ate_estimate_def), CI_Lower_99 = c(ci_99_lower_d_def * sd_pooled_def, ci_99_lower_smd_def * sd_matched_pooled_def, ci_99_lower_ate_def), CI_Upper_99 = c(ci_99_upper_d_def * sd_pooled_def, ci_99_upper_smd_def * sd_matched_pooled_def, ci_99_upper_ate_def), CI_Lower_95 = c(ci_95_lower_d_def * sd_pooled_def, ci_95_lower_smd_def * sd_matched_pooled_def, ci_95_lower_ate_def), CI_Upper_95 = c(ci_95_upper_d_def * sd_pooled_def, ci_95_upper_smd_def * sd_matched_pooled_def, ci_95_upper_ate_def) ) # Reorder factor levels for display order effect_sizes_deficiency$Category <- factor(effect_sizes_deficiency$Category, levels = c("Cohen's d", "SMD (PSM)", "ATE (ML)")) forest_plot_deficiency <- ggplot(effect_sizes_deficiency, aes(x = Category, y = Estimate)) + # Add points for the ATE estimates geom_point(size = 5, shape = 21, fill = "black", color = "black", stroke = 0.5) + # Add 99% CI as the longest bars geom_linerange(aes(ymin = CI_Lower_99, ymax = CI_Upper_99), size = 1.0, color = "black") + # Add 95% CI as shorter bars geom_linerange(aes(ymin = CI_Lower_95, ymax = CI_Upper_95), size = 2.0, color = "firebrick") + # Add a dashed line at 0 for reference geom_hline(yintercept = 0, linetype = "dashed", color = "black") + # Flip coordinates for horizontal orientation coord_flip() + # Labels and theme labs( y = "Estimated effect size (percentage points)", x = NULL ) + theme_classic(base_size = 16) + theme( axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14), plot.title = element_text(size = 18, face = "bold"), panel.grid.major.y = element_line(color = "grey80"), panel.grid.minor.y = element_blank() ) # Display the updated plot print(forest_plot_deficiency) ``` ## 5. Negative Controls Outcome Analysis 1: Entire analysis on a different time frame ```{r} # Define the new periods - uncomment the desired time frame to be tested # Time Frame 1 period1 <- c(as.Date("2018-03-01"), as.Date("2020-01-31")) period2 <- c(as.Date("2020-02-01"), as.Date("2022-02-28")) # Time Frame 2 period1 <- c(as.Date("2018-03-01"), as.Date("2020-03-31")) period2 <- c(as.Date("2020-04-01"), as.Date("2022-02-28")) # Create a new data frame for robustness check df_robust <- df # Assign each entry to a period assign_period <- function(date) { if (date >= period1[1] && date <= period1[2]) { return("B") } else if (date >= period2[1] && date <= period2[2]) { return("D") } else { return(NA) } } df_robust$period <- as.character(sapply(df_robust$test_date, assign_period)) df_robust <- df_robust[!is.na(df_robust$period),] ``` ### 5.1 Descriptive ```{r} sample_size_period <- df_robust %>% count(period) print("Sample size per period:") print(sample_size_period) sample_size_contingency <- with(df_robust, table(period)) sample_size_p_value <- chisq.test(sample_size_contingency)$p.value print("P-value for sample size distribution:") print(sample_size_p_value) age_group_counts <- df_robust %>% group_by(period, age_group) %>% summarise(n = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = n / sum(n) * 100) %>% ungroup() print("Age group counts per period:") print(age_group_counts) age_group_p_value <- chisq.test(with(df_robust, table(period, age_group)))$p.value print("P-value for age group distribution:") print(age_group_p_value) gender_group_counts <- df_robust %>% group_by(period, gender) %>% summarise(n = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = n / sum(n) * 100) %>% ungroup() print("Gender group counts per period:") print(gender_group_counts) gender_group_p_value <- chisq.test(with(df_robust, table(period, gender)))$p.value print("P-value for gender distribution:") print(gender_group_p_value) season_counts <- df_robust %>% group_by(period, season) %>% summarise(n = n(), .groups = 'drop') %>% group_by(period) %>% mutate(percentage = n / sum(n) * 100) %>% ungroup() print("Seasonal distribution per period:") print(season_counts) season_counts_p_value <- chisq.test(with(df_robust, table(period, season)))$p.value print("P-value for seasonal distribution:") print(season_counts_p_value) mean_vitamin_d_levels <- df_robust %>% group_by(period) %>% summarise(mean_value = mean(first_value, na.rm = TRUE), sd_value = sd(first_value, na.rm = TRUE), .groups = 'drop') print("Mean and standard deviation of Vitamin D levels per period:") print(mean_vitamin_d_levels) vitamin_d_t_test <- t.test(first_value ~ period, data = df_robust) print("T-test results for Vitamin D levels:") print(vitamin_d_t_test) vitamin_d_deficiency_rate <- df_robust %>% group_by(period) %>% summarise(deficiency_count = sum(deficient), total_count = n(), deficiency_rate = sum(deficient) / n() * 100, .groups = 'drop') print("Vitamin D deficiency rates per period:") print(vitamin_d_deficiency_rate) deficiency_contingency <- with(df_robust, table(period, deficient)) deficiency_p_value <- chisq.test(deficiency_contingency)$p.value print("P-value for Vitamin D deficiency rate differences:") print(deficiency_p_value) ``` ### 5.2 PSM ```{r} df_robust$gender <- factor(df_robust$gender) df_robust$age_group <- factor(df_robust$age_group) df_robust$period <- factor(df_robust$period) df_robust$test_month <- factor(df_robust$test_month) test_match_obj <- matchit(period ~ age_group + gender + test_month, data = df_robust, method = "nearest", distance = "glm", exact = c("gender", "age_group", "test_month")) matched_data <- match.data(test_match_obj) # Statistical testing for matched data vitamin_d_t_test_matched <- t.test(first_value ~ period, data = matched_data) print("T-test results for Vitamin D levels in matched data:") print(vitamin_d_t_test_matched) # Contingency table for deficiency rates deficiency_contingency_matched <- with(matched_data, table(period, deficient)) deficiency_p_value_matched <- chisq.test(deficiency_contingency_matched)$p.value print("P-value for Vitamin D deficiency rate differences in matched data:") print(deficiency_p_value_matched) # Calculate deficiency rates before and during the pandemic deficiency_rates_matched <- prop.table(deficiency_contingency_matched, margin = 1) * 100 print("Deficiency rates in matched sample (before and during the pandemic):") print(deficiency_rates_matched) ``` ### 5.3 ML ```{r} df_robust$age_group_numeric <- as.integer(factor(df_robust$age_group)) df_robust$gender_numeric <- as.integer(factor(df_robust$gender)) df_robust$season_numeric <- as.integer(factor(df_robust$season)) df_robust$treatment <- ifelse(df_robust$period == "D", 1, 0) causal_forest_model <- causal_forest( X = df_robust[c("age_group_numeric", "gender_numeric", "season_numeric")], Y = df_robust$first_value, W = df_robust$treatment, num.trees = 2000 ) # Estimating the average treatment effect (ATE) with confidence intervals ate_result <- average_treatment_effect(causal_forest_model, target.sample = "all") # Extract the point estimate and standard error ate_estimate <- ate_result[1] ate_std_err <- ate_result[2] # Print the results print(paste("ATE Estimate:", round(ate_estimate, 4))) # Calculate percentage change average_first_value <- mean(df_robust$first_value, na.rm = TRUE) percentage_change <- (ate_estimate / average_first_value) * 100 # Print the percentage change print(paste("Percentage change due to pandemic:", round(percentage_change, 2), "%")) # Confidence intervals ci_99_lower_ate <- ate_estimate - z_99 * ate_std_err ci_99_upper_ate <- ate_estimate + z_99 * ate_std_err ci_95_lower_ate <- ate_estimate - z_95 * ate_std_err ci_95_upper_ate <- ate_estimate + z_95 * ate_std_err cat("99% Confidence Interval for ATE:", ci_99_lower_ate, "to", ci_99_upper_ate, "\n") cat("95% Confidence Interval for ATE:", ci_95_lower_ate, "to", ci_95_upper_ate, "\n") # Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate. ``` ## 6. Negative Controls Outcome Analysis 2: Compare different months of different years ### 6.1 Descriptive ```{r} # Filter data for the months of March, April, and May for the years 2018, 2019, 2020, and 2021 df_spring <- df %>% filter(year(test_date) %in% c(2018, 2019, 2020, 2021) & month(test_date) %in% c(3, 4, 5)) # Ensure factors df_spring$period <- factor(df_spring$period) df_spring$gender <- factor(df_spring$gender) df_spring$age_group <- factor(df_spring$age_group) df_spring$season <- factor(df_spring$season) df_spring$test_month <- factor(month(df_spring$test_date)) # 1. Descriptive Statistics spring_vitamin_d_stats <- df_spring %>% group_by(cohort_year, test_month) %>% summarise(mean_vit_d = mean(first_value, na.rm = TRUE), sd_vit_d = sd(first_value, na.rm = TRUE), deficiency_count = sum(deficient, na.rm = TRUE), total_count = n(), deficiency_rate = (sum(deficient, na.rm = TRUE) / n()) * 100, .groups = "drop") %>% arrange(test_month, cohort_year) print("Vitamin D statistics by year and month:") print(spring_vitamin_d_stats) # Mean Vitamin D levels before and during the pandemic mean_vit_d_period <- df_spring %>% group_by(period) %>% summarise(mean_vit_d = mean(first_value, na.rm = TRUE), sd_vit_d = sd(first_value, na.rm = TRUE), .groups = "drop") print("Mean Vitamin D levels before and during the pandemic:") print(mean_vit_d_period) # Deficiency rates before and during the pandemic deficiency_rate_period <- df_spring %>% group_by(period) %>% summarise(deficiency_count = sum(deficient, na.rm = TRUE), total_count = n(), deficiency_rate = (sum(deficient, na.rm = TRUE) / n()) * 100, .groups = "drop") print("Deficiency rates before and during the pandemic:") print(deficiency_rate_period) # ANOVA test for mean Vitamin D levels vitamin_d_anova <- aov(first_value ~ factor(cohort_year) * factor(test_month), data = df_spring) summary(vitamin_d_anova) # Chi-square test for deficiency rate deficiency_contingency <- with(df_spring, table(cohort_year, test_month, deficient)) deficiency_p_value <- chisq.test(rowSums(deficiency_contingency))$p.value print("P-value for Vitamin D deficiency rate differences across years and months:") print(deficiency_p_value) ``` ### 6.2 PSM ```{r} test_match_obj <- matchit(period ~ age_group + gender + test_month, data = df_spring, method = "nearest", distance = "glm", exact = c("gender", "age_group", "test_month")) matched_data <- match.data(test_match_obj) # T-test for Vitamin D levels in matched data vitamin_d_t_test_matched <- t.test(first_value ~ period, data = matched_data) print("T-test results for Vitamin D levels in matched data:") print(vitamin_d_t_test_matched) # Deficiency rates in matched data deficiency_contingency_matched <- with(matched_data, table(period, deficient)) deficiency_p_value_matched <- chisq.test(deficiency_contingency_matched)$p.value print("P-value for Vitamin D deficiency rate differences in matched data:") print(deficiency_p_value_matched) deficiency_rates_matched <- prop.table(deficiency_contingency_matched, margin = 1) * 100 print("Deficiency rates in matched sample (before and during the pandemic):") print(deficiency_rates_matched) ``` ### 6.3 ML ```{r} df_spring$age_group_numeric <- as.integer(factor(df_spring$age_group)) df_spring$gender_numeric <- as.integer(factor(df_spring$gender)) df_spring$season_numeric <- as.integer(factor(df_spring$season)) df_spring$treatment <- ifelse(df_spring$period == "D", 1, 0) causal_forest_model <- causal_forest( X = df_spring[c("age_group_numeric", "gender_numeric", "season_numeric")], Y = df_spring$first_value, W = df_spring$treatment, num.trees = 2000 ) ate_result <- average_treatment_effect(causal_forest_model, target.sample = "all") ate_estimate <- ate_result[1] ate_std_err <- ate_result[2] print(paste("ATE Estimate:", round(ate_estimate, 4))) average_first_value <- mean(df_spring$first_value, na.rm = TRUE) percentage_change <- (ate_estimate / average_first_value) * 100 print(paste("Percentage change due to pandemic:", round(percentage_change, 2), "%")) # Confidence intervals ci_99_lower_ate <- ate_estimate - z_99 * ate_std_err ci_99_upper_ate <- ate_estimate + z_99 * ate_std_err ci_95_lower_ate <- ate_estimate - z_95 * ate_std_err ci_95_upper_ate <- ate_estimate + z_95 * ate_std_err cat("99% Confidence Interval for ATE:", ci_99_lower_ate, "to", ci_99_upper_ate, "\n") cat("95% Confidence Interval for ATE:", ci_95_lower_ate, "to", ci_95_upper_ate, "\n") # Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate. ``` ## 7. Negative Controls Outcome Analysis 3: Hours of sunshine ```{r} # Define the base URL base_url <- "https://opendata.dwd.de/climate_environment/CDC/regional_averages_DE/monthly/sunshine_duration/" # Initialize an empty data frame to store sunshine data sunshine_data <- data.frame() # Loop through each month to read and combine the data for (i in 1:12) { # Format the file name file_name <- sprintf("regional_averages_sd_%02d.txt", i) file_url <- paste0(base_url, file_name) # Read the data, skipping the first row to use the second row as headers monthly_data <- read_delim(file_url, delim = ";", col_types = cols(), skip = 1) # Select relevant columns (Jahr = year, Monat = month, Bayern = sunshine duration) monthly_data <- monthly_data %>% dplyr::select(Jahr, Monat, Bayern) %>% rename(sunshine_duration = Bayern) %>% mutate(Monat = as.numeric(Monat)) # Ensure month is numeric # Append the monthly data to the combined data frame sunshine_data <- bind_rows(sunshine_data, monthly_data) } # Ensuring year and month are numeric for merging df <- df %>% mutate( cohort_year = as.numeric(cohort_year), test_month = as.numeric(test_month) ) # Merge the sunshine data with your existing data frame df <- df %>% left_join(sunshine_data, by = c("cohort_year" = "Jahr", "test_month" = "Monat")) # Check for any unmatched entries unmatched_entries <- df %>% filter(is.na(sunshine_duration)) # Print the number of unmatched entries, if any cat("Number of entries with missing sunshine duration data:", nrow(unmatched_entries), "\n") # View the first few rows of the updated data frame head(df) ``` ```{r} # Convert categorical variables to numeric for causal forest df$age_group <- as.factor(df$age_group) df$age_group_numeric <- as.integer(df$age_group) df$gender <- as.factor(df$gender) df$gender_numeric <- as.integer(df$gender) df$season <- as.factor(df$season) df$season_numeric <- as.integer(df$season) # Convert period to a binary treatment indicator df$treatment <- ifelse(df$period == "D", 1, 0) # Ensure sunshine_duration is numeric df$sunshine_numeric <- as.numeric(df$sunshine_duration) # Run causal forest model with sunshine hours as an additional covariate causal_forest_model_sunshine <- causal_forest( X = df[c("age_group_numeric", "gender_numeric", "season_numeric", "sunshine_numeric")], Y = df$first_value, W = df$treatment, num.trees = 2000 ) # Estimating the average treatment effect (ATE) with confidence intervals ate_result_sunshine <- average_treatment_effect(causal_forest_model_sunshine, target.sample = "overlap") # using target.sample = "all" resulted in NaN because the sunshine column produced too extreme propensity scores # Extract the point estimate and standard error ate_estimate_sunshine <- ate_result_sunshine[1] ate_std_err_sunshine <- ate_result_sunshine[2] # Print the results print(paste("ATE Estimate (with sunshine):", round(ate_estimate_sunshine, 4))) # Confidence intervals z_99 <- 2.576 z_95 <- 1.960 ci_99_lower_ate_sunshine <- ate_estimate_sunshine - z_99 * ate_std_err_sunshine ci_99_upper_ate_sunshine <- ate_estimate_sunshine + z_99 * ate_std_err_sunshine ci_95_lower_ate_sunshine <- ate_estimate_sunshine - z_95 * ate_std_err_sunshine ci_95_upper_ate_sunshine <- ate_estimate_sunshine + z_95 * ate_std_err_sunshine cat("99% CI for ATE (with sunshine):", ci_99_lower_ate_sunshine, "to", ci_99_upper_ate_sunshine, "\n") cat("95% CI for ATE (with sunshine):", ci_95_lower_ate_sunshine, "to", ci_95_upper_ate_sunshine, "\n") # Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate. ``` ## 8. Negative Controls Outcome Analysis 4: Randomly shuffling the Vit D results ```{r} # Create a copy of df with shuffled Vitamin D values df_shuffled <- df %>% mutate(shuffled_value = sample(first_value, replace = FALSE)) # Ensure there are no missing values in the shuffled data df_shuffled <- df_shuffled %>% filter(!is.na(shuffled_value)) ``` ### 8.1 Descriptive ```{r} # Compute statistics for the shuffled Vitamin D values vitamin_d_stats_shuffled <- df_shuffled %>% group_by(cohort_year, test_month) %>% summarise(mean_vit_d = mean(shuffled_value, na.rm = TRUE), sd_vit_d = sd(shuffled_value, na.rm = TRUE), deficiency_count = sum(shuffled_value < thresholds$VD3, na.rm = TRUE), total_count = n(), deficiency_rate = (sum(shuffled_value < thresholds$VD3, na.rm = TRUE) / n()) * 100, .groups = "drop") %>% arrange(test_month, cohort_year) print("Shuffled Vitamin D statistics by year and month:") print(vitamin_d_stats_shuffled) # Mean Vitamin D levels before and during the pandemic mean_vit_d_period_shuffled <- df_shuffled %>% group_by(period) %>% summarise(mean_vit_d = mean(shuffled_value, na.rm = TRUE), sd_vit_d = sd(shuffled_value, na.rm = TRUE), .groups = "drop") print("Mean Shuffled Vitamin D levels before and during the pandemic:") print(mean_vit_d_period_shuffled) # Deficiency rates before and during the pandemic deficiency_rate_period_shuffled <- df_shuffled %>% group_by(period) %>% summarise(deficiency_count = sum(shuffled_value < thresholds$VD3, na.rm = TRUE), total_count = n(), deficiency_rate = (sum(shuffled_value < thresholds$VD3, na.rm = TRUE) / n()) * 100, .groups = "drop") print("Shuffled Deficiency rates before and during the pandemic:") print(deficiency_rate_period_shuffled) # ANOVA test for mean Vitamin D levels on shuffled data vitamin_d_anova_shuffled <- aov(shuffled_value ~ factor(cohort_year) * factor(test_month), data = df_shuffled) summary(vitamin_d_anova_shuffled) # Chi-square test for deficiency rate in shuffled data deficiency_contingency_shuffled <- with(df_shuffled, table(cohort_year, test_month, shuffled_value < thresholds$VD3)) deficiency_p_value_shuffled <- chisq.test(rowSums(deficiency_contingency_shuffled))$p.value print("P-value for Shuffled Vitamin D deficiency rate differences across years and months:") print(deficiency_p_value_shuffled) ``` ### 8.2 PSM ```{r} test_match_obj_shuffled <- matchit(period ~ age_group + gender + test_month, data = df_shuffled, method = "nearest", distance = "glm", exact = c("gender", "age_group", "test_month")) matched_data_shuffled <- match.data(test_match_obj_shuffled) # T-test for Vitamin D levels in matched shuffled data vitamin_d_t_test_matched_shuffled <- t.test(shuffled_value ~ period, data = matched_data_shuffled) print("T-test results for Shuffled Vitamin D levels in matched data:") print(vitamin_d_t_test_matched_shuffled) # Deficiency rates in matched shuffled data deficiency_contingency_matched_shuffled <- with(matched_data_shuffled, table(period, shuffled_value < thresholds$VD3)) deficiency_p_value_matched_shuffled <- chisq.test(deficiency_contingency_matched_shuffled)$p.value print("P-value for Shuffled Vitamin D deficiency rate differences in matched data:") print(deficiency_p_value_matched_shuffled) deficiency_rates_matched_shuffled <- prop.table(deficiency_contingency_matched_shuffled, margin = 1) * 100 print("Shuffled Deficiency rates in matched sample (before and during the pandemic):") print(deficiency_rates_matched_shuffled) ``` ### 8.3 ML ```{r} df_shuffled$age_group_numeric <- as.integer(factor(df_shuffled$age_group)) df_shuffled$gender_numeric <- as.integer(factor(df_shuffled$gender)) df_shuffled$season_numeric <- as.integer(factor(df_shuffled$season)) df_shuffled$treatment <- ifelse(df_shuffled$period == "D", 1, 0) causal_forest_model_shuffled <- causal_forest( X = df_shuffled[c("age_group_numeric", "gender_numeric", "season_numeric")], Y = df_shuffled$shuffled_value, W = df_shuffled$treatment, num.trees = 2000 ) ate_result_shuffled <- average_treatment_effect(causal_forest_model_shuffled, target.sample = "all") ate_estimate_shuffled <- ate_result_shuffled[1] ate_std_err_shuffled <- ate_result_shuffled[2] print(paste("Shuffled ATE Estimate:", round(ate_estimate_shuffled, 4))) average_first_value_shuffled <- mean(df_shuffled$shuffled_value, na.rm = TRUE) percentage_change_shuffled <- (ate_estimate_shuffled / average_first_value_shuffled) * 100 print(paste("Shuffled Percentage change due to pandemic:", round(percentage_change_shuffled, 2), "%")) # Confidence intervals ci_99_lower_ate_shuffled <- ate_estimate_shuffled - z_99 * ate_std_err_shuffled ci_99_upper_ate_shuffled <- ate_estimate_shuffled + z_99 * ate_std_err_shuffled ci_95_lower_ate_shuffled <- ate_estimate_shuffled - z_95 * ate_std_err_shuffled ci_95_upper_ate_shuffled <- ate_estimate_shuffled + z_95 * ate_std_err_shuffled cat("99% Confidence Interval for Shuffled ATE:", ci_99_lower_ate_shuffled, "to", ci_99_upper_ate_shuffled, "\n") cat("95% Confidence Interval for Shuffled ATE:", ci_95_lower_ate_shuffled, "to", ci_95_upper_ate_shuffled, "\n") # Note: Confidence intervals for the ATE estimate are calculated using the formula: CI = ATE ± Z * SE_ATE where Z is the Z-score for the desired confidence level (e.g., 1.960 for 95%, 2.576 for 99%), and SE_ATE is the standard error of the ATE estimate. ```