# Load required libraries (consolidated)
library(tidyverse) # Includes dplyr, ggplot2, tidyr, etc.
library(lavaan) # For CFA and SEM
library(semTools) # For measurement invariance testing
library(psych) # For psychometric analyses
library(knitr) # For tables
library(kableExtra) # For enhanced tables
library(emmeans) # For marginal means in ANOVA
library(effectsize) # For effect size calculations
library(purrr) # For functional programming tools
library(openxlsx) # For Excel export
library(paletteer) # For color palettes
library(ggraph) # For graph visualization
library(igraph) # For network analysis
library(patchwork) # For combining plots
library(stringr) # For string manipulation
library(ggridges) # For ridge plots
library(corrplot) # For correlation plots
library(reshape2) # For data reshaping
library(naniar) # For missing data visualization
library(visdat) # For data visualization
library(gtsummary) # For summary tables
library(ragg) # For high-quality graphic device
library(janitor) # For cleaning data
# Load the dataset
load("zmsp.RData")
# Define indicator variables by dimension
dimensions <- list(
bond_class = c('k6_1803', 'k6_1806', 'k6_1809'),
bond_teacher = c('k6_1802', 'k6_1805', 'k6_1808'),
future_orientation = c('k6_1811', 'k6_1813', 'k6_1815'),
school_diff_r = c('k6_1810_r', 'k6_1812_r', 'k6_1814_r'),
school_commit_r = c('k6_1801', 'k6_1804', 'k6_1807_r')
)
# Flatten the list for convenience
bond_class <- dimensions$bond_class
bond_teacher <- dimensions$bond_teacher
future_orientation <- dimensions$future_orientation
school_diff_r <- dimensions$school_diff_r
school_commit_r <- dimensions$school_commit_r
# All items for the 15-item scale
school_exp_r <- c(bond_class, bond_teacher, school_commit_r, school_diff_r, future_orientation)
# Items for the 12-item scale (without school commitment)
school_exp_12 <- c(bond_class, bond_teacher, school_diff_r, future_orientation)
# Define labels for plots and tables
labels <- c(
experience = "School Experience",
class = "Bond to Class",
teacher = "Bond to Teacher",
future = "Future Orientation",
diffic = "School Difficulties",
k6_1803 = "Sense of community in class",
k6_1806 = "Get along with classmates",
k6_1809 = "Classmates are nice to me",
k6_1802 = "Teacher treats me fairly",
k6_1805 = "Get along with teacher",
k6_1808 = "Teacher helps when needed",
k6_1811 = "Working towards interesting job",
k6_1813 = "Try hard at school for future job",
k6_1815 = "Doing well at school is important",
k6_1801 = "Likes going to school",
k6_1804 = "Likes doing homework",
k6_1807_r = "Finds school useless (r)",
k6_1810_r = "Often has bad grades (r)",
k6_1812_r = "Makes mistakes in homework (r)",
k6_1814_r = "Struggles to follow lessons (r)"
)
# Create a subset for latent variable names
labels_latents <- labels[c("experience", "class", "teacher", "future", "diffic")]
# Define the four-factor model with second-order factor
model <- '
class =~ k6_1803 + k6_1806 + k6_1809
teacher =~ k6_1802 + k6_1805 + k6_1808
diffic =~ k6_1810_r + k6_1812_r + k6_1814_r
future =~ k6_1811 + k6_1813 + k6_1815
# 2nd order factor
experience =~ class + teacher + diffic + future
'
# Create subsets for each city
city_data <- list(
all = zmsp,
zproso = zmsp %>% filter(city == "Zurich"),
mproso = zmsp %>% filter(city == "Montevideo"),
spproso = zmsp %>% filter(city == "São Paulo")
)
# Get city names for iteration
cities <- unique(zmsp$city)
# Define latent variables list for analyses
latent_vars <- c("future", "diffic", "teacher", "class", "experience")
# Define a common theme for visualizations
theme_common <- theme_minimal() +
theme(
text = element_text(family = "Helvetica"),
axis.text.x = element_text(angle = 0, hjust = 0.5),
strip.text = element_text(face = "bold"),
strip.background = element_rect(fill = "gray95"),
panel.spacing = unit(1, "lines"),
legend.position = "bottom"
)create_demographic_summary <- function(data) {
# Ensure gender is properly formatted
data$k6_102 <- factor(data$k6_102,
levels = c("male", "female"),
labels = c("Male", "Female"))
# Create summary statistics
tbl <- data %>%
tbl_summary(
by = city,
include = c(k6_102, k6_ageyears),
missing = "no",
statistic = list(all_continuous() ~ "{mean} ({sd})"),
label = list(k6_ageyears = "Age (years)", k6_102 = "Sex")
) %>%
add_p() %>%
modify_spanning_header(c("stat_1", "stat_2", "stat_3") ~ "**City**")
return(tbl)
}
# Function to visualize age by gender and city
plot_age_by_gender_city <- function(data) {
# Ensure gender is properly formatted
data$k6_102 <- factor(data$k6_102,
levels = c("male", "female"),
labels = c("Male", "Female"))
# Calculate summary statistics
summary <- data %>%
filter(!is.na(k6_102)) %>%
group_by(city, k6_102) %>%
summarize(
mean_age = mean(k6_ageyears, na.rm = TRUE),
median_age = median(k6_ageyears, na.rm = TRUE),
min_age = min(k6_ageyears, na.rm = TRUE),
max_age = max(k6_ageyears, na.rm = TRUE),
count = n(),
sd_age = sd(k6_ageyears, na.rm = TRUE),
se_age = sd_age / sqrt(count),
ci_lower = mean_age - 1.96 * se_age,
ci_upper = mean_age + 1.96 * se_age,
.groups = "drop"
)
# Create plot
p <- data %>%
filter(!is.na(k6_102)) %>%
ggplot(aes(x = k6_ageyears)) +
geom_histogram(fill = "#FC8D62FF", alpha = 0.7, binwidth = 1) +
geom_vline(data = summary,
aes(xintercept = mean_age),
linetype = "solid",
color = "black",
linewidth = 0.5) +
geom_text(data = summary,
aes(x = mean_age + 0.3, y = 750,
label = paste("Mean =", round(mean_age, 2))),
color = "grey30",
size = 3,
hjust = 0,
family = "Helvetica") +
facet_grid(k6_102 ~ city) +
scale_x_continuous(breaks = scales::breaks_width(1)) +
labs(x = "Age (years)", y = "Frequency") +
ylim(0,800) +
theme_common
return(p)
}
# Execute
demo_summary <- create_demographic_summary(zmsp)
age_plot <- plot_age_by_gender_city(zmsp)
# View
demo_summary| Characteristic |
City
|
p-value2 | ||
|---|---|---|---|---|
| São Paulo N = 2,6801 |
Zurich N = 1,4471 |
Montevideo N = 2,2041 |
||
| Sex | 0.088 | |||
| Male | 1,363 (52%) | 750 (52%) | 1,074 (49%) | |
| Female | 1,247 (48%) | 697 (48%) | 1,110 (51%) | |
| Age (years) | 14.88 (0.69) | 15.44 (0.36) | 15.15 (0.91) | <0.001 |
| 1 n (%); Mean (SD) | ||||
| 2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test | ||||
# Load required libraries
library(tidyverse)
library(patchwork)
library(ggridges)
library(paletteer)
# Prepare São Paulo data
sp_data <- zmsp %>%
filter(city == "São Paulo") %>%
mutate(
sex = factor(k6_102,
levels = c("male", "female"),
labels = c("Male", "Female")),
race = factor(q103,
levels = c("preta", "branca", "pardo", "amarela", "indigena"),
labels = c("Black", "White", "Mixed (Pardo)", "Yellow", "Indigenous")),
ses = escore_final_pond,
age = k6_ageyears
) %>%
select(sex, age, race, ses)
# Plot
psp1 <- sp_data %>%
pivot_longer(cols = c(age, ses), names_to = "variable", values_to = "value") %>%
mutate(variable = factor(variable,
levels = c("age", "ses"),
labels = c("Age (years)", "SES Score"))) %>%
drop_na(race, sex, value) %>%
ggplot(aes(x = value, y = race, fill = sex)) +
geom_density_ridges(alpha = 0.7, scale = 0.9) +
facet_wrap(~ variable, scales = "free_x", strip.position = "bottom") +
scale_fill_manual(values = c("Male" = "#FFC000", "Female" = "#4169E1")) +
labs(
x = "",
y = "Race / Skin color",
fill = "Sex"
) +
theme_common
print(psp1)library(gtsummary)
# gtsummary table of demographic variables
demo_sp <- sp_data %>%
tbl_summary(
by = race,
include = c(sex, age, ses),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
label = list(
sex ~ "Sex",
age ~ "Age (years)",
ses ~ "SES Score"
),
missing = "no"
) %>%
add_n() %>%
add_p() %>%
italicize_levels() %>%
bold_labels()
# Print the table
demo_sp| Characteristic | N | Black N = 3401 |
White N = 1,1911 |
Mixed (Pardo) N = 9551 |
Yellow N = 1011 |
Indigenous N = 701 |
p-value2 |
|---|---|---|---|---|---|---|---|
| Sex | 2,588 | 0.039 | |||||
| Male | 185 (57%) | 591 (51%) | 503 (54%) | 43 (43%) | 31 (46%) | ||
| Female | 138 (43%) | 574 (49%) | 429 (46%) | 57 (57%) | 37 (54%) | ||
| Age (years) | 2,595 | 15.1 (0.8) | 14.8 (0.7) | 14.9 (0.6) | 14.8 (0.6) | 14.9 (0.7) | <0.001 |
| SES Score | 2,520 | 4.6 (3.3) | 5.4 (3.1) | 4.7 (3.2) | 5.1 (3.4) | 4.5 (3.0) | <0.001 |
| 1 n (%); Mean (SD) | |||||||
| 2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test | |||||||
z_data <- zmsp %>%
filter(city == "Zurich") %>%
mutate(
education_group_ses = case_when(
k5_6_edumax == "incomplete compulsory school" ~ "1_Did Not Complete Compulsory School",
# 2. Upper Secondary Vocational
k5_6_edumax %in% c(
"compulsory school, elementary vocational training",
"domestic science course, 1-year school of commerce",
"apprenticeship",
"full time vocational school"
) ~ "2_Upper Secondary Vocational",
# 3. Upper Secondary Academic (Matura)
k5_6_edumax == "A-levels" ~ "3_Upper Secondary Academic (Matura)",
# 4. Tertiary Vocational/Professional
k5_6_edumax %in% c(
"vocational high education",
"technical school or vocational college",
"vocational high school, higher specialized school"
) ~ "4_Tertiary Vocational/Professional",
# 5. Tertiary Academic/University
k5_6_edumax == "university, Swiss Federal Institute of Technology\x94" ~ "5_Tertiary Academic/University",
# Handle "missing" level and true NAs
k5_6_edumax == "missing" ~ NA_character_,
TRUE ~ NA_character_
),
education_group_ses = factor(
education_group_ses,
levels = c(
"1_Did Not Complete Compulsory School",
"2_Upper Secondary Vocational",
"3_Upper Secondary Academic (Matura)",
"4_Tertiary Vocational/Professional",
"5_Tertiary Academic/University"
),
labels = c(
"Did Not Complete\nCompulsory School",
"Upper Secondary\nVocational",
"Upper Secondary\nAcademic (Matura)",
"Tertiary\nVocational/Professional",
"Tertiary\nAcademic/University"
)
)
) %>%
mutate(
sex = factor(k6_102, levels = c("male", "female"), labels = c("Male", "Female")),
migrationback = factor(k5_6_mighh2,
labels = c("At least one parent born in Switzerland", "Both parents born abroad")),
ses = education_group_ses,
age = k6_ageyears
)
z_data_for_plot <- z_data %>%
select(age, ses, sex, migrationback) %>%
drop_na()
# Plot
pz1 <- z_data_for_plot %>%
ggplot(aes(x = age, fill = sex)) +
geom_density(alpha = 0.7) +
facet_grid(ses ~ migrationback,
scales = "fixed",
switch = "y") + # Rows by SES, Cols by Migration
scale_fill_manual(values = c("Male" = "red", "Female" = "grey90")) +
labs(
x = "Age (years)",
y = "Density",
fill = "Sex"
) +
theme_common
print(pz1)library(gtsummary)
# gtsummary table of demographic variables
demo_z <- z_data %>%
select(sex, age, migrationback, ses) %>%
tbl_summary(
by = migrationback,
include = c(sex, age, ses),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 2,
label = list(
sex ~ "Sex",
age ~ "Age (years)",
ses ~ "Family education level"
),
missing = "no"
) %>%
add_n() %>%
add_p() %>%
italicize_levels() %>%
bold_labels()
# Print the table
demo_z| Characteristic | N | At least one parent born in Switzerland N = 7141 |
Both parents born abroad N = 6971 |
p-value2 |
|---|---|---|---|---|
| Sex | 1,411 | 0.3 | ||
| Male | 379 (53%) | 351 (50%) | ||
| Female | 335 (47%) | 346 (50%) | ||
| Age (years) | 1,411 | 15.43 (0.37) | 15.46 (0.36) | 0.2 |
| Family education level | 1,340 | <0.001 | ||
| Did Not Complete Compulsory School | 10 (1.5%) | 78 (12%) | ||
| Upper Secondary Vocational | 295 (43%) | 350 (53%) | ||
| Upper Secondary Academic (Matura) | 105 (15%) | 76 (12%) | ||
| Tertiary Vocational/Professional | 83 (12%) | 60 (9.2%) | ||
| Tertiary Academic/University | 192 (28%) | 91 (14%) | ||
| 1 n (%); Mean (SD) | ||||
| 2 Pearson’s Chi-squared test; Wilcoxon rank sum test | ||||
zmsp <- zmsp %>%
mutate(
education_level_3cat = case_when(
# Primary education: "Ninguno" (1) and "Escuela Primaria" (2)
max_educ %in% c(1, 2) ~ "1. Primary",
# Secondary education: "Curso Técnico de la UTU" (3), "Ciclo Básico" (4), "Bachillerato Secundaria" (5)
max_educ %in% c(3, 4, 5) ~ "2. Secundary",
# Tertiary education: "Escuela Departamental" (6) through "Posgrados Universitarios" (10)
max_educ %in% c(6, 7, 8, 9, 10) ~ "3. Terciary",
TRUE ~ NA_character_ # Handles any unexpected numeric values, assigning NA
)
)
# Table
zmsp$education_level_3cat <- factor(
zmsp$education_level_3cat,
levels = c("1. Primary", "2. Secundary", "3. Terciary"),
labels = c("Primary", "Secondary", "Terciary"),
ordered = TRUE
)
demo_m <- zmsp %>%
filter(city == "Montevideo") %>%
mutate(k6_ageyears = as.numeric(k6_ageyears)) %>%
select(k6_102, k6_ageyears, education_level_3cat) %>%
tbl_summary(
by = k6_102,
include = c(k6_ageyears, education_level_3cat),
type = list(k6_ageyears ~ "continuous"),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
label = list(
k6_ageyears ~ "Age (years)",
education_level_3cat ~ "Family education level"
),
missing = "no"
) %>%
add_n() %>%
add_p() %>%
italicize_levels() %>%
bold_labels()
# Print the table
demo_m| Characteristic | N | male N = 1,0741 |
female N = 1,1101 |
p-value2 |
|---|---|---|---|---|
| Age (years) | 2,146 | 15.2 (0.9) | 15.1 (0.9) | 0.011 |
| Family education level | 2,155 | 0.060 | ||
| Primary | 102 (9.6%) | 125 (11%) | ||
| Secondary | 642 (61%) | 609 (56%) | ||
| Terciary | 316 (30%) | 361 (33%) | ||
| 1 Mean (SD); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
create_long_format_responses <- function(data, vars) {
# Identify all factor levels across selected columns
all_levels <- unique(unlist(lapply(data[vars], levels)))
# Create long format data
long_data <- data %>%
select(city, all_of(vars)) %>%
mutate(across(all_of(vars), ~ factor(., levels = all_levels, ordered = TRUE))) %>%
pivot_longer(
cols = all_of(vars),
names_to = "item",
values_to = "response"
) %>%
mutate(response = factor(response, levels = all_levels, ordered = TRUE))
# Summarize percentages
result <- long_data %>%
group_by(city, item, response) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(city, item) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup()
return(result)
}
# Function to visualize item responses
plot_item_responses <- function(long_data) {
ggplot(na.omit(long_data),
aes(x = city, y = percent, fill = response)) +
geom_col(position = "fill") +
scale_fill_paletteer_d(
palette = "nationalparkcolors::Badlands",
labels = c("False", "More false than true", "More true than false", "True")
) +
facet_wrap(~ item, labeller = labeller(item = labels), ncol = 5) +
scale_y_continuous(labels = scales::percent_format(scale = 100)) +
labs(x = "", y = "", fill = "Response") +
theme_common +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
}
# Execute
responses_long <- create_long_format_responses(zmsp, school_exp_r)
responses_plot <- plot_item_responses(responses_long)
# View
responses_plot# 1. Basic missing data summary
get_missing_summary <- function(data, variables) {
miss_var_summary(data[variables])
}
# 2. Missing data visualization
plot_missing_pattern <- function(data, variables, labels) {
vis_data <- data[variables]
names(vis_data) <- labels[names(vis_data)]
vis_miss(vis_data, cluster = TRUE) +
scale_fill_manual(
values = c("TRUE" = "#FC8D62FF", "FALSE" = "grey90"),
name = "Missing",
labels = c("TRUE" = "Yes", "FALSE" = "No")
) +
theme_common +
theme(
axis.text.x = element_text(angle = 90, hjust = 0, size = 8),
axis.text.y = element_text(size = 8)
) +
labs(y = "Students")
}
# 3. Participant-level missing data
calculate_participant_missing <- function(data, variables) {
missing_by_participant <- data.frame(
participant_id = 1:nrow(data),
missing_count = rowSums(is.na(data[variables])),
missing_percent = rowSums(is.na(data[variables])) / length(variables) * 100
)
# Summary statistics
missing_stats <- missing_by_participant %>%
summarise(
min_missing = min(missing_count),
max_missing = max(missing_count),
mean_missing = mean(missing_count),
median_missing = median(missing_count),
participants_complete = sum(missing_count == 0),
participants_complete_pct = participants_complete / n() * 100,
participants_over_50pct = sum(missing_percent > 50)
)
list(
by_participant = missing_by_participant,
summary_stats = missing_stats
)
}
# 4. Missing data correlation analysis
calculate_missing_correlations <- function(data, variables) {
shadow_matrix <- data %>%
select(all_of(variables)) %>%
is.na()
cor(as.data.frame(shadow_matrix))
}
# 5. Missing correlation heatmap
plot_missing_correlations <- function(miss_corr_matrix, labels) {
miss_corr_df <- as.data.frame(as.table(miss_corr_matrix)) %>%
rename(x = Var1, y = Var2, correlation = Freq) %>%
filter(x != y) %>%
mutate(
x_label = labels[as.character(x)],
y_label = labels[as.character(y)]
)
ggplot(miss_corr_df, aes(x = x_label, y = y_label, fill = correlation)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(
aes(label = sprintf("%.2f", correlation)),
color = ifelse(abs(miss_corr_df$correlation) > 0.5, "black", "grey30"),
size = 2.5
) +
scale_fill_gradient2(
low = "#E78AC3FF",
high = "#8DA0CBFF",
midpoint = 0.5,
limits = c(0, 1),
name = "Correlation"
) +
theme_common +
theme(
axis.text.y = element_text(face = "bold", size = 8),
axis.text.x = element_text(angle = 60, hjust = 1, size = 6),
panel.grid = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank()
)
}
# Execute analysis step by step
missing_summary <- get_missing_summary(zmsp, school_exp_r)
missing_pattern_plot <- plot_missing_pattern(zmsp, school_exp_r, labels)
participant_missing <- calculate_participant_missing(zmsp, school_exp_r)
missing_correlations <- calculate_missing_correlations(zmsp, school_exp_r)
missing_corr_plot <- plot_missing_correlations(missing_correlations, labels)
# View results
missing_pattern_plot# Function to calculate correlation matrix
calculate_correlation_matrix <- function(data, variables, method = "spearman") {
corr_matrix <- cor(as.data.frame(lapply(data[, variables], as.numeric)),
method = method, use = "pairwise.complete.obs")
return(corr_matrix)
}
# Function to visualize correlation matrix
plot_correlation_matrix <- function(corr_matrix, var_labels) {
# Prepare matrix with proper labels
labeled_matrix <- corr_matrix
rownames(labeled_matrix) <- var_labels[rownames(corr_matrix)]
colnames(labeled_matrix) <- var_labels[colnames(corr_matrix)]
# Function to truncate long names
smart_truncate <- function(names, max_len = 12) {
sapply(names, function(n) {
if (nchar(n) <= max_len) return(n)
paste0(substr(n, 1, max_len - 3), "...")
})
}
colnames(labeled_matrix) <- smart_truncate(colnames(labeled_matrix), max_len = 10)
# Create correlation plot
corrplot(labeled_matrix,
method = "shade",
order = "original",
tl.col = "black",
addgrid.col = 'white',
tl.srt = 45,
tl.cex = 0.7,
col = COL2("PuOr", 20),
col.lim = c(-0.1, 1),
diag = FALSE)
}
# Function to calculate Cronbach's alpha for subscales
calculate_reliability <- function(data, subscales) {
# Calculate correlation matrix
corr_matrix <- calculate_correlation_matrix(data, unlist(subscales))
# Calculate alpha for each subscale
alphas <- sapply(subscales, function(scale) {
result <- alpha(corr_matrix[scale, scale])
return(result$total$raw_alpha)
})
# Calculate overall alpha
overall_alpha <- alpha(corr_matrix)$total$raw_alpha
# Create summary dataframe
alpha_df <- data.frame(
Subscale = c(names(alphas), "Full Scale"),
Alpha = c(alphas, overall_alpha)
)
return(alpha_df)
}
# Function to calculate reliability by city
calculate_reliability_by_city <- function(data, subscales, cities) {
# Initialize results list
results <- list()
# Get overall reliability
overall_corr <- calculate_correlation_matrix(data, unlist(subscales))
# Calculate overall alphas
overall_alphas <- sapply(subscales, function(scale) {
alpha(overall_corr[scale, scale])$total$raw_alpha
})
overall_total_alpha <- alpha(overall_corr)$total$raw_alpha
# Create base dataframe
alpha_df <- data.frame(
Subscale = c(names(subscales), "Full Scale"),
Overall = c(overall_alphas, overall_total_alpha)
)
# Add city-specific alphas
for (city_name in cities) {
city_data <- data[data$city == city_name, ]
city_corr <- calculate_correlation_matrix(city_data, unlist(subscales))
city_alphas <- sapply(subscales, function(scale) {
alpha(city_corr[scale, scale])$total$raw_alpha
})
city_total_alpha <- alpha(city_corr)$total$raw_alpha
# Add to dataframe
alpha_df[[city_name]] <- c(city_alphas, city_total_alpha)
}
return(alpha_df)
}
# Function to analyze discriminant validity
analyze_discriminant_validity <- function(data, subscales) {
# Calculate correlation matrix
corr_matrix <- calculate_correlation_matrix(data, unlist(subscales))
# Calculate item-total correlations
item_total_cors <- list()
for (scale_name in names(subscales)) {
scale_items <- subscales[[scale_name]]
item_total <- data.frame(
Item = scale_items,
Label = labels[scale_items],
ItemTotal = NA
)
for (item in scale_items) {
# Calculate correlation between this item and sum of other items in subscale
other_items <- setdiff(scale_items, item)
item_cor <- sapply(other_items, function(other) corr_matrix[item, other])
item_total$ItemTotal[item_total$Item == item] <- mean(item_cor, na.rm = TRUE)
}
item_total_cors[[scale_name]] <- item_total
}
# Calculate cross-loadings
cross_loadings <- list()
for (scale_name in names(subscales)) {
scale_items <- subscales[[scale_name]]
cross_load_df <- data.frame(
Item = scale_items,
Label = labels[scale_items]
)
# For each other subscale, calculate average correlation
for (other_scale in setdiff(names(subscales), scale_name)) {
other_items <- subscales[[other_scale]]
cross_load_df[[other_scale]] <- sapply(scale_items, function(item) {
mean(corr_matrix[item, other_items], na.rm = TRUE)
})
}
cross_loadings[[scale_name]] <- cross_load_df
}
# Combine item-total correlations and cross-loadings
item_vs_cross <- data.frame()
for (scale_name in names(subscales)) {
scale_items <- subscales[[scale_name]]
for (item in scale_items) {
# Get item-total correlation
item_total <- item_total_cors[[scale_name]]$ItemTotal[item_total_cors[[scale_name]]$Item == item]
# Get cross-loadings for this item
cross_loads <- cross_loadings[[scale_name]][cross_loadings[[scale_name]]$Item == item,
setdiff(names(cross_loadings[[scale_name]]), c("Item", "Label"))]
# Get highest cross-loading
max_cross <- max(unlist(cross_loads), na.rm = TRUE)
max_cross_scale <- names(cross_loads)[which.max(unlist(cross_loads))]
# Add to data frame
item_vs_cross <- rbind(item_vs_cross, data.frame(
Item = item,
Label = labels[item],
Subscale = scale_name,
ItemTotal = item_total,
MaxCross = max_cross,
MaxCrossScale = max_cross_scale,
Difference = item_total - max_cross
))
}
}
# Create visualization
p <- ggplot(item_vs_cross, aes(x = ItemTotal,
y = MaxCross,
color = Subscale,
label = Item)) +
geom_point(size = 6, alpha = 0.7) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
xlim(0, 0.8) +
ylim(0, 0.8) +
scale_color_brewer(palette = "Set2") +
labs(x = "Item-Total Correlation",
y = "Highest Cross-Loading") +
theme_common
return(list(
item_total = item_total_cors,
cross_loadings = cross_loadings,
combined = item_vs_cross,
plot = p
))
}
# Define subscales for correlation and reliability analysis
subscales <- list(
"Bond to Class" = bond_class,
"Bond to Teacher" = bond_teacher,
"School Difficulties" = school_diff_r,
"Future Orientation" = future_orientation,
"School Commitment" = school_commit_r
)
# Execute correlation and reliability analyses
corr_matrix <- calculate_correlation_matrix(zmsp, school_exp_r)
reliability_all <- calculate_reliability(zmsp, subscales)
reliability_by_city <- calculate_reliability_by_city(zmsp, subscales, cities)
discriminant_validity <- analyze_discriminant_validity(zmsp, subscales)
# Create visualizations
corr_plot <- plot_correlation_matrix(corr_matrix, labels)# Function to run EFA with different factor solutions
run_efa <- function(data, model_spec, variables, n_factors = 1:6) {
efa_fit <- efa(model_spec,
data = data,
nfactors = n_factors,
ov.names = variables,
output = "efa",
std.ov = TRUE,
ordered = TRUE)
return(efa_fit)
}
# Function to create scree plot for EFA results
create_scree_plot <- function(eigenvalues_15, eigenvalues_12) {
factor_num_15 <- 1:length(eigenvalues_15)
factor_num_12 <- 1:length(eigenvalues_12)
# Combine data
scree_data <- bind_rows(
tibble(Eigenvalue = eigenvalues_15, Factor = factor_num_15, Scale = "15-item"),
tibble(Eigenvalue = eigenvalues_12, Factor = factor_num_12, Scale = "12-item")
)
# Create scree plot
set2_colors <- RColorBrewer::brewer.pal(8, "Set2")[1:2]
p <- ggplot(scree_data, aes(x = Factor, y = Eigenvalue, color = Scale, group = Scale)) +
geom_line(linewidth = 1,
alpha = 0.7) +
geom_point(size = 3) +
geom_hline(yintercept = 1, linetype = "dashed", color = "darkgray") +
theme_common +
labs(x = "Factor Number", y = "Eigenvalue") +
scale_x_continuous(breaks = 1:15) +
scale_color_manual(values = set2_colors)
return(p)
}
# Function to visualize factor loadings heatmap
create_factor_loadings_heatmap <- function(loadings_data) {
# Convert to long format
loadings_long <- loadings_data %>%
pivot_longer(cols = -c(Item, Description),
names_to = "Factor",
values_to = "Loading") %>%
mutate(Loading = ifelse(is.na(Loading) | Loading < 0.3, NA, Loading))
# Group items by theoretical dimension
loadings_long <- loadings_long %>%
mutate(ItemGroup = case_when(
Item %in% c("k6_1803", "k6_1806", "k6_1809") ~ "Bond to Class",
Item %in% c("k6_1802", "k6_1805", "k6_1808") ~ "Bond to Teacher",
Item %in% c("k6_1801", "k6_1804", "k6_1807_r") ~ "School Commitment",
Item %in% c("k6_1810_r", "k6_1812_r", "k6_1814_r") ~ "School Difficulties",
Item %in% c("k6_1811", "k6_1813", "k6_1815") ~ "Future Orientation"
))
# Create model groups
loadings_long <- loadings_long %>%
mutate(Model = case_when(
grepl("12-4F", Factor) ~ "12-item 4-factor",
grepl("15-4F", Factor) ~ "15-item 4-factor",
grepl("15-5F", Factor) ~ "15-item 5-factor"
))
# Order factors and items
loadings_long$ItemGroup <- factor(loadings_long$ItemGroup,
levels = c("Bond to Class", "Bond to Teacher",
"School Commitment", "School Difficulties",
"Future Orientation"))
# Create heatmap
p <- ggplot(loadings_long,
aes(x = Factor, y = Description, fill = Loading)) +
geom_tile(color = "white") +
geom_text(aes(label = ifelse(!is.na(Loading),
sprintf("%.2f", Loading),
"")),
family = "Helvetica") +
scale_fill_gradientn(colors = RColorBrewer::brewer.pal(8, "Set2"),
na.value = "white",
limits = c(0, 1.02)) +
facet_grid(ItemGroup ~ Model, scales = "free", space = "free") +
theme_common +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(face = "bold")
) +
labs(x = "", y = "", fill = "Factor Loading")
return(p)
}
# Specify EFA models
efa_model_15 <- paste(school_exp_r, collapse = " + ")
efa_model_12 <- paste(school_exp_12, collapse = " + ")
# Run EFA
efa_fit_15 <- run_efa(zmsp, efa_model_15, school_exp_r)
efa_fit_12 <- run_efa(zmsp, efa_model_12, school_exp_12)
# Use hardcoded eigenvalues
eigenvalues_15 <- c(4.723, 2.013, 1.676, 1.149, 0.896, 0.726, 0.633, 0.539, 0.492, 0.453, 0.410, 0.387, 0.349, 0.319, 0.235)
eigenvalues_12 <- c(4.053, 1.897, 1.499, 1.119, 0.630, 0.547, 0.498, 0.426, 0.413, 0.357, 0.325, 0.238)
# Create scree plot using these values
scree_plot <- create_scree_plot(eigenvalues_15, eigenvalues_12)
# Factor loadings data
loadings_data <- tibble(
Item = c("k6_1803", "k6_1806", "k6_1809", "k6_1802", "k6_1805", "k6_1808",
"k6_1801", "k6_1804", "k6_1807_r", "k6_1810_r", "k6_1812_r", "k6_1814_r",
"k6_1811", "k6_1813", "k6_1815"),
Description = c("Sense of community in class",
"Get along with classmates",
"Classmates are nice to me",
"Teacher treats me fairly",
"Get along with teacher",
"Teacher helps when needed",
"Likes going to school",
"Likes doing homework",
"Finds school useless (r)",
"Often has bad grades (r)",
"Makes mistakes in homework (r)",
"Struggles to follow lessons (r)",
"Working towards interesting job",
"Try hard at school for future job",
"Doing well at school is important"),
# Add factor loadings for different models
"12-4F: Bond to Class" = c(0.681, 0.912, 0.795, 0, 0, 0, NA, NA, NA, 0, 0, 0, 0, 0, 0),
"12-4F: Bond to Teacher" = c(0, 0, 0, 0.784, 0.783, 0.540, NA, NA, NA, 0, 0, 0, 0, 0, 0),
"12-4F: School Difficulties" = c(0, 0, 0, 0, 0, 0, NA, NA, NA, 0.618, 0.719, 0.749, 0, 0, 0),
"12-4F: Future Orientation" = c(0, 0, 0, 0, 0, 0, NA, NA, NA, 0, 0, 0, 0.675, 0.864, 0.655),
# Additional model loadings
"15-4F: Bond to Class" = c(0.667, 0.899, 0.802, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
"15-4F: Bond to Teacher" = c(0, 0, 0, 0.798, 0.770, 0.528, 0.388, 0, 0, 0, 0, 0, 0, 0, 0),
"15-4F: School Difficulties" = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.616, 0.721, 0.749, 0, 0, 0),
"15-4F: Future/Commitment" = c(0, 0, 0, 0, 0, 0, 0.307, 0.378, 0.394, 0, 0, 0, 0.668, 0.813, 0.683),
"15-5F: Bond to Class" = c(0.683, 0.906, 0.803, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
"15-5F: Bond to Teacher" = c(0, 0, 0, 0.732, 0.757, 0.540, 0, 0, 0, 0, 0, 0, 0, 0, 0),
"15-5F: School Commitment" = c(0, 0, 0, 0, 0, 0, 1.012, 0.322, 0, 0, 0, 0, 0, 0, 0),
"15-5F: School Difficulties" = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.614, 0.725, 0.767, 0, 0, 0),
"15-5F: Future Orientation" = c(0, 0, 0, 0, 0, 0, 0, 0, 0.368, 0, 0, 0, 0.686, 0.830, 0.700)
)
# Create factor loadings heatmap
loadings_heatmap <- create_factor_loadings_heatmap(loadings_data)
# View
loadings_heatmap# Function to run CFA and extract fit measures
run_cfa <- function(data, model_spec, ordered = TRUE) {
fit <- cfa(model_spec, data = data, ordered = ordered)
return(fit)
}
# Function to extract and format fit measures
extract_fit_measures <- function(fit) {
summary(fit, fit.measures = TRUE, standardized = TRUE)
}
# Run CFA for each dataset
cfa_fits <- list(
all = run_cfa(city_data$all, model),
zproso = run_cfa(city_data$zproso, model),
mproso = run_cfa(city_data$mproso, model),
spproso = run_cfa(city_data$spproso, model)
)
# Get fit measures for each model
fit_summaries <- lapply(cfa_fits, extract_fit_measures)
# Function to create factor loadings visualization
plot_factor_loadings <- function(fit_model) {
# Extract standardized loadings
std_est <- standardizedSolution(fit_model) %>%
filter(op == "=~", !grepl("^experience$", lhs)) # Only first-order factor loadings
# Check if this is a multi-group model
is_multigroup <- "group" %in% names(std_est)
# Process differently based on whether it's a multi-group model
if (is_multigroup) {
# Add group labels for multi-group model
group_labels <- lavInspect(fit_model, "group.label")
std_est_labeled <- std_est %>%
mutate(
group = factor(group, labels = group_labels),
item_label = labels[rhs],
factor_label = labels[lhs],
# Format for better display
item_label = stringr::str_replace_all(labels[rhs], "(\\w+\\s+\\w+)\\s+", "\\1\n"),
group = factor(group, levels = c("São Paulo", "Zurich", "Montevideo"), ordered = TRUE)
)
# Create heatmap for multi-group model
p <- ggplot(std_est_labeled, aes(x = group,
y = item_label,
fill = est.std)) +
geom_tile() +
geom_text(aes(label = sprintf("%.2f", est.std)), color = "black", size = 3) +
facet_wrap(~ factor_label, scales = "free_y") +
scale_fill_gradient2(
low = "#E5C494FF", mid = "white", high = "#E78AC3FF",
midpoint = 0.75, name = "Standardized\nFactor Loading", limits = c(0.5, 1)
) +
labs(x = "", y = "") +
theme_common
} else {
# For single-group model, create a different visualization
std_est_labeled <- std_est %>%
mutate(
item_label = labels[rhs],
factor_label = labels[lhs],
# Format for better display
item_label = stringr::str_replace_all(labels[rhs], "(\\w+\\s+\\w+)\\s+", "\\1\n")
)
# Create barplot for single-group model
p <- ggplot(std_est_labeled, aes(x = item_label,
y = est.std)) +
geom_bar(stat = "identity",
fill = "#E78AC3FF") +
geom_text(aes(label = sprintf("%.2f", est.std)),
position = position_stack(vjust = 0.5),
color = "black", size = 3) +
facet_wrap(~ factor_label, scales = "free_x") +
labs(x = "", y = "Standardized Factor Loading") +
theme_common +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
}
return(p)
}
# Create visualizations
loadings_plot <- plot_factor_loadings(cfa_fits$all)
# View
loadings_plotinvariance_models <- list(
configural = cfa(model, data = zmsp, ordered = TRUE, group = "city"),
metric = cfa(model, data = zmsp, ordered = TRUE, group = "city",
group.equal = "loadings"),
scalar = cfa(model, data = zmsp, ordered = TRUE, group = "city",
group.equal = c("intercepts", "loadings")),
thresholds = cfa(model, data = zmsp, ordered = TRUE, group = "city",
group.equal = c("thresholds", "loadings"))
)invariance_comparisons <- list(
metric_vs_configural = compareFit(invariance_models$metric,
invariance_models$configural),
scalar_vs_metric = compareFit(invariance_models$scalar,
invariance_models$metric),
full = compareFit(invariance_models$thresholds,
invariance_models$scalar,
invariance_models$metric,
invariance_models$configural)
)# Extract fit measures for all models
invariance_fit_measures <- lapply(invariance_models, fitMeasures)
# Process fit measures for table and plotting
fit_measures_df <- do.call(rbind, invariance_fit_measures) %>%
as.data.frame() %>%
mutate(Model = rownames(.)) %>%
select(Model, chisq, df, cfi.scaled, tli.scaled, rmsea.scaled, srmr) %>%
setNames(c("Model", "Chi_Square", "df", "CFI", "TLI", "RMSEA", "SRMR")) %>%
mutate(
Delta_ChiSq = c(NA, diff(Chi_Square)),
Delta_CFI = c(NA, diff(CFI)),
Delta_TLI = c(NA, diff(TLI)),
Delta_RMSEA = c(NA, diff(RMSEA)),
Delta_SRMR = c(NA, diff(SRMR))
)
# Function to create measurement invariance table
create_mi_table <- function(fit_df) {
kable(fit_df %>%
mutate(across(c(CFI, TLI, RMSEA, SRMR, Delta_ChiSq, Delta_CFI, Delta_TLI, Delta_RMSEA, Delta_SRMR),
~ sprintf("%.3f", .x))),
caption = "Measurement Invariance Results: 12 items, 2nd order model",
col.names = c("Model", "χ²", "df", "CFI", "TLI", "RMSEA", "SRMR",
"Δχ²", "ΔCFI", "ΔTLI", "ΔRMSEA", "ΔSRMR")) %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
}
# Function to plot fit measure deltas
plot_fit_deltas <- function(fit_df) {
delta_fit <- fit_df %>%
select(Model, Delta_CFI, Delta_TLI, Delta_RMSEA, Delta_SRMR) %>%
tidyr::pivot_longer(-Model, names_to = "Fit_Metric", values_to = "Delta_Value") %>%
filter(!is.na(Delta_Value))
ggplot(delta_fit, aes(x = Delta_Value, y = Model, fill = Fit_Metric)) +
geom_bar(stat = "identity", position = "dodge") +
geom_vline(xintercept = c(-0.01, 0.01, 0.015),
linetype = "dashed", color = "black", linewidth = 0.8) +
scale_fill_paletteer_d("nationalparkcolors::Badlands") +
scale_x_continuous(limits = c(-0.015, 0.015),
breaks = seq(-0.015, 0.015, by = 0.005)) +
labs(x = "Model Comparison", y = "Change in Fit Index", fill = "Fit Index") +
theme_common
}
# Function to plot thresholds
plot_thresholds <- function(fit_model) {
# Extract threshold parameters
thresholds_config <- parameterEstimates(fit_model) %>%
filter(op == "|") %>%
select(lhs, rhs, group, est) %>%
mutate(
lhs = factor(lhs),
rhs = factor(rhs, levels = c("t1", "t2", "t3")),
group = factor(group, labels = lavInspect(fit_model, "group.label"))
)
# Add labels
levels(thresholds_config$lhs) <- labels[levels(thresholds_config$lhs)]
# Plot
ggplot(thresholds_config, aes(x = rhs, y = est, color = group, group = group)) +
geom_point(size = 3) +
geom_line() +
facet_wrap(~ lhs, scales = "fixed", labeller = labeller(lhs = labels)) +
scale_color_brewer(palette = "Set2", name = "City") +
labs(x = "Threshold", y = "Estimate") +
theme_common
}# Visualizations and tables
mi_table <- create_mi_table(fit_measures_df)
fit_deltas_plot <- plot_fit_deltas(fit_measures_df)
thresholds_plot <- plot_thresholds(invariance_models$configural)
# Create model structure visualizations
plot_model_structure <- function(invariance_model) {
# Extract standardized factor loadings by group
loadings_by_group <- standardizedSolution(invariance_model, type = "std.all") %>%
filter(op == "=~")
# Create custom node and edge dataframes for network visualization
nodes <- data.frame(
id = unique(c(loadings_by_group$lhs, loadings_by_group$rhs)),
label = sapply(unique(c(loadings_by_group$lhs, loadings_by_group$rhs)),
function(n) ifelse(n %in% names(labels), labels[n], n)),
type = case_when(
unique(c(loadings_by_group$lhs, loadings_by_group$rhs)) == "experience" ~ "second_order",
unique(c(loadings_by_group$lhs, loadings_by_group$rhs)) %in% c("class", "teacher", "diffic", "future") ~ "first_order",
TRUE ~ "indicator"
)
)
edges <- loadings_by_group %>%
select(from = lhs, to = rhs, weight = est.std, group)
# Create a list to store the individual plots
city_plots <- list()
# Colors for node types
node_colors <- c("second_order" = "black",
"first_order" = "grey60",
"indicator" = "white")
# Create separate plots for each city
for (g in unique(edges$group)) {
g_edges <- edges %>% filter(group == g)
g_graph <- graph_from_data_frame(g_edges, vertices = nodes)
city_name <- c("São Paulo", "Zurich", "Montevideo")[g]
# Create a new dataframe with wrapped labels
node_labels <- nodes$label
# Shorten and wrap indicator labels
wrapped_labels <- sapply(1:length(node_labels), function(i) {
label <- node_labels[i]
node_id <- nodes$id[i]
node_type <- nodes$type[i]
if (node_type == "indicator") {
# Abbreviate long labels for indicators
if (nchar(label) > 20) {
# First try splitting into multiple lines
wrapped <- stringr::str_wrap(label, width = 15)
# If still too long, truncate
if (nchar(wrapped) > 40) {
wrapped <- paste0(substr(label, 1, 17), "...")
}
return(wrapped)
} else {
return(label)
}
} else {
# For factor nodes, keep as is but bold
return(label)
}
})
# Add the wrapped labels to the graph
V(g_graph)$wrapped_label <- wrapped_labels[match(V(g_graph)$name, nodes$id)]
# Set a seed for reproducible layout
set.seed(42)
# Create the plot
p <- ggraph(g_graph, layout = "stress", weights = E(g_graph)$weight) +
geom_edge_link(aes(width = weight,
alpha = weight,
color = weight,
label = sprintf("%.2f", weight)),
arrow = arrow(length = unit(3, "mm"), type = "closed"),
end_cap = circle(4, "mm"),
start_cap = circle(2, "mm"),
angle_calc = "along",
label_dodge = unit(2.5, "mm"),
label_size = 2,
label_colour = "black") +
geom_node_point(aes(fill = type, size = type),
shape = 21,
color = "black",
stroke = 1,
show.legend = FALSE) +
geom_node_text(aes(label = wrapped_label),
repel = TRUE,
size = 3,
bg.color = "white",
bg.r = 0.15,
segment.color = "gray50",
segment.size = 0.4,
segment.alpha = 0.6,
fontface = ifelse(V(g_graph)$type == "indicator", "plain", "bold"),
max.overlaps = 20) +
scale_edge_width(range = c(0.5, 3), guide = "none") +
scale_edge_alpha(range = c(0.6, 1), guide = "none") +
scale_edge_color_steps2(low = "white",
mid = "#E5C494FF",
high = "#E78AC3FF",
name = "Factor Loading",
midpoint = 0.5,
limits = c(0, 1),
guide = if(g < 3) "none" else "legend") +
scale_fill_manual(values = node_colors, guide = "none") +
scale_size_manual(values = c("second_order" = 15,
"first_order" = 10,
"indicator" = 6),
guide = "none") +
labs(title = paste(city_name)) +
theme_graph(base_family = "Helvetica") +
theme(
text = element_text(family = "Helvetica"),
legend.position = if(g < 3) "none" else "bottom",
plot.title = element_text(hjust = 0, size = 14, face = "bold"),
plot.margin = margin(5, 5, 5, 5)
) +
coord_cartesian(clip = "off") +
expand_limits(x = c(-1.5, 1.5), y = c(-1.5, 1.5))
# Store the plot
city_plots[[g]] <- p
}
# Combine the plots
combined_plot <- city_plots[[1]] + city_plots[[2]] + city_plots[[3]] +
plot_layout(ncol = 3) +
plot_annotation(
theme = theme(plot.margin = margin(5, 5, 5, 5))
)
return(combined_plot)
}
# Execute
model_structure_plot <- plot_model_structure(invariance_models$scalar)
# View
model_structure_plot# Function to extract latent scores
extract_latent_scores <- function(fit_model) {
# Get the list of EBM scores (one matrix per city)
fs_list <- lavPredict(fit_model, type = "lv", method = "EBM")
# Turn each group's matrix into a data frame that includes ID
fs_dfs <- lapply(seq_along(fs_list), function(g) {
mat <- fs_list[[g]]
idx <- fit_model@Data@case.idx[[g]]
data.frame(
ID = zmsp$ID[idx],
mat,
check.names = FALSE
)
})
# Stack them into one data frame
fs_df <- bind_rows(fs_dfs)
return(fs_df)
}
# Function to prepare long-format data for latent scores
prepare_latent_scores_long <- function(data) {
data %>%
select(ID, city, class, teacher, diffic, future, experience) %>%
pivot_longer(
cols = c(class, teacher, diffic, future, experience),
names_to = "factor",
values_to = "score"
) %>%
mutate(
factor_type = ifelse(factor == "experience", "Second-Order", "First-Order"),
factor = factor(factor, levels = c("experience", "class", "teacher", "diffic", "future")),
factor_label = case_when(
factor == "experience" ~ "School Experience",
factor == "class" ~ "Bond to Class",
factor == "teacher" ~ "Bond to Teacher",
factor == "diffic" ~ "School Difficulties (reversed)",
factor == "future" ~ "Future Orientation"
)
)
}
# Function to calculate summary statistics
calculate_factor_summaries <- function(long_data) {
long_data %>%
group_by(city, factor, factor_label, factor_type) %>%
summarize(
mean_score = mean(score, na.rm = TRUE),
median_score = median(score, na.rm = TRUE),
sd_score = sd(score, na.rm = TRUE),
q25 = quantile(score, 0.25, na.rm = TRUE),
q75 = quantile(score, 0.75, na.rm = TRUE),
n = n(),
# Calculate standard error
se = sd_score / sqrt(n),
# Calculate confidence intervals
ci_lower = mean_score - 1.96 * se,
ci_upper = mean_score + 1.96 * se,
.groups = "drop"
)
}
# Function to run ANOVA and post-hoc tests
analyze_latent_var <- function(var_name, data) {
# Fit the one-way ANOVA
fit <- lm(as.formula(paste0(var_name, " ~ city")), data = data)
# ANOVA table
aov_tab <- anova(fit)
# Compute eta-squared
es_tab <- eta_squared(fit, partial = TRUE)
# Get the right column name
es_col <- intersect(c("Eta2_partial", "Eta2"), names(es_tab))[1]
# Extract city effect size
city_eta2 <- es_tab %>%
filter(Parameter == "city") %>%
pull(!!sym(es_col))
# Estimated marginal means and Tukey post-hoc
ems <- emmeans(fit, ~ city)
tukey <- pairs(ems, adjust = "tukey")
# Return all results in a list
list(
anova = aov_tab,
eta2 = city_eta2,
emmeans = ems,
contrasts = tukey
)
}
# Function to plot latent means
plot_latent_means <- function(summaries) {
ggplot(summaries, aes(x = factor_label, y = mean_score, fill = city)) +
geom_col(position = position_dodge(width = 0.9), width = 0.8, alpha = 0.7) +
geom_errorbar(
aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = 0.9),
width = 0.25
) +
facet_grid(. ~ factor_type, space = "free", scales = "free_x") +
scale_fill_brewer(palette = "Set2") +
scale_x_discrete(labels = function(x) {
str_replace_all(x, "(\\w+\\s+\\w+)\\s+", "\\1\n")
}) +
labs(x = NULL, y = "Mean Latent Score", fill = "City") +
theme_common
}
# Function to plot latent score distributions
plot_latent_distributions <- function(long_data) {
ggplot(long_data, aes(x = score, y = factor_label, fill = city)) +
geom_density_ridges(alpha = 0.7, scale = 1) +
facet_grid(factor_type ~ ., scales = "free_y", space = "free") +
scale_fill_brewer(palette = "Set2") +
scale_color_brewer(palette = "Set2") +
guides(color = "none") +
labs(x = "Latent Score", y = NULL, fill = "City") +
theme_common
}
# Function to calculate effect sizes between cities
calculate_effect_sizes <- function(long_data) {
# Group data
grouped_data <- long_data %>%
group_by(factor, factor_label, factor_type)
# Get unique groups
groups <- grouped_data %>%
group_keys()
# Calculate effect sizes for each group
effect_sizes <- map_dfr(1:nrow(groups), function(i) {
# Get current group data
grp_data <- grouped_data %>%
filter(factor == groups$factor[i])
# Get data for each city
city_data <- split(grp_data$score, grp_data$city)
cities <- names(city_data)
# Calculate Cohen's d for each pair of cities
result <- map_dfr(1:(length(cities)-1), function(j) {
map_dfr((j+1):length(cities), function(k) {
city1 <- cities[j]
city2 <- cities[k]
x <- city_data[[city1]]
y <- city_data[[city2]]
# Calculate means and SDs
mean_x <- mean(x, na.rm = TRUE)
mean_y <- mean(y, na.rm = TRUE)
sd_x <- sd(x, na.rm = TRUE)
sd_y <- sd(y, na.rm = TRUE)
# Calculate pooled SD
pooled_sd <- sqrt(((length(x) - 1) * sd_x^2 + (length(y) - 1) * sd_y^2) /
(length(x) + length(y) - 2))
# Calculate Cohen's d
cohens_d <- (mean_x - mean_y) / pooled_sd
# Return results
tibble(
factor = groups$factor[i],
factor_label = groups$factor_label[i],
factor_type = groups$factor_type[i],
comparison = paste(city1, "vs", city2),
city1 = city1,
city2 = city2,
mean_diff = mean_x - mean_y,
cohens_d = cohens_d
)
})
})
return(result)
})
# Create effect size heatmap
effect_size_heatmap <- effect_sizes %>%
select(factor_label, factor_type, comparison, cohens_d) %>%
mutate(effect_magnitude = case_when(
abs(cohens_d) < 0.2 ~ "Negligible",
abs(cohens_d) < 0.5 ~ "Small",
abs(cohens_d) < 0.8 ~ "Medium",
TRUE ~ "Large"
))
# Plot effect sizes
p <- ggplot(effect_size_heatmap, aes(x = comparison, y = factor_label, fill = cohens_d)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = sprintf("%.2f", cohens_d)), color = "black", size = 3) +
scale_fill_gradient2(
low = "#E5C494FF",
mid = "white",
high = "#E78AC3FF",
midpoint = 0,
name = "Cohen's d"
) +
facet_grid(factor_type ~ ., scales = "free_y", space = "free") +
labs(x = NULL, y = NULL) +
theme_common
return(list(
effect_sizes = effect_sizes,
heatmap = effect_size_heatmap,
plot = p
))
}latent_scores <- extract_latent_scores(invariance_models$scalar)
zmsp_with_scores <- zmsp %>% left_join(latent_scores, by = "ID")
latent_scores_long <- prepare_latent_scores_long(zmsp_with_scores)
factor_summaries <- calculate_factor_summaries(latent_scores_long)
# Run ANOVA and post-hoc tests for each latent variable
infer_results <- lapply(latent_vars, analyze_latent_var, data = zmsp_with_scores)
names(infer_results) <- latent_vars
# Calculate effect sizes
effect_size_results <- calculate_effect_sizes(latent_scores_long)AI Assistance Disclosure: We used Claude (Anthropic)
as a technical programming assistant primarily for R code development
and implementation. The assistance focused on: (1) debugging complex R
workflows using lavaan and semTools packages,
(2) creating comprehensive data visualization scripts using
ggplot2 and related packages, and (3) structuring
reproducible analysis workflows and HTML report compilation. All
research design decisions, theoretical interpretations, model
specifications, and substantive conclusions remain entirely our
intellectual contribution, with AI assistance serving as a technical
programming tool analogous to advanced statistical software
documentation with interactive problem-solving capabilities.