## Load packages ################################################################
list.of.packages <- c("rstatix", "dplyr", "tidyr","tidyverse", "effectsize", "ggalluvial", "ggpubr", "readxl", "gganimate", "paletteer", "Cairo")
#################################################################################
## Load scan_type data with header info
kari_raw <- read_csv(paste0(inPATH,"KARI_scan_mapping_DF25.csv"))

## Trim Kari
kari <- kari_raw %>% 
  mutate(AcquisitionDateTime = as.POSIXct(AcquisitionDateTime, format = "%Y-%m-%d %H:%M:%S")) %>% 
  mutate(MRdate = as.Date(AcquisitionDateTime), Time = format(AcquisitionDateTime, format = "%H:%M:%S")) %>%
  filter(grepl("MR", Modality)) %>%
  drop_na(MRdate) %>%
  select(MAPID=subject_label, experiment_label, scan_type, MRdate) 

# ## Identify mismatches between experiment_label and MRdate
# kari_inconsistent <- kari %>%
#   drop_na() %>%
#   group_by(experiment_label) %>%
#   filter(n_distinct(MRdate) > 1) %>%
#   ungroup()

## Collapse across experiment_label while retaining the MRdate column
mri <- kari %>%
  group_by(experiment_label) %>%
  distinct(experiment_label, .keep_all = TRUE) %>%
  select(MAPID, experiment_label, MRdate) %>%
  rename(ID = MAPID, Session = experiment_label, Date = MRdate) %>%
  ungroup()

## Add PET data
## Amyloid
av45 <- read_excel(paste0(dfPATH,"av45.xlsx"))
av45 <- av45 %>%
  mutate(PET_Date = as.Date(PET_Date)) %>%
  group_by(PET_Session) %>%
  distinct(PET_Session, .keep_all = TRUE) %>%
  select(ID, PET_Session, PET_Date) %>%
  rename(Session = PET_Session, Date = PET_Date) %>%
  ungroup()

fbb <- read_excel(paste0(dfPATH,"fbb.xlsx"))
fbb <- fbb %>%
  mutate(PET_Date = as.Date(PET_Date)) %>%
  group_by(PET_Session) %>%
  distinct(PET_Session, .keep_all = TRUE) %>%
  select(ID, PET_Session, PET_Date) %>%
  rename(Session = PET_Session, Date = PET_Date) %>%
  ungroup()

pib <- read_excel(paste0(dfPATH,"pib.xlsx"))
pib <- pib %>%
  mutate(PET_Date = as.Date(PET_Date)) %>%
  group_by(PET_Session) %>%
  distinct(PET_Session, .keep_all = TRUE) %>%
  select(ID, PET_Session, PET_Date) %>%
  rename(Session = PET_Session, Date = PET_Date) %>%
  ungroup()

## Combine ID and PET_Session for each av45, fbb, and pib
amy <- bind_rows(
  av45 %>% select(ID, Session, Date),
  pib %>% select(ID, Session, Date),
  fbb %>% select(ID, Session, Date)
)

## Tau
tau <- read_excel(paste0(dfPATH,"tau.xlsx"))
tau <- tau %>%
  mutate(PET_Date = as.Date(PET_Date)) %>%
  group_by(PET_Session) %>%
  distinct(PET_Session, .keep_all = TRUE) %>%
  select(ID, PET_Session, PET_Date) %>%
  rename(Session = PET_Session, Date = PET_Date) %>%
  ungroup()

## FDG
fdg <- read_excel(paste0(dfPATH,"fdg.xlsx"))
fdg <- fdg %>%
  mutate(PET_Date = as.Date(PET_Date)) %>%
  group_by(PET_Session) %>%
  distinct(PET_Session, .keep_all = TRUE) %>%
  select(ID, PET_Session, PET_Date) %>%
  rename(Session = PET_Session, Date = PET_Date) %>%
  ungroup()

## Combine all rows from mri, amy, tau, and fdg 
combined_df <- bind_rows(mri, amy, tau, fdg)

# Remove duplicate Session values, keeping only the first occurrence
combined_df <- combined_df %>% distinct(Session, .keep_all = TRUE)

## Demographics
dem <- read_excel(paste0(clinPATH,"demographics.xlsx")) %>%
  select(ID, SES, BIRTH, DEATH, ENTDATE, EDUC, HAND, grantc, race, sex)

## Clinical
cdr <- read_excel(paste0(clinPATH,"b4_cdr.xlsx")) %>%
  select(ID, TESTDATE, cdr)%>%
  mutate(CDR_Date = as.Date(TESTDATE, origin = "1899-12-30"))%>%
  select(-TESTDATE)

## APOE
apoe <- read_excel(paste0(genPATH,"apoe.xlsx"))%>%
  select(ID=id, apoe)
############################################################################
############################################################################
## Add CDR data from clinical
## Ensure Date columns are in Date format
cdr$CDR_Date <- as.Date(cdr$CDR_Date)

## Create an empty column for the closest Date and CDR
combined_df_cdr <- combined_df
combined_df_cdr$ClosestCDR <- NA
combined_df_cdr$CDR <- NA

# Loop through each row in combined_df_cdr
for (i in 1:nrow(combined_df_cdr)) {
  img_ID <- combined_df_cdr$ID[i]
  ## Test if img_ID is in clin$ID
  if (img_ID %in% cdr$ID) {
  } else {
    next
  }
  # Subset clin for the current participant
  subset_cdr <- cdr[cdr$ID == combined_df_cdr$ID[i], ]
  
  # Find the index of the closest Date
  closest_index <- which.min(abs(subset_cdr$CDR_Date - combined_df_cdr$Date[i]))
  
  # Assign the closest Date and CDR to the data frame
  combined_df_cdr$ClosestCDR[i] <- subset_cdr$CDR_Date[closest_index]
  combined_df_cdr$CDR[i] <- subset_cdr$cdr[closest_index]
}

combined_df_cdr$ClosestCDR <- as.Date(combined_df_cdr$ClosestCDR)
combined_df_cdr <- combined_df_cdr %>% drop_na(CDR)

## Calculate time between CDR date and scan date
combined_df_cdr$CDR_gap <- as.numeric(difftime(combined_df_cdr$Date, combined_df_cdr$ClosestCDR, units = "days") / 365.25)

## Remove rows where CDR is more than 2 years from MR date
combined_df_cdr <- combined_df_cdr %>% filter(abs(CDR_gap) <= 2)

############################################################################
############################################################################
## Pull Closest CDR for each ID
combined_df_cdr_max <- combined_df_cdr %>%
  group_by(ID) %>%
  summarize(CDR = max(CDR, na.rm = TRUE), across(everything(), first)) %>%
  select(ID, Session, Date, ClosestCDR, CDR_gap, CDR) %>%
  mutate(CDR_factor = case_when(CDR == 0 ~ "Unimpaired", CDR >= 0.5 ~ "Impaired")) %>%
  ungroup()

## Combine demographic data
combined_df_cdr_max_dem <- combined_df_cdr_max %>%
    left_join(dem, by = c("ID" = "ID"))

## Calculate age at scan
combined_df_cdr_max_dem$Scan_Age <- as.numeric(difftime(combined_df_cdr_max_dem$Date, combined_df_cdr_max_dem$BIRTH, units = "days")/365.25)

## Calculate the min, max, and standard deviation for Scan_Age
age_range <- range(combined_df_cdr_max_dem$Scan_Age)
age_sd <- sd(combined_df_cdr_max_dem$Scan_Age)

#################################################################################
#################################################################################
## Alluvial plot (age, sex, race, CDR)
## Create age group break points for 3 groups
breaks <- quantile(combined_df_cdr_max_dem$Scan_Age, probs = c(0, 1/3, 2/3, 1),names=FALSE)
agemin <- breaks[1]
agelower <- breaks[2]
ageupper <- breaks[3]
agemax <- breaks[4]

strlow <- paste0(floor(agemin),'-',floor(agelower))
strmid <- paste0(floor(agelower),'-',floor(ageupper))
strhigh <- paste0(floor(ageupper),'-',floor(agemax))

## Build data frame for alluvial plot
alluvial.df <- combined_df_cdr_max_dem %>%
  select(CDR, Scan_Age, sex, race)%>%
  mutate(Age = factor(case_when(Scan_Age >= agemin & Scan_Age <= agelower ~ strlow, Scan_Age >= agelower & Scan_Age <= ageupper ~ strmid, Scan_Age >= ageupper & Scan_Age <= agemax ~ strhigh)), Sex = factor(str_replace_all(sex, c("M" = "Male", "F" = "Female"))), CDR = factor(case_when(CDR == 0 ~ "Unimpaired", CDR >= 0.5 ~ "Impaired")), Race = if_else(race %in% c("White", "Black", "ASIAN", "more than one"), race,"Non-White")) %>%
  mutate(Race = factor(str_replace_all(Race, c("Black" = "Non-White", "ASIAN" = "Non-White", "more than one" = "Non-White"))))

  ## Reorder factors for CDR so that Unimpaired is first
alluvial.df$Race <- factor(alluvial.df$Race, levels = c("White", "Non-White"))
alluvial.df$CDR <- factor(alluvial.df$CDR, levels = c("Impaired","Unimpaired"))
alluvial.df$Age <- factor(alluvial.df$Age, levels = c(strhigh, strmid, strlow))

colorfill <- c("#6F4070FF", "#EDAD08FF")

## Attempt 2
ggplot(alluvial.df, aes(axis1 = Age, axis2 = Sex, axis3 = Race, axis4 = CDR)) +
  scale_x_discrete(limits = c("Age", "Sex", "Race", "CDR")) +
  geom_alluvium(aes(fill = CDR), stat = "alluvium", aes.bind = "alluvia") +
  geom_stratum(aes(fill = CDR), alpha = 1) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 4) +
  labs(title = "Demographics According to CDR", y = "Number of Participants") + 
  scale_fill_manual(values = colorfill, na.translate = FALSE) +
  theme(panel.border = element_blank(), axis.line = element_line(),  
        plot.title = element_text(hjust = 0.5),  
        panel.background = element_rect(fill = "transparent", colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        axis.text = element_text(size = 12), axis.title = element_text(size=12), legend.position = "none")