###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# ANALYSIS FOR STUDY ####
# "The gut microbiome, single nucleotide polymorphisms, and differentially expressed genes promote aggression in an ant" ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## ARTICLE AUTHORS:
#Patrick Krapf, Francesco Cicconardi, Martin Schilling, Gerhard P. Aigner, Thomas Klammsteiner, Manfred Ayasse, Wolfgang Arthofer, Sasha (Alexander) Mikheyev, Birgit C. Schlick-Steiner, Florian M. Steiner


###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Install and load all packages needed####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
packages_needed <- c("ExactMultinom",
                     "readxl",
                     "rnaturalearth",
                     "rnaturalearthdata",
                     "ggspatial",
                     "sf",
                     "ggmap",
                     "ggplot2",
                     "stringr",
                     "ggsn",
                     "dplyr",
                     "forcats",
                     "ggfortify",
                     "factoextra",
                     "cluster",
                     "qqman",
                     "data.table",
                     "tidyverse",
                     "ggpubr",
                     "vegan",
                     "MASS",
                     "edgeR",
                     "DESeq2",
                     "Glimma",
                     "ggvenn",
                     "raster",
                     "rasterVis",
                     "rworldxtra",
                     "foreign",
                     "nnet",
                     "reshape2",
                     "emmeans",
                     "lmtest",
                     "AICcmodavg",
                     "DescTools",
                     "tidyr",
                     "ggrepel",
                     "broom"
)

#The code below, checks which packages are needed, whether they are installed. If not they are downloaded.
pk_to_install <- packages_needed [!( packages_needed %in% rownames(installed.packages())  )]
if(length(pk_to_install)>0 ){
  install.packages(pk_to_install,repos="http://cran.r-project.org")
}

#Now, all packages are loaded. Not necessarily needed, as packages will be loaded specifically for each analysis as well.
lapply(packages_needed, require, character.only = TRUE)
#Now, all libraries should be loaded. You can check this, by using the search() function
search()



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Blind bioassays / Recognition assays ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## Below, we analyse the results of the blind bioassays using a Multinomial Goodness-of-Fit Tests
## Note, we don't need to load this data as it is count data added below.
library(ExactMultinom)

observed_counts <- c(502, 423, 454) # observed counts for staying on own, alien, control areas on filter plates
expected_counts <- rep(sum(observed_counts)/3, 3) # expected counts assuming equal probabilities

# Perform the multinomial test
result <-  ExactMultinom::multinom.test(x = observed_counts, p = expected_counts)
# Print the results
print(result)



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Fig 1 - Map + inset ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###

###---###---###---###---###---###---###
## Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(readxl); library("rnaturalearth"); library("rnaturalearthdata"); library("ggspatial"); library("sf"); library("ggmap"); library(ggplot2); library("stringr"); library("ggsn") 

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") 
dataset <- read_xlsx("ForR_DNA_samples.xlsx", 
                     sheet="EnvironmentalVars") #
head(dataset)

###---###---###---###---###---###---###
## Data tidying ####
###---###---###---###---###---###---###
dataset_map <- matrix(NA, nrow=9, ncol=1)
dataset_map <- as.data.frame(dataset_map)
dataset_map$lat <- unique(dataset$lat)
dataset_map$lon <- unique(dataset$lon)
dataset_map$V1 <- unique(dataset$nest)
dataset_map

###---###---###---###---###---###---###
## Load Europe tiles with stadia maps ####
###---###---###---###---###---###---###
##Europe
area <- c(left = 0, bottom = 35, right = 30 , top = 57) #all Europe  
map2 <- get_stadiamap(area, zoom = 5, maptype = "stamen_toner_lite", color="color")  #, force = T


###---###---###---###---###---###---###
### Save pdf ####
###---###---###---###---###---###---###
pdf("Fig1A_Europe_Plot_MS.pdf")
#Europe_PW_20240610 former map
ggmap(map2) +
  ylab("Latitude")+
  xlab("Longitude")+ 
  geom_point(aes(y=dataset$lat[1], x=dataset$lon[1]), size=4, fill="#7b7bc0ff", colour="black",pch=21) +  
  geom_point(aes(y=dataset$lat[28], x=dataset$lon[28]) , size=4, fill="#32cd32ff", colour="black",pch=21) + 
  geom_point(aes(y=dataset$lat[91], x=dataset$lon[91]) , size=4, fill="#ff6820ff", colour="black",pch=21) + 
  scalebar(dataset, dist = 50, dist_unit = "km", transform = TRUE, model = "WGS84", location="bottomright", height=0.1)+
  theme(plot.title = element_text(size=25),
        axis.title.x = element_text(size=17),
        axis.text.x  = element_text(size=13, color="black"),
        axis.title.y = element_text(size=17),
        axis.text.y  = element_text(size=13, color="black"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.grid.minor.x = element_blank())
dev.off()


###---###---###---###---###---###---###
## Create inset with detailed locations ####
###---###---###---###---###---###---###
area <- c(left = 10.5, bottom = 46.50, right = 13.50 , top = 47.5) #Austria inset
map3 <- get_stadiamap(area, zoom = 9, maptype = "stamen_terrain_background", color="color")  

###---###---###---###---###---###---###
### Save pdf ####
###---###---###---###---###---###---###
pdf("Fig1A_Inset_PlotAustria_MS.pdf")
ggmap(map3)+
  ylab("Latitude")+
  xlab("Longitude")+
  geom_point(aes(y=dataset_map$lat[1], x=dataset_map$lon[1]), size=4, fill="#7b7bc0ff", colour="black",pch=21) +  #MQ-N
  geom_point(aes(y=dataset_map$lat[4], x=dataset_map$lon[4]), size=4, fill="#32cd32ff", colour="black",pch=21) + #SQ-N
  geom_point(aes(y=dataset_map$lat[9], x=dataset_map$lon[9]), size=4, fill="#f6820ff6", colour="black",pch=21) + #SQ-A
  geom_segment(aes(x = dataset_map$lon[4], y = dataset_map$lat[4], xend = dataset_map$lon[9], yend = dataset_map$lat[9])) +
  geom_segment(aes(x = dataset_map$lon[4], y = dataset_map$lat[4], xend = dataset_map$lon[3], yend = dataset_map$lat[3])) +
  geom_segment(aes(x = dataset_map$lon[3], y = dataset_map$lat[3], xend = dataset_map$lon[9], yend = dataset_map$lat[9])) +
  geom_point(aes(y= 47.259659, x=11.400375) , size=4, fill="black", colour="black",pch=21) +  #Innsbruck
  annotate("text", y=47.259659, x=11.400375+0.3, label="Innsbruck", size=6, colour="black") + 
  annotate("text", x = 11.25, y = 47.0, label = "40 km") + 
  annotate("text", x = 12.0, y = 47.0, label = "140 km") + 
  annotate("text", x = 12.25, y = 46.7, label = "125 km") + 
  scalebar(dataset, dist = 50, dist_unit = "km", transform = TRUE, model = "WGS84", location="bottomright", height=0.1)+
  theme(plot.title = element_text(size=25),
        axis.title.x = element_text(size=17),
        axis.text.x  = element_text(size=13, color="black"),
        axis.title.y = element_text(size=17),
        axis.text.y  = element_text(size=13, color="black"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.grid.minor.x = element_blank())
dev.off()

## Note: Fig1 B (schematic overview) was crated in BioRender and was added manually to Fig1.
#END Fig1



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Aggression analysis ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
##Below, we will analyse the data from the aggression assays. 

###---###---###---###---###---###---###
## Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(readxl); library(dplyr); library(ggplot2); library(forcats)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") 
dataset <- read_xlsx("ForR_DNA_samples.xlsx", 
                     sheet="samples109_") #
head(dataset)

#reduced dataset
dataset_red <- dataset %>%
  filter(seq == "part1")  %>%
  as_tibble()


###---###---###---###---###---###---###
## Create Figure for detailed aggression analysis ####
###---###---###---###---###---###---###

#Create the plot labels
labelsPlot <- c("Reacted aggressively", "Reacted peacefully", "Started aggression")
names(labelsPlot) <- c("react_agg", "react_pcf", "start_agg")
dataset_red$beh  <-  fct_relevel(dataset_red$beh,
                                 "start_agg", "react_agg", "react_pcf")  # custom order for facet_grid

###---###---###---###---###---###---###
## Save pdf ####
###---###---###---###---###---###---###
table(dataset_red$pop, dataset_red$beh)
pdf("Fig2A_PW_beh_red_MS.pdf")
ggplot(dataset_red, aes(x=pop, y=AI_worker, fill=pop))+
  geom_boxplot() + 
  geom_violin(alpha=0.2)+
  geom_jitter(alpha=0.2)+
  facet_grid(~beh, labeller=labeller(beh = labelsPlot)) +
  ylab("Behaviour Index (AI)")+
  xlab("Populations")+
  theme_bw()+
  scale_y_continuous(breaks=seq(-4,5,1), limits = c(-4.2, 5.2))+
  scale_fill_manual(values=c("#ffad5b", "#32cd32", "#7d9ec0"),
                    breaks = c("SQ-A", "SQ-N", "MQ-N")) +  # custom legend order
  scale_x_discrete(limits=c("SQ-A", "SQ-N", "MQ-N")) +
  theme(plot.title = element_text(size=20),
        axis.title.x = element_text(size=15),
        axis.text.x  = element_text(size=13, color="black"),
        axis.title.y = element_text(size=15),
        axis.text.y  = element_text(size=13, color="black"),
        strip.text.x = element_text(size = 13, colour = "black"),
        legend.position="none",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())
dev.off()


###---###---###---###---###---###---###
## ANOVA: Test if aggression is significant different among populations ####
###---###---###---###---###---###---###
mod1 <- aov(dataset_red$AI_worker ~ dataset_red$beh)
summary(mod1)
TukeyHSD(mod1)

#END aggression assays



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# CHC PCA and hierarchical cluster analysis ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###

###---###---###---###---###---###---###---###---###---###
## CHC PCA ####
###---###---###---###---###---###---###---###---###---###

###---###---###---###---###---###---###
### Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(ggfortify)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

CHC_data <- rio::import("./ForR_CHC_results.xlsx", which="values")
head(CHC_data)
length(CHC_data)
test2<- prcomp(CHC_data[3:65])
summary(test2)


###---###---###---###---###---###---###
### Use ggfortify's autoplot to create PCA ####
###---###---###---###---###---###---###
#Plot the PCA
a_plot <- autoplot(prcomp(CHC_data[3:65]), data=CHC_data, frame = TRUE,
                   frame.type = 'norm', colour = "pop", size = 3)+ theme_bw() 

# Save the PDF
pdf("Fig2B_PCA_CHC_MS.pdf")
a_plot + 
  scale_color_manual(values = c("#ffad5b", "#32cd32", "#7d9ec0"),
                     limits=c("SQ-A", "SQ-N", "MQ-N"),
                     breaks=c("SQ-A", "SQ-N", "MQ-N"),
                     labels=c("Pop SQ-A", "Pop SQ-N", "Pop MQ-N"),
                     name="Populations") +
  scale_fill_manual(values = c("#ffad5b", "#32cd32", "#7d9ec0"),
                    limits=c("SQ-A", "SQ-N", "MQ-N"),
                    breaks=c("SQ-A", "SQ-N", "MQ-N"),
                    labels=c("Pop SQ-A", "Pop SQ-N", "Pop MQ-N"),  #"Pop A", "Pop N", "Pop S"
                    name="Populations") +
  scale_x_continuous(breaks = c(seq(-0.4,0.4,0.2))) 
dev.off()
#END PCA CHCs

##NOTE: Fig2 C-D were created using the VCF files and were added manually to Fig2.


###---###---###---###---###---###---###---###---###---###
## CHC HIERARCHICAL CLUSTER ANALYSIS ####
###---###---###---###---###---###---###---###---###---###
## Below, we will conduct a hierarchichal cluster analysis using the CHC data.

###---###---###---###---###---###---###
### Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(factoextra); library(cluster)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

# load data
CHC_data <- rio::import("./ForR_CHC_results.xlsx", which="values")
head(CHC_data)


###---###---###---###---###---###---###
### Find linkage method and conduct clustering ####
###---###---###---###---###---###---###
#Find the Linkage Method to Use
#define linkage methods
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

#function to compute agglomerative coefficient
ac <- function(x) {
  agnes(CHC_data[3:65], method = x)$ac
}

#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac)
#We can see that Ward’s minimum variance method produces the highest agglomerative coefficient, thus we’ll use that as the method for our final hierarchical clustering:

#perform hierarchical clustering using Ward's minimum variance
clust <- agnes(CHC_data[3:65], method = "ward")

###---###---###---###---###---###---###
#### Save pdf ####
###---###---###---###---###---###---###
pdf("FigS1_CHC_Hierarch_Clust_Anal_MS_supp.pdf")
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram", labels=CHC_data$col)
dev.off()
#END Hierarchical cluster analysis



###---###---###---###---###---###---###
# SNPs GEMMA (Genome Wide association analysis), PCA, Fisher's test ####
###---###---###---###---###---###---###
## Below, we load the results of the GEMMA analysis and create a Manhattan plot for the SNPs

###---###---###---###---###---###---###
## Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(qqman); library(data.table)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") 
paramsAgg <- read.table("ForR_result.assoc.DNA_agg.txt", header = T) #is same as ldp_enq_lmm
head(paramsAgg)

###---###---###---###---###---###---###
## Plot GEMMA aggression results  ####
###---###---###---###---###---###---##ä
chr1 <- as.numeric(unlist(lapply(strsplit(unlist(lapply(strsplit(paramsAgg$rs, "_"), '[[', 2)), ":"), "[[", 1)))
ps1 <- unlist(lapply(strsplit(paramsAgg$rs, ":"), '[[', 3))
paramsAgg["chr"] <- chr1
paramsAgg$BP <- as.numeric(ps1)
paramsAgg$CHR <- as.integer(paramsAgg$chr)
paramsAgg$p_wald <- as.numeric(paramsAgg$p_wald)
paramsAgg$SNP <- paramsAgg$rs

## NOTE: this may take some time, depending on your local machine
AggManhattan <- manhattan(paramsAgg, p= 'p_wald')
pdf("FigS2_ManhattanPlot_MS_Supp.pdf")
manhattan(paramsAgg, p= 'p_wald')
dev.off()
#END Manhattan plot


###---###---###---###---###---###---###
## Plot allelic states for each of the SNPs  ####
###---###---###---###---###---###---###

###---###---###---###---###---###---###
### Load packages, setwd, and load data  ####
###---###---###---###---###---###---###
library(ggplot2); library(tidyverse); library(ggpubr)

#Setwd and load data
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") 
data <- readxl::read_xlsx("ForR_SNPs_agg_SuggLine_GEMMA.xlsx", sheet="Data_R2") #alternatively use "Data_R"
head(data)

data1 <- data #
table(data1$scaffold_11_2457277_, data1$beh)
table(data1$scaffold_163_28302_, data1$beh)
table(data1$scaffold_185_110741_, data1$beh)

#SNP1
p1 <- ggplot(data, aes(x=seq(1:109), y=scaffold_11_2457277, col=beh)) +
  geom_point(size=2) + 
  xlab("Samples")+
  ylab("SNP1 state")+
  theme_bw()+
  scale_color_manual(name="Behavioural\nstates",
                     limits=c("start_agg","react_agg","react_pcf"),
                     breaks=c("start_agg", "react_agg", "react_pcf"),
                     labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"),
                     values=c("#CC6677", "#DDCC77", "#332288")) +
  scale_x_continuous(breaks=c(seq(1:109)),
                     labels=c(data$sample)) +
  theme(axis.text.x  = element_text(angle=45,  size=12)) #vjust=0.5,
p1


#SNP2
p2 <- ggplot(data, aes(x=seq(1:109), y=scaffold_163_28302, col=beh)) +
  geom_point(size=2) + 
  xlab("Samples")+
  ylab("SNP2 state")+
  theme_bw()+
  scale_color_manual(name="Behavioural\nstates",
                     limits=c("start_agg","react_agg","react_pcf"),
                     breaks=c("start_agg", "react_agg", "react_pcf"),
                     labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"),
                     values=c("#CC6677", "#DDCC77", "#332288")) +
  scale_x_continuous(breaks=c(seq(1:109)),
                     labels=c(data$sample)) +
  theme(axis.text.x  = element_text(angle=45,  size=12)) #

p2

#SNP3
p3 <- ggplot(data, aes(x=seq(1:109), y=scaffold_185_110741, col=beh)) +  #
  geom_point(size=2) + 
  xlab("Samples")+
  ylab("SNP3 state")+
  scale_x_continuous(breaks=c(seq(1:109)),
                     labels=c(data$sample)) +
  scale_color_manual(name="Behavioural\nstates",
                     limits=c("start_agg","react_agg","react_pcf"),
                     breaks=c("start_agg", "react_agg", "react_pcf"),
                     labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"),
                     values=c("#CC6677", "#DDCC77", "#332288")) +
  theme_bw()+
  scale_x_continuous(breaks=seq(1, 109, by=2))+
  theme(axis.text.x  = element_text(angle=45,  size=12)) #
p3


###---###---###---###---###---###---###
### Save pdf ####
###---###---###---###---###---###---###
pdf("FigS3_SNP_states_MS_supp.pdf", width = 9, height=7)
ggarrange(p1, p2, p3, 
          labels = c("Mediator of RNA polymerase II transcription subunit 26, scaffold_11_2457277",
                     "Unkown, scaffold_163_28302",
                     "Gastrulation-defective, scaffold_185_110741"),
          ncol = 1, nrow = 3) 
dev.off()
#END plot allelic states


###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# PCA with SNP allelic states  ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Below, we create a PCA with SNPs allelic states + behavioural states

###---###---###---###---###---###---###
### Load packages, setwd, and load data  ####
###---###---###---###---###---###---###
library(vegan); library(MASS); library(ggfortify)

# set wd and load data
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") 
data <- readxl::read_xlsx("ForR_SNPs_agg_SuggLine_GEMMA.xlsx", sheet="Data_R")

#Now, we create dummy variables: code homozygotes for ref as "1", heterozygotes as "2", homozygotes for alt were not found, so not "3"
data$scaffold_185_110741_coded <- ifelse(data$scaffold_185_110741 =="0/1", 2, 1)
data$scaffold_163_28302_coded <- ifelse(data$scaffold_163_28302 =="0/1", 2, 1)
data$scaffold_11_2457277_coded <- ifelse(data$scaffold_11_2457277 =="0/1", 2, 1)
data


###---###---###---###---###---###---###
## Subset data and plot data ####
###---###---###---###---###---###---###
#Subset data
data_subset <- subset(data, select = c("scaffold_185_110741_coded", "scaffold_163_28302_coded", "scaffold_11_2457277_coded", "beh"))

#Use ggfortify and autoplot to plot the data
a_plot <- autoplot(prcomp(data_subset[1:3]), data=data,
                   frame = TRUE, frame.type = 'norm', 
                   colour = "beh", size = 3) + theme_bw()

#Plot PCA
pdf("Fig3A_SNPs_PCA_MS.pdf")
a_plot +
  scale_color_manual(values = c("#CC6677", "#DDCC77", "#332288"),
                     limits=c("start_agg","react_agg","react_pcf"),
                     breaks=c("start_agg", "react_agg", "react_pcf"),
                     labels=c("started\naggression", 
                              "reacted\naggressively",
                              "reacted\npeacefully"),
                     name="Behavioural\nstates") +
  scale_fill_manual(values = c("#CC6677", "#DDCC77", "#332288"),
                    limits=c("start_agg","react_agg","react_pcf"),
                    breaks=c("start_agg", "react_agg", "react_pcf"),
                    labels=c("started\naggression",
                             "reacted\naggressively",
                             "reacted\npeacefully"),
                    name="Behavioural\nstates")
dev.off()
#END plot


###---###---###---###---###---###---###
## Run Pearson's Chi-squared Test with SNP count data on behavioural states  ####
###---###---###---###---###---###---###

## Setwd and load data
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") 
data <- readxl::read_xlsx("ForR_SNPs_agg_SuggLine_GEMMA.xlsx", sheet="Data_R")

## Check data 
table(data$scaffold_11_2457277, data$beh)
table(data$scaffold_163_28302, data$beh)
table(data$scaffold_185_110741, data$beh)

###---###---###---###---###---###---###
### Run Pearson's Chi-squared Test ####
###---###---###---###---###---###---###

sink("Pearson_Chi_squared-test_SNP_count_data.txt")
table(data$scaffold_11_2457277, data$beh)
table(data$scaffold_163_28302, data$beh)
table(data$scaffold_185_110741, data$beh)
cat("\n")
cat("\n")
print("count SNP states #SNP scaffold 11")
cat("\n")

#count SNP states #SNP scaffold 11
M <- as.table(rbind(c(36, 5),
                    c(35, 4),
                    c(26, 3)))
dimnames(M) <- list(behavioural_state = c("started_aggression",
                                          "reacted_aggressivelly",
                                          "reacted_peacefully"),
                    allele = c("reference","alternative"))
M
(Xsq <- chisq.test(M, simulate.p.value = T, B = 2000))  # Prints test summary

#Pairwise comparisons of the resuls
pairwise_tests <- function(M) {
  rows <- combn(nrow(M), 2, simplify = FALSE)
  
  results <- lapply(rows, function(r) {
    tab <- M[r, ]
    test <- fisher.test(tab)
    data.frame(
      comparison = paste("Row", r[1], "vs Row", r[2]),
      p.value = test$p.value
    )
  })
  
  do.call(rbind, results)
}

posthoc <- pairwise_tests(M)

# Holm correction for multiple testing
posthoc$p.adjusted <- p.adjust(posthoc$p.value, method = "holm")

#Result
posthoc

#Cermér's V Effect size
chisq <- chisq.test(M, simulate.p.value = TRUE, B = 2000)

n <- sum(M)
r <- nrow(M)
c <- ncol(M)

cramers_v <- sqrt(chisq$statistic / (n * min(r - 1, c - 1)))
cramers_v

cat("\n")
cat("\n")
print("count SNP states #SNP scaffold 163")

#count SNP states #SNP scaffold 163
M <- as.table(rbind(c(18, 23), c(2, 37), c(2, 27)))
dimnames(M) <- list(behavioural_state = c("started_aggression",
                                          "reacted_aggressivelly",
                                          "reacted_peacefully"),
                    allele = c("reference","alternative"))
M
(Xsq <- chisq.test(M, simulate.p.value = T, B = 2000))  # Prints test summary


## Pairwise comparison
posthoc <- pairwise_tests(M)

# Holm correction for multiple testing
posthoc$p.adjusted <- p.adjust(posthoc$p.value, method = "holm")

#Result
posthoc

#Cramér's V Effect size
chisq <- chisq.test(M, simulate.p.value = TRUE, B = 2000)

n <- sum(M)
r <- nrow(M)
c <- ncol(M)

cramers_v <- sqrt(chisq$statistic / (n * min(r - 1, c - 1)))
cramers_v

cat("\n")
cat("\n")
print("count SNP states #SNP scaffold 185")

#count SNP states #SNP scaffold 185
M <- as.table(rbind(c(11, 30), c(29, 10), c(18, 11)))
dimnames(M) <- list(behavioural_state = c("started_aggression",
                                          "reacted_aggressivelly",
                                          "reacted_peacefully"),
                    allele = c("reference","alternative"))
M
(Xsq <- chisq.test(M, simulate.p.value = T, B = 2000))  # Prints test summary


## Pairwise comparison
posthoc <- pairwise_tests(M)

# Holm correction for multiple testing
posthoc$p.adjusted <- p.adjust(posthoc$p.value, method = "holm")

#Result
posthoc

#Cermér's V Effect size
chisq <- chisq.test(M, simulate.p.value = TRUE, B = 2000)

n <- sum(M)
r <- nrow(M)
c <- ncol(M)

cramers_v <- sqrt(chisq$statistic / (n * min(r - 1, c - 1)))
cramers_v

sink()
#END Pearson's Chi-squared test



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Differential gene expression  ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## NOTE: We only use 82 individuals here, which is filtered using the file "ForR_RNA_82Samples.xlsx"

###---###---###---###---###---###---###
## Load packages, setwd, and load data  ####
###---###---###---###---###---###---###
library(edgeR); library(DESeq2); library(Glimma)


###---###---###---###---###---###---###
## Load HTseq count data  ####
###---###---###---###---###---###---###
## Note: Using a custom script, we merged all HTseq count data files together into one tvs-file, "ForR_htseq_all_sample_count.tsv", for the DGE analysis

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")
countData <- read.csv("ForR_htseq_all_sample_count.tsv", sep = "\t",header=TRUE)
#Note, here are still samples present but in the next step, only the selected 82 individuals will be used.


###---###---###---###---###---###---###
### Set column and row names in file ####
###---###---###---###---###---###---###
IdList <- read.csv("indList_pops.txt", sep = "\t",header=T)
head(IdList)
colnames(countData) <- t(IdList)
rownames(countData) <- countData$gene_id
countData
head(countData)
tail(countData)


###---###---###---###---###---###---###
### Delete ambigious gene names, load metadata ####
###---###---###---###---###---###---###
##Delete ambiguous genes names and exclude things such as "no feature", etc. 
countData <- countData[-20857:-20861,] #20859
tail(countData)


# Load metadata ###
##Metadata is information of the samples
metaData <- readxl::read_xlsx("ForR_RNA_82samples.xlsx", sheet="RNA_82samples")

#Select comparison levels for the behavioural states, either started aggression vs reacted aggressively (strAgg_rctAgg), started aggression vs reacted peacefully (strAgg_rctPcf), reacted aggressively  vs reacted peacefully (rctAgg_rctPcf)
StartAgg_vs_RctAgg <- subset(metaData, strAgg_rctAgg=="x") 
StartAgg_vs_RctPcf <- subset(metaData, strAgg_rctPcf=="x") 
RctAgg_vs_RctPcf <- subset(metaData, rctAgg_rctPcf=="x")  

#List worker ID including population and colony ID
StartAgg_vs_RctAgg$id_
StartAgg_vs_RctPcf$id_
RctAgg_vs_RctPcf$id_


###---###---###---###---###---###---###
### Prepare data for DGE analysis ####
###---###---###---###---###---###---###
#### Select data for behavioral comparisons ####
#Started aggression vs reacted aggressively
#StartAgg_vs_RctAgg #
inds_StAgg_RctAgg <- c("gene_id", StartAgg_vs_RctAgg$id_)
counData_Start_RctAgg <- subset(countData, select=inds_StAgg_RctAgg)
counData_Start_RctAgg
length(counData_Start_RctAgg) #59+1 (geneID)
length(StartAgg_vs_RctAgg$sample_bam) 
StartAgg_vs_RctAgg$id_

#Started aggression vs reacted peacefully
#StartAgg_vs_RctPcf #
inds_StAgg_RctPcf <- c("gene_id", StartAgg_vs_RctPcf$id_)
counData_Start_RctPcf <- subset(countData, select=inds_StAgg_RctPcf)
counData_Start_RctPcf
length(counData_Start_RctPcf) #55+1 (geneID)
length(StartAgg_vs_RctPcf$sample_bam) 
StartAgg_vs_RctPcf$id_

#Reacted aggressively vs reacted aggressively
#RctAgg_vs_RctPcf #
inds_RctAgg_RctPcf <- c("gene_id", RctAgg_vs_RctPcf$id_)
counData_RctAgg_RctPcf <- subset(countData, select=inds_RctAgg_RctPcf)
counData_RctAgg_RctPcf
length(counData_RctAgg_RctPcf) #52+1 (geneID)
length(RctAgg_vs_RctPcf$sample_bam) 
RctAgg_vs_RctPcf$id_


###---###---###---###---###---###---###
#### Create DESeq data object for behavioral comparisons ####
###---###---###---###---###---###---###

## Started aggression VS Reacted aggressively ##
dds_St_vs_RctAgg <- DESeqDataSetFromMatrix(countData=counData_Start_RctAgg, 
                                           colData=StartAgg_vs_RctAgg, 
                                           design=~beh, tidy = TRUE)
length(StartAgg_vs_RctAgg$pop) #
length(counData_Start_RctAgg) #
ncol(counData_Start_RctAgg)-1 == nrow(StartAgg_vs_RctAgg) #control, if same amount of 

## Started aggression VS Reacted peacefully ##
dds_St_vs_RctPcf <- DESeqDataSetFromMatrix(countData=counData_Start_RctPcf, 
                                           colData=StartAgg_vs_RctPcf, 
                                           design=~beh, tidy = TRUE)
length(StartAgg_vs_RctPcf$pop) #
length(counData_Start_RctPcf) #
ncol(counData_Start_RctPcf)-1 == nrow(StartAgg_vs_RctPcf) #control, if same amount of 


#Reacted aggressively VS Reacted peacefully ##
dds_Rctagg_vs_RctPcf <- DESeqDataSetFromMatrix(countData=counData_RctAgg_RctPcf,
                                               colData=RctAgg_vs_RctPcf, 
                                               design=~beh, tidy = TRUE)
length(RctAgg_vs_RctPcf$pop) #
length(counData_RctAgg_RctPcf) #
ncol(counData_RctAgg_RctPcf)-1 == nrow(RctAgg_vs_RctPcf) #control, if same amount of 


###---###---###---###---###---###---###
#### Pre-filtering ####
###---###---###---###---###---###---###

#While it is not necessary to pre-filter low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful: by removing rows in which there are very few reads, we reduce the memory size of the dds data object, and we increase the speed of the transformation and testing functions within DESeq2. Here we perform a minimal pre-filtering to keep only rows that have at least 10 reads total. Note that more strict filtering to increase power is automatically applied via independent filtering on the mean of normalized counts within the results function.

#Started aggression vs reacted aggressively
#dds_St_vs_RctAgg  #
keep <- rowSums(counts(dds_St_vs_RctAgg)) >= 10 #prefilter to counts found more often than 10
dds_St_vs_RctAgg <- dds_St_vs_RctAgg[keep,]
dds_St_vs_RctAgg <- DESeq(dds_St_vs_RctAgg)
dds_St_vs_RctAgg$strAgg_rctAgg

#Started aggression vs reacted peacefully
#dds_St_vs_RctPcf  #
keep <- rowSums(counts(dds_St_vs_RctPcf)) >= 10 #prefilter to counts found more often than 10
dds_St_vs_RctPcf <- dds_St_vs_RctPcf[keep,]
dds_St_vs_RctPcf <- DESeq(dds_St_vs_RctPcf)
dds_St_vs_RctPcf$strAgg_rctPcf

#Reacted aggressively vs reacted peacefully
#dds_Rctagg_vs_RctPcf #
keep <- rowSums(counts(dds_Rctagg_vs_RctPcf)) >= 10 #prefilter to counts found more often than 10
dds_Rctagg_vs_RctPcf <- dds_Rctagg_vs_RctPcf[keep,]
dds_Rctagg_vs_RctPcf <- DESeq(dds_Rctagg_vs_RctPcf)
dds_Rctagg_vs_RctPcf$rctAgg_rctPcf


###---###---###---###---###---###---###
### Differential gene expression analysis ####
###---###---###---###---###---###---###

#Select behavioural comparisons
dds <- dds_St_vs_RctAgg 
dds <- dds_St_vs_RctPcf 
dds <- dds_Rctagg_vs_RctPcf 
#Note: You have to run these separately or rename the df "dds" and then change the script below.

#Check if correct one is loaded
dds
dds$name
resultsNames(dds)
#estimateSizeFactors --> This calculates the relative library depth of each sample 
#estimateDispersions --> estimates the dispersion of counts for each gene 
#nbinomWaldTest --> calculates the significance of coefficients in a Negative Binomial GLM using the size and dispersion outputs

#Details about the comparison are printed to the console, directly above the results table. Take a look at the results table
res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")
res
head(results(dds, tidy=TRUE)) #


###---###---###---###---###---###---###
### Summary of differential gene expression analysis ####
###---###---###---###---###---###---###
#Below, in "res" are ther resutls stored
summary(res)

# Note that we could have specified the coefficient or contrast we want to build a results table for, using either of the following equivalent commands:
#res <- results(dds, contrast=c("condition","treated","untreated"))
res_A <- results(dds, name="beh_start_agg_vs_react_agg")
res_B <- results(dds, name="beh_start_agg_vs_react_pcf") 
res_C <- results(dds, name="beh_react_agg_vs_react_pcf") 
##Note: You need to run the code from the lines 744 onwards with the different behavioural comparisons 

#Display results
summary(res_A)
summary(res_B)
summary(res_C)



###---###---###---###---###---###---###
#### Volcano plots ####
###---###---###---###---###---###---###
#Set par to defualt valus, in case it has changed
par(mfrow=c(1,1))

###---###---###---###---###---###---###
##### Select behavioural comparisons ####
###---###---###---###---###---###---###

## Started aggression vs reacted aggressively ##
dds <- dds_St_vs_RctAgg 

#Calculate how many genes are significanly up/down-regulated for each behavioural comparison
res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")
summary(res)
tail(res)
CountGenes <- subset(res, padj<.05 )
summary(CountGenes)
length((CountGenes$padj))  #90

pdf("FigS4A_volcanoPlot_StartAgg_vs_RctAgg_pval.05.pdf")
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))
# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res, padj<.02 & abs(log2FoldChange)> 1), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
dev.off()


## Started aggression vs reacted peacefully ##
dds <- dds_St_vs_RctPcf 
res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")
summary(res)
tail(res)
CountGenes <- subset(res, padj<.05 )
summary(CountGenes)
length((CountGenes$padj))  #90

pdf("FigS4B_volcanoPlot_StartAgg_vs_RctPcf_pval.05.pdf")
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))
# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res, padj<.02 & abs(log2FoldChange)> 1), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
dev.off()


## Reacted aggressively vs reacted peacefully ##
dds <- dds_Rctagg_vs_RctPcf
res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")
summary(res)
tail(res)
CountGenes <- subset(res, padj<.05 )
summary(CountGenes)
length((CountGenes$padj))  #90

pdf("FigS4C_volcanoPlot_RctAggvsRctPcf_pval.05.pdf")
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))
# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res, padj<.02 & abs(log2FoldChange)> 1), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
dev.off()



###---###---###---###---###---###---###
## Prepare data for saving and to be used in g:Profiler ####
###---###---###---###---###---###---###
# Now, we save all genes without p-adjusted values for Gene ontology analysis in g:Profiler
library(tidyverse)

#Select behavioural comparison
dds <- dds_St_vs_RctAgg 
resLFC <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")

#Select behavioural comparison
dds <- dds_St_vs_RctPcf
resLFC <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")

#Select behavioural comparison
dds <- dds_Rctagg_vs_RctPcf 
resLFC <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr")

#order data based on p-value and select data
res2 <- resLFC[order(resLFC$pvalue),]
res2_ <- cbind(res2$baseMean, res2$log2FoldChange, res2$lfcSE, res2$stat, res2$pvalue, res2$padj) #if res2 not log shrinkage
rownames(res2_) <- rownames(res2)
colnames(res2_) <- colnames(res2)

res2_ <- as_tibble(res2_)
res2_$geneNames <- rownames(res2)
res2_ <- res2_[,-4]
res2_$geneNames <- rownames(res2)
res2_pos <- res2_ %>% filter(log2FoldChange >0)
length(res2_pos$pvalue) #7810
res2_neg <- res2_ %>% filter(log2FoldChange <0)
length(res2_neg$pvalue) #9339

###---###---###---###---###---###---###
### Save dete data from above ####
###---###---###---###---###---###---###
#Note, 
write.csv(res2_neg, "res_stAgg_ReacAgg_Pref10_neg.txt")
write.csv(res2_pos, "res_stAgg_ReacAgg_Pref10_pos.tx")
write.csv(res2_neg, "res_StAgg_ReacPcf_Pref10_neg.txt")
write.csv(res2_pos, "res_StAgg_ReacPcf_Pref10_pos.txt")
write.csv(res2_neg, "res_ReacAgg_ReacPcf_Pref10_neg.txt")
write.csv(res2_pos, "res_ReacAgg_ReacPcf_Pref10_pos.txt")
#This will yield files for significantly up- and down-regulated genes
#Note: Using a customised python script, we extracted gene names from the gene txt.-files created above (e.g. res_StAgg_ReacPcf_Pref10_pos.txt) and searched for each of the genes in the gtf file (Talp_Annotation_Merged_v2_RefGen_m150.gtf) for the corresponding gene name. The script then writes this name, together with log2fold change, etc in a new file.



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## VENN diagram ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###

###---###---###---###---###---###---###
### Load packages, setwd, and load data ####
###---###---###---###---###---###---###

library("ggvenn")
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")
VennDiaDataAll <- readxl::read_xlsx("ForR_Gene_table.xlsx", sheet="Venn")

#Now the same again BUT we only choose genes with a known gene name
VennDiaDataAll$FB_name
#Down-regulated genes
started_aggression_vs_reacted_aggressively_down_red <- subset(VennDiaDataAll,beh_comparison=="started aggression vs reacted aggressively" & expression=="downregulated" & FB_name != "NA")

started_aggression_vs_reacted_peacefully_down_red <- subset(VennDiaDataAll, beh_comparison=="started aggression vs reacted peacefully" & expression=="downregulated" & FB_name != "NA")

reacted_aggressively_vs_reacted_aggressivey_down_red <- subset(VennDiaDataAll, beh_comparison=="reacted aggressively vs reacted peacefully" & expression=="downregulated" & FB_name != "NA")

#Up-regulated genes
started_aggression_vs_reacted_aggressively_up_red <- subset(VennDiaDataAll, beh_comparison=="started aggression vs reacted aggressively" & expression=="upregulated" & FB_name != "NA")

started_aggression_vs_reacted_peacefully_up_red <- subset(VennDiaDataAll, beh_comparison=="started aggression vs reacted peacefully" & expression=="upregulated" & FB_name != "NA")

reacted_aggressively_vs_reacted_aggressivey_up_red <- subset(VennDiaDataAll, beh_comparison=="reacted aggressively vs reacted peacefully" & expression=="upregulated" & FB_name != "NA")

#Combine all down-regulated genes in a list
down_reg_list_red <- list(
  "Started aggression vs reacted aggressively" = started_aggression_vs_reacted_aggressively_down_red$geneNames,
  "Started aggression vs reacted peacefully" = started_aggression_vs_reacted_peacefully_down_red$geneNames,
  "Reacted peacefully vs reacted peacefully" = reacted_aggressively_vs_reacted_aggressivey_down_red$geneNames)
down_reg_list_red

#Combine all up-regulated genes in a list
up_reg_list_red <- list(
  "Started aggression vs reacted aggressively" = started_aggression_vs_reacted_aggressively_up_red$geneNames,
  "Started aggression vs reacted peacefully" = started_aggression_vs_reacted_peacefully_up_red$geneNames,
  "Reacted peacefully vs reacted peacefully" = reacted_aggressively_vs_reacted_aggressivey_up_red$geneNames)
up_reg_list_red


###---###---###---###---###---###---###
### Plot pdfs ####
###---###---###---###---###---###---###
pdf("Fig3B_Venn_up_down_knownGeneName_MS.pdf")
ggvenn(
  down_reg_list_red, 
  fill_color = c("#7336c2a9", "#fcfa18ec", "#868686FF"), #
  stroke_size = 0.5, set_name_size = 4
)
ggvenn(
  up_reg_list_red, 
  fill_color = c("#7336c25f", "#fcfa9aec", "#cccccc80"), #NOTE: The colors need to be made slightly transparent to match the colors in the main document
  stroke_size = 0.5, set_name_size = 4
)
dev.off()
#END venn diagram


###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Download/calculate environmental variables ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
##Below, we download and/or calculate the environmental variables for the multinomial logistic regression

###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## Nitrogen and other variables ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
#First, we start with soil nitrogen 

###---###---###---###---###---###---###
### Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(raster); library(rasterVis); library(rworldxtra); data(countriesHigh)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")
freq <- read.csv("ForR_allEnvironVars.csv", sep = ";",header=TRUE)

###---###---###---###---###---###---###
### Extract latitude and longitude from each colony to extract data ####
###---###---###---###---###---###---###
lats <- freq$lat
lons <- freq$long
coords <- data.frame(x=lons,y=lats)
freq$nest
#NOTE: in this file, all environmental data are already present. Below, is only for consistency on how to extract the data.

###---###---###---###---###---###---###
### Extract nitrogen variables for analysis ####
###---###---###---###---###---###---### 
##Note, we use the "N.zip" file from LUCAS topsoil data from the European Commission, which can be downloaded online
r <- raster('N.tif')
#from LUCAS topsoil data 2012


###---###---###---###---###---###---###
#### Create a raster object and extract data ####
###---###---###---###---###---###---###
#Create raster object
crs(r)
wgs84 = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
raster_wgs84 = projectRaster(r, crs = wgs84, method = "ngb") #change raster GRS80 to WG84

#Extract data points for each colony and save as "Nitrogen-values" file
points <- SpatialPoints(coords, proj4string = raster_wgs84@crs)
values <- extract(raster_wgs84, points)
df1 <- cbind.data.frame(coordinates(points), values)
colnames(df1) <- c("long", "lat", "nitrogen")
write.csv(df1, "Nitrogen-values_P&W.csv")


###---###---###---###---###---###---###
## Bioclim variables from WorldClim ####
###---###---###---###---###---###---###

###---###---###---###---###---###---###
### Download Bioclim environmental variables for analysis ####
###---###---###---###---###---###---###
#from https://worldclim.org/

###---###---###---###---###---###---###
### Set working directory, load data, and select Bioclim variables ####
###---###---###---###---###---###---###

#setwd("C:/Users/krapf/OneDrive/Desktop/")
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

# load data
rNew <- raster::getData("worldclim",var="bio", res=0.5, lon=9.75, lat=46.17) #if already downloaded, it will just load it

#Now, we select the BioClim variables
r1_12 <- rNew[[c(1,4,5,6,12, 16, 17, 18, 19)]]
names(r1_12) <- c("Temp","TempSeason", "TempWarmestMonth",
                  "TempColdestMonth", "Prec", "Precip_WettestQuarter",
                  "Precip_DriestQuarter", "Precip_WarmestQuarter", "Precip_ColdestQuarter" )
# BIO1 = Annual Mean Temperature
# BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))
# BIO3 = Isothermality (BIO2/BIO7) (?100)
# BIO4 = Temperature Seasonality (standard deviation ?100)
# BIO5 = Max Temperature of Warmest Month
# BIO6 = Min Temperature of Coldest Month
# BIO7 = Temperature Annual Range (BIO5-BIO6)
# BIO8 = Mean Temperature of Wettest Quarter
# BIO9 = Mean Temperature of Driest Quarter
# BIO10 = Mean Temperature of Warmest Quarter
# BIO11 = Mean Temperature of Coldest Quarter
# BIO12 = Annual Precipitation
# BIO13 = Precipitation of Wettest Month
# BIO14 = Precipitation of Driest Month
# BIO15 = Precipitation Seasonality (Coefficient of Variation)
# BIO16 = Precipitation of Wettest Quarter
# BIO17 = Precipitation of Driest Quarter
# BIO18 = Precipitation of Warmest Quarter
# BIO19 = Precipitation of Coldest Quarter


###---###---###---###---###---###---###
### Extract data for each colony and save data ####
###---###---###---###---###---###---###
#Extract BioClim data from the colony coordinates
points <- SpatialPoints(coords, proj4string = r1_12@crs)
values <- extract(r1_12,points)

#Below, we save the extracted data as csv-file
write.csv(values, "PrecipTemp_values_P&W.csv") 


###---###---###---###---###---###---###
### Monthly averaged precipitation data ####
###---###---###---###---###---###---###

#Get additional precipitation data for each month
setwd("C:/Users/c7701110/Desktop")
rNews <- getData("worldclim",var="prec",res=0.5, lon=9.75, lat=46.17) #0.5
rNews

#Select data
rPrecip <- rNews[[c(1,2,3,4,5,6,7,8,9,10,11,12)]]
names(rPrecip) <- c("Jan","Feb","Mar","Apr", "May","Jun","Jul", "Aug","Sep", "Oct","Nov","Dec")
rPrecip

#Extract data for each colony
points <- SpatialPoints(coords, proj4string = rPrecip@crs)
values <- extract(rPrecip,points)
setwd("C:/Users/c7701110/Desktop/MS3")
write.csv(values, "Precip_values_P&W.csv") 

##NOTE: We then combined all values into the files "ForR_allEnvironVars.csv"


###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Assess Bacterial OTUs association with behavioural states using a multinomial logistic regression ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
#OTUs are, among others, Pseudomonas, Bacteroides, etc.

###---###---###---###---###---###---###
## Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(foreign); library(nnet); library(ggplot2); library(reshape2); library(emmeans); library(lmtest); library(AICcmodavg); library(DescTools)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

Microbiome_sorted <- readxl::read_xlsx("ForR_otu_metadata_rarefied.xlsx",
                                       sheet="OTU_transposed_Count>100")
Microbiome_sorted_red <- subset(Microbiome_sorted, Video_id!="exclude" & beh!="ctrl_freeze") #excludes the control samples, termed "ctrl_freeze", and the summaries created in the file and named with "exclude", so that they can be excluded
tail(Microbiome_sorted_red)
head(Microbiome_sorted_red)


###---###---###---###---###---###---###
## Prepare data ####
###---###---###---###---###---###---###
#characters to numeric
#identify all character columns
length(Microbiome_sorted_red) ##all variables, including 119 OTUs
chars <- sapply(Microbiome_sorted_red[,7:125], is.character) # tot, 119 OTUs, first 6 variables are pop names, etc

#convert all character columns to numeric
Microbiome_sorted_red[,7:125][ , chars] <- as.data.frame(apply(Microbiome_sorted_red[,7:125][ , chars], 2, as.numeric))
as.data.frame(Microbiome_sorted_red)
Microbiome_sorted_red
Microbiome_sorted_red$beh <- relevel(as.factor(Microbiome_sorted_red$beh), ref = "start_agg")


###---###---###---###---###---###---###---###---###
## For loop to analyse the OTUs ####
###---###---###---###---###---###---###---###---###
# Number of variables
num_vars <- 119

# Sliding window size
window_size <- 21

#Matrix for emmeans results
#EmmeansRes
Emmeans_Results_matrix <- matrix(NA, nrow=3, ncol=4)
colnames(Emmeans_Results_matrix) <- c("model", "contrast", "t-value", "p-value")

# Iterate through each position of the sliding window
for (i in 1:(ncol(Microbiome_sorted_red) - (window_size)+1) ) {
  #####setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") #!!!!!! CHANGE !!!!!!!
  # Extract the data within the sliding window
  window_data <- Microbiome_sorted_red[, 7:(i+5 + window_size-1) ]
  
  # Iterate through each variable
  for (j in 1:num_vars) {
    sink(paste("OTU_models_", j,"_20250329.txt", sep=""))
    print(paste("LoopModel_", j, sep=""))
    # Formulate the formula for the linear model
    variable_name <- colnames(Microbiome_sorted_red[,7:125])
    formula <- as.formula(paste("beh ~", variable_name[j], "+", variable_name[j+1], "+", variable_name[j+2], "+", variable_name[j+3], "+", 
                                variable_name[j+4], "+",
                                variable_name[j+5], "+",
                                variable_name[j+6], "+", 
                                variable_name[j+7], "+",
                                variable_name[j+8], "+",
                                variable_name[j+9], "+", 
                                variable_name[j+10], "+",
                                variable_name[j+11], "+",
                                variable_name[j+12], "+", 
                                variable_name[j+13], "+",
                                variable_name[j+14], "+",
                                variable_name[j+15], "+", 
                                variable_name[j+16], "+",
                                variable_name[j+17], "+",
                                variable_name[j+18], "+", 
                                variable_name[j+19] )) #paste0("OTU_", j)))
    
    # Fit the linear model
    lm_model <- multinom(formula, data = Microbiome_sorted_red, model=T)
    cat("\n");  cat("\n")
    
    # Print the results or store them as required
    print(lm_model)
    cat("\n");  cat("\n")
    
    print("Goodness_of_fit_Mod2")
    # Test the goodness of fit
    print(chisq.test(Microbiome_sorted_red$beh, predict(lm_model)))
    cat("\n");  cat("\n")
    
    #Calculate and p-value using z-wald test
    print("summary")
    print(summary(lm_model))
    cat("\n")
    print("z-value")
    z <- summary(lm_model)$coefficients/summary(lm_model)$standard.errors
    print(z)
    
    cat("\n")
    # 2-tailed z test to calculate p-values using Wald Z test
    print("p-value")
    p <- (1 - pnorm(abs(z), 0, 1)) * 2
    print(p)
    #this code below stores p-values in a excel file and saves it
    print_p <- as.data.frame(p)
    sigOTUs <- as.data.frame(ifelse(print_p <= 0.05, "1", "0"))
    rio::export(sigOTUs, paste("sigOTUs_", j,".xlsx", sep=""), format = "xlsx") #
    cat("\n")
    
    #EMMEANS contrasts
    tryCatch({  #emmeans may cause an error
      print("Emmeans")
      emms1 <- emmeans(lm_model, ~ beh, ) #
      con1 <- contrast(emms1)
      EmmeansRes <- summary(pairs(con1, by = NULL))
      print(EmmeansRes)
      
      Emmeans_Results_matrix[1:3, 1] <- print(paste("LoopModel_", j, sep=""))
      Emmeans_Results_matrix[1:3, 2] <- EmmeansRes$contrast
      Emmeans_Results_matrix[1:3, 3] <- EmmeansRes$t
      Emmeans_Results_matrix[1:3, 4] <- EmmeansRes$p.value
      sink(paste("Emmeans_Results_matrix_", j,".txt", sep="") )
      print(Emmeans_Results_matrix)
      sink()
      
    }, error= function(e) {
      cat("\n"); cat("\n") })
    
    
    #Calculate the Pseudo R-Square
    print("PseudpR2")
    print(PseudoR2(lm_model, which = c("CoxSnell","Nagelkerke","McFadden")))
    sink()
  }
  
}


###---###---###---###---###---###---###---###---###
## Manually add missing models ####
###---###---###---###---###---###---###---###---###
#In the for loop, not all variables could be added due to the nature of the for-loop. So below, we add the last few models manually
#with 119 variables
OTU_models_101 <- multinom(beh ~ OTU_7293 +OTU_8257 +OTU_16490 +OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2,
                           data = Microbiome_sorted_red, model=T)
OTU_models_102 <- multinom(beh ~ OTU_8257 +OTU_16490 +OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7,
                           data = Microbiome_sorted_red, model=T)
OTU_models_103 <- multinom(beh ~ OTU_16490 +OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13,
                           data = Microbiome_sorted_red, model=T)
OTU_models_104 <- multinom(beh ~ OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58,
                           data = Microbiome_sorted_red, model=T)
OTU_models_105 <- multinom(beh ~ OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123,
                           data = Microbiome_sorted_red, model=T)
OTU_models_106 <- multinom(beh ~ OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366,
                           data = Microbiome_sorted_red, model=T)
OTU_models_107 <- multinom(beh ~ OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370,
                           data = Microbiome_sorted_red, model=T)
OTU_models_108 <- multinom(beh ~ OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732,
                           data = Microbiome_sorted_red, model=T)
OTU_models_109 <- multinom(beh ~ OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805,
                           data = Microbiome_sorted_red, model=T)
OTU_models_110 <- multinom(beh ~ OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251,
                           data = Microbiome_sorted_red, model=T)
OTU_models_111 <- multinom(beh ~ OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420,
                           data = Microbiome_sorted_red, model=T)
OTU_models_112 <- multinom(beh ~ OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442,
                           data = Microbiome_sorted_red, model=T)
OTU_models_113 <- multinom(beh ~ OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581,
                           data = Microbiome_sorted_red, model=T)
OTU_models_114 <- multinom(beh ~ OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806,
                           data = Microbiome_sorted_red, model=T)
OTU_models_115 <- multinom(beh ~ OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011,
                           data = Microbiome_sorted_red, model=T)
OTU_models_116 <- multinom(beh ~ OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292,
                           data = Microbiome_sorted_red, model=T)
OTU_models_117 <- multinom(beh ~ OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292 + OTU_8545,
                           data = Microbiome_sorted_red, model=T)
OTU_models_118 <- multinom(beh ~ OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292 + OTU_8545 +OTU_8711,
                           data = Microbiome_sorted_red, model=T)
OTU_models_119 <- multinom(beh ~ OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292 + OTU_8545 +OTU_8711 + OTU_11261,
                           data = Microbiome_sorted_red, model=T)

## Great, now we had also added all final models and can continue below


###---###---###---###---###---###---###---###---###
## For loop to extract OTUs that sig influence behavioural states and saves it as an output ####
###---###---###---###---###---###---###---###---###

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")
sink("ALL_OTUs_count_119_mod_20250330_.txt", append=T)

files <- list.files(path=".", pattern="sigOTUs") #!#!#! keep this but change it if you have changed the output file names above
files <- files[1:100] 
for(i in 1:length(files)) { #change to all files
  ColumNames_SigOTUs <- readxl::read_xlsx(paste("sigOTUs_", i, ".xlsx", sep="")) #this I need
  ColumNames_SigOTUs <- as.data.frame(ColumNames_SigOTUs)
  rownames(ColumNames_SigOTUs) <- c("react_agg", "react_pcf")
  
  for(j in 1:length(ColumNames_SigOTUs)) {
    tryCatch({ 
      if(colnames(ColumNames_SigOTUs[j]) == paste("(Intercept)") ) {
        ""
      }
      if(ColumNames_SigOTUs[1, j] == "1") {
        #ifelse(ColumNames_SigOTUs[1,j] == "1")
        print(paste(files[i], rownames(ColumNames_SigOTUs[1,]), colnames(ColumNames_SigOTUs[j]), sep="\t"))
      }
      if(ColumNames_SigOTUs[1, j] == "0") {
        #sink("OTUs_count.txt", append=T)
        #print(rownames(ColumNames_SigOTUs[2,]))
        #print()
        ""
      }
      if(ColumNames_SigOTUs[2, j] == "1") {
        print(paste(files[i], rownames(ColumNames_SigOTUs[2,]), colnames(ColumNames_SigOTUs[j]), sep="\t"))
      }
      if(ColumNames_SigOTUs[2, j] == "0") {
        ""
      }
      else ""
    }, error = function(e) {
      cat("Error: ", e$message, "\nSkipping file ", files[i], "\n\n")
    })
  }
}
sink()


#Now, we add additional values with "sigOTUs_OTU_models_" as name
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")  #CHANGE
sink("ALL_OTUs_count_119_mod.txt", append=T)
#i<-1
#j<-1
files <- list.files(path=".", pattern="sigOTUs") #!#!#!keep this but change it if you have changed the output file names above

files <- files[100:119] 
for(i in 1:length(files)) { #change to all files
  ColumNames_SigOTUs <- readxl::read_xlsx(paste("sigOTUs_OTU_models_", i+100, ".xlsx", sep="")) #this I need
  ColumNames_SigOTUs <- as.data.frame(ColumNames_SigOTUs)
  rownames(ColumNames_SigOTUs) <- c("react_agg", "react_pcf")
  #ColumnNames <- colnames(ColumNames_SigOTUs)
  for(j in 1:length(ColumNames_SigOTUs)) {
    tryCatch({ 
      if(colnames(ColumNames_SigOTUs[j]) == paste("(Intercept)") ) {
        ""
      }
      if(ColumNames_SigOTUs[1, j] == "1") {
        #ifelse(ColumNames_SigOTUs[1,j] == "1")
        print(paste(files[i], rownames(ColumNames_SigOTUs[1,]), colnames(ColumNames_SigOTUs[j]), sep="\t"))
      }
      if(ColumNames_SigOTUs[1, j] == "0") {
        #sink("OTUs_count.txt", append=T)
        #print(rownames(ColumNames_SigOTUs[2,]))
        #print()
        ""
      }
      if(ColumNames_SigOTUs[2, j] == "1") {
        print(paste(files[i], rownames(ColumNames_SigOTUs[2,]), colnames(ColumNames_SigOTUs[j]), sep="\t"))
      }
      if(ColumNames_SigOTUs[2, j] == "0") {
        ""
      }
      else ""
    }, error = function(e) {
      cat("Error: ", e$message, "\nSkipping file ", files[i], "\n\n")
    })
  }
}
sink()


## In the next step, we read in all the created files
#First, open "ALL_OTUs_count_119_mod.txt" and copied the data to an excel file. Then with "replace" function, replace the "\t" with only "\" and run the "text to culumns" function. I copied this data to a new txt file "ALL_OTUs_count_119_mod_fin.txt", which is read in below.
#For the file below, I also deleted the " [1] " at the beginning, the " at the end and the \t with a real tab

#setwd("C:/Users/krapf/OneDrive/Desktop/SubInDel/Workshop/ExpressionCounts/OTU_models/") 
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")
file_path <- "ALL_OTUs_count_119_mod_fin.txt"
lines <- readLines(file_path)
lines_split <- strsplit(lines, "\t", fixed=T)
lines_split


# Initialize a counter for OTU occurrences
otu_counts <- table(gsub('.*OTU_', "", lines_split))
sorted_counts <- sort(otu_counts, decreasing = TRUE)

## Print the counts and export the file
#Note: Do this only once and check the file manually
#In the excel file, you then order the file names and count
##### rio::export(as.data.frame(sorted_counts), "ALL_OTU_Counts_119mod_20250330.xlsx", format = "xlsx")



###---###---###---###---###---###---###---###
## Binomial test of the OTU counts after rarefaction and after the multinomial logistic regression ####
###---###---###---###---###---###---###---###
library(emmeans)

## Below, we run a binomial test and add the values manually
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

sink("OTUcounts_Comparing_control_and_BehavStates_Supp.txt")
print("PSEUDOMONAS - OTU 366")
count <- c(33, 134, 102, 45)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)  
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states

print("PSEUDOMONAS - OTU 2442")
count <- c(33, 95, 30, 34)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("PSEUDOMONAS - OTU 4806")
count <- c(34, 16, 52, 25)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Exclude because there are no significant differences between control and behavioural states


print("BACTEROIDES - OTU 1598")
count <- c(24, 67, 30, 12)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("BACTEROIDES - OTU 5256")
count <- c(24, 48, 29, 55)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Bacteroides - OTU 22324")
count <- c(335, 131, 278, 274)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Prevotella - OTU 348")
count <- c(142, 22, 201, 39)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states

print("Prevotella - OTU 377")
count <- c(4, 333, 19, 51)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states

print("Prevotella - OTU 667")
count <- c(42, 54, 106, 34)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Prevotella - OTU 837")
count <- c(29, 71, 74, 53)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Prevotella - OTU 1887")
count <- c(55, 754, 106, 140)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Prevotella - OTU 2975")
count <- c(20, 57, 41, 52)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Prevotella - OTU 9014")
count <- c(21, 25, 178, 21)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states


print("Prevotella - OTU 17264")
count <- c(96, 15, 102, 52)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states 


print("Prevotella - OTU 20448")
count <- c(408, 395, 127, 188)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states 


print("Lactobacillus - OTU 813")
count <- c(56, 73, 70, 34)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep even though no significant differences between control and behavioural states 


print("Lactobacillus - OTU 7014")
count <- c(16, 43, 73, 51)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states 


print("Lactobacillus - OTU 21141")
count <- c(84, 87, 78, 125)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states 


print("Rhizobiales - OTU 369")
count <- c(63, 102, 110, 94)
beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Keep because there are significant differences between control and behavioural states 

sink()
## Based on these results, we exclude a Lactobacillus sp. (OTU 813) and one Pseudomonas sp. (OTU 4806) because they did not differ between control and the behavioural states.



####################################
# #!!
# print("PSEUDOMONAS - OTU 805")
# count <- c(139, 4, 50, 6)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh)
# con1 <- contrast(emms1)
# pairs(con1, by = NULL)
# cat("\n"); cat("\n"); cat("\n"); cat("\n")

# print("PSEUDOMONAS - OTU 1420")
# count <- c(13, 73, 3, 12)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh) 
# con1 <- contrast(emms1)
# pairs(con1, by = NULL) 
# cat("\n"); cat("\n"); cat("\n"); cat("\n")

# #!!
# print("PSEUDOMONAS - OTU 436")
# count <- c(278, 12, 194, 17)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh) 
# con1 <- contrast(emms1)
# pairs(con1, by = NULL) 
# cat("\n"); cat("\n"); cat("\n"); cat("\n")
# 
# #!!
# print("Prevotella - OTU 326")
# count <- c(236, 1, 89, 105)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh) 
# con1 <- contrast(emms1)
# pairs(con1, by = NULL) 
# cat("\n"); cat("\n"); cat("\n"); cat("\n")
# 
# #!!
# print("Prevotella - OTU 18107")
# count <- c(12, 17, 83, 7)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh) 
# con1 <- contrast(emms1)
# pairs(con1, by = NULL) 
# cat("\n"); cat("\n"); cat("\n"); cat("\n")

# #!!
# print("Lactobacillus - OTU 121")
# count <- c(486, 188, 357, 342)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh) 
# con1 <- contrast(emms1)
# pairs(con1, by = NULL) 
# cat("\n"); cat("\n"); cat("\n"); cat("\n")
# 
# #!!
# print("Lactobacillus - OTU 5416")
# count <- c(591, 45, 352, 44)
# beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully")
# df_micro <- data.frame(beh, count)
# mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
# summary(mod1)
# emms1 <- emmeans(mod1, ~ beh) 
# con1 <- contrast(emms1)
# pairs(con1, by = NULL) 
# cat("\n"); cat("\n"); cat("\n"); cat("\n")
#######################################



###---###---###---###---###---###---###
## Select OTUs manually and calculate Poisson tests ####
###---###---###---###---###---###---###
library("emmeans")
ResultTable <- matrix(nrow = 27, ncol=8)
colnames(ResultTable) <- c("Genus", "Species", "Comparison", "Estimate", "SE", "df", "Z-Ratio", "p-value")

## Now, we selected OTUs with a count of more than 100 in each for subsequent analysis. We now calculate Poisson tests with count data
sink("OTU_countdata_MS.txt")

print("BACTEROIDES ")
print("Bacteroides sp. - OTU 1598")
count <- c(67, 30, 12)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL)
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[1:3,1] <- "Bacteroides"
ResultTable[1:3,2] <- "Bacteroides sp. - OTU 1598"
ResultTable[1:3,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[1:3,4] <- EmmRes_df$estimate
ResultTable[1:3,5] <- EmmRes_df$SE
ResultTable[1:3,6] <- EmmRes_df$df
ResultTable[1:3,7] <- EmmRes_df$z.ratio
ResultTable[1:3,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
##Linked to started agg and aggression


print("Bacteroides sp. - OTU 5256")
count <- c(48, 29, 55)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Excluded because of inconsistent results (react agg vs react pcf difference, but rest not)


print("Bacteroides sp. - OTU 22324")
count <- c(131, 278, 274)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[4:6,1] <- "Bacteroides"
ResultTable[4:6,2] <- "Bacteroides sp. - OTU 22324"
ResultTable[4:6,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[4:6,4] <- EmmRes_df$estimate
ResultTable[4:6,4] <- EmmRes_df$estimate
ResultTable[4:6,5] <- EmmRes_df$SE
ResultTable[4:6,6] <- EmmRes_df$df
ResultTable[4:6,7] <- EmmRes_df$z.ratio
ResultTable[4:6,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Linked to started aggression


print("LACTOBACILLUS ")
print("Lactobacillus mucosae - OTU 813")
count <- c(73, 70, 34)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[7:9,1] <- "Lactobacillus"
ResultTable[7:9,2] <- "Lactobacillus mucosae - OTU 813"
ResultTable[7:9,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[7:9,4] <- EmmRes_df$estimate
ResultTable[7:9,4] <- EmmRes_df$estimate
ResultTable[7:9,5] <- EmmRes_df$SE
ResultTable[7:9,6] <- EmmRes_df$df
ResultTable[7:9,7] <- EmmRes_df$z.ratio
ResultTable[7:9,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Link to aggression


print("Lactobacillus fructivorans - OTU 7014")
count <- c(43, 73, 51)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Excluded due to mixed results


print("Lactobacillus sp. - OTU 21141")
count <- c(87, 78, 125)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[10:12,1] <- "Lactobacillus"
ResultTable[10:12,2] <- "Lactobacillus sp. - OTU 21141"
ResultTable[10:12,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[10:12,4] <- EmmRes_df$estimate
ResultTable[10:12,4] <- EmmRes_df$estimate
ResultTable[10:12,5] <- EmmRes_df$SE
ResultTable[10:12,6] <- EmmRes_df$df
ResultTable[10:12,7] <- EmmRes_df$z.ratio
ResultTable[10:12,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Linked to peacefulness


print("PREVOTELLA ")
print("Prevotella sp. - OTU 348")
count <- c(22, 201, 39)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Excluded, mixed results


print("Prevotella sp. - OTU 377")
count <- c(333, 19, 51)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[13:15,1] <- "Prevotella"
ResultTable[13:15,2] <- "Prevotella sp. - OTU 377"
ResultTable[13:15,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[13:15,4] <- EmmRes_df$estimate
ResultTable[13:15,4] <- EmmRes_df$estimate
ResultTable[13:15,5] <- EmmRes_df$SE
ResultTable[13:15,6] <- EmmRes_df$df
ResultTable[13:15,7] <- EmmRes_df$z.ratio
ResultTable[13:15,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Link to started agg


print("Prevotella sp. - OTU 667")
count <- c(54, 106, 34)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Excluded, mixed results - link to reacted agg


print("Prevotella sp. - OTU 837")
count <- c(71, 74, 53)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Excluded, not sig


print("Prevotella sp. - OTU 1887")
count <- c(754, 106, 140)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[16:18,1] <- "Prevotella"
ResultTable[16:18,2] <- "Prevotella sp. - OTU 1887"
ResultTable[16:18,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[16:18,4] <- EmmRes_df$estimate
ResultTable[16:18,4] <- EmmRes_df$estimate
ResultTable[16:18,5] <- EmmRes_df$SE
ResultTable[16:18,6] <- EmmRes_df$df
ResultTable[16:18,7] <- EmmRes_df$z.ratio
ResultTable[16:18,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Linked to started agg


print("Prevotella sp. - OTU 2975")
count <- c(57, 41, 52)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Excluded, not sig


print("Prevotella sp. - OTU 9014")
count <- c(25, 178, 21)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
# Excluded due to mixed results


print("Prevotella sp. - OTU 17264")
count <- c(15, 102, 52)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")
# Excluded due to mixed results


print("Prevotella sp. - OTU 20448")
count <- c(395, 127, 188)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[19:21,1] <- "Prevotella"
ResultTable[19:21,2] <- "Prevotella sp. - OTU 20448"
ResultTable[19:21,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[19:21,4] <- EmmRes_df$estimate
ResultTable[19:21,4] <- EmmRes_df$estimate
ResultTable[19:21,5] <- EmmRes_df$SE
ResultTable[19:21,6] <- EmmRes_df$df
ResultTable[19:21,7] <- EmmRes_df$z.ratio
ResultTable[19:21,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Link to agg


#PSEUDOMONAS
print("PSEUDOMONAS ")
print("Pseudomonas psychrotolerans - OTU 366")
count <- c(134, 102, 45)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL)
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[22:24,1] <- "Pseudomonas"
ResultTable[22:24,2] <- "Pseudomonas psychrotolerans - OTU 366"
ResultTable[22:24,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[22:24,4] <- EmmRes_df$estimate
ResultTable[22:24,4] <- EmmRes_df$estimate
ResultTable[22:24,5] <- EmmRes_df$SE
ResultTable[22:24,6] <- EmmRes_df$df
ResultTable[22:24,7] <- EmmRes_df$z.ratio
ResultTable[22:24,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Linked to aggression


print("Pseudomonas sp. - OTU 1420")
count <- c(73, 3, 12)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh)
con1 <- contrast(emms1)
pairs(con1, by = NULL)
cat("\n"); cat("\n"); cat("\n"); cat("\n")


print("Pseudomonas sp. - OTU 2442")
count <- c(95, 30, 34)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
EmmRes <- pairs(con1, by = NULL)
EmmRes_df <- as.data.frame(summary(EmmRes))
ResultTable[25:27,1] <- "Pseudomonas"
ResultTable[25:27,2] <- "Pseudomonas sp. - OTU 2442"
ResultTable[25:27,3] <- c("started aggression vs reacted aggressively",
                        "started aggression vs reacted peacefully",
                        "reacted aggressively vs reacted peacefully")
ResultTable[25:27,4] <- EmmRes_df$estimate
ResultTable[25:27,4] <- EmmRes_df$estimate
ResultTable[25:27,5] <- EmmRes_df$SE
ResultTable[25:27,6] <- EmmRes_df$df
ResultTable[25:27,7] <- EmmRes_df$z.ratio
ResultTable[25:27,8] <- EmmRes_df$p.value
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Linked to started aggression


print("Rhizobiales sp. - OTU 588")
count <- c(102, 110, 94)
beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully")
df_micro <- data.frame(beh, count)
df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression")
mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson)
summary(mod1)
emms1 <- emmeans(mod1, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 
cat("\n"); cat("\n"); cat("\n"); cat("\n")
#Exclude, not sig


###---###---###---###---###---###---###
### Combine data and correct for multiple testing  ####
###---###---###---###---###---###---###
#Combine data
p_vals <- c(0.0947, 0.000001, 0.000001, #OTU 366
            0.000001, 0.000001, 0.0805, #OTU 1420
            0.000001, 0.000001, 0.8715,  #OTU 2442
            0.000001, 0.000001, 0.9841,  #OTU 22324
            0.000001, 0.1182, 0.000001,  #OTU 18107
            0.000001, 0.000001, 0.0018,  #OTU 20448
            0.9659, 0.0007, 0.0016,  #OTU 813
            0.7633, 0.0256, 0.0031)  #OTU 21141
p_vals_adj <- p.adjust(p_vals)

cbind(c(rep("Pseudomonas psychrotolerans - O_366",3),
        rep("Pseudomonas sp. - O_1420",3),
        rep("Pseudomonas sp. - O_2442",3),
        rep("Bacteroides - O_22324",3),
        rep("Prevotella sp. - O_18107",3), #
        rep("Prevotella sp. - O_20448",3),
        rep("Lactobacillus mucosae -  O_813",3),
        rep("Lactobacillus sp. - O_21141",3)),
      rep(c("start_agg vs react_agg", "start_agg vs react_pcf", "react_agg vs react_pcf"), 8),
      p_vals, p_vals_adj)
sink()

rio::export(ResultTable, "OTU_emmeans_Results.xlsx")

###---###---###---###---###
## Plot OTUs after binomial logistic regression MS####
###---###---###---###---###

###---###---###---###---###---###---###
### Load packages, setwd, and load data ####
###---###---###---###---###---###---###
library(ggplot2); library(ggpubr);
#setwd("C:/Users/krapf/OneDrive/Desktop/SubInDel/Workshop/ExpressionCounts")
setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

#Same as above but with sorted OUTs based on RowSums of all inds tested, i.e. number of OTUs found in each inds summed up; only used the first 10% of all OTUs, i.e. 2243
#sorted OTUs based on rowSums
OTU_count_plot_data <- readxl::read_xlsx("ForR_otu_metadata_rarefied.xlsx",
                                         sheet="OTU_sum_plot") 


###---###---###---###---###---###---###
### Prepare and plot data and save as pdf ####
###---###---###---###---###---###---###
OTU_count_plot_data$keep
OTU_count_plot_data_red <- subset(OTU_count_plot_data, keep=="x")
OTU_count_plot_data_red

#plot data
level_order <- c('start_agg', 'react_agg', 'react_pcf') 

#Plot data
pdf("Fig3C_Plot_Counts_MS.pdf", width=8)
ggplot(OTU_count_plot_data_red, aes(x=factor(count_beh, level=level_order),
                                    y=counts, fill=count_beh)) +
  geom_bar(stat="identity") +
  facet_grid(~OTU_R) +
  scale_fill_manual(name="Behavioural states",
                    breaks=c("start_agg","react_agg","react_pcf"),
                    labels=c(expression(italic("Started aggression")),
                             expression(italic("Reacted aggressively")),
                             expression(italic("Reacted peacefully")) ),
                    values = c("#CC6677", "#DDCC77", "#332288")) +
  scale_x_discrete(guide = guide_axis(n.dodge=2))+
  ylab("OTU count") +
  xlab("Behavioural states") +
  theme_bw() +
  theme(plot.title = element_text(size=20),
        legend.position="bottom",
        axis.title.x = element_text(size=13),
        axis.text.x  = element_blank(), #element_text(size=11, color="black"),
        axis.title.y = element_text(size=13),
        axis.text.y  = element_text(size=11, color="black"),
        panel.grid.minor.x = element_blank())
dev.off()



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# Multinomial logistic regression - joint analysis using multiple data layer ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## Below, run the multinomial logistic regression

###---###---###---###---###---###---###---###
## Load packages, setwd, and load data ####
###---###---###---###---###---###---###---###
library(foreign); library(nnet); library(ggplot2); library(reshape2); library(emmeans); library(lmtest); library("AICcmodavg"); library("DescTools")

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")
GeneCounts_MultiNom <- readxl::read_xlsx("ForR_GeneCounts.xlsx",
                                   sheet=1) 

#Multinomial logistic regression uses categories as dependent factors to conduct a logistic regression where the response variable is log-transformed, here "beh" with three levels. 
#we relevel start_agg as reference/baseline for models below
GeneCounts_MultiNom$beh <- relevel(as.factor(GeneCounts_MultiNom$beh), ref = "start_agg")


###---###---###---###---###---###---###---###---###
## UP- AND DOWN-REGULATED GENES ####
###---###---###---###---###---###---###---###---###
#Below, we only use genes that were found in both comparisons, started aggression vs reacted aggressively and started aggression vs reacted peacefully (i.e. Models1-8)

###---###---###---###---###---###---###
### Select variables in the models ####
###---###---###---###---###---###---###
Mod1 <- multinom(beh ~ 1, 
                 data = GeneCounts_MultiNom, model=TRUE)

Mod2 <- multinom(beh ~ scaffold_11_2457277_coded +
                   scaffold_163_28302_coded +
                   scaffold_185_110741_coded,
                 data = GeneCounts_MultiNom, model=TRUE)

Mod3 <- multinom(beh ~ MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709, 
                 data = GeneCounts_MultiNom, model=TRUE)

Mod4 <- multinom(beh ~ scaffold_11_2457277_coded +
                   scaffold_163_28302_coded + 
                   scaffold_185_110741_coded + MSTRG.5450 + MSTRG.4267 +
                   MSTRG.5109 + MSTRG.6709, 
                 data = GeneCounts_MultiNom, model=TRUE)

Mod5 <- multinom(beh ~ related_within_QG_prelim + TAS + CHC_PC1 + 
                   nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                   Temp_meanYear + TempMax_WarmestMonth, 
                 data = GeneCounts_MultiNom, model=TRUE)

Mod6 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                   scaffold_185_110741_coded + related_within_QG_prelim + TAS + CHC_PC1 + 
                   nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                   Temp_meanYear + TempMax_WarmestMonth, 
                 data = GeneCounts_MultiNom, model=TRUE)

Mod7 <- multinom(beh ~ MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709 + 
                   related_within_QG_prelim+TAS + CHC_PC1 + 
                   nitrogen_soil + related_within_QG_prelim + TAS + CHC_PC1 + 
                   nitrogen_soil + Precip_annual + Precip_WarmestQuarter + 
                   Temp_meanYear + TempMax_WarmestMonth, 
                 data = GeneCounts_MultiNom, model=TRUE)

Mod8 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                   scaffold_185_110741_coded + MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709 + 
                   related_within_QG_prelim + TAS+CHC_PC1 + 
                   nitrogen_soil + Precip_annual + Precip_WarmestQuarter + 
                   Temp_meanYear + TempMax_WarmestMonth, 
                 data = GeneCounts_MultiNom, model=TRUE)


###---###---###---###---###---###---###
### Save model fit information ####
###---###---###---###---###---###---###
sink("ModComp_Results_MS.txt")
print("Model comparison")
anova(Mod1, Mod2, Mod3, Mod4, Mod5, Mod6, Mod7, Mod8, test="Chisq")

#PLEASE NOTE that the model ranking is different to how we used it, e.g.
#anova(ZeroMod, Mod5, test="Chisq")
#LR (log-likelihood) is a measure of how much unexplained variability there is in the data. Therefore, the difference or change in log-likelihood indicates how much new variance has been explained by the model.
#The chi-square test tests the decrease in unexplained variance from the baseline model (408.1933) to the final model (333.9036), which is a difference of 408.1933 - 333.9036 = 74.29. This change is significant, which means that our final model explains a significant amount of the original variability.
#The likelihood ratio chi-square of 74.29 with a p-value < 0.001 tells us that our model as a whole fits significantly better than an empty or null model (i.e., a model with no predictors).

###---###---###---###---###---###---###
### Model comparisons, fit, and p-value testing ####
###---###---###---###---###---###---###
## Model comparison using AICc
print("AICc_models")
mod_list <- list(Mod2=Mod2, Mod3=Mod3, Mod4=Mod4, Mod5=Mod5, Mod6=Mod6, Mod7=Mod7, Mod8=Mod8) 
print(mod_list)
aictab(mod_list)


## Calculate the Goodness of fit
# Check the predicted probability for each program
print("Goodness_of_fit_Mod2")
head(Mod2$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod2),82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod2))

print("Goodness_of_fit_Mod4")
head(Mod4$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod4),82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod4))

## Calculate and p-value using z-wald test for Model 2
print("summary_Model2")
summary(Mod2)
print("z-value_Model2")
z <- summary(Mod2)$coefficients/summary(Mod2)$standard.errors
z
# 2-tailed z test to calculate p-values using Wald Z test
print("p-value_Model2")
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
#We see above that scaffold 2 and 3 (gene gd have an effect on the behaviours)

## Extract data for the variables for Model 2
summary(Mod2)
exp(-3.106913) #intercept
exp(-1.727618) #intercept
exp(0.23679451) #1.3 times higher SNP1
exp(-0.01633534) #1.00 times higher SNP1
exp(3.451271) #31.5 times higher SNP2
exp(2.466562) #11.8 times higher SNP2
exp(-2.457875) #0.1 times fewer SNP3
exp(-1.953764) #0.1 times fewer SNP3


## Calculate and p-value using z-wald test for Model 4
print("summary_Model4")
summary(Mod4)
print("z-value_Model4")
z <- summary(Mod4)$coefficients/summary(Mod4)$standard.errors
z
print("p-value_Model4")
# 2-tailed z test to calculate p-values using Wald Z test
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
#We see above that scaffold 2 and 3 (SNP3 = gastrulation effectice) have an effect on the behaviours

## Extract data for the variables for Model 4
summary(Mod4)
exp(-4.064435) #intercept
exp(-1.074275 ) #intercept
exp(0.1945963) #1.2 times higher SNP1
exp(-0.5279522) #0.6 times fewer SNP1
exp(3.668872) #39.2 times higher SNP2 sig
exp(2.464436) #11.8 times higher SNP2 sig
exp(-2.833251) #0.1 times fewer SNP3 sig
exp(-1.960591) #0.1 times fewer SNP3 sig
exp(0.0001688561) #1.00 times higher MSTRG.5450
exp(0.0002718880) #1.00 times higher MSTRG.5450
exp(0.0028886672) #1.00 times higher MSTRG.4267
exp(0.000503833) #1.00 times higher MSTRG.4267
exp(-0.005598530) #1.00 times fewer MSTRG.5109
exp(-0.003815167) #1.00 times fewer MSTRG.5109
exp(0.005739344) #1.00 times higher MSTRG.6709 sig
exp(0.006348842) #1.00 times higher MSTRG.6709 sig


###---###---###---###---###---###---###
### Calculate Pseudo-R2 and Log-likelihood ratio tests ####
###---###---###---###---###---###---### 
## Calculate the Pseudo R-Square
# Load the DescTools package for calculate the R square
# Calculate the R Square
print("PseudpR2_models")
PseudoR2(Mod2, which = c("Nagelkerke")) #only SNPs
PseudoR2(Mod3, which = c("Nagelkerke"))
PseudoR2(Mod4, which = c("Nagelkerke")) #only environment
PseudoR2(Mod5, which = c("Nagelkerke"))
PseudoR2(Mod6, which = c("Nagelkerke"))
PseudoR2(Mod7, which = c("Nagelkerke"))
PseudoR2(Mod8, which = c("Nagelkerke"))
#Interpretation of the R-Square:
##The Nagelkerke modification that does range from 0 to 1 is a more reliable measure of the relationship. Nagelkerke’s R2 will normally be higher than the Cox and Snell measure. If the result is 0.49, it indicates a relationship of 49% between the predictors and the prediction.


## Likelihood Ratio Tests
## Use the lmtest package to run Likelihood Ratio Tests
#library(lmtest)
#The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. 
print("lrtest Mod2")
lrtest(Mod2, "scaffold_11_2457277_coded") # 
lrtest(Mod2, "scaffold_163_28302_coded") # sig
lrtest(Mod2, "scaffold_185_110741_coded") #gd #sig
print("lrtest Mod4")
lrtest(Mod4, "scaffold_11_2457277_coded") # 
lrtest(Mod4, "scaffold_163_28302_coded") # sig
lrtest(Mod4, "scaffold_185_110741_coded") #gd #sig
lrtest(Mod4, "MSTRG.5450") #gene_CG3061
lrtest(Mod4, "MSTRG.4267") #gene_silver
lrtest(Mod4, "MSTRG.5109") #gene_Syt4
lrtest(Mod4, "MSTRG.6709") #gene_yellowd2  *** sig


#Interpretation of the Likelihood Ratio Tests
#It leaves the value used in parantheses out and compares the "full" model and this "leave-on-out" model. If it is significant, then this factor significantly explains part of the model. In our case, it tells us that two scaffolds had a significant main effect on the behaviours.
#These likelihood statistics can be seen as sorts of overall statistics that tell us which predictors significantly enable us to predict the outcome category, but they don’t really tell us specifically what the effect is. To see this we have to look at the individual parameter estimates.


###---###---###---###---###---###---###
###  Comparing the three behavioural states using emmeans and calculate odds ratios ####
###---###---###---###---###---###---###
#### Model 2
print("Mod2")
emms1 <- emmeans(Mod2, ~ beh) 
con1 <- contrast(emms1)
confint(emms1)
pairs(con1, by = NULL) #, interaction = "pairwise"
##What does the emmeans calculation above mean?: using the coded variables of scaffolds, there is a sig differences between reactagg and start agg and reactpcf and start agg. 

#### Model 4
print("Mod4")
emms1 <- emmeans(Mod4, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 


### Calculate odds ratio and data
#### Model 2
# First, we extract the  coefficients
print("Mod2")
coefs <- coef(Mod2)        # matrix: outcomes x predictors
vc <- vcov(Mod2)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod2, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts
tidy_fit

#### Model 4
print("Mod4")
coefs <- coef(Mod4)        # matrix: outcomes x predictors
vc <- vcov(Mod4)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod4, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts
tidy_fit
sink()

#Below, we save all information from Model 4 in a list and ...
lr_results <- list(
  scaffold_11_2457277_coded = lrtest(Mod4, update(Mod4, . ~ . - scaffold_11_2457277_coded)),
  scaffold_163_28302_coded = lrtest(Mod4, update(Mod4, . ~ . - scaffold_163_28302_coded)),
  scaffold_185_110741_coded  = lrtest(Mod4, update(Mod4, . ~ . - scaffold_185_110741_coded)),
  MSTRG.5450 = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.5450)),
  MSTRG.4267   = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.4267 )),
  MSTRG.5109 = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.5109)),
  MSTRG.6709  = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.6709))
)

# ... combine the information in a data frame ...
lr_df <- do.call(rbind, lapply(lr_results, function(x) {
  data.frame(
    Predictor = rownames(x)[2],
    Chisq     = x$Chisq[2],
    pvalue    = x$`Pr(>Chisq)`[2]
  )
}))
lr_df
lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity
lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data
lr_df_doubled

# ... which we save as an Escel file 
plot_df_doubled <- tidy_fit %>%
  left_join(lr_df_doubled, by = "term") %>%
  mutate(
    signif_label = case_when(
      pvalue < 0.001 ~ "***",
      pvalue < 0.01  ~ "**",
      pvalue < 0.05  ~ "*",
      TRUE ~ ""
    ),
    chisq_label = paste0("χ²=", round(Chisq, 1))
  )
plot_df_doubled
plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values
plot_df
rio::export(plot_df, "Mod4.xlsx")


###---###---###---###---###---###---###---###---###
## ONLY UP-REGULATED GENES ####
###---###---###---###---###---###---###---###---###
#Below, we only use up-regulated genes (i.e. Models9-16)

###---###---###---###---###---###---###
###  Select variables in the models ####
###---###---###---###---###---###---###
Mod9 <- multinom(beh ~ 1, 
                   data = GeneCounts_MultiNom, model=TRUE)
Mod10 <- multinom(beh ~ scaffold_11_2457277_coded +
                    scaffold_163_28302_coded +
                    scaffold_185_110741_coded,
                   data = GeneCounts_MultiNom, model=TRUE)

Mod11 <- multinom(beh ~ MSTRG.9806 + MSTRG.13575 + MSTRG.13555, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod12 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                     scaffold_185_110741_coded +  MSTRG.9806 + MSTRG.13575 + MSTRG.13555,
                   data = GeneCounts_MultiNom, model=TRUE)

Mod13 <- multinom(beh ~ related_within_QG_prelim + TAS + CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter + 
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod14 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                     scaffold_185_110741_coded + related_within_QG_prelim + TAS + CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod15 <- multinom(beh ~ MSTRG.9806 + MSTRG.13575 + MSTRG.13555 + 
                     related_within_QG_prelim+TAS + CHC_PC1 + 
                     nitrogen_soil + related_within_QG_prelim + TAS + CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod16 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                     scaffold_185_110741_coded + MSTRG.9806 + MSTRG.13575 + MSTRG.13555 + 
                     related_within_QG_prelim + TAS+CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

###---###---###---###---###---###---###
### Save model fit information ####
###---###---###---###---###---###---###
sink("ModComp_Results_UPREG.GENE_MS.txt")
print("Model comparison")
anova(Mod9, Mod10, Mod11, Mod12, Mod13, Mod14, Mod15, Mod16, test="Chisq")


###---###---###---###---###---###---###
### Model comparisons, fit, and p-value testing ####
###---###---###---###---###---###---###
# Model comparison using AICc
mod_list <- list(Mod9=Mod9, Mod10=Mod10, Mod12=Mod12, Mod13=Mod13, Mod14=Mod14, Mod15=Mod15, Mod16=Mod16) 
print(mod_list)
aictab(mod_list)


## Calculate the Goodness of fit
# Check the predicted probability for each program
print("Goodness_of_fit_Mod10")
head(Mod10$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod10),82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod10))

print("Goodness_of_fit_Mod12")
head(Mod12$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod12),82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod12))

## Calculate and p-value using z-wald test
# Model 10
print("summary_Mod10")
summary(Mod10)
print("z-value_Mod10")
z <- summary(Mod10)$coefficients/summary(Mod10)$standard.errors
z
# 2-tailed z test to calculate p-values using Wald Z test
print("p-value_Mod10")
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p


# Model 12
print("summary_Mod12")
summary(Mod12)
print("z-value_Mod12")
z <- summary(Mod12)$coefficients/summary(Mod12)$standard.errors
z
print("p-valueMod12")
# 2-tailed z test to calculate p-values using Wald Z test
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p


# Extract data from the model fit from Model 12
summary(Mod12)
exp(1.21) #3.4 times higher SNP1
exp(1.27) #3.5 times higher SNP1
exp(3.47) #32.1 times higher SNP2
exp(2.43) #11.4 times higher SNP2
exp(0.79) #2.2 times fewer SNP3
exp(0.75) #2.1 times fewer SNP3


###---###---###---###---###---###---###
### Calculate Pseudo-R2 and Log-likelihood ratio tests ####
###---###---###---###---###---###---### 
#Calculate the Pseudo R-Square
print("PseudoR2_models")
PseudoR2(Mod10, which = c("Nagelkerke")) #only SNPs
PseudoR2(Mod11, which = c("Nagelkerke"))
PseudoR2(Mod12, which = c("Nagelkerke")) #only 
PseudoR2(Mod13, which = c("Nagelkerke"))
PseudoR2(Mod14, which = c("Nagelkerke"))
PseudoR2(Mod15, which = c("Nagelkerke"))
PseudoR2(Mod16, which = c("Nagelkerke"))


#Likelihood Ratio Tests
## Use the lmtest package to run Likelihood Ratio Tests
#library(lmtest)
#The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. 
#Model 10
print("lrtest Mod10")
lrtest(Mod10, "scaffold_11_2457277_coded") # 
lrtest(Mod10, "scaffold_163_28302_coded") # sig
lrtest(Mod10, "scaffold_185_110741_coded") #gd #sig

#Model 12
print("lrtest Mod12")
lrtest(Mod12, "scaffold_11_2457277_coded") # 
lrtest(Mod12, "scaffold_163_28302_coded") # sig
lrtest(Mod12, "scaffold_185_110741_coded") #gd #sig
lrtest(Mod12, "MSTRG.9806") #gene_CNBP
lrtest(Mod12, "MSTRG.13575") #gene_CG3902 
lrtest(Mod12, "MSTRG.13555") #gene_CG34367 #sig 


###---###---###---###---###---###---###
###  Comparing the three behavioural states using emmeans and calculate odds ratios ####
###---###---###---###---###---###---###
## Comparing the three behavioural states using emmeans
#Model 10
print("Mod10")
emms1 <- emmeans(Mod10, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 

# Model 12
print("Mod12")
emms1 <- emmeans(Mod12, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 


## Calculate odds ratio and data 
# Model 10
# First, we extract the  coefficients
print("Mod10")
coefs <- coef(Mod10)        # matrix: outcomes x predictors
vc <- vcov(Mod10)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod10, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts
tidy_fit

# Model 12
print("Mod12")
coefs <- coef(Mod12)        # matrix: outcomes x predictors
vc <- vcov(Mod12)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod12, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts
tidy_fit
sink()

#then, we combine everything from Model 12, the selected model, into one list
lr_results <- list(
  scaffold_11_2457277_coded = lrtest(Mod12, update(Mod12, . ~ . - scaffold_11_2457277_coded)),
  scaffold_163_28302_coded = lrtest(Mod12, update(Mod12, . ~ . - scaffold_163_28302_coded)),
  scaffold_185_110741_coded  = lrtest(Mod12, update(Mod12, . ~ . - scaffold_185_110741_coded)),
  MSTRG.9806 = lrtest(Mod12, update(Mod12, . ~ . - MSTRG.9806)),
  MSTRG.13575   = lrtest(Mod12, update(Mod12, . ~ . - MSTRG.13575 )),
  MSTRG.13555 = lrtest(Mod12, update(Mod12, . ~ . - MSTRG.13555))
)


lr_df <- do.call(rbind, lapply(lr_results, function(x) {
  data.frame(
    LogLik = x$LogLik[2],
    Predictor = rownames(x)[2],
    Chisq     = x$Chisq[2],
    pvalue    = x$`Pr(>Chisq)`[2]
  )
}))
lr_df
lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity
lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data
lr_df_doubled


plot_df_doubled <- tidy_fit %>%
  left_join(lr_df_doubled, by = "term") %>%
  mutate(
    signif_label = case_when(
      pvalue < 0.001 ~ "***",
      pvalue < 0.01  ~ "**",
      pvalue < 0.05  ~ "*",
      TRUE ~ ""
    ),
    chisq_label = paste0("χ²=", round(Chisq, 1))
  )
plot_df_doubled
plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values
plot_df

#and save everything in an Excel-file
rio::export(plot_df, "Mod12.xlsx")


###---###---###---###---###---###---###---###---###
## ONLY DOWN-REGULATED GENES ####
###---###---###---###---###---###---###---###---###
## Below, we only use downregulated genes (i.e. Models17-24)

###---###---###---###---###---###---###
### Select variables for each model ####
###---###---###---###---###---###---###
Mod17 <- multinom(beh ~ 1, 
                   data = GeneCounts_MultiNom, model=TRUE)
Mod18 <- multinom(beh ~ scaffold_11_2457277_coded + 
                    scaffold_163_28302_coded + 
                    scaffold_185_110741_coded,
                   data = GeneCounts_MultiNom, model=TRUE)

Mod19 <- multinom(beh ~ MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + MSTRG.4186 +
                     MSTRG.14246 + MSTRG.12676, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod20 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                     scaffold_185_110741_coded +  MSTRG.7840 + MSTRG.16119 + MSTRG.15669 +
                     MSTRG.15080 + MSTRG.4186 +  MSTRG.14246 + MSTRG.12676,
                   data = GeneCounts_MultiNom, model=TRUE)

Mod21 <- multinom(beh ~ related_within_QG_prelim + TAS + CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter + 
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod22 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                     scaffold_185_110741_coded + related_within_QG_prelim + TAS + CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod23 <- multinom(beh ~ MSTRG.7840 + MSTRG.16119 + MSTRG.15669 +
                    MSTRG.15080 + MSTRG.4186 +  MSTRG.14246 +  
                     + MSTRG.12676 + related_within_QG_prelim+TAS + CHC_PC1 + 
                     nitrogen_soil + related_within_QG_prelim + TAS + CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                    Temp_meanYear + TempMax_WarmestMonth, 
                   data = GeneCounts_MultiNom, model=TRUE)

Mod24 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + 
                     scaffold_185_110741_coded + MSTRG.7840 + MSTRG.16119 + 
                    MSTRG.15669 + MSTRG.15080 + MSTRG.4186 +  MSTRG.14246 + 
                    MSTRG.12676 + related_within_QG_prelim + TAS+CHC_PC1 + 
                     nitrogen_soil + Precip_annual + Precip_WarmestQuarter +
                    Temp_meanYear + TempMax_WarmestMonth,
                   data = GeneCounts_MultiNom, model=TRUE)


###---###---###---###---###---###---###
### Save model fit information ####
###---###---###---###---###---###---###
#Extract model fit information
sink("ModComp_Results_DOWNREG.GENE_MS.txt")
print("Model comparison")
anova(Mod17, Mod18, Mod19, Mod20, Mod21, Mod22, Mod23, Mod24, test="Chisq")


###---###---###---###---###---###---###
### Model comparisons, fit, and p-value testing ####
###---###---###---###---###---###---###
# Model comparison using AICc
mod_list <- list(Mod18=Mod18, Mod19=Mod19, Mod20=Mod20, Mod21=Mod21, Mod22=Mod22, Mod23=Mod23, Mod24=Mod24) 
print(mod_list)
aictab(mod_list)

# Calculate the Goodness of fit
#Model 18
# Check the predicted probability for each program
print("Goodness_of_fit_Mod18")
head(Mod18$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod18),82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod18))

#Model 20
print("Goodness_of_fit_Mod20")
head(Mod20$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod20),82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod20))

## Calculate and p-value using z-wald test
#Model 18
print("summary_Mod18")
summary(Mod18)
print("z-value_Mod18")
z <- summary(Mod18)$coefficients/summary(Mod18)$standard.errors
z
# 2-tailed z test to calculate p-values using Wald Z test
print("p-value_Mod18")
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
#We see above that scaffold 2 and 3 (gastrulcation effect have an effect on the behaviours)

#Model 20
print("summary_Mod20")
summary(Mod20)
print("z-value_Mod20")
z <- summary(Mod20)$coefficients/summary(Mod20)$standard.errors
z
print("p-value_Mod20")
# 2-tailed z test to calculate p-values using Wald Z test
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
#

#extract data from Model 20
exp(0.44) #1.6 times higher SNP1
exp(0.45) #1.6 times higher SNP1
exp(3.99) #54.1 times higher SNP2
exp(3.44) #31.2 times higher SNP2
exp(-4.94) #0.01 times fewer SNP3
exp(-4.17) #0.02 times fewer SNP3
exp(0.66) #1.9 times higher 7840
exp(0.66) #1.9 times higher 7840
exp(0.23) #1.3 times higher 16119
exp(-0.12) #0.9 times fewer 16119
exp(0.11) #1.3 times higher 4186 
exp(0.18) #0.9 times higher 4186 
exp(0.25) #1.3 times higher 12676 
exp(0.24) #1.3 times higher 12676 


###---###---###---###---###---###---###
### Calculate Pseudo-R2 and Log-likelihood ratio tests ####
###---###---###---###---###---###---### 
## Calculate the Pseudo R-Square
print("PseudoR2_models")
PseudoR2(Mod18, which = c("Nagelkerke")) #only SNPs
PseudoR2(Mod19, which = c("Nagelkerke"))
PseudoR2(Mod20, which = c("Nagelkerke")) #
PseudoR2(Mod21, which = c("Nagelkerke"))
PseudoR2(Mod22, which = c("Nagelkerke"))
PseudoR2(Mod23, which = c("Nagelkerke"))
PseudoR2(Mod24, which = c("Nagelkerke"))

## Calculate Likelihood Ratio Tests
## Use the lmtest package to run Likelihood Ratio Tests
#library(lmtest)
#The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. 
print("lrtest Mod18")
lrtest(Mod18, "scaffold_11_2457277_coded") # 
lrtest(Mod18, "scaffold_163_28302_coded") # sig
lrtest(Mod18, "scaffold_185_110741_coded") #gd #sig
print("lrtest Mod20")
lrtest(Mod20, "scaffold_11_2457277_coded") # 
lrtest(Mod20, "scaffold_163_28302_coded") # sig
lrtest(Mod20, "scaffold_185_110741_coded") #gd #sig
lrtest(Mod20, "MSTRG.7840") #gene_BiCc +sig
lrtest(Mod20, "MSTRG.16119") #gene_Pif1 +sig
lrtest(Mod20, "MSTRG.15669") #gene_Exo84  
lrtest(Mod20, "MSTRG.15080") #gene_PlexinA  
lrtest(Mod20, "MSTRG.4186") #gene_KaiE1d  
lrtest(Mod20, "MSTRG.14246") #gene_Rd 
lrtest(Mod20, "MSTRG.12676") #gene_RfC3  *sig


###---###---###---###---###---###---###
###  Comparing the three behavioural states using emmeans and calculate odds ratios ####
###---###---###---###---###---###---###
## comparing the three behavioural states using emmeans
#Model 18
print("Mod18")
emms1 <- emmeans(Mod18, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) #, interaction = "pairwise"
 

#Model 20
print("Mod20")
emms1 <- emmeans(Mod20, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) 


##Calculate odds ratio and data
#Model 18
# First, we extract the  coefficients
print("Mod18")
coefs <- coef(Mod18)        # matrix: outcomes x predictors
vc <- vcov(Mod18)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod18, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts
tidy_fit

#Model 20
print("Mod20")
coefs <- coef(Mod20)        # matrix: outcomes x predictors
vc <- vcov(Mod20)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod20, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts
tidy_fit
sink()

#Combine all data from Model 20 into a data frame and ...
lr_results <- list(
  scaffold_11_2457277_coded = lrtest(Mod20, update(Mod20, . ~ . - scaffold_11_2457277_coded)),
  scaffold_163_28302_coded = lrtest(Mod20, update(Mod20, . ~ . - scaffold_163_28302_coded)),
  scaffold_185_110741_coded  = lrtest(Mod20, update(Mod20, . ~ . - scaffold_185_110741_coded)),
  MSTRG.7840 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.7840)),
  MSTRG.16119   = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.16119 )),
  MSTRG.15669 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.15669)),
  MSTRG.15080 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.15080)),
  MSTRG.4186   = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.4186 )),
  MSTRG.14246 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.14246)),
  MSTRG.12676  = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.12676))
)


lr_df <- do.call(rbind, lapply(lr_results, function(x) {
  data.frame(
    LogLik = x$LogLik[2],
    Predictor = rownames(x)[2],
    Chisq     = x$Chisq[2],
    pvalue    = x$`Pr(>Chisq)`[2]
  )
}))
lr_df
lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity
lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data
lr_df_doubled

plot_df_doubled <- tidy_fit %>%
  left_join(lr_df_doubled, by = "term") %>%
  mutate(
    signif_label = case_when(
      pvalue < 0.001 ~ "***",
      pvalue < 0.01  ~ "**",
      pvalue < 0.05  ~ "*",
      TRUE ~ ""
    ),
    chisq_label = paste0("χ²=", round(Chisq, 1))
  )
plot_df_doubled
plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values
plot_df

###save it as an Excel file
rio::export(plot_df, "Mod20.xlsx")



###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
# FINAL MODEL: only significant SNPs and DEGs ####
###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###
## This approach uses only the significant SNPs and differentially-expressed genes based on the results of the previous analyses

library(foreign); library(nnet); library(ggplot2); library(reshape2); library(emmeans); library(lmtest); library("AICcmodavg"); library("DescTools"); library("tidyr"); library("dplyr"); library(ggrepel); library(broom)

setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/")

GeneCounts_MultiNom <- readxl::read_xlsx("ForR_GeneCounts_final.xlsx",
                                   sheet=1) ##THIS is the file that combines all information

#Multinomial logistic regression uses categories as dependent factors to conduct a logistic regression where the response variable is log-transformed, here "beh" with three levels. 
#we relevel start_agg as reference/baseline for models below
GeneCounts_MultiNom$beh <- relevel(as.factor(GeneCounts_MultiNom$beh), ref = "start_agg")

###---###---###---###---###---###---###
### Select variables ####
###---###---###---###---###---###---###
#Fit models
#Intercept only model
Mod_f1 <- multinom(beh ~ 1, 
                   data = GeneCounts_MultiNom, model=TRUE)

#Final model
Mod_f4_sig <- multinom(beh ~ scaffold_163_28302_coded +
                         scaffold_185_110741_coded +
                         MSTRG.6709 + #in both 
                         MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + #down-reg 
                         MSTRG.4186 + MSTRG.14246 + MSTRG.12676 ,  #down-reg 
                       data = GeneCounts_MultiNom, model=TRUE)

mod_list <- list(Mod_f1=Mod_f1, Mod_f4_sig=Mod_f4_sig) #, Mod9=Mod9
print(mod_list)
aictab(mod_list)
#best, Mod_f2 (only SNPs) followed by Modf3


###---###---###---###---###---###---###
### Model comparisons, fit, and p-value testing ####
###---###---###---###---###---###---###

#Calculate the Goodness of fit
# Check the predicted probability for each program
sink("FinalResults_MS.txt")
print("Goodness_of_fit_Mod_f4_sig")
head(Mod_f4_sig$fitted.values, 82)
# We can get the predicted result by use predict function
head(predict(Mod_f4_sig), 82)
# Test the goodness of fit
chisq.test(GeneCounts_MultiNom$beh, predict(Mod_f4_sig))


#Calculate and p-value using z-wald test
print("summary_Mod_f4_sig")
summary(Mod_f4_sig)
print("z-value_Mod_f4_sig")
z <- summary(Mod_f4_sig)$coefficients/summary(Mod_f4_sig)$standard.errors
z
# 2-tailed z test to calculate p-values using Wald Z test
print("p-value_Mod_f4_sig")
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p


#Extract data
summary(Mod_f4_sig)
exp(4.896033) #chance of finding SNP2 133.8 times higher in react_agg than start_agg
exp(3.797238) #chance of finding SNP2 44.6 times higher in react_pcf than start_agg
exp(-4.129057) #... SNP3 0.01 times lower in react_agg than start_agg
exp(-3.810066) #... SNP3 0.02 times lower in react_pcf than start_agg
exp(0.0258754) # MSTRG.6709 1.02 times higher
exp(0.02150042) #MSTRG.6709 1.022 times higher 
exp(0.5803252) # MSTRG.7840 1.8 times higher in ...
exp(0.5728261) #MSTRG.7840  1.8 times higher 
exp(0.3411982848) # MSTRG.16119 1.4 times higher 16119
exp(0.0007958548) #MSTRG.16119 1.0 times higher 16119  #NOT SIG
exp(-0.0002316689) #  MSTRG.15669 -1.0 times lower 4186 
exp(-0.0002322059) # MSTRG.15669 -1.0 times lower 4186 
exp(0.03221771) #MSTRG.15080 1.03 times higher   #NOT SIG
exp(-0.01099654 ) #MSTRG.15080 -1.0 times lower   #NOT SIG
exp(0.3068957) #  MSTRG.4186 1.4 times higher 
exp(0.3588560) # MSTRG.4186 1.4 times higher 
exp(-0.002389500 ) #  MSTRG.15669 -1.0 times lower  
exp(-0.00180066) # MSTRG.15669 -1.0 times lower  
exp(0.3159435) #  MSTRG.14246 1.4 times higher  
exp(0.2853418) # MSTRG.14246 1.3 times higher  


###---###---###---###---###---###---###
### Calculate Pseudo-R2 and Log-likelihood ratio tests ####
###---###---###---###---###---###---### 

#Calculate the Pseudo R-Square
print("PseudoR2_models")
PseudoR2(Mod_f4_sig, which = c("CoxSnell","Nagelkerke","McFadden")) #only SNPs

#Likelihood Ratio Tests
## Use the lmtest package to run Likelihood Ratio Tests
#library(lmtest)
#The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. 
print("lrtest Mod_f4_sig")
#lrtest(Mod_f4_sig, "scaffold_11_2457277_coded") # 
lrtest(Mod_f4_sig, "scaffold_163_28302_coded") # sig
lrtest(Mod_f4_sig, "scaffold_185_110741_coded") #gd #sig
lrtest(Mod_f4_sig, "MSTRG.6709") #gene_yellow-d2 #sig
lrtest(Mod_f4_sig, "MSTRG.7840") #gene_BiCc +sig
lrtest(Mod_f4_sig, "MSTRG.16119") #gene_Pif1 +sig
lrtest(Mod_f4_sig, "MSTRG.15669") #gene_Exo84  #sig
lrtest(Mod_f4_sig, "MSTRG.15080") #gene_PlexinA   #not sig
lrtest(Mod_f4_sig, "MSTRG.4186") #gene_KaiE1d  #sig
lrtest(Mod_f4_sig, "MSTRG.14246") #gene_Rd  #sig
lrtest(Mod_f4_sig, "MSTRG.12676") #gene_RfC3  *sig


###---###---###---###---###---###---###
###  Comparing the three behavioural states using emmeans and calculate odds ratios ####
###---###---###---###---###---###---###
#compares the three behaviours
print("Mod_f4_sig")
emms1 <- emmeans(Mod_f4_sig, ~ beh) 
con1 <- contrast(emms1)
pairs(con1, by = NULL) #, interaction = "pairwise"
confint(emms1)
sink()


#Plot odds ratio and data
# First, we extract the  coefficients
coefs <- coef(Mod_f4_sig)        # matrix: outcomes x predictors
vc <- vcov(Mod_f4_sig)           # variance-covariance matrix (flattened)

tidy_fit <- tidy(Mod_f4_sig, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)")   # drop intercepts

# tidy_fit has columns:
# outcome | term | estimate (OR) | conf.low | conf.high

#Likelihood ratio tests for each predictor
lr_results <- list(
  scaffold_163_28302_coded = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - scaffold_163_28302_coded)),
  scaffold_185_110741_coded  = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - scaffold_185_110741_coded)),
  MSTRG.6709 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.6709)),
  MSTRG.7840  = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.7840)),
  MSTRG.16119 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.16119)),
  MSTRG.15669  = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.15669)),
  MSTRG.15080 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.15080)),
  MSTRG.4186  = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.4186)),
  MSTRG.14246 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.14246)),
  MSTRG.12676  = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.12676))
)

lr_df <- do.call(rbind, lapply(lr_results, function(x) {
  data.frame(
    Predictor = rownames(x)[2],
    Chisq     = x$Chisq[2],
    pvalue    = x$`Pr(>Chisq)`[2]
  )
}))
lr_df
lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity
lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data
lr_df_doubled

plot_df_doubled <- tidy_fit %>%
  left_join(lr_df_doubled, by = "term") %>%
  mutate(
    signif_label = case_when(
      pvalue < 0.001 ~ "***",
      pvalue < 0.01  ~ "**",
      pvalue < 0.05  ~ "*",
      TRUE ~ ""
    ),
    chisq_label = paste0("χ²=", round(Chisq, 1))
  )
plot_df_doubled
plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values

#Re-order data set for plotting
plot_df$Behaviour <- ifelse(plot_df$y.level=="react_agg", "Reacted aggressively", 
                            ifelse(plot_df$y.level=="react_pcf", "Reacted peacefully", ""))
plot_df$SNPs_genes <- rep(c("SNP1", "SNP3", "yellow-d2", "BiCc", "Pif1", "Exo84", "PlexinA", "KaiR1d", "Rd", "RfC3"), 2)
plot_df$SNPs_genes <- factor(plot_df$SNPs_genes,
                                   levels = c("BiCc" , "Exo84", "KaiR1d", "Pif1", "PlexinA", "Rd",
                                              "RfC3",  "yellow-d2", "SNP3", "SNP1"))  # custom order

###---###---###---###---###---###---###
### Plot and save pdf ####
###---###---###---###---###---###---###
# Plot as forest-style dot plot
pdf("Fig3D_ForestPlot_wChiSquare_pval_MS_20250901.pdf")
ggplot(plot_df, aes(x = log(estimate), y = SNPs_genes,
                          color = Behaviour, fill=Behaviour)) +
  geom_point(size = 5, position = position_dodge(width = 0.7), pch=21) +
  geom_errorbarh(aes(xmin = log(conf.low), xmax = log(conf.high)),
                 height = 0.2, position = position_dodge(width = 0.7)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  #scale_x_log10() +
  scale_color_manual(values=c("black", "black")) +
  scale_fill_manual(values=c("#7336c2a9", "#fcfa18ec")) +
  labs(x = "Log-scale odds ratio", y = "SNPs or genes") +
  geom_text_repel(aes(label = signif_label),
                  nudge_x = 0.1, size = 6, color = "black") +
  geom_text_repel(aes(label = Chisq),
                  nudge_x = 2, size = 3, color = "black") +
  theme_bw(base_size = 14) +
  theme(plot.title = element_text(size=20),
        axis.title.x = element_text(size=15),
        axis.text.x  = element_text(size=13, color="black"),
        axis.title.y = element_text(size=15),
        axis.text.y  = element_text(size=13, color="black"),
        strip.text.x = element_text(size = 13, colour = "black"))
dev.off()

#Also, export the results of the final model
rio::export(plot_df, "FinalModel_results_MS.xlsx")


###---###---###---###---###---###---###---
##  Barplot for the up- and down-regulated genes  ####
###---###---###---###---###---###---###--
library(ggplot2)

setwd("C:/Users/krapf/OneDrive/Desktop/SubInDel/Workshop/ExpressionCounts")

set2 <- readxl::read_xlsx("ForR_Gene_table.xlsx", sheet="barplot")
set1 <- subset(set2, use=="x")

pdf("Fig3E_log2fold_plot.pdf", height=5)
ggplot(set1, aes(x=log2FoldChange, y=geneNames, fill=function_plot))+  #
  geom_bar(stat="identity") +
  facet_grid(~behavioural_comp) +
  theme_bw() + 
  xlim(c(-3.5,1)) +
  ylab("Genes")+
  xlab("log2 fold change") +
  geom_vline(xintercept = 0, linetype=2) + #, color, size
  geom_text(aes(label=round(log2FoldChange, 2)), vjust=0) +
  scale_fill_manual(name="Function",
                    values=c("#2D708EFF", "#C2DF23FF")) +
  theme(plot.title = element_text(size=20),
        axis.title.x = element_text(size=15),
        axis.text.x  = element_text(size=13, color="black"),
        axis.title.y = element_text(size=15),
        axis.text.y  = element_text(size=13, color="black"),
        strip.text.x = element_text(size = 13, colour = "black"),
        legend.position="bottom",
  )
dev.off()

#END of script