###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # ANALYSIS FOR STUDY #### # "The gut microbiome, single nucleotide polymorphisms, and differentially expressed genes promote aggression in an ant" #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## ARTICLE AUTHORS: #Patrick Krapf, Francesco Cicconardi, Martin Schilling, Gerhard P. Aigner, Thomas Klammsteiner, Manfred Ayasse, Wolfgang Arthofer, Sasha (Alexander) Mikheyev, Birgit C. Schlick-Steiner, Florian M. Steiner ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Install and load all packages needed#### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### packages_needed <- c("ExactMultinom", "readxl", "rnaturalearth", "rnaturalearthdata", "ggspatial", "sf", "ggmap", "ggplot2", "stringr", "ggsn", "dplyr", "forcats", "ggfortify", "factoextra", "cluster", "qqman", "data.table", "tidyverse", "ggpubr", "vegan", "MASS", "edgeR", "DESeq2", "Glimma", "ggvenn", "raster", "rasterVis", "rworldxtra", "foreign", "nnet", "reshape2", "emmeans", "lmtest", "AICcmodavg", "DescTools", "tidyr", "ggrepel", "broom" ) #The code below, checks which packages are needed, whether they are installed. If not they are downloaded. pk_to_install <- packages_needed [!( packages_needed %in% rownames(installed.packages()) )] if(length(pk_to_install)>0 ){ install.packages(pk_to_install,repos="http://cran.r-project.org") } #Now, all packages are loaded. Not necessarily needed, as packages will be loaded specifically for each analysis as well. lapply(packages_needed, require, character.only = TRUE) #Now, all libraries should be loaded. You can check this, by using the search() function search() ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Blind bioassays / Recognition assays #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## Below, we analyse the results of the blind bioassays using a Multinomial Goodness-of-Fit Tests ## Note, we don't need to load this data as it is count data added below. library(ExactMultinom) observed_counts <- c(502, 423, 454) # observed counts for staying on own, alien, control areas on filter plates expected_counts <- rep(sum(observed_counts)/3, 3) # expected counts assuming equal probabilities # Perform the multinomial test result <- ExactMultinom::multinom.test(x = observed_counts, p = expected_counts) # Print the results print(result) ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Fig 1 - Map + inset #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ###---###---###---###---###---###---### ## Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(readxl); library("rnaturalearth"); library("rnaturalearthdata"); library("ggspatial"); library("sf"); library("ggmap"); library(ggplot2); library("stringr"); library("ggsn") setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") dataset <- read_xlsx("ForR_DNA_samples.xlsx", sheet="EnvironmentalVars") # head(dataset) ###---###---###---###---###---###---### ## Data tidying #### ###---###---###---###---###---###---### dataset_map <- matrix(NA, nrow=9, ncol=1) dataset_map <- as.data.frame(dataset_map) dataset_map$lat <- unique(dataset$lat) dataset_map$lon <- unique(dataset$lon) dataset_map$V1 <- unique(dataset$nest) dataset_map ###---###---###---###---###---###---### ## Load Europe tiles with stadia maps #### ###---###---###---###---###---###---### ##Europe area <- c(left = 0, bottom = 35, right = 30 , top = 57) #all Europe map2 <- get_stadiamap(area, zoom = 5, maptype = "stamen_toner_lite", color="color") #, force = T ###---###---###---###---###---###---### ### Save pdf #### ###---###---###---###---###---###---### pdf("Fig1A_Europe_Plot_MS.pdf") #Europe_PW_20240610 former map ggmap(map2) + ylab("Latitude")+ xlab("Longitude")+ geom_point(aes(y=dataset$lat[1], x=dataset$lon[1]), size=4, fill="#7b7bc0ff", colour="black",pch=21) + geom_point(aes(y=dataset$lat[28], x=dataset$lon[28]) , size=4, fill="#32cd32ff", colour="black",pch=21) + geom_point(aes(y=dataset$lat[91], x=dataset$lon[91]) , size=4, fill="#ff6820ff", colour="black",pch=21) + scalebar(dataset, dist = 50, dist_unit = "km", transform = TRUE, model = "WGS84", location="bottomright", height=0.1)+ theme(plot.title = element_text(size=25), axis.title.x = element_text(size=17), axis.text.x = element_text(size=13, color="black"), axis.title.y = element_text(size=17), axis.text.y = element_text(size=13, color="black"), panel.border = element_rect(colour = "black", fill=NA, size=1), panel.grid.minor.x = element_blank()) dev.off() ###---###---###---###---###---###---### ## Create inset with detailed locations #### ###---###---###---###---###---###---### area <- c(left = 10.5, bottom = 46.50, right = 13.50 , top = 47.5) #Austria inset map3 <- get_stadiamap(area, zoom = 9, maptype = "stamen_terrain_background", color="color") ###---###---###---###---###---###---### ### Save pdf #### ###---###---###---###---###---###---### pdf("Fig1A_Inset_PlotAustria_MS.pdf") ggmap(map3)+ ylab("Latitude")+ xlab("Longitude")+ geom_point(aes(y=dataset_map$lat[1], x=dataset_map$lon[1]), size=4, fill="#7b7bc0ff", colour="black",pch=21) + #MQ-N geom_point(aes(y=dataset_map$lat[4], x=dataset_map$lon[4]), size=4, fill="#32cd32ff", colour="black",pch=21) + #SQ-N geom_point(aes(y=dataset_map$lat[9], x=dataset_map$lon[9]), size=4, fill="#f6820ff6", colour="black",pch=21) + #SQ-A geom_segment(aes(x = dataset_map$lon[4], y = dataset_map$lat[4], xend = dataset_map$lon[9], yend = dataset_map$lat[9])) + geom_segment(aes(x = dataset_map$lon[4], y = dataset_map$lat[4], xend = dataset_map$lon[3], yend = dataset_map$lat[3])) + geom_segment(aes(x = dataset_map$lon[3], y = dataset_map$lat[3], xend = dataset_map$lon[9], yend = dataset_map$lat[9])) + geom_point(aes(y= 47.259659, x=11.400375) , size=4, fill="black", colour="black",pch=21) + #Innsbruck annotate("text", y=47.259659, x=11.400375+0.3, label="Innsbruck", size=6, colour="black") + annotate("text", x = 11.25, y = 47.0, label = "40 km") + annotate("text", x = 12.0, y = 47.0, label = "140 km") + annotate("text", x = 12.25, y = 46.7, label = "125 km") + scalebar(dataset, dist = 50, dist_unit = "km", transform = TRUE, model = "WGS84", location="bottomright", height=0.1)+ theme(plot.title = element_text(size=25), axis.title.x = element_text(size=17), axis.text.x = element_text(size=13, color="black"), axis.title.y = element_text(size=17), axis.text.y = element_text(size=13, color="black"), panel.border = element_rect(colour = "black", fill=NA, size=1), panel.grid.minor.x = element_blank()) dev.off() ## Note: Fig1 B (schematic overview) was crated in BioRender and was added manually to Fig1. #END Fig1 ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Aggression analysis #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ##Below, we will analyse the data from the aggression assays. ###---###---###---###---###---###---### ## Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(readxl); library(dplyr); library(ggplot2); library(forcats) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") dataset <- read_xlsx("ForR_DNA_samples.xlsx", sheet="samples109_") # head(dataset) #reduced dataset dataset_red <- dataset %>% filter(seq == "part1") %>% as_tibble() ###---###---###---###---###---###---### ## Create Figure for detailed aggression analysis #### ###---###---###---###---###---###---### #Create the plot labels labelsPlot <- c("Reacted aggressively", "Reacted peacefully", "Started aggression") names(labelsPlot) <- c("react_agg", "react_pcf", "start_agg") dataset_red$beh <- fct_relevel(dataset_red$beh, "start_agg", "react_agg", "react_pcf") # custom order for facet_grid ###---###---###---###---###---###---### ## Save pdf #### ###---###---###---###---###---###---### table(dataset_red$pop, dataset_red$beh) pdf("Fig2A_PW_beh_red_MS.pdf") ggplot(dataset_red, aes(x=pop, y=AI_worker, fill=pop))+ geom_boxplot() + geom_violin(alpha=0.2)+ geom_jitter(alpha=0.2)+ facet_grid(~beh, labeller=labeller(beh = labelsPlot)) + ylab("Behaviour Index (AI)")+ xlab("Populations")+ theme_bw()+ scale_y_continuous(breaks=seq(-4,5,1), limits = c(-4.2, 5.2))+ scale_fill_manual(values=c("#ffad5b", "#32cd32", "#7d9ec0"), breaks = c("SQ-A", "SQ-N", "MQ-N")) + # custom legend order scale_x_discrete(limits=c("SQ-A", "SQ-N", "MQ-N")) + theme(plot.title = element_text(size=20), axis.title.x = element_text(size=15), axis.text.x = element_text(size=13, color="black"), axis.title.y = element_text(size=15), axis.text.y = element_text(size=13, color="black"), strip.text.x = element_text(size = 13, colour = "black"), legend.position="none", panel.grid.minor.y = element_blank(), panel.grid.minor.x = element_blank()) dev.off() ###---###---###---###---###---###---### ## ANOVA: Test if aggression is significant different among populations #### ###---###---###---###---###---###---### mod1 <- aov(dataset_red$AI_worker ~ dataset_red$beh) summary(mod1) TukeyHSD(mod1) #END aggression assays ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # CHC PCA and hierarchical cluster analysis #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ###---###---###---###---###---###---###---###---###---### ## CHC PCA #### ###---###---###---###---###---###---###---###---###---### ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(ggfortify) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") CHC_data <- rio::import("./ForR_CHC_results.xlsx", which="values") head(CHC_data) length(CHC_data) test2<- prcomp(CHC_data[3:65]) summary(test2) ###---###---###---###---###---###---### ### Use ggfortify's autoplot to create PCA #### ###---###---###---###---###---###---### #Plot the PCA a_plot <- autoplot(prcomp(CHC_data[3:65]), data=CHC_data, frame = TRUE, frame.type = 'norm', colour = "pop", size = 3)+ theme_bw() # Save the PDF pdf("Fig2B_PCA_CHC_MS.pdf") a_plot + scale_color_manual(values = c("#ffad5b", "#32cd32", "#7d9ec0"), limits=c("SQ-A", "SQ-N", "MQ-N"), breaks=c("SQ-A", "SQ-N", "MQ-N"), labels=c("Pop SQ-A", "Pop SQ-N", "Pop MQ-N"), name="Populations") + scale_fill_manual(values = c("#ffad5b", "#32cd32", "#7d9ec0"), limits=c("SQ-A", "SQ-N", "MQ-N"), breaks=c("SQ-A", "SQ-N", "MQ-N"), labels=c("Pop SQ-A", "Pop SQ-N", "Pop MQ-N"), #"Pop A", "Pop N", "Pop S" name="Populations") + scale_x_continuous(breaks = c(seq(-0.4,0.4,0.2))) dev.off() #END PCA CHCs ##NOTE: Fig2 C-D were created using the VCF files and were added manually to Fig2. ###---###---###---###---###---###---###---###---###---### ## CHC HIERARCHICAL CLUSTER ANALYSIS #### ###---###---###---###---###---###---###---###---###---### ## Below, we will conduct a hierarchichal cluster analysis using the CHC data. ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(factoextra); library(cluster) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") # load data CHC_data <- rio::import("./ForR_CHC_results.xlsx", which="values") head(CHC_data) ###---###---###---###---###---###---### ### Find linkage method and conduct clustering #### ###---###---###---###---###---###---### #Find the Linkage Method to Use #define linkage methods m <- c( "average", "single", "complete", "ward") names(m) <- c( "average", "single", "complete", "ward") #function to compute agglomerative coefficient ac <- function(x) { agnes(CHC_data[3:65], method = x)$ac } #calculate agglomerative coefficient for each clustering linkage method sapply(m, ac) #We can see that Ward’s minimum variance method produces the highest agglomerative coefficient, thus we’ll use that as the method for our final hierarchical clustering: #perform hierarchical clustering using Ward's minimum variance clust <- agnes(CHC_data[3:65], method = "ward") ###---###---###---###---###---###---### #### Save pdf #### ###---###---###---###---###---###---### pdf("FigS1_CHC_Hierarch_Clust_Anal_MS_supp.pdf") pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram", labels=CHC_data$col) dev.off() #END Hierarchical cluster analysis ###---###---###---###---###---###---### # SNPs GEMMA (Genome Wide association analysis), PCA, Fisher's test #### ###---###---###---###---###---###---### ## Below, we load the results of the GEMMA analysis and create a Manhattan plot for the SNPs ###---###---###---###---###---###---### ## Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(qqman); library(data.table) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") paramsAgg <- read.table("ForR_result.assoc.DNA_agg.txt", header = T) #is same as ldp_enq_lmm head(paramsAgg) ###---###---###---###---###---###---### ## Plot GEMMA aggression results #### ###---###---###---###---###---###---##ä chr1 <- as.numeric(unlist(lapply(strsplit(unlist(lapply(strsplit(paramsAgg$rs, "_"), '[[', 2)), ":"), "[[", 1))) ps1 <- unlist(lapply(strsplit(paramsAgg$rs, ":"), '[[', 3)) paramsAgg["chr"] <- chr1 paramsAgg$BP <- as.numeric(ps1) paramsAgg$CHR <- as.integer(paramsAgg$chr) paramsAgg$p_wald <- as.numeric(paramsAgg$p_wald) paramsAgg$SNP <- paramsAgg$rs ## NOTE: this may take some time, depending on your local machine AggManhattan <- manhattan(paramsAgg, p= 'p_wald') pdf("FigS2_ManhattanPlot_MS_Supp.pdf") manhattan(paramsAgg, p= 'p_wald') dev.off() #END Manhattan plot ###---###---###---###---###---###---### ## Plot allelic states for each of the SNPs #### ###---###---###---###---###---###---### ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(ggplot2); library(tidyverse); library(ggpubr) #Setwd and load data setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") data <- readxl::read_xlsx("ForR_SNPs_agg_SuggLine_GEMMA.xlsx", sheet="Data_R2") #alternatively use "Data_R" head(data) data1 <- data # table(data1$scaffold_11_2457277_, data1$beh) table(data1$scaffold_163_28302_, data1$beh) table(data1$scaffold_185_110741_, data1$beh) #SNP1 p1 <- ggplot(data, aes(x=seq(1:109), y=scaffold_11_2457277, col=beh)) + geom_point(size=2) + xlab("Samples")+ ylab("SNP1 state")+ theme_bw()+ scale_color_manual(name="Behavioural\nstates", limits=c("start_agg","react_agg","react_pcf"), breaks=c("start_agg", "react_agg", "react_pcf"), labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"), values=c("#CC6677", "#DDCC77", "#332288")) + scale_x_continuous(breaks=c(seq(1:109)), labels=c(data$sample)) + theme(axis.text.x = element_text(angle=45, size=12)) #vjust=0.5, p1 #SNP2 p2 <- ggplot(data, aes(x=seq(1:109), y=scaffold_163_28302, col=beh)) + geom_point(size=2) + xlab("Samples")+ ylab("SNP2 state")+ theme_bw()+ scale_color_manual(name="Behavioural\nstates", limits=c("start_agg","react_agg","react_pcf"), breaks=c("start_agg", "react_agg", "react_pcf"), labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"), values=c("#CC6677", "#DDCC77", "#332288")) + scale_x_continuous(breaks=c(seq(1:109)), labels=c(data$sample)) + theme(axis.text.x = element_text(angle=45, size=12)) # p2 #SNP3 p3 <- ggplot(data, aes(x=seq(1:109), y=scaffold_185_110741, col=beh)) + # geom_point(size=2) + xlab("Samples")+ ylab("SNP3 state")+ scale_x_continuous(breaks=c(seq(1:109)), labels=c(data$sample)) + scale_color_manual(name="Behavioural\nstates", limits=c("start_agg","react_agg","react_pcf"), breaks=c("start_agg", "react_agg", "react_pcf"), labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"), values=c("#CC6677", "#DDCC77", "#332288")) + theme_bw()+ scale_x_continuous(breaks=seq(1, 109, by=2))+ theme(axis.text.x = element_text(angle=45, size=12)) # p3 ###---###---###---###---###---###---### ### Save pdf #### ###---###---###---###---###---###---### pdf("FigS3_SNP_states_MS_supp.pdf", width = 9, height=7) ggarrange(p1, p2, p3, labels = c("Mediator of RNA polymerase II transcription subunit 26, scaffold_11_2457277", "Unkown, scaffold_163_28302", "Gastrulation-defective, scaffold_185_110741"), ncol = 1, nrow = 3) dev.off() #END plot allelic states ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # PCA with SNP allelic states #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Below, we create a PCA with SNPs allelic states + behavioural states ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(vegan); library(MASS); library(ggfortify) # set wd and load data setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") data <- readxl::read_xlsx("ForR_SNPs_agg_SuggLine_GEMMA.xlsx", sheet="Data_R") #Now, we create dummy variables: code homozygotes for ref as "1", heterozygotes as "2", homozygotes for alt were not found, so not "3" data$scaffold_185_110741_coded <- ifelse(data$scaffold_185_110741 =="0/1", 2, 1) data$scaffold_163_28302_coded <- ifelse(data$scaffold_163_28302 =="0/1", 2, 1) data$scaffold_11_2457277_coded <- ifelse(data$scaffold_11_2457277 =="0/1", 2, 1) data ###---###---###---###---###---###---### ## Subset data and plot data #### ###---###---###---###---###---###---### #Subset data data_subset <- subset(data, select = c("scaffold_185_110741_coded", "scaffold_163_28302_coded", "scaffold_11_2457277_coded", "beh")) #Use ggfortify and autoplot to plot the data a_plot <- autoplot(prcomp(data_subset[1:3]), data=data, frame = TRUE, frame.type = 'norm', colour = "beh", size = 3) + theme_bw() #Plot PCA pdf("Fig3A_SNPs_PCA_MS.pdf") a_plot + scale_color_manual(values = c("#CC6677", "#DDCC77", "#332288"), limits=c("start_agg","react_agg","react_pcf"), breaks=c("start_agg", "react_agg", "react_pcf"), labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"), name="Behavioural\nstates") + scale_fill_manual(values = c("#CC6677", "#DDCC77", "#332288"), limits=c("start_agg","react_agg","react_pcf"), breaks=c("start_agg", "react_agg", "react_pcf"), labels=c("started\naggression", "reacted\naggressively", "reacted\npeacefully"), name="Behavioural\nstates") dev.off() #END plot ###---###---###---###---###---###---### ## Run Pearson's Chi-squared Test with SNP count data on behavioural states #### ###---###---###---###---###---###---### ## Setwd and load data setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") data <- readxl::read_xlsx("ForR_SNPs_agg_SuggLine_GEMMA.xlsx", sheet="Data_R") ## Check data table(data$scaffold_11_2457277, data$beh) table(data$scaffold_163_28302, data$beh) table(data$scaffold_185_110741, data$beh) ###---###---###---###---###---###---### ### Run Pearson's Chi-squared Test #### ###---###---###---###---###---###---### sink("Pearson_Chi_squared-test_SNP_count_data.txt") table(data$scaffold_11_2457277, data$beh) table(data$scaffold_163_28302, data$beh) table(data$scaffold_185_110741, data$beh) cat("\n") cat("\n") print("count SNP states #SNP scaffold 11") cat("\n") #count SNP states #SNP scaffold 11 M <- as.table(rbind(c(36, 5), c(35, 4), c(26, 3))) dimnames(M) <- list(behavioural_state = c("started_aggression", "reacted_aggressivelly", "reacted_peacefully"), allele = c("reference","alternative")) M (Xsq <- chisq.test(M, simulate.p.value = T, B = 2000)) # Prints test summary #Pairwise comparisons of the resuls pairwise_tests <- function(M) { rows <- combn(nrow(M), 2, simplify = FALSE) results <- lapply(rows, function(r) { tab <- M[r, ] test <- fisher.test(tab) data.frame( comparison = paste("Row", r[1], "vs Row", r[2]), p.value = test$p.value ) }) do.call(rbind, results) } posthoc <- pairwise_tests(M) # Holm correction for multiple testing posthoc$p.adjusted <- p.adjust(posthoc$p.value, method = "holm") #Result posthoc #Cermér's V Effect size chisq <- chisq.test(M, simulate.p.value = TRUE, B = 2000) n <- sum(M) r <- nrow(M) c <- ncol(M) cramers_v <- sqrt(chisq$statistic / (n * min(r - 1, c - 1))) cramers_v cat("\n") cat("\n") print("count SNP states #SNP scaffold 163") #count SNP states #SNP scaffold 163 M <- as.table(rbind(c(18, 23), c(2, 37), c(2, 27))) dimnames(M) <- list(behavioural_state = c("started_aggression", "reacted_aggressivelly", "reacted_peacefully"), allele = c("reference","alternative")) M (Xsq <- chisq.test(M, simulate.p.value = T, B = 2000)) # Prints test summary ## Pairwise comparison posthoc <- pairwise_tests(M) # Holm correction for multiple testing posthoc$p.adjusted <- p.adjust(posthoc$p.value, method = "holm") #Result posthoc #Cramér's V Effect size chisq <- chisq.test(M, simulate.p.value = TRUE, B = 2000) n <- sum(M) r <- nrow(M) c <- ncol(M) cramers_v <- sqrt(chisq$statistic / (n * min(r - 1, c - 1))) cramers_v cat("\n") cat("\n") print("count SNP states #SNP scaffold 185") #count SNP states #SNP scaffold 185 M <- as.table(rbind(c(11, 30), c(29, 10), c(18, 11))) dimnames(M) <- list(behavioural_state = c("started_aggression", "reacted_aggressivelly", "reacted_peacefully"), allele = c("reference","alternative")) M (Xsq <- chisq.test(M, simulate.p.value = T, B = 2000)) # Prints test summary ## Pairwise comparison posthoc <- pairwise_tests(M) # Holm correction for multiple testing posthoc$p.adjusted <- p.adjust(posthoc$p.value, method = "holm") #Result posthoc #Cermér's V Effect size chisq <- chisq.test(M, simulate.p.value = TRUE, B = 2000) n <- sum(M) r <- nrow(M) c <- ncol(M) cramers_v <- sqrt(chisq$statistic / (n * min(r - 1, c - 1))) cramers_v sink() #END Pearson's Chi-squared test ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Differential gene expression #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## NOTE: We only use 82 individuals here, which is filtered using the file "ForR_RNA_82Samples.xlsx" ###---###---###---###---###---###---### ## Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(edgeR); library(DESeq2); library(Glimma) ###---###---###---###---###---###---### ## Load HTseq count data #### ###---###---###---###---###---###---### ## Note: Using a custom script, we merged all HTseq count data files together into one tvs-file, "ForR_htseq_all_sample_count.tsv", for the DGE analysis setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") countData <- read.csv("ForR_htseq_all_sample_count.tsv", sep = "\t",header=TRUE) #Note, here are still samples present but in the next step, only the selected 82 individuals will be used. ###---###---###---###---###---###---### ### Set column and row names in file #### ###---###---###---###---###---###---### IdList <- read.csv("indList_pops.txt", sep = "\t",header=T) head(IdList) colnames(countData) <- t(IdList) rownames(countData) <- countData$gene_id countData head(countData) tail(countData) ###---###---###---###---###---###---### ### Delete ambigious gene names, load metadata #### ###---###---###---###---###---###---### ##Delete ambiguous genes names and exclude things such as "no feature", etc. countData <- countData[-20857:-20861,] #20859 tail(countData) # Load metadata ### ##Metadata is information of the samples metaData <- readxl::read_xlsx("ForR_RNA_82samples.xlsx", sheet="RNA_82samples") #Select comparison levels for the behavioural states, either started aggression vs reacted aggressively (strAgg_rctAgg), started aggression vs reacted peacefully (strAgg_rctPcf), reacted aggressively vs reacted peacefully (rctAgg_rctPcf) StartAgg_vs_RctAgg <- subset(metaData, strAgg_rctAgg=="x") StartAgg_vs_RctPcf <- subset(metaData, strAgg_rctPcf=="x") RctAgg_vs_RctPcf <- subset(metaData, rctAgg_rctPcf=="x") #List worker ID including population and colony ID StartAgg_vs_RctAgg$id_ StartAgg_vs_RctPcf$id_ RctAgg_vs_RctPcf$id_ ###---###---###---###---###---###---### ### Prepare data for DGE analysis #### ###---###---###---###---###---###---### #### Select data for behavioral comparisons #### #Started aggression vs reacted aggressively #StartAgg_vs_RctAgg # inds_StAgg_RctAgg <- c("gene_id", StartAgg_vs_RctAgg$id_) counData_Start_RctAgg <- subset(countData, select=inds_StAgg_RctAgg) counData_Start_RctAgg length(counData_Start_RctAgg) #59+1 (geneID) length(StartAgg_vs_RctAgg$sample_bam) StartAgg_vs_RctAgg$id_ #Started aggression vs reacted peacefully #StartAgg_vs_RctPcf # inds_StAgg_RctPcf <- c("gene_id", StartAgg_vs_RctPcf$id_) counData_Start_RctPcf <- subset(countData, select=inds_StAgg_RctPcf) counData_Start_RctPcf length(counData_Start_RctPcf) #55+1 (geneID) length(StartAgg_vs_RctPcf$sample_bam) StartAgg_vs_RctPcf$id_ #Reacted aggressively vs reacted aggressively #RctAgg_vs_RctPcf # inds_RctAgg_RctPcf <- c("gene_id", RctAgg_vs_RctPcf$id_) counData_RctAgg_RctPcf <- subset(countData, select=inds_RctAgg_RctPcf) counData_RctAgg_RctPcf length(counData_RctAgg_RctPcf) #52+1 (geneID) length(RctAgg_vs_RctPcf$sample_bam) RctAgg_vs_RctPcf$id_ ###---###---###---###---###---###---### #### Create DESeq data object for behavioral comparisons #### ###---###---###---###---###---###---### ## Started aggression VS Reacted aggressively ## dds_St_vs_RctAgg <- DESeqDataSetFromMatrix(countData=counData_Start_RctAgg, colData=StartAgg_vs_RctAgg, design=~beh, tidy = TRUE) length(StartAgg_vs_RctAgg$pop) # length(counData_Start_RctAgg) # ncol(counData_Start_RctAgg)-1 == nrow(StartAgg_vs_RctAgg) #control, if same amount of ## Started aggression VS Reacted peacefully ## dds_St_vs_RctPcf <- DESeqDataSetFromMatrix(countData=counData_Start_RctPcf, colData=StartAgg_vs_RctPcf, design=~beh, tidy = TRUE) length(StartAgg_vs_RctPcf$pop) # length(counData_Start_RctPcf) # ncol(counData_Start_RctPcf)-1 == nrow(StartAgg_vs_RctPcf) #control, if same amount of #Reacted aggressively VS Reacted peacefully ## dds_Rctagg_vs_RctPcf <- DESeqDataSetFromMatrix(countData=counData_RctAgg_RctPcf, colData=RctAgg_vs_RctPcf, design=~beh, tidy = TRUE) length(RctAgg_vs_RctPcf$pop) # length(counData_RctAgg_RctPcf) # ncol(counData_RctAgg_RctPcf)-1 == nrow(RctAgg_vs_RctPcf) #control, if same amount of ###---###---###---###---###---###---### #### Pre-filtering #### ###---###---###---###---###---###---### #While it is not necessary to pre-filter low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful: by removing rows in which there are very few reads, we reduce the memory size of the dds data object, and we increase the speed of the transformation and testing functions within DESeq2. Here we perform a minimal pre-filtering to keep only rows that have at least 10 reads total. Note that more strict filtering to increase power is automatically applied via independent filtering on the mean of normalized counts within the results function. #Started aggression vs reacted aggressively #dds_St_vs_RctAgg # keep <- rowSums(counts(dds_St_vs_RctAgg)) >= 10 #prefilter to counts found more often than 10 dds_St_vs_RctAgg <- dds_St_vs_RctAgg[keep,] dds_St_vs_RctAgg <- DESeq(dds_St_vs_RctAgg) dds_St_vs_RctAgg$strAgg_rctAgg #Started aggression vs reacted peacefully #dds_St_vs_RctPcf # keep <- rowSums(counts(dds_St_vs_RctPcf)) >= 10 #prefilter to counts found more often than 10 dds_St_vs_RctPcf <- dds_St_vs_RctPcf[keep,] dds_St_vs_RctPcf <- DESeq(dds_St_vs_RctPcf) dds_St_vs_RctPcf$strAgg_rctPcf #Reacted aggressively vs reacted peacefully #dds_Rctagg_vs_RctPcf # keep <- rowSums(counts(dds_Rctagg_vs_RctPcf)) >= 10 #prefilter to counts found more often than 10 dds_Rctagg_vs_RctPcf <- dds_Rctagg_vs_RctPcf[keep,] dds_Rctagg_vs_RctPcf <- DESeq(dds_Rctagg_vs_RctPcf) dds_Rctagg_vs_RctPcf$rctAgg_rctPcf ###---###---###---###---###---###---### ### Differential gene expression analysis #### ###---###---###---###---###---###---### #Select behavioural comparisons dds <- dds_St_vs_RctAgg dds <- dds_St_vs_RctPcf dds <- dds_Rctagg_vs_RctPcf #Note: You have to run these separately or rename the df "dds" and then change the script below. #Check if correct one is loaded dds dds$name resultsNames(dds) #estimateSizeFactors --> This calculates the relative library depth of each sample #estimateDispersions --> estimates the dispersion of counts for each gene #nbinomWaldTest --> calculates the significance of coefficients in a Negative Binomial GLM using the size and dispersion outputs #Details about the comparison are printed to the console, directly above the results table. Take a look at the results table res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") res head(results(dds, tidy=TRUE)) # ###---###---###---###---###---###---### ### Summary of differential gene expression analysis #### ###---###---###---###---###---###---### #Below, in "res" are ther resutls stored summary(res) # Note that we could have specified the coefficient or contrast we want to build a results table for, using either of the following equivalent commands: #res <- results(dds, contrast=c("condition","treated","untreated")) res_A <- results(dds, name="beh_start_agg_vs_react_agg") res_B <- results(dds, name="beh_start_agg_vs_react_pcf") res_C <- results(dds, name="beh_react_agg_vs_react_pcf") ##Note: You need to run the code from the lines 744 onwards with the different behavioural comparisons #Display results summary(res_A) summary(res_B) summary(res_C) ###---###---###---###---###---###---### #### Volcano plots #### ###---###---###---###---###---###---### #Set par to defualt valus, in case it has changed par(mfrow=c(1,1)) ###---###---###---###---###---###---### ##### Select behavioural comparisons #### ###---###---###---###---###---###---### ## Started aggression vs reacted aggressively ## dds <- dds_St_vs_RctAgg #Calculate how many genes are significanly up/down-regulated for each behavioural comparison res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") summary(res) tail(res) CountGenes <- subset(res, padj<.05 ) summary(CountGenes) length((CountGenes$padj)) #90 pdf("FigS4A_volcanoPlot_StartAgg_vs_RctAgg_pval.05.pdf") with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3))) # Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05) with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue")) with(subset(res, padj<.02 & abs(log2FoldChange)> 1), points(log2FoldChange, -log10(pvalue), pch=20, col="red")) dev.off() ## Started aggression vs reacted peacefully ## dds <- dds_St_vs_RctPcf res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") summary(res) tail(res) CountGenes <- subset(res, padj<.05 ) summary(CountGenes) length((CountGenes$padj)) #90 pdf("FigS4B_volcanoPlot_StartAgg_vs_RctPcf_pval.05.pdf") with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3))) # Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05) with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue")) with(subset(res, padj<.02 & abs(log2FoldChange)> 1), points(log2FoldChange, -log10(pvalue), pch=20, col="red")) dev.off() ## Reacted aggressively vs reacted peacefully ## dds <- dds_Rctagg_vs_RctPcf res <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") summary(res) tail(res) CountGenes <- subset(res, padj<.05 ) summary(CountGenes) length((CountGenes$padj)) #90 pdf("FigS4C_volcanoPlot_RctAggvsRctPcf_pval.05.pdf") with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3))) # Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05) with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue")) with(subset(res, padj<.02 & abs(log2FoldChange)> 1), points(log2FoldChange, -log10(pvalue), pch=20, col="red")) dev.off() ###---###---###---###---###---###---### ## Prepare data for saving and to be used in g:Profiler #### ###---###---###---###---###---###---### # Now, we save all genes without p-adjusted values for Gene ontology analysis in g:Profiler library(tidyverse) #Select behavioural comparison dds <- dds_St_vs_RctAgg resLFC <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") #Select behavioural comparison dds <- dds_St_vs_RctPcf resLFC <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") #Select behavioural comparison dds <- dds_Rctagg_vs_RctPcf resLFC <- results(dds, lfcThreshold = 0, pAdjustMethod = "fdr") #order data based on p-value and select data res2 <- resLFC[order(resLFC$pvalue),] res2_ <- cbind(res2$baseMean, res2$log2FoldChange, res2$lfcSE, res2$stat, res2$pvalue, res2$padj) #if res2 not log shrinkage rownames(res2_) <- rownames(res2) colnames(res2_) <- colnames(res2) res2_ <- as_tibble(res2_) res2_$geneNames <- rownames(res2) res2_ <- res2_[,-4] res2_$geneNames <- rownames(res2) res2_pos <- res2_ %>% filter(log2FoldChange >0) length(res2_pos$pvalue) #7810 res2_neg <- res2_ %>% filter(log2FoldChange <0) length(res2_neg$pvalue) #9339 ###---###---###---###---###---###---### ### Save dete data from above #### ###---###---###---###---###---###---### #Note, write.csv(res2_neg, "res_stAgg_ReacAgg_Pref10_neg.txt") write.csv(res2_pos, "res_stAgg_ReacAgg_Pref10_pos.tx") write.csv(res2_neg, "res_StAgg_ReacPcf_Pref10_neg.txt") write.csv(res2_pos, "res_StAgg_ReacPcf_Pref10_pos.txt") write.csv(res2_neg, "res_ReacAgg_ReacPcf_Pref10_neg.txt") write.csv(res2_pos, "res_ReacAgg_ReacPcf_Pref10_pos.txt") #This will yield files for significantly up- and down-regulated genes #Note: Using a customised python script, we extracted gene names from the gene txt.-files created above (e.g. res_StAgg_ReacPcf_Pref10_pos.txt) and searched for each of the genes in the gtf file (Talp_Annotation_Merged_v2_RefGen_m150.gtf) for the corresponding gene name. The script then writes this name, together with log2fold change, etc in a new file. ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## VENN diagram #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library("ggvenn") setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") VennDiaDataAll <- readxl::read_xlsx("ForR_Gene_table.xlsx", sheet="Venn") #Now the same again BUT we only choose genes with a known gene name VennDiaDataAll$FB_name #Down-regulated genes started_aggression_vs_reacted_aggressively_down_red <- subset(VennDiaDataAll,beh_comparison=="started aggression vs reacted aggressively" & expression=="downregulated" & FB_name != "NA") started_aggression_vs_reacted_peacefully_down_red <- subset(VennDiaDataAll, beh_comparison=="started aggression vs reacted peacefully" & expression=="downregulated" & FB_name != "NA") reacted_aggressively_vs_reacted_aggressivey_down_red <- subset(VennDiaDataAll, beh_comparison=="reacted aggressively vs reacted peacefully" & expression=="downregulated" & FB_name != "NA") #Up-regulated genes started_aggression_vs_reacted_aggressively_up_red <- subset(VennDiaDataAll, beh_comparison=="started aggression vs reacted aggressively" & expression=="upregulated" & FB_name != "NA") started_aggression_vs_reacted_peacefully_up_red <- subset(VennDiaDataAll, beh_comparison=="started aggression vs reacted peacefully" & expression=="upregulated" & FB_name != "NA") reacted_aggressively_vs_reacted_aggressivey_up_red <- subset(VennDiaDataAll, beh_comparison=="reacted aggressively vs reacted peacefully" & expression=="upregulated" & FB_name != "NA") #Combine all down-regulated genes in a list down_reg_list_red <- list( "Started aggression vs reacted aggressively" = started_aggression_vs_reacted_aggressively_down_red$geneNames, "Started aggression vs reacted peacefully" = started_aggression_vs_reacted_peacefully_down_red$geneNames, "Reacted peacefully vs reacted peacefully" = reacted_aggressively_vs_reacted_aggressivey_down_red$geneNames) down_reg_list_red #Combine all up-regulated genes in a list up_reg_list_red <- list( "Started aggression vs reacted aggressively" = started_aggression_vs_reacted_aggressively_up_red$geneNames, "Started aggression vs reacted peacefully" = started_aggression_vs_reacted_peacefully_up_red$geneNames, "Reacted peacefully vs reacted peacefully" = reacted_aggressively_vs_reacted_aggressivey_up_red$geneNames) up_reg_list_red ###---###---###---###---###---###---### ### Plot pdfs #### ###---###---###---###---###---###---### pdf("Fig3B_Venn_up_down_knownGeneName_MS.pdf") ggvenn( down_reg_list_red, fill_color = c("#7336c2a9", "#fcfa18ec", "#868686FF"), # stroke_size = 0.5, set_name_size = 4 ) ggvenn( up_reg_list_red, fill_color = c("#7336c25f", "#fcfa9aec", "#cccccc80"), #NOTE: The colors need to be made slightly transparent to match the colors in the main document stroke_size = 0.5, set_name_size = 4 ) dev.off() #END venn diagram ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Download/calculate environmental variables #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ##Below, we download and/or calculate the environmental variables for the multinomial logistic regression ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## Nitrogen and other variables #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### #First, we start with soil nitrogen ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(raster); library(rasterVis); library(rworldxtra); data(countriesHigh) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") freq <- read.csv("ForR_allEnvironVars.csv", sep = ";",header=TRUE) ###---###---###---###---###---###---### ### Extract latitude and longitude from each colony to extract data #### ###---###---###---###---###---###---### lats <- freq$lat lons <- freq$long coords <- data.frame(x=lons,y=lats) freq$nest #NOTE: in this file, all environmental data are already present. Below, is only for consistency on how to extract the data. ###---###---###---###---###---###---### ### Extract nitrogen variables for analysis #### ###---###---###---###---###---###---### ##Note, we use the "N.zip" file from LUCAS topsoil data from the European Commission, which can be downloaded online r <- raster('N.tif') #from LUCAS topsoil data 2012 ###---###---###---###---###---###---### #### Create a raster object and extract data #### ###---###---###---###---###---###---### #Create raster object crs(r) wgs84 = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" raster_wgs84 = projectRaster(r, crs = wgs84, method = "ngb") #change raster GRS80 to WG84 #Extract data points for each colony and save as "Nitrogen-values" file points <- SpatialPoints(coords, proj4string = raster_wgs84@crs) values <- extract(raster_wgs84, points) df1 <- cbind.data.frame(coordinates(points), values) colnames(df1) <- c("long", "lat", "nitrogen") write.csv(df1, "Nitrogen-values_P&W.csv") ###---###---###---###---###---###---### ## Bioclim variables from WorldClim #### ###---###---###---###---###---###---### ###---###---###---###---###---###---### ### Download Bioclim environmental variables for analysis #### ###---###---###---###---###---###---### #from https://worldclim.org/ ###---###---###---###---###---###---### ### Set working directory, load data, and select Bioclim variables #### ###---###---###---###---###---###---### #setwd("C:/Users/krapf/OneDrive/Desktop/") setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") # load data rNew <- raster::getData("worldclim",var="bio", res=0.5, lon=9.75, lat=46.17) #if already downloaded, it will just load it #Now, we select the BioClim variables r1_12 <- rNew[[c(1,4,5,6,12, 16, 17, 18, 19)]] names(r1_12) <- c("Temp","TempSeason", "TempWarmestMonth", "TempColdestMonth", "Prec", "Precip_WettestQuarter", "Precip_DriestQuarter", "Precip_WarmestQuarter", "Precip_ColdestQuarter" ) # BIO1 = Annual Mean Temperature # BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp)) # BIO3 = Isothermality (BIO2/BIO7) (?100) # BIO4 = Temperature Seasonality (standard deviation ?100) # BIO5 = Max Temperature of Warmest Month # BIO6 = Min Temperature of Coldest Month # BIO7 = Temperature Annual Range (BIO5-BIO6) # BIO8 = Mean Temperature of Wettest Quarter # BIO9 = Mean Temperature of Driest Quarter # BIO10 = Mean Temperature of Warmest Quarter # BIO11 = Mean Temperature of Coldest Quarter # BIO12 = Annual Precipitation # BIO13 = Precipitation of Wettest Month # BIO14 = Precipitation of Driest Month # BIO15 = Precipitation Seasonality (Coefficient of Variation) # BIO16 = Precipitation of Wettest Quarter # BIO17 = Precipitation of Driest Quarter # BIO18 = Precipitation of Warmest Quarter # BIO19 = Precipitation of Coldest Quarter ###---###---###---###---###---###---### ### Extract data for each colony and save data #### ###---###---###---###---###---###---### #Extract BioClim data from the colony coordinates points <- SpatialPoints(coords, proj4string = r1_12@crs) values <- extract(r1_12,points) #Below, we save the extracted data as csv-file write.csv(values, "PrecipTemp_values_P&W.csv") ###---###---###---###---###---###---### ### Monthly averaged precipitation data #### ###---###---###---###---###---###---### #Get additional precipitation data for each month setwd("C:/Users/c7701110/Desktop") rNews <- getData("worldclim",var="prec",res=0.5, lon=9.75, lat=46.17) #0.5 rNews #Select data rPrecip <- rNews[[c(1,2,3,4,5,6,7,8,9,10,11,12)]] names(rPrecip) <- c("Jan","Feb","Mar","Apr", "May","Jun","Jul", "Aug","Sep", "Oct","Nov","Dec") rPrecip #Extract data for each colony points <- SpatialPoints(coords, proj4string = rPrecip@crs) values <- extract(rPrecip,points) setwd("C:/Users/c7701110/Desktop/MS3") write.csv(values, "Precip_values_P&W.csv") ##NOTE: We then combined all values into the files "ForR_allEnvironVars.csv" ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Assess Bacterial OTUs association with behavioural states using a multinomial logistic regression #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### #OTUs are, among others, Pseudomonas, Bacteroides, etc. ###---###---###---###---###---###---### ## Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(foreign); library(nnet); library(ggplot2); library(reshape2); library(emmeans); library(lmtest); library(AICcmodavg); library(DescTools) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") Microbiome_sorted <- readxl::read_xlsx("ForR_otu_metadata_rarefied.xlsx", sheet="OTU_transposed_Count>100") Microbiome_sorted_red <- subset(Microbiome_sorted, Video_id!="exclude" & beh!="ctrl_freeze") #excludes the control samples, termed "ctrl_freeze", and the summaries created in the file and named with "exclude", so that they can be excluded tail(Microbiome_sorted_red) head(Microbiome_sorted_red) ###---###---###---###---###---###---### ## Prepare data #### ###---###---###---###---###---###---### #characters to numeric #identify all character columns length(Microbiome_sorted_red) ##all variables, including 119 OTUs chars <- sapply(Microbiome_sorted_red[,7:125], is.character) # tot, 119 OTUs, first 6 variables are pop names, etc #convert all character columns to numeric Microbiome_sorted_red[,7:125][ , chars] <- as.data.frame(apply(Microbiome_sorted_red[,7:125][ , chars], 2, as.numeric)) as.data.frame(Microbiome_sorted_red) Microbiome_sorted_red Microbiome_sorted_red$beh <- relevel(as.factor(Microbiome_sorted_red$beh), ref = "start_agg") ###---###---###---###---###---###---###---###---### ## For loop to analyse the OTUs #### ###---###---###---###---###---###---###---###---### # Number of variables num_vars <- 119 # Sliding window size window_size <- 21 #Matrix for emmeans results #EmmeansRes Emmeans_Results_matrix <- matrix(NA, nrow=3, ncol=4) colnames(Emmeans_Results_matrix) <- c("model", "contrast", "t-value", "p-value") # Iterate through each position of the sliding window for (i in 1:(ncol(Microbiome_sorted_red) - (window_size)+1) ) { #####setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") #!!!!!! CHANGE !!!!!!! # Extract the data within the sliding window window_data <- Microbiome_sorted_red[, 7:(i+5 + window_size-1) ] # Iterate through each variable for (j in 1:num_vars) { sink(paste("OTU_models_", j,"_20250329.txt", sep="")) print(paste("LoopModel_", j, sep="")) # Formulate the formula for the linear model variable_name <- colnames(Microbiome_sorted_red[,7:125]) formula <- as.formula(paste("beh ~", variable_name[j], "+", variable_name[j+1], "+", variable_name[j+2], "+", variable_name[j+3], "+", variable_name[j+4], "+", variable_name[j+5], "+", variable_name[j+6], "+", variable_name[j+7], "+", variable_name[j+8], "+", variable_name[j+9], "+", variable_name[j+10], "+", variable_name[j+11], "+", variable_name[j+12], "+", variable_name[j+13], "+", variable_name[j+14], "+", variable_name[j+15], "+", variable_name[j+16], "+", variable_name[j+17], "+", variable_name[j+18], "+", variable_name[j+19] )) #paste0("OTU_", j))) # Fit the linear model lm_model <- multinom(formula, data = Microbiome_sorted_red, model=T) cat("\n"); cat("\n") # Print the results or store them as required print(lm_model) cat("\n"); cat("\n") print("Goodness_of_fit_Mod2") # Test the goodness of fit print(chisq.test(Microbiome_sorted_red$beh, predict(lm_model))) cat("\n"); cat("\n") #Calculate and p-value using z-wald test print("summary") print(summary(lm_model)) cat("\n") print("z-value") z <- summary(lm_model)$coefficients/summary(lm_model)$standard.errors print(z) cat("\n") # 2-tailed z test to calculate p-values using Wald Z test print("p-value") p <- (1 - pnorm(abs(z), 0, 1)) * 2 print(p) #this code below stores p-values in a excel file and saves it print_p <- as.data.frame(p) sigOTUs <- as.data.frame(ifelse(print_p <= 0.05, "1", "0")) rio::export(sigOTUs, paste("sigOTUs_", j,".xlsx", sep=""), format = "xlsx") # cat("\n") #EMMEANS contrasts tryCatch({ #emmeans may cause an error print("Emmeans") emms1 <- emmeans(lm_model, ~ beh, ) # con1 <- contrast(emms1) EmmeansRes <- summary(pairs(con1, by = NULL)) print(EmmeansRes) Emmeans_Results_matrix[1:3, 1] <- print(paste("LoopModel_", j, sep="")) Emmeans_Results_matrix[1:3, 2] <- EmmeansRes$contrast Emmeans_Results_matrix[1:3, 3] <- EmmeansRes$t Emmeans_Results_matrix[1:3, 4] <- EmmeansRes$p.value sink(paste("Emmeans_Results_matrix_", j,".txt", sep="") ) print(Emmeans_Results_matrix) sink() }, error= function(e) { cat("\n"); cat("\n") }) #Calculate the Pseudo R-Square print("PseudpR2") print(PseudoR2(lm_model, which = c("CoxSnell","Nagelkerke","McFadden"))) sink() } } ###---###---###---###---###---###---###---###---### ## Manually add missing models #### ###---###---###---###---###---###---###---###---### #In the for loop, not all variables could be added due to the nature of the for-loop. So below, we add the last few models manually #with 119 variables OTU_models_101 <- multinom(beh ~ OTU_7293 +OTU_8257 +OTU_16490 +OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2, data = Microbiome_sorted_red, model=T) OTU_models_102 <- multinom(beh ~ OTU_8257 +OTU_16490 +OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7, data = Microbiome_sorted_red, model=T) OTU_models_103 <- multinom(beh ~ OTU_16490 +OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13, data = Microbiome_sorted_red, model=T) OTU_models_104 <- multinom(beh ~ OTU_17372 +OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58, data = Microbiome_sorted_red, model=T) OTU_models_105 <- multinom(beh ~ OTU_19291 +OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123, data = Microbiome_sorted_red, model=T) OTU_models_106 <- multinom(beh ~ OTU_20430 +OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366, data = Microbiome_sorted_red, model=T) OTU_models_107 <- multinom(beh ~ OTU_20640 +OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370, data = Microbiome_sorted_red, model=T) OTU_models_108 <- multinom(beh ~ OTU_20829 +OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732, data = Microbiome_sorted_red, model=T) OTU_models_109 <- multinom(beh ~ OTU_20999 +OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805, data = Microbiome_sorted_red, model=T) OTU_models_110 <- multinom(beh ~ OTU_21141 +OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251, data = Microbiome_sorted_red, model=T) OTU_models_111 <- multinom(beh ~ OTU_4 +OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420, data = Microbiome_sorted_red, model=T) OTU_models_112 <- multinom(beh ~ OTU_4778 +OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442, data = Microbiome_sorted_red, model=T) OTU_models_113 <- multinom(beh ~ OTU_269 +OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581, data = Microbiome_sorted_red, model=T) OTU_models_114 <- multinom(beh ~ OTU_358 +OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806, data = Microbiome_sorted_red, model=T) OTU_models_115 <- multinom(beh ~ OTU_5363 +OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011, data = Microbiome_sorted_red, model=T) OTU_models_116 <- multinom(beh ~ OTU_9 +OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292, data = Microbiome_sorted_red, model=T) OTU_models_117 <- multinom(beh ~ OTU_588 +OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292 + OTU_8545, data = Microbiome_sorted_red, model=T) OTU_models_118 <- multinom(beh ~ OTU_744 +OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292 + OTU_8545 +OTU_8711, data = Microbiome_sorted_red, model=T) OTU_models_119 <- multinom(beh ~ OTU_794 + OTU_2 + OTU_7 + OTU_13 + OTU_58 + OTU_123 + OTU_366 + OTU_370 + OTU_732 + OTU_805 + OTU_1251 + OTU_1420 + OTU_2442 + OTU_4581 + OTU_4806 + OTU_6011 + OTU_7292 + OTU_8545 +OTU_8711 + OTU_11261, data = Microbiome_sorted_red, model=T) ## Great, now we had also added all final models and can continue below ###---###---###---###---###---###---###---###---### ## For loop to extract OTUs that sig influence behavioural states and saves it as an output #### ###---###---###---###---###---###---###---###---### setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") sink("ALL_OTUs_count_119_mod_20250330_.txt", append=T) files <- list.files(path=".", pattern="sigOTUs") #!#!#! keep this but change it if you have changed the output file names above files <- files[1:100] for(i in 1:length(files)) { #change to all files ColumNames_SigOTUs <- readxl::read_xlsx(paste("sigOTUs_", i, ".xlsx", sep="")) #this I need ColumNames_SigOTUs <- as.data.frame(ColumNames_SigOTUs) rownames(ColumNames_SigOTUs) <- c("react_agg", "react_pcf") for(j in 1:length(ColumNames_SigOTUs)) { tryCatch({ if(colnames(ColumNames_SigOTUs[j]) == paste("(Intercept)") ) { "" } if(ColumNames_SigOTUs[1, j] == "1") { #ifelse(ColumNames_SigOTUs[1,j] == "1") print(paste(files[i], rownames(ColumNames_SigOTUs[1,]), colnames(ColumNames_SigOTUs[j]), sep="\t")) } if(ColumNames_SigOTUs[1, j] == "0") { #sink("OTUs_count.txt", append=T) #print(rownames(ColumNames_SigOTUs[2,])) #print() "" } if(ColumNames_SigOTUs[2, j] == "1") { print(paste(files[i], rownames(ColumNames_SigOTUs[2,]), colnames(ColumNames_SigOTUs[j]), sep="\t")) } if(ColumNames_SigOTUs[2, j] == "0") { "" } else "" }, error = function(e) { cat("Error: ", e$message, "\nSkipping file ", files[i], "\n\n") }) } } sink() #Now, we add additional values with "sigOTUs_OTU_models_" as name setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") #CHANGE sink("ALL_OTUs_count_119_mod.txt", append=T) #i<-1 #j<-1 files <- list.files(path=".", pattern="sigOTUs") #!#!#!keep this but change it if you have changed the output file names above files <- files[100:119] for(i in 1:length(files)) { #change to all files ColumNames_SigOTUs <- readxl::read_xlsx(paste("sigOTUs_OTU_models_", i+100, ".xlsx", sep="")) #this I need ColumNames_SigOTUs <- as.data.frame(ColumNames_SigOTUs) rownames(ColumNames_SigOTUs) <- c("react_agg", "react_pcf") #ColumnNames <- colnames(ColumNames_SigOTUs) for(j in 1:length(ColumNames_SigOTUs)) { tryCatch({ if(colnames(ColumNames_SigOTUs[j]) == paste("(Intercept)") ) { "" } if(ColumNames_SigOTUs[1, j] == "1") { #ifelse(ColumNames_SigOTUs[1,j] == "1") print(paste(files[i], rownames(ColumNames_SigOTUs[1,]), colnames(ColumNames_SigOTUs[j]), sep="\t")) } if(ColumNames_SigOTUs[1, j] == "0") { #sink("OTUs_count.txt", append=T) #print(rownames(ColumNames_SigOTUs[2,])) #print() "" } if(ColumNames_SigOTUs[2, j] == "1") { print(paste(files[i], rownames(ColumNames_SigOTUs[2,]), colnames(ColumNames_SigOTUs[j]), sep="\t")) } if(ColumNames_SigOTUs[2, j] == "0") { "" } else "" }, error = function(e) { cat("Error: ", e$message, "\nSkipping file ", files[i], "\n\n") }) } } sink() ## In the next step, we read in all the created files #First, open "ALL_OTUs_count_119_mod.txt" and copied the data to an excel file. Then with "replace" function, replace the "\t" with only "\" and run the "text to culumns" function. I copied this data to a new txt file "ALL_OTUs_count_119_mod_fin.txt", which is read in below. #For the file below, I also deleted the " [1] " at the beginning, the " at the end and the \t with a real tab #setwd("C:/Users/krapf/OneDrive/Desktop/SubInDel/Workshop/ExpressionCounts/OTU_models/") setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") file_path <- "ALL_OTUs_count_119_mod_fin.txt" lines <- readLines(file_path) lines_split <- strsplit(lines, "\t", fixed=T) lines_split # Initialize a counter for OTU occurrences otu_counts <- table(gsub('.*OTU_', "", lines_split)) sorted_counts <- sort(otu_counts, decreasing = TRUE) ## Print the counts and export the file #Note: Do this only once and check the file manually #In the excel file, you then order the file names and count ##### rio::export(as.data.frame(sorted_counts), "ALL_OTU_Counts_119mod_20250330.xlsx", format = "xlsx") ###---###---###---###---###---###---###---### ## Binomial test of the OTU counts after rarefaction and after the multinomial logistic regression #### ###---###---###---###---###---###---###---### library(emmeans) ## Below, we run a binomial test and add the values manually setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") sink("OTUcounts_Comparing_control_and_BehavStates_Supp.txt") print("PSEUDOMONAS - OTU 366") count <- c(33, 134, 102, 45) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("PSEUDOMONAS - OTU 2442") count <- c(33, 95, 30, 34) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("PSEUDOMONAS - OTU 4806") count <- c(34, 16, 52, 25) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Exclude because there are no significant differences between control and behavioural states print("BACTEROIDES - OTU 1598") count <- c(24, 67, 30, 12) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("BACTEROIDES - OTU 5256") count <- c(24, 48, 29, 55) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Bacteroides - OTU 22324") count <- c(335, 131, 278, 274) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 348") count <- c(142, 22, 201, 39) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 377") count <- c(4, 333, 19, 51) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 667") count <- c(42, 54, 106, 34) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 837") count <- c(29, 71, 74, 53) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 1887") count <- c(55, 754, 106, 140) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 2975") count <- c(20, 57, 41, 52) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 9014") count <- c(21, 25, 178, 21) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 17264") count <- c(96, 15, 102, 52) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Prevotella - OTU 20448") count <- c(408, 395, 127, 188) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Lactobacillus - OTU 813") count <- c(56, 73, 70, 34) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep even though no significant differences between control and behavioural states print("Lactobacillus - OTU 7014") count <- c(16, 43, 73, 51) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Lactobacillus - OTU 21141") count <- c(84, 87, 78, 125) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states print("Rhizobiales - OTU 369") count <- c(63, 102, 110, 94) beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Keep because there are significant differences between control and behavioural states sink() ## Based on these results, we exclude a Lactobacillus sp. (OTU 813) and one Pseudomonas sp. (OTU 4806) because they did not differ between control and the behavioural states. #################################### # #!! # print("PSEUDOMONAS - OTU 805") # count <- c(139, 4, 50, 6) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") # print("PSEUDOMONAS - OTU 1420") # count <- c(13, 73, 3, 12) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") # #!! # print("PSEUDOMONAS - OTU 436") # count <- c(278, 12, 194, 17) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") # # #!! # print("Prevotella - OTU 326") # count <- c(236, 1, 89, 105) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") # # #!! # print("Prevotella - OTU 18107") # count <- c(12, 17, 83, 7) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") # #!! # print("Lactobacillus - OTU 121") # count <- c(486, 188, 357, 342) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") # # #!! # print("Lactobacillus - OTU 5416") # count <- c(591, 45, 352, 44) # beh <- c("control", "started_aggression", "reacted_aggressivelly", "reacted_peacefully") # df_micro <- data.frame(beh, count) # mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) # summary(mod1) # emms1 <- emmeans(mod1, ~ beh) # con1 <- contrast(emms1) # pairs(con1, by = NULL) # cat("\n"); cat("\n"); cat("\n"); cat("\n") ####################################### ###---###---###---###---###---###---### ## Select OTUs manually and calculate Poisson tests #### ###---###---###---###---###---###---### library("emmeans") ResultTable <- matrix(nrow = 27, ncol=8) colnames(ResultTable) <- c("Genus", "Species", "Comparison", "Estimate", "SE", "df", "Z-Ratio", "p-value") ## Now, we selected OTUs with a count of more than 100 in each for subsequent analysis. We now calculate Poisson tests with count data sink("OTU_countdata_MS.txt") print("BACTEROIDES ") print("Bacteroides sp. - OTU 1598") count <- c(67, 30, 12) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[1:3,1] <- "Bacteroides" ResultTable[1:3,2] <- "Bacteroides sp. - OTU 1598" ResultTable[1:3,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[1:3,4] <- EmmRes_df$estimate ResultTable[1:3,5] <- EmmRes_df$SE ResultTable[1:3,6] <- EmmRes_df$df ResultTable[1:3,7] <- EmmRes_df$z.ratio ResultTable[1:3,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") ##Linked to started agg and aggression print("Bacteroides sp. - OTU 5256") count <- c(48, 29, 55) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Excluded because of inconsistent results (react agg vs react pcf difference, but rest not) print("Bacteroides sp. - OTU 22324") count <- c(131, 278, 274) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[4:6,1] <- "Bacteroides" ResultTable[4:6,2] <- "Bacteroides sp. - OTU 22324" ResultTable[4:6,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[4:6,4] <- EmmRes_df$estimate ResultTable[4:6,4] <- EmmRes_df$estimate ResultTable[4:6,5] <- EmmRes_df$SE ResultTable[4:6,6] <- EmmRes_df$df ResultTable[4:6,7] <- EmmRes_df$z.ratio ResultTable[4:6,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Linked to started aggression print("LACTOBACILLUS ") print("Lactobacillus mucosae - OTU 813") count <- c(73, 70, 34) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[7:9,1] <- "Lactobacillus" ResultTable[7:9,2] <- "Lactobacillus mucosae - OTU 813" ResultTable[7:9,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[7:9,4] <- EmmRes_df$estimate ResultTable[7:9,4] <- EmmRes_df$estimate ResultTable[7:9,5] <- EmmRes_df$SE ResultTable[7:9,6] <- EmmRes_df$df ResultTable[7:9,7] <- EmmRes_df$z.ratio ResultTable[7:9,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Link to aggression print("Lactobacillus fructivorans - OTU 7014") count <- c(43, 73, 51) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Excluded due to mixed results print("Lactobacillus sp. - OTU 21141") count <- c(87, 78, 125) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[10:12,1] <- "Lactobacillus" ResultTable[10:12,2] <- "Lactobacillus sp. - OTU 21141" ResultTable[10:12,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[10:12,4] <- EmmRes_df$estimate ResultTable[10:12,4] <- EmmRes_df$estimate ResultTable[10:12,5] <- EmmRes_df$SE ResultTable[10:12,6] <- EmmRes_df$df ResultTable[10:12,7] <- EmmRes_df$z.ratio ResultTable[10:12,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Linked to peacefulness print("PREVOTELLA ") print("Prevotella sp. - OTU 348") count <- c(22, 201, 39) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Excluded, mixed results print("Prevotella sp. - OTU 377") count <- c(333, 19, 51) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[13:15,1] <- "Prevotella" ResultTable[13:15,2] <- "Prevotella sp. - OTU 377" ResultTable[13:15,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[13:15,4] <- EmmRes_df$estimate ResultTable[13:15,4] <- EmmRes_df$estimate ResultTable[13:15,5] <- EmmRes_df$SE ResultTable[13:15,6] <- EmmRes_df$df ResultTable[13:15,7] <- EmmRes_df$z.ratio ResultTable[13:15,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Link to started agg print("Prevotella sp. - OTU 667") count <- c(54, 106, 34) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Excluded, mixed results - link to reacted agg print("Prevotella sp. - OTU 837") count <- c(71, 74, 53) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Excluded, not sig print("Prevotella sp. - OTU 1887") count <- c(754, 106, 140) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[16:18,1] <- "Prevotella" ResultTable[16:18,2] <- "Prevotella sp. - OTU 1887" ResultTable[16:18,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[16:18,4] <- EmmRes_df$estimate ResultTable[16:18,4] <- EmmRes_df$estimate ResultTable[16:18,5] <- EmmRes_df$SE ResultTable[16:18,6] <- EmmRes_df$df ResultTable[16:18,7] <- EmmRes_df$z.ratio ResultTable[16:18,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Linked to started agg print("Prevotella sp. - OTU 2975") count <- c(57, 41, 52) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Excluded, not sig print("Prevotella sp. - OTU 9014") count <- c(25, 178, 21) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") # Excluded due to mixed results print("Prevotella sp. - OTU 17264") count <- c(15, 102, 52) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") # Excluded due to mixed results print("Prevotella sp. - OTU 20448") count <- c(395, 127, 188) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[19:21,1] <- "Prevotella" ResultTable[19:21,2] <- "Prevotella sp. - OTU 20448" ResultTable[19:21,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[19:21,4] <- EmmRes_df$estimate ResultTable[19:21,4] <- EmmRes_df$estimate ResultTable[19:21,5] <- EmmRes_df$SE ResultTable[19:21,6] <- EmmRes_df$df ResultTable[19:21,7] <- EmmRes_df$z.ratio ResultTable[19:21,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Link to agg #PSEUDOMONAS print("PSEUDOMONAS ") print("Pseudomonas psychrotolerans - OTU 366") count <- c(134, 102, 45) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[22:24,1] <- "Pseudomonas" ResultTable[22:24,2] <- "Pseudomonas psychrotolerans - OTU 366" ResultTable[22:24,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[22:24,4] <- EmmRes_df$estimate ResultTable[22:24,4] <- EmmRes_df$estimate ResultTable[22:24,5] <- EmmRes_df$SE ResultTable[22:24,6] <- EmmRes_df$df ResultTable[22:24,7] <- EmmRes_df$z.ratio ResultTable[22:24,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Linked to aggression print("Pseudomonas sp. - OTU 1420") count <- c(73, 3, 12) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") print("Pseudomonas sp. - OTU 2442") count <- c(95, 30, 34) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) EmmRes <- pairs(con1, by = NULL) EmmRes_df <- as.data.frame(summary(EmmRes)) ResultTable[25:27,1] <- "Pseudomonas" ResultTable[25:27,2] <- "Pseudomonas sp. - OTU 2442" ResultTable[25:27,3] <- c("started aggression vs reacted aggressively", "started aggression vs reacted peacefully", "reacted aggressively vs reacted peacefully") ResultTable[25:27,4] <- EmmRes_df$estimate ResultTable[25:27,4] <- EmmRes_df$estimate ResultTable[25:27,5] <- EmmRes_df$SE ResultTable[25:27,6] <- EmmRes_df$df ResultTable[25:27,7] <- EmmRes_df$z.ratio ResultTable[25:27,8] <- EmmRes_df$p.value cat("\n"); cat("\n"); cat("\n"); cat("\n") #Linked to started aggression print("Rhizobiales sp. - OTU 588") count <- c(102, 110, 94) beh <- c("started_aggression", "reacted_aggressivelly", "reacted_peacefully") df_micro <- data.frame(beh, count) df_micro$beh <- relevel(as.factor(df_micro$beh), ref = "started_aggression") mod1 <- glm(df_micro$count ~ df_micro$beh, family = poisson) summary(mod1) emms1 <- emmeans(mod1, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) cat("\n"); cat("\n"); cat("\n"); cat("\n") #Exclude, not sig ###---###---###---###---###---###---### ### Combine data and correct for multiple testing #### ###---###---###---###---###---###---### #Combine data p_vals <- c(0.0947, 0.000001, 0.000001, #OTU 366 0.000001, 0.000001, 0.0805, #OTU 1420 0.000001, 0.000001, 0.8715, #OTU 2442 0.000001, 0.000001, 0.9841, #OTU 22324 0.000001, 0.1182, 0.000001, #OTU 18107 0.000001, 0.000001, 0.0018, #OTU 20448 0.9659, 0.0007, 0.0016, #OTU 813 0.7633, 0.0256, 0.0031) #OTU 21141 p_vals_adj <- p.adjust(p_vals) cbind(c(rep("Pseudomonas psychrotolerans - O_366",3), rep("Pseudomonas sp. - O_1420",3), rep("Pseudomonas sp. - O_2442",3), rep("Bacteroides - O_22324",3), rep("Prevotella sp. - O_18107",3), # rep("Prevotella sp. - O_20448",3), rep("Lactobacillus mucosae - O_813",3), rep("Lactobacillus sp. - O_21141",3)), rep(c("start_agg vs react_agg", "start_agg vs react_pcf", "react_agg vs react_pcf"), 8), p_vals, p_vals_adj) sink() rio::export(ResultTable, "OTU_emmeans_Results.xlsx") ###---###---###---###---### ## Plot OTUs after binomial logistic regression MS#### ###---###---###---###---### ###---###---###---###---###---###---### ### Load packages, setwd, and load data #### ###---###---###---###---###---###---### library(ggplot2); library(ggpubr); #setwd("C:/Users/krapf/OneDrive/Desktop/SubInDel/Workshop/ExpressionCounts") setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") #Same as above but with sorted OUTs based on RowSums of all inds tested, i.e. number of OTUs found in each inds summed up; only used the first 10% of all OTUs, i.e. 2243 #sorted OTUs based on rowSums OTU_count_plot_data <- readxl::read_xlsx("ForR_otu_metadata_rarefied.xlsx", sheet="OTU_sum_plot") ###---###---###---###---###---###---### ### Prepare and plot data and save as pdf #### ###---###---###---###---###---###---### OTU_count_plot_data$keep OTU_count_plot_data_red <- subset(OTU_count_plot_data, keep=="x") OTU_count_plot_data_red #plot data level_order <- c('start_agg', 'react_agg', 'react_pcf') #Plot data pdf("Fig3C_Plot_Counts_MS.pdf", width=8) ggplot(OTU_count_plot_data_red, aes(x=factor(count_beh, level=level_order), y=counts, fill=count_beh)) + geom_bar(stat="identity") + facet_grid(~OTU_R) + scale_fill_manual(name="Behavioural states", breaks=c("start_agg","react_agg","react_pcf"), labels=c(expression(italic("Started aggression")), expression(italic("Reacted aggressively")), expression(italic("Reacted peacefully")) ), values = c("#CC6677", "#DDCC77", "#332288")) + scale_x_discrete(guide = guide_axis(n.dodge=2))+ ylab("OTU count") + xlab("Behavioural states") + theme_bw() + theme(plot.title = element_text(size=20), legend.position="bottom", axis.title.x = element_text(size=13), axis.text.x = element_blank(), #element_text(size=11, color="black"), axis.title.y = element_text(size=13), axis.text.y = element_text(size=11, color="black"), panel.grid.minor.x = element_blank()) dev.off() ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # Multinomial logistic regression - joint analysis using multiple data layer #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## Below, run the multinomial logistic regression ###---###---###---###---###---###---###---### ## Load packages, setwd, and load data #### ###---###---###---###---###---###---###---### library(foreign); library(nnet); library(ggplot2); library(reshape2); library(emmeans); library(lmtest); library("AICcmodavg"); library("DescTools") setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") GeneCounts_MultiNom <- readxl::read_xlsx("ForR_GeneCounts.xlsx", sheet=1) #Multinomial logistic regression uses categories as dependent factors to conduct a logistic regression where the response variable is log-transformed, here "beh" with three levels. #we relevel start_agg as reference/baseline for models below GeneCounts_MultiNom$beh <- relevel(as.factor(GeneCounts_MultiNom$beh), ref = "start_agg") ###---###---###---###---###---###---###---###---### ## UP- AND DOWN-REGULATED GENES #### ###---###---###---###---###---###---###---###---### #Below, we only use genes that were found in both comparisons, started aggression vs reacted aggressively and started aggression vs reacted peacefully (i.e. Models1-8) ###---###---###---###---###---###---### ### Select variables in the models #### ###---###---###---###---###---###---### Mod1 <- multinom(beh ~ 1, data = GeneCounts_MultiNom, model=TRUE) Mod2 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded, data = GeneCounts_MultiNom, model=TRUE) Mod3 <- multinom(beh ~ MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709, data = GeneCounts_MultiNom, model=TRUE) Mod4 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709, data = GeneCounts_MultiNom, model=TRUE) Mod5 <- multinom(beh ~ related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod6 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod7 <- multinom(beh ~ MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709 + related_within_QG_prelim+TAS + CHC_PC1 + nitrogen_soil + related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod8 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.5450 + MSTRG.4267 + MSTRG.5109 + MSTRG.6709 + related_within_QG_prelim + TAS+CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) ###---###---###---###---###---###---### ### Save model fit information #### ###---###---###---###---###---###---### sink("ModComp_Results_MS.txt") print("Model comparison") anova(Mod1, Mod2, Mod3, Mod4, Mod5, Mod6, Mod7, Mod8, test="Chisq") #PLEASE NOTE that the model ranking is different to how we used it, e.g. #anova(ZeroMod, Mod5, test="Chisq") #LR (log-likelihood) is a measure of how much unexplained variability there is in the data. Therefore, the difference or change in log-likelihood indicates how much new variance has been explained by the model. #The chi-square test tests the decrease in unexplained variance from the baseline model (408.1933) to the final model (333.9036), which is a difference of 408.1933 - 333.9036 = 74.29. This change is significant, which means that our final model explains a significant amount of the original variability. #The likelihood ratio chi-square of 74.29 with a p-value < 0.001 tells us that our model as a whole fits significantly better than an empty or null model (i.e., a model with no predictors). ###---###---###---###---###---###---### ### Model comparisons, fit, and p-value testing #### ###---###---###---###---###---###---### ## Model comparison using AICc print("AICc_models") mod_list <- list(Mod2=Mod2, Mod3=Mod3, Mod4=Mod4, Mod5=Mod5, Mod6=Mod6, Mod7=Mod7, Mod8=Mod8) print(mod_list) aictab(mod_list) ## Calculate the Goodness of fit # Check the predicted probability for each program print("Goodness_of_fit_Mod2") head(Mod2$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod2),82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod2)) print("Goodness_of_fit_Mod4") head(Mod4$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod4),82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod4)) ## Calculate and p-value using z-wald test for Model 2 print("summary_Model2") summary(Mod2) print("z-value_Model2") z <- summary(Mod2)$coefficients/summary(Mod2)$standard.errors z # 2-tailed z test to calculate p-values using Wald Z test print("p-value_Model2") p <- (1 - pnorm(abs(z), 0, 1)) * 2 p #We see above that scaffold 2 and 3 (gene gd have an effect on the behaviours) ## Extract data for the variables for Model 2 summary(Mod2) exp(-3.106913) #intercept exp(-1.727618) #intercept exp(0.23679451) #1.3 times higher SNP1 exp(-0.01633534) #1.00 times higher SNP1 exp(3.451271) #31.5 times higher SNP2 exp(2.466562) #11.8 times higher SNP2 exp(-2.457875) #0.1 times fewer SNP3 exp(-1.953764) #0.1 times fewer SNP3 ## Calculate and p-value using z-wald test for Model 4 print("summary_Model4") summary(Mod4) print("z-value_Model4") z <- summary(Mod4)$coefficients/summary(Mod4)$standard.errors z print("p-value_Model4") # 2-tailed z test to calculate p-values using Wald Z test p <- (1 - pnorm(abs(z), 0, 1)) * 2 p #We see above that scaffold 2 and 3 (SNP3 = gastrulation effectice) have an effect on the behaviours ## Extract data for the variables for Model 4 summary(Mod4) exp(-4.064435) #intercept exp(-1.074275 ) #intercept exp(0.1945963) #1.2 times higher SNP1 exp(-0.5279522) #0.6 times fewer SNP1 exp(3.668872) #39.2 times higher SNP2 sig exp(2.464436) #11.8 times higher SNP2 sig exp(-2.833251) #0.1 times fewer SNP3 sig exp(-1.960591) #0.1 times fewer SNP3 sig exp(0.0001688561) #1.00 times higher MSTRG.5450 exp(0.0002718880) #1.00 times higher MSTRG.5450 exp(0.0028886672) #1.00 times higher MSTRG.4267 exp(0.000503833) #1.00 times higher MSTRG.4267 exp(-0.005598530) #1.00 times fewer MSTRG.5109 exp(-0.003815167) #1.00 times fewer MSTRG.5109 exp(0.005739344) #1.00 times higher MSTRG.6709 sig exp(0.006348842) #1.00 times higher MSTRG.6709 sig ###---###---###---###---###---###---### ### Calculate Pseudo-R2 and Log-likelihood ratio tests #### ###---###---###---###---###---###---### ## Calculate the Pseudo R-Square # Load the DescTools package for calculate the R square # Calculate the R Square print("PseudpR2_models") PseudoR2(Mod2, which = c("Nagelkerke")) #only SNPs PseudoR2(Mod3, which = c("Nagelkerke")) PseudoR2(Mod4, which = c("Nagelkerke")) #only environment PseudoR2(Mod5, which = c("Nagelkerke")) PseudoR2(Mod6, which = c("Nagelkerke")) PseudoR2(Mod7, which = c("Nagelkerke")) PseudoR2(Mod8, which = c("Nagelkerke")) #Interpretation of the R-Square: ##The Nagelkerke modification that does range from 0 to 1 is a more reliable measure of the relationship. Nagelkerke’s R2 will normally be higher than the Cox and Snell measure. If the result is 0.49, it indicates a relationship of 49% between the predictors and the prediction. ## Likelihood Ratio Tests ## Use the lmtest package to run Likelihood Ratio Tests #library(lmtest) #The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. print("lrtest Mod2") lrtest(Mod2, "scaffold_11_2457277_coded") # lrtest(Mod2, "scaffold_163_28302_coded") # sig lrtest(Mod2, "scaffold_185_110741_coded") #gd #sig print("lrtest Mod4") lrtest(Mod4, "scaffold_11_2457277_coded") # lrtest(Mod4, "scaffold_163_28302_coded") # sig lrtest(Mod4, "scaffold_185_110741_coded") #gd #sig lrtest(Mod4, "MSTRG.5450") #gene_CG3061 lrtest(Mod4, "MSTRG.4267") #gene_silver lrtest(Mod4, "MSTRG.5109") #gene_Syt4 lrtest(Mod4, "MSTRG.6709") #gene_yellowd2 *** sig #Interpretation of the Likelihood Ratio Tests #It leaves the value used in parantheses out and compares the "full" model and this "leave-on-out" model. If it is significant, then this factor significantly explains part of the model. In our case, it tells us that two scaffolds had a significant main effect on the behaviours. #These likelihood statistics can be seen as sorts of overall statistics that tell us which predictors significantly enable us to predict the outcome category, but they don’t really tell us specifically what the effect is. To see this we have to look at the individual parameter estimates. ###---###---###---###---###---###---### ### Comparing the three behavioural states using emmeans and calculate odds ratios #### ###---###---###---###---###---###---### #### Model 2 print("Mod2") emms1 <- emmeans(Mod2, ~ beh) con1 <- contrast(emms1) confint(emms1) pairs(con1, by = NULL) #, interaction = "pairwise" ##What does the emmeans calculation above mean?: using the coded variables of scaffolds, there is a sig differences between reactagg and start agg and reactpcf and start agg. #### Model 4 print("Mod4") emms1 <- emmeans(Mod4, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) ### Calculate odds ratio and data #### Model 2 # First, we extract the coefficients print("Mod2") coefs <- coef(Mod2) # matrix: outcomes x predictors vc <- vcov(Mod2) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod2, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts tidy_fit #### Model 4 print("Mod4") coefs <- coef(Mod4) # matrix: outcomes x predictors vc <- vcov(Mod4) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod4, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts tidy_fit sink() #Below, we save all information from Model 4 in a list and ... lr_results <- list( scaffold_11_2457277_coded = lrtest(Mod4, update(Mod4, . ~ . - scaffold_11_2457277_coded)), scaffold_163_28302_coded = lrtest(Mod4, update(Mod4, . ~ . - scaffold_163_28302_coded)), scaffold_185_110741_coded = lrtest(Mod4, update(Mod4, . ~ . - scaffold_185_110741_coded)), MSTRG.5450 = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.5450)), MSTRG.4267 = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.4267 )), MSTRG.5109 = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.5109)), MSTRG.6709 = lrtest(Mod4, update(Mod4, . ~ . - MSTRG.6709)) ) # ... combine the information in a data frame ... lr_df <- do.call(rbind, lapply(lr_results, function(x) { data.frame( Predictor = rownames(x)[2], Chisq = x$Chisq[2], pvalue = x$`Pr(>Chisq)`[2] ) })) lr_df lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data lr_df_doubled # ... which we save as an Escel file plot_df_doubled <- tidy_fit %>% left_join(lr_df_doubled, by = "term") %>% mutate( signif_label = case_when( pvalue < 0.001 ~ "***", pvalue < 0.01 ~ "**", pvalue < 0.05 ~ "*", TRUE ~ "" ), chisq_label = paste0("χ²=", round(Chisq, 1)) ) plot_df_doubled plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values plot_df rio::export(plot_df, "Mod4.xlsx") ###---###---###---###---###---###---###---###---### ## ONLY UP-REGULATED GENES #### ###---###---###---###---###---###---###---###---### #Below, we only use up-regulated genes (i.e. Models9-16) ###---###---###---###---###---###---### ### Select variables in the models #### ###---###---###---###---###---###---### Mod9 <- multinom(beh ~ 1, data = GeneCounts_MultiNom, model=TRUE) Mod10 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded, data = GeneCounts_MultiNom, model=TRUE) Mod11 <- multinom(beh ~ MSTRG.9806 + MSTRG.13575 + MSTRG.13555, data = GeneCounts_MultiNom, model=TRUE) Mod12 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.9806 + MSTRG.13575 + MSTRG.13555, data = GeneCounts_MultiNom, model=TRUE) Mod13 <- multinom(beh ~ related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod14 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod15 <- multinom(beh ~ MSTRG.9806 + MSTRG.13575 + MSTRG.13555 + related_within_QG_prelim+TAS + CHC_PC1 + nitrogen_soil + related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod16 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.9806 + MSTRG.13575 + MSTRG.13555 + related_within_QG_prelim + TAS+CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) ###---###---###---###---###---###---### ### Save model fit information #### ###---###---###---###---###---###---### sink("ModComp_Results_UPREG.GENE_MS.txt") print("Model comparison") anova(Mod9, Mod10, Mod11, Mod12, Mod13, Mod14, Mod15, Mod16, test="Chisq") ###---###---###---###---###---###---### ### Model comparisons, fit, and p-value testing #### ###---###---###---###---###---###---### # Model comparison using AICc mod_list <- list(Mod9=Mod9, Mod10=Mod10, Mod12=Mod12, Mod13=Mod13, Mod14=Mod14, Mod15=Mod15, Mod16=Mod16) print(mod_list) aictab(mod_list) ## Calculate the Goodness of fit # Check the predicted probability for each program print("Goodness_of_fit_Mod10") head(Mod10$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod10),82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod10)) print("Goodness_of_fit_Mod12") head(Mod12$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod12),82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod12)) ## Calculate and p-value using z-wald test # Model 10 print("summary_Mod10") summary(Mod10) print("z-value_Mod10") z <- summary(Mod10)$coefficients/summary(Mod10)$standard.errors z # 2-tailed z test to calculate p-values using Wald Z test print("p-value_Mod10") p <- (1 - pnorm(abs(z), 0, 1)) * 2 p # Model 12 print("summary_Mod12") summary(Mod12) print("z-value_Mod12") z <- summary(Mod12)$coefficients/summary(Mod12)$standard.errors z print("p-valueMod12") # 2-tailed z test to calculate p-values using Wald Z test p <- (1 - pnorm(abs(z), 0, 1)) * 2 p # Extract data from the model fit from Model 12 summary(Mod12) exp(1.21) #3.4 times higher SNP1 exp(1.27) #3.5 times higher SNP1 exp(3.47) #32.1 times higher SNP2 exp(2.43) #11.4 times higher SNP2 exp(0.79) #2.2 times fewer SNP3 exp(0.75) #2.1 times fewer SNP3 ###---###---###---###---###---###---### ### Calculate Pseudo-R2 and Log-likelihood ratio tests #### ###---###---###---###---###---###---### #Calculate the Pseudo R-Square print("PseudoR2_models") PseudoR2(Mod10, which = c("Nagelkerke")) #only SNPs PseudoR2(Mod11, which = c("Nagelkerke")) PseudoR2(Mod12, which = c("Nagelkerke")) #only PseudoR2(Mod13, which = c("Nagelkerke")) PseudoR2(Mod14, which = c("Nagelkerke")) PseudoR2(Mod15, which = c("Nagelkerke")) PseudoR2(Mod16, which = c("Nagelkerke")) #Likelihood Ratio Tests ## Use the lmtest package to run Likelihood Ratio Tests #library(lmtest) #The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. #Model 10 print("lrtest Mod10") lrtest(Mod10, "scaffold_11_2457277_coded") # lrtest(Mod10, "scaffold_163_28302_coded") # sig lrtest(Mod10, "scaffold_185_110741_coded") #gd #sig #Model 12 print("lrtest Mod12") lrtest(Mod12, "scaffold_11_2457277_coded") # lrtest(Mod12, "scaffold_163_28302_coded") # sig lrtest(Mod12, "scaffold_185_110741_coded") #gd #sig lrtest(Mod12, "MSTRG.9806") #gene_CNBP lrtest(Mod12, "MSTRG.13575") #gene_CG3902 lrtest(Mod12, "MSTRG.13555") #gene_CG34367 #sig ###---###---###---###---###---###---### ### Comparing the three behavioural states using emmeans and calculate odds ratios #### ###---###---###---###---###---###---### ## Comparing the three behavioural states using emmeans #Model 10 print("Mod10") emms1 <- emmeans(Mod10, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) # Model 12 print("Mod12") emms1 <- emmeans(Mod12, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) ## Calculate odds ratio and data # Model 10 # First, we extract the coefficients print("Mod10") coefs <- coef(Mod10) # matrix: outcomes x predictors vc <- vcov(Mod10) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod10, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts tidy_fit # Model 12 print("Mod12") coefs <- coef(Mod12) # matrix: outcomes x predictors vc <- vcov(Mod12) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod12, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts tidy_fit sink() #then, we combine everything from Model 12, the selected model, into one list lr_results <- list( scaffold_11_2457277_coded = lrtest(Mod12, update(Mod12, . ~ . - scaffold_11_2457277_coded)), scaffold_163_28302_coded = lrtest(Mod12, update(Mod12, . ~ . - scaffold_163_28302_coded)), scaffold_185_110741_coded = lrtest(Mod12, update(Mod12, . ~ . - scaffold_185_110741_coded)), MSTRG.9806 = lrtest(Mod12, update(Mod12, . ~ . - MSTRG.9806)), MSTRG.13575 = lrtest(Mod12, update(Mod12, . ~ . - MSTRG.13575 )), MSTRG.13555 = lrtest(Mod12, update(Mod12, . ~ . - MSTRG.13555)) ) lr_df <- do.call(rbind, lapply(lr_results, function(x) { data.frame( LogLik = x$LogLik[2], Predictor = rownames(x)[2], Chisq = x$Chisq[2], pvalue = x$`Pr(>Chisq)`[2] ) })) lr_df lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data lr_df_doubled plot_df_doubled <- tidy_fit %>% left_join(lr_df_doubled, by = "term") %>% mutate( signif_label = case_when( pvalue < 0.001 ~ "***", pvalue < 0.01 ~ "**", pvalue < 0.05 ~ "*", TRUE ~ "" ), chisq_label = paste0("χ²=", round(Chisq, 1)) ) plot_df_doubled plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values plot_df #and save everything in an Excel-file rio::export(plot_df, "Mod12.xlsx") ###---###---###---###---###---###---###---###---### ## ONLY DOWN-REGULATED GENES #### ###---###---###---###---###---###---###---###---### ## Below, we only use downregulated genes (i.e. Models17-24) ###---###---###---###---###---###---### ### Select variables for each model #### ###---###---###---###---###---###---### Mod17 <- multinom(beh ~ 1, data = GeneCounts_MultiNom, model=TRUE) Mod18 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded, data = GeneCounts_MultiNom, model=TRUE) Mod19 <- multinom(beh ~ MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + MSTRG.4186 + MSTRG.14246 + MSTRG.12676, data = GeneCounts_MultiNom, model=TRUE) Mod20 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + MSTRG.4186 + MSTRG.14246 + MSTRG.12676, data = GeneCounts_MultiNom, model=TRUE) Mod21 <- multinom(beh ~ related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod22 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod23 <- multinom(beh ~ MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + MSTRG.4186 + MSTRG.14246 + + MSTRG.12676 + related_within_QG_prelim+TAS + CHC_PC1 + nitrogen_soil + related_within_QG_prelim + TAS + CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) Mod24 <- multinom(beh ~ scaffold_11_2457277_coded + scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + MSTRG.4186 + MSTRG.14246 + MSTRG.12676 + related_within_QG_prelim + TAS+CHC_PC1 + nitrogen_soil + Precip_annual + Precip_WarmestQuarter + Temp_meanYear + TempMax_WarmestMonth, data = GeneCounts_MultiNom, model=TRUE) ###---###---###---###---###---###---### ### Save model fit information #### ###---###---###---###---###---###---### #Extract model fit information sink("ModComp_Results_DOWNREG.GENE_MS.txt") print("Model comparison") anova(Mod17, Mod18, Mod19, Mod20, Mod21, Mod22, Mod23, Mod24, test="Chisq") ###---###---###---###---###---###---### ### Model comparisons, fit, and p-value testing #### ###---###---###---###---###---###---### # Model comparison using AICc mod_list <- list(Mod18=Mod18, Mod19=Mod19, Mod20=Mod20, Mod21=Mod21, Mod22=Mod22, Mod23=Mod23, Mod24=Mod24) print(mod_list) aictab(mod_list) # Calculate the Goodness of fit #Model 18 # Check the predicted probability for each program print("Goodness_of_fit_Mod18") head(Mod18$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod18),82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod18)) #Model 20 print("Goodness_of_fit_Mod20") head(Mod20$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod20),82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod20)) ## Calculate and p-value using z-wald test #Model 18 print("summary_Mod18") summary(Mod18) print("z-value_Mod18") z <- summary(Mod18)$coefficients/summary(Mod18)$standard.errors z # 2-tailed z test to calculate p-values using Wald Z test print("p-value_Mod18") p <- (1 - pnorm(abs(z), 0, 1)) * 2 p #We see above that scaffold 2 and 3 (gastrulcation effect have an effect on the behaviours) #Model 20 print("summary_Mod20") summary(Mod20) print("z-value_Mod20") z <- summary(Mod20)$coefficients/summary(Mod20)$standard.errors z print("p-value_Mod20") # 2-tailed z test to calculate p-values using Wald Z test p <- (1 - pnorm(abs(z), 0, 1)) * 2 p # #extract data from Model 20 exp(0.44) #1.6 times higher SNP1 exp(0.45) #1.6 times higher SNP1 exp(3.99) #54.1 times higher SNP2 exp(3.44) #31.2 times higher SNP2 exp(-4.94) #0.01 times fewer SNP3 exp(-4.17) #0.02 times fewer SNP3 exp(0.66) #1.9 times higher 7840 exp(0.66) #1.9 times higher 7840 exp(0.23) #1.3 times higher 16119 exp(-0.12) #0.9 times fewer 16119 exp(0.11) #1.3 times higher 4186 exp(0.18) #0.9 times higher 4186 exp(0.25) #1.3 times higher 12676 exp(0.24) #1.3 times higher 12676 ###---###---###---###---###---###---### ### Calculate Pseudo-R2 and Log-likelihood ratio tests #### ###---###---###---###---###---###---### ## Calculate the Pseudo R-Square print("PseudoR2_models") PseudoR2(Mod18, which = c("Nagelkerke")) #only SNPs PseudoR2(Mod19, which = c("Nagelkerke")) PseudoR2(Mod20, which = c("Nagelkerke")) # PseudoR2(Mod21, which = c("Nagelkerke")) PseudoR2(Mod22, which = c("Nagelkerke")) PseudoR2(Mod23, which = c("Nagelkerke")) PseudoR2(Mod24, which = c("Nagelkerke")) ## Calculate Likelihood Ratio Tests ## Use the lmtest package to run Likelihood Ratio Tests #library(lmtest) #The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. print("lrtest Mod18") lrtest(Mod18, "scaffold_11_2457277_coded") # lrtest(Mod18, "scaffold_163_28302_coded") # sig lrtest(Mod18, "scaffold_185_110741_coded") #gd #sig print("lrtest Mod20") lrtest(Mod20, "scaffold_11_2457277_coded") # lrtest(Mod20, "scaffold_163_28302_coded") # sig lrtest(Mod20, "scaffold_185_110741_coded") #gd #sig lrtest(Mod20, "MSTRG.7840") #gene_BiCc +sig lrtest(Mod20, "MSTRG.16119") #gene_Pif1 +sig lrtest(Mod20, "MSTRG.15669") #gene_Exo84 lrtest(Mod20, "MSTRG.15080") #gene_PlexinA lrtest(Mod20, "MSTRG.4186") #gene_KaiE1d lrtest(Mod20, "MSTRG.14246") #gene_Rd lrtest(Mod20, "MSTRG.12676") #gene_RfC3 *sig ###---###---###---###---###---###---### ### Comparing the three behavioural states using emmeans and calculate odds ratios #### ###---###---###---###---###---###---### ## comparing the three behavioural states using emmeans #Model 18 print("Mod18") emms1 <- emmeans(Mod18, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) #, interaction = "pairwise" #Model 20 print("Mod20") emms1 <- emmeans(Mod20, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) ##Calculate odds ratio and data #Model 18 # First, we extract the coefficients print("Mod18") coefs <- coef(Mod18) # matrix: outcomes x predictors vc <- vcov(Mod18) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod18, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts tidy_fit #Model 20 print("Mod20") coefs <- coef(Mod20) # matrix: outcomes x predictors vc <- vcov(Mod20) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod20, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts tidy_fit sink() #Combine all data from Model 20 into a data frame and ... lr_results <- list( scaffold_11_2457277_coded = lrtest(Mod20, update(Mod20, . ~ . - scaffold_11_2457277_coded)), scaffold_163_28302_coded = lrtest(Mod20, update(Mod20, . ~ . - scaffold_163_28302_coded)), scaffold_185_110741_coded = lrtest(Mod20, update(Mod20, . ~ . - scaffold_185_110741_coded)), MSTRG.7840 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.7840)), MSTRG.16119 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.16119 )), MSTRG.15669 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.15669)), MSTRG.15080 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.15080)), MSTRG.4186 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.4186 )), MSTRG.14246 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.14246)), MSTRG.12676 = lrtest(Mod20, update(Mod20, . ~ . - MSTRG.12676)) ) lr_df <- do.call(rbind, lapply(lr_results, function(x) { data.frame( LogLik = x$LogLik[2], Predictor = rownames(x)[2], Chisq = x$Chisq[2], pvalue = x$`Pr(>Chisq)`[2] ) })) lr_df lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data lr_df_doubled plot_df_doubled <- tidy_fit %>% left_join(lr_df_doubled, by = "term") %>% mutate( signif_label = case_when( pvalue < 0.001 ~ "***", pvalue < 0.01 ~ "**", pvalue < 0.05 ~ "*", TRUE ~ "" ), chisq_label = paste0("χ²=", round(Chisq, 1)) ) plot_df_doubled plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values plot_df ###save it as an Excel file rio::export(plot_df, "Mod20.xlsx") ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### # FINAL MODEL: only significant SNPs and DEGs #### ###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---###---### ## This approach uses only the significant SNPs and differentially-expressed genes based on the results of the previous analyses library(foreign); library(nnet); library(ggplot2); library(reshape2); library(emmeans); library(lmtest); library("AICcmodavg"); library("DescTools"); library("tidyr"); library("dplyr"); library(ggrepel); library(broom) setwd("C:/Users/krapf/OneDrive/Desktop/P_W/A_MS_datasets/") GeneCounts_MultiNom <- readxl::read_xlsx("ForR_GeneCounts_final.xlsx", sheet=1) ##THIS is the file that combines all information #Multinomial logistic regression uses categories as dependent factors to conduct a logistic regression where the response variable is log-transformed, here "beh" with three levels. #we relevel start_agg as reference/baseline for models below GeneCounts_MultiNom$beh <- relevel(as.factor(GeneCounts_MultiNom$beh), ref = "start_agg") ###---###---###---###---###---###---### ### Select variables #### ###---###---###---###---###---###---### #Fit models #Intercept only model Mod_f1 <- multinom(beh ~ 1, data = GeneCounts_MultiNom, model=TRUE) #Final model Mod_f4_sig <- multinom(beh ~ scaffold_163_28302_coded + scaffold_185_110741_coded + MSTRG.6709 + #in both MSTRG.7840 + MSTRG.16119 + MSTRG.15669 + MSTRG.15080 + #down-reg MSTRG.4186 + MSTRG.14246 + MSTRG.12676 , #down-reg data = GeneCounts_MultiNom, model=TRUE) mod_list <- list(Mod_f1=Mod_f1, Mod_f4_sig=Mod_f4_sig) #, Mod9=Mod9 print(mod_list) aictab(mod_list) #best, Mod_f2 (only SNPs) followed by Modf3 ###---###---###---###---###---###---### ### Model comparisons, fit, and p-value testing #### ###---###---###---###---###---###---### #Calculate the Goodness of fit # Check the predicted probability for each program sink("FinalResults_MS.txt") print("Goodness_of_fit_Mod_f4_sig") head(Mod_f4_sig$fitted.values, 82) # We can get the predicted result by use predict function head(predict(Mod_f4_sig), 82) # Test the goodness of fit chisq.test(GeneCounts_MultiNom$beh, predict(Mod_f4_sig)) #Calculate and p-value using z-wald test print("summary_Mod_f4_sig") summary(Mod_f4_sig) print("z-value_Mod_f4_sig") z <- summary(Mod_f4_sig)$coefficients/summary(Mod_f4_sig)$standard.errors z # 2-tailed z test to calculate p-values using Wald Z test print("p-value_Mod_f4_sig") p <- (1 - pnorm(abs(z), 0, 1)) * 2 p #Extract data summary(Mod_f4_sig) exp(4.896033) #chance of finding SNP2 133.8 times higher in react_agg than start_agg exp(3.797238) #chance of finding SNP2 44.6 times higher in react_pcf than start_agg exp(-4.129057) #... SNP3 0.01 times lower in react_agg than start_agg exp(-3.810066) #... SNP3 0.02 times lower in react_pcf than start_agg exp(0.0258754) # MSTRG.6709 1.02 times higher exp(0.02150042) #MSTRG.6709 1.022 times higher exp(0.5803252) # MSTRG.7840 1.8 times higher in ... exp(0.5728261) #MSTRG.7840 1.8 times higher exp(0.3411982848) # MSTRG.16119 1.4 times higher 16119 exp(0.0007958548) #MSTRG.16119 1.0 times higher 16119 #NOT SIG exp(-0.0002316689) # MSTRG.15669 -1.0 times lower 4186 exp(-0.0002322059) # MSTRG.15669 -1.0 times lower 4186 exp(0.03221771) #MSTRG.15080 1.03 times higher #NOT SIG exp(-0.01099654 ) #MSTRG.15080 -1.0 times lower #NOT SIG exp(0.3068957) # MSTRG.4186 1.4 times higher exp(0.3588560) # MSTRG.4186 1.4 times higher exp(-0.002389500 ) # MSTRG.15669 -1.0 times lower exp(-0.00180066) # MSTRG.15669 -1.0 times lower exp(0.3159435) # MSTRG.14246 1.4 times higher exp(0.2853418) # MSTRG.14246 1.3 times higher ###---###---###---###---###---###---### ### Calculate Pseudo-R2 and Log-likelihood ratio tests #### ###---###---###---###---###---###---### #Calculate the Pseudo R-Square print("PseudoR2_models") PseudoR2(Mod_f4_sig, which = c("CoxSnell","Nagelkerke","McFadden")) #only SNPs #Likelihood Ratio Tests ## Use the lmtest package to run Likelihood Ratio Tests #library(lmtest) #The results of the likelihood ratio tests can be used to ascertain the significance of predictors to the model. print("lrtest Mod_f4_sig") #lrtest(Mod_f4_sig, "scaffold_11_2457277_coded") # lrtest(Mod_f4_sig, "scaffold_163_28302_coded") # sig lrtest(Mod_f4_sig, "scaffold_185_110741_coded") #gd #sig lrtest(Mod_f4_sig, "MSTRG.6709") #gene_yellow-d2 #sig lrtest(Mod_f4_sig, "MSTRG.7840") #gene_BiCc +sig lrtest(Mod_f4_sig, "MSTRG.16119") #gene_Pif1 +sig lrtest(Mod_f4_sig, "MSTRG.15669") #gene_Exo84 #sig lrtest(Mod_f4_sig, "MSTRG.15080") #gene_PlexinA #not sig lrtest(Mod_f4_sig, "MSTRG.4186") #gene_KaiE1d #sig lrtest(Mod_f4_sig, "MSTRG.14246") #gene_Rd #sig lrtest(Mod_f4_sig, "MSTRG.12676") #gene_RfC3 *sig ###---###---###---###---###---###---### ### Comparing the three behavioural states using emmeans and calculate odds ratios #### ###---###---###---###---###---###---### #compares the three behaviours print("Mod_f4_sig") emms1 <- emmeans(Mod_f4_sig, ~ beh) con1 <- contrast(emms1) pairs(con1, by = NULL) #, interaction = "pairwise" confint(emms1) sink() #Plot odds ratio and data # First, we extract the coefficients coefs <- coef(Mod_f4_sig) # matrix: outcomes x predictors vc <- vcov(Mod_f4_sig) # variance-covariance matrix (flattened) tidy_fit <- tidy(Mod_f4_sig, conf.int = TRUE, exponentiate = TRUE) %>% filter(term != "(Intercept)") # drop intercepts # tidy_fit has columns: # outcome | term | estimate (OR) | conf.low | conf.high #Likelihood ratio tests for each predictor lr_results <- list( scaffold_163_28302_coded = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - scaffold_163_28302_coded)), scaffold_185_110741_coded = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - scaffold_185_110741_coded)), MSTRG.6709 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.6709)), MSTRG.7840 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.7840)), MSTRG.16119 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.16119)), MSTRG.15669 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.15669)), MSTRG.15080 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.15080)), MSTRG.4186 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.4186)), MSTRG.14246 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.14246)), MSTRG.12676 = lrtest(Mod_f4_sig, update(Mod_f4_sig, . ~ . - MSTRG.12676)) ) lr_df <- do.call(rbind, lapply(lr_results, function(x) { data.frame( Predictor = rownames(x)[2], Chisq = x$Chisq[2], pvalue = x$`Pr(>Chisq)`[2] ) })) lr_df lr_df_doubled <- rbind(lr_df, lr_df) #here, we double the values for simplicity lr_df_doubled$term <- tidy_fit$term #now, we add the y-level term from tidy_fit above to merge the data lr_df_doubled plot_df_doubled <- tidy_fit %>% left_join(lr_df_doubled, by = "term") %>% mutate( signif_label = case_when( pvalue < 0.001 ~ "***", pvalue < 0.01 ~ "**", pvalue < 0.05 ~ "*", TRUE ~ "" ), chisq_label = paste0("χ²=", round(Chisq, 1)) ) plot_df_doubled plot_df <- unique(plot_df_doubled) #here, we get rid of double values and only use the unique values #Re-order data set for plotting plot_df$Behaviour <- ifelse(plot_df$y.level=="react_agg", "Reacted aggressively", ifelse(plot_df$y.level=="react_pcf", "Reacted peacefully", "")) plot_df$SNPs_genes <- rep(c("SNP1", "SNP3", "yellow-d2", "BiCc", "Pif1", "Exo84", "PlexinA", "KaiR1d", "Rd", "RfC3"), 2) plot_df$SNPs_genes <- factor(plot_df$SNPs_genes, levels = c("BiCc" , "Exo84", "KaiR1d", "Pif1", "PlexinA", "Rd", "RfC3", "yellow-d2", "SNP3", "SNP1")) # custom order ###---###---###---###---###---###---### ### Plot and save pdf #### ###---###---###---###---###---###---### # Plot as forest-style dot plot pdf("Fig3D_ForestPlot_wChiSquare_pval_MS_20250901.pdf") ggplot(plot_df, aes(x = log(estimate), y = SNPs_genes, color = Behaviour, fill=Behaviour)) + geom_point(size = 5, position = position_dodge(width = 0.7), pch=21) + geom_errorbarh(aes(xmin = log(conf.low), xmax = log(conf.high)), height = 0.2, position = position_dodge(width = 0.7)) + geom_vline(xintercept = 0, linetype = "dashed") + #scale_x_log10() + scale_color_manual(values=c("black", "black")) + scale_fill_manual(values=c("#7336c2a9", "#fcfa18ec")) + labs(x = "Log-scale odds ratio", y = "SNPs or genes") + geom_text_repel(aes(label = signif_label), nudge_x = 0.1, size = 6, color = "black") + geom_text_repel(aes(label = Chisq), nudge_x = 2, size = 3, color = "black") + theme_bw(base_size = 14) + theme(plot.title = element_text(size=20), axis.title.x = element_text(size=15), axis.text.x = element_text(size=13, color="black"), axis.title.y = element_text(size=15), axis.text.y = element_text(size=13, color="black"), strip.text.x = element_text(size = 13, colour = "black")) dev.off() #Also, export the results of the final model rio::export(plot_df, "FinalModel_results_MS.xlsx") ###---###---###---###---###---###---###--- ## Barplot for the up- and down-regulated genes #### ###---###---###---###---###---###---###-- library(ggplot2) setwd("C:/Users/krapf/OneDrive/Desktop/SubInDel/Workshop/ExpressionCounts") set2 <- readxl::read_xlsx("ForR_Gene_table.xlsx", sheet="barplot") set1 <- subset(set2, use=="x") pdf("Fig3E_log2fold_plot.pdf", height=5) ggplot(set1, aes(x=log2FoldChange, y=geneNames, fill=function_plot))+ # geom_bar(stat="identity") + facet_grid(~behavioural_comp) + theme_bw() + xlim(c(-3.5,1)) + ylab("Genes")+ xlab("log2 fold change") + geom_vline(xintercept = 0, linetype=2) + #, color, size geom_text(aes(label=round(log2FoldChange, 2)), vjust=0) + scale_fill_manual(name="Function", values=c("#2D708EFF", "#C2DF23FF")) + theme(plot.title = element_text(size=20), axis.title.x = element_text(size=15), axis.text.x = element_text(size=13, color="black"), axis.title.y = element_text(size=15), axis.text.y = element_text(size=13, color="black"), strip.text.x = element_text(size = 13, colour = "black"), legend.position="bottom", ) dev.off() #END of script