###Scripts### #####Statistics#### #DIRECTORY getwd() #################### #Load Data #################### data=read.table("comp.txt",head=TRUE) names(data) attach(data) detach(data) data$Genotype=as.factor(data$Genotype) ##################################################LENGTHS #boxplot(interest variable name ~ ind) ###PA windowsFonts() boxplot(data$PA ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Aboveground length (cm)", xlab="Genotype",bty="1", family= "TT Times New Roman", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd = tapply(data$PA, data$Genotype, summary) dd #MEAN: sm: 16.43 ufes250: 14.33 #boxplot(interest variable name ~ ind) ###Foliolo boxplot(data$Foliolo ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Leaflet length (cm)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd2 = tapply(data$Foliolo, data$Genotype, summary) dd2 #MEAN: sm: 7.836 ufes250: 7.043 #boxplot(interest variable name ~ ind) ###Caule boxplot(data$Caule ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab=" Stalk length (cm)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd3 = tapply(data$Caule, data$Genotype, summary) dd3 #MEAN: sm: 7.802 ufes250: 6.824 #boxplot(interest variable name ~ ind) ###Raiz boxplot(data$Raiz ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Roots length (cm)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd4 = tapply(data$Raiz, data$Genotype, summary) dd4 #MEAN: sm: 17.5 ufes250: 13.15 ###############################################MASSES data=read.table("massafresca_r.txt",head=TRUE) names(data) attach(data) detach(data) data$Genotype=as.factor(data$Genotype) #boxplot(interest variable name ~ ind) ###Fresh mass_roots boxplot(data$Raiz ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Fresh weight (roots) (g)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd5 = tapply(data$Raiz, data$Genotype, summary) dd5 #MEAN: sm: 0.5136 ufes250: 0.7569 data=read.table("massaseca_r.txt",head=TRUE) names(data) attach(data) detach(data) data$Genotype=as.factor(data$Genotype) #boxplot(interest variable name ~ ind) ###Dry mass_roots boxplot(data$Raiz ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd6 = tapply(data$Raiz, data$Genotype, summary) dd6 #MEAN: sm: 0.1837 ufes250: 0.1948 data=read.table("massafresca_pa.txt",head=TRUE) names(data) attach(data) detach(data) data$Genotype=as.factor(data$Genotype) #boxplot(interest variable name ~ ind) ###Fresh mass_pa boxplot(data$PA ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Fresh weight (Aboveground) (g)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd5 = tapply(data$PA, data$Genotype, summary) dd5 #MEAN: sm: 1.372 ufes250: 1.155 data=read.table("massaseca_pa.txt",head=TRUE) names(data) attach(data) detach(data) data$Genotype=as.factor(data$Genotype) #boxplot(interest variable name ~ ind) ###Dry mass_pa boxplot(data$PA ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Dry weight (roots) (g)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4) dd6 = tapply(data$PA, data$Genotype, summary) dd6 #MEAN: sm: 0.3914 ufes250: 0.3119 boxplot(Individuo ~ Genotype, data=data) #plot(interest variable name ~ ind) plot(data$PA) #####Ancova#### ###Directory, package installation, file call setwd() ##install.packages("car") library(car) d2<- read.table("comp.txt", header = TRUE, sep= "\t") head(d2) names(d2) ###############FOLIOLO aov1 = aov(Foliolo ~ Genotype + Caule + PA + Raiz, d2) Anova(aov1 ,type="III") levels(d2$Genotype) contrast1 = c(1,-1)#sm x ufes-250 d2$Genotype <- as.factor(d2$Genotype) contrasts(d2$Genotype) = cbind(contrast1) contrasts(d2$Genotype) summary.lm(aov1) ###############ABOVEGROUND PART aov2 = aov(PA ~ Genotype + Caule + Foliolo + Raiz, d2) summary.lm(aov2) ###############STALK aov3 = aov(Caule ~ Genotype + PA + Foliolo + Raiz, d2) summary.lm(aov3) ###############ROOT aov4 = aov(Raiz ~ Genotype + PA + Foliolo + Caule, d2) summary.lm(aov4) #####Differential expression analysis##### #working directory getwd() #Packages #if (!requireNamespace("BiocManager", quietly = TRUE)) #install.packages("BiocManager") #BiocManager::install(c("edgeR", "limma", "statmod", "Glimma", "biomaRt", "gplots", "ggplot2", #"AnnotationDbi", "readxl", "yarrr", "fs", "RNAseqQC", "DESeq2", "NOISeq", #"ensembldb", "GenomicFeatures", "rtracklayer", "AnnotationHub", "RColorBrewer")) pacotes <- c("rmarkdown", "knitr", "data.table", "tidyverse", "janitor", "DescTools", "grid", "ggplot2", "gtable", "gridExtra", "plotly", "ggrepel", "PerformanceAnalytics", "reshape2", "FSA", "ggpubr", "rstatix", "viridis", "edgeR", "limma", "statmod", "Glimma", "biomaRt", "gplots", "ggplot2", "AnnotationDbi", "readxl", "yarrr", "fs", "RNAseqQC", "DESeq2", "NOISeq", "ensembldb", "GenomicFeatures", "rtracklayer", "AnnotationHub", "RColorBrewer") #if(sum(as.numeric(!pacotes %in% installed.packages())) != 0){ instalador <- pacotes[!pacotes %in% installed.packages()] for(i in 1:length(instalador)) { install.packages(instalador, dependencies = T) break()} sapply(pacotes, require, character = T) } else { sapply(pacotes, require, character = T) } ##### #install.packages("Glimma") library(Glimma) #install.packages("ggplot2") library ("ggplot2") #install.packages("ggpubr") library("ggpubr") ##samples file #Considering all samples (8) and also by tissues separately (roots and leaves) #considering all samples (8) amostras_total <- read.table("samples.txt",header=T,as.is=T) print(amostras_total) #considering only leaf samples linhas_selecionadas_folhas <- c(1, 2, 5, 6) # Indices of desired rows amostras_folhas <- amostras_total[linhas_selecionadas_folhas, ] print(amostras_folhas) #considering only root samples linhas_selecionadas_raiz <- c(3,4,7,8) # Indices of desired rows amostras_raiz <- amostras_total[linhas_selecionadas_raiz, ] print(amostras_raiz) #including file information of counts in the samples file_total <- c("EeLP1_Read_Count.txt", "EeLP3_Read_Count.txt", "EeRP1_Read_Count.txt", "EeRP2_Read_Count.txt", "EeSMLP1_Read_Count.txt", "EeSMLP3_Read_Count.txt", "EeSMRP1_Read_Count.txt", "EeSMRP2_Read_Count.txt") print(file_total) file_leaf <- c("EeLP1_Read_Count.txt", "EeLP3_Read_Count.txt", "EeSMLP1_Read_Count.txt", "EeSMLP3_Read_Count.txt") file_root <- c( "EeRP1_Read_Count.txt","EeRP2_Read_Count.txt", "EeSMRP1_Read_Count.txt","EeSMRP2_Read_Count.txt") #Joining the files of the samples total_file <- mutate(amostras_total, file_total) total_file #Joining the leaf files leaf_file <- mutate(amostras_folhas, file_leaf) leaf_file #Joining the root files root_file <- mutate(amostras_raiz, file_root) root_file # Install and load the library rentrez #install.packages("rentrez") library(rentrez) library(dplyr) ###Annotation##### # Set the email to access NCBI #email <- "francine.almeida@orcid" # Specify the search term for Euterpe edulis #search_term <- "Elaeis guineensis[Organism]" #db<- "gene" # Perform the search on NCBI #search_result <- entrez_search(db = "gene", term = search_term, retmax = 1000) # Extract the gene IDs from the search #gene_ids <- search_result$ids # Get the gene information #gene_info <- entrez_fetch(db = "gene", id = gene_ids, rettype = "docsum") # Create a dataframe with the gene information #genes_df <- read.delim(textConnection(gene_info), stringsAsFactors = FALSE) # Select the desired columns #selected_columns <- c('GeneID', 'Symbol', 'description', 'chromosome', 'start', 'end', 'type_of_gene', 'wikigene_name', 'HGNC', 'gc_content', 'biotype') #genes_df <- genes_df[, selected_columns] # Rename the columns to match the desired information #colnames(genes_df) <- c('ensembl_gene_id', 'external_gene_name', 'description', 'chromosome_name', 'start_position', 'end_position', 'gene_biotype', 'wikigene_name', 'hgnc_symbol', 'percentage_gene_gc_content', 'gene_biotype') # Print the dataframe #print(genes_df) ################### # ## # # # ##Prepare the object with readDGE to make a matrix with count of all samples #install.packages("edgeR") library("edgeR") #including file names in the dataframe: d <- readDGE(file_leaf, header = F) #header false because the first line is gene d$counts <- d$counts[order(rownames(d$counts)),] class(d) dim(d$counts) #d.full <- d # backup object, in case you want to go back to the original d ##create object containing annotated gene information getwd() file.exists("list_genes_rnaseq.csv") genes<- read.csv("list_genes_rnaseq.csv", header = TRUE, sep = ";") genes<- as.data.frame(genes) head(genes) #To see if the two dataframes match in the same order, because counts and annotation must be in the same order match(genes$Gene_ID,row.names(d$counts)) summary(match(genes$Gene_ID,row.names(d$counts))) ## 5. Creating objects to include in the DGE-list** #5.1. Creating new objects that will be included in the DGE-list later #total samplenames <- amostras_total$Sample_ID samplenames group <- as.factor(amostras_total$Treatment) group library(ggplot2) #leaf samplenames_leaf <- amostras_folhas$Sample_ID samplenames_leaf group_leaf <- as.factor(amostras_folhas$Treatment) group_leaf #root samplenames_root <- amostras_raiz$Sample_ID samplenames_root group_root <- as.factor(amostras_raiz$Treatment) group_root ## 6. Making DGE list object #6.1 for all samples d$genes$Symbol <- genes$Symbol d$genes$Gene_ID <- genes$Gene_ID d$genes$Gene_biotype <- genes$Gene_biotype design <- model.matrix(~0 + group) colnames(design) <- levels(group) #Estimate dispersion for all samples d <- estimateDisp(d, design, robust=TRUE) dispersion <- d$common.dispersion dispersion #Fit the model for all samples fit <- glmFit(d, design) #Fit the dispersion for all samples lRT <- glmLRT(fit, contrast = c(1,1)) #list of DEGs for all samples tab <- topTags(lRT, n=Inf, p.value=0.05) #Extracting significant genes deg.all <- rownames(tab$table)[which(tab$table$FDR<0.05)] write.csv(deg.all, file = "deg.all.csv") deg.all # Add metadata information of the samples colnames(design) <- c("Control", "Treated") design <- data.frame(design, row.names = colnames(design)) design DGEList <- DGEList(counts=d$counts, genes=d$genes, samples=samplenames) #total DGEList DGEList$samples <- cbind(DGEList$samples, group) DGEList$samples <- data.frame(DGEList$samples, row.names = colnames(DGEList$samples)) head(DGEList$samples) #Differential analysis DGEList <- calcNormFactors(DGEList) counts.per.million <- DGEList$counts/DGEList$samples$lib.size*1e6 rownames(counts.per.million) <- rownames(DGEList) DGEList <- estimateGLMCommonDisp(DGEList, design) DGEList <- estimateGLMTrendedDisp(DGEList, design) DGEList <- estimateGLMTagwiseDisp(DGEList, design) fit <- glmFit(DGEList, design) contrast.matrix <- makeContrasts(Control - Treated, levels = design) lR <- glmLRT(fit, contrast=contrast.matrix) #Differential expression analysis for total topTags(lR) #volcano plot volcano <- as.data.frame(topTags(lR)$table) colnames(volcano) # Calculate logFC volcano$logFC <- topTags(lR)$table$logFC # Calculate -log10(p-value) volcano$minus_log10_p_value <- -log10(topTags(lR)$table$PValue) # Set threshold for significance sig_threshold <- 1.3 # Create plot ggplot(data = volcano, aes(x = logFC, y = minus_log10_p_value)) + geom_point(aes(color = ifelse(abs(logFC) > sig_threshold & minus_log10_p_value > 1.3, "red", "black"))) + geom_hline(yintercept = 1.3, linetype = "dashed", color = "blue") + geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") + scale_color_manual(values = c("black" = "black", "red" = "red")) + theme_minimal() + theme(legend.position = "none") + xlab("log2 Fold Change") + ylab("-log10(P-value)") + ggtitle("Volcano Plot for All Samples") dev.off() #Saving Volcano Plot ggsave("volcano_all.png", width = 10, height = 10, units = "cm") #####Test differentially expressed genes in leaves##### #6.2 for leaves d$genes$Symbol <- genes$Symbol d$genes$Gene_ID <- genes$Gene_ID d$genes$Gene_biotype <- genes$Gene_biotype design <- model.matrix(~0 + group_leaf) colnames(design) <- levels(group_leaf) #Estimate dispersion for leaves d <- estimateDisp(d, design, robust=TRUE) dispersion <- d$common.dispersion dispersion #Fit the model for leaves fit <- glmFit(d, design) #Fit the dispersion for leaves lRT <- glmLRT(fit, contrast = c(1,1)) #list of DEGs for leaves tab <- topTags(lRT, n=Inf, p.value=0.05) #Extracting significant genes deg.leaves <- rownames(tab$table)[which(tab$table$FDR<0.05)] write.csv(deg.leaves, file = "deg.leaves.csv") deg.leaves # Add metadata information of the samples colnames(design) <- c("Control", "Treated") design <- data.frame(design, row.names = colnames(design)) design DGEList <- DGEList(counts=d$counts, genes=d$genes, samples=samplenames_leaf) #leaf DGEList DGEList$samples <- cbind(DGEList$samples, group_leaf) DGEList$samples <- data.frame(DGEList$samples, row.names = colnames(DGEList$samples)) head(DGEList$samples) #Differential analysis DGEList <- calcNormFactors(DGEList) counts.per.million <- DGEList$counts/DGEList$samples$lib.size*1e6 rownames(counts.per.million) <- rownames(DGEList) DGEList <- estimateGLMCommonDisp(DGEList, design) DGEList <- estimateGLMTrendedDisp(DGEList, design) DGEList <- estimateGLMTagwiseDisp(DGEList, design) fit <- glmFit(DGEList, design) contrast.matrix <- makeContrasts(Control - Treated, levels = design) lR <- glmLRT(fit, contrast=contrast.matrix) #Differential expression analysis for leaves topTags(lR) #volcano plot volcano <- as.data.frame(topTags(lR)$table) colnames(volcano) # Calculate logFC volcano$logFC <- topTags(lR)$table$logFC # Calculate -log10(p-value) volcano$minus_log10_p_value <- -log10(topTags(lR)$table$PValue) # Set threshold for significance sig_threshold <- 1.3 # Create plot ggplot(data = volcano, aes(x = logFC, y = minus_log10_p_value)) + geom_point(aes(color = ifelse(abs(logFC) > sig_threshold & minus_log10_p_value > 1.3, "red", "black"))) + geom_hline(yintercept = 1.3, linetype = "dashed", color = "blue") + geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") + scale_color_manual(values = c("black" = "black", "red" = "red")) + theme_minimal() + theme(legend.position = "none") + xlab("log2 Fold Change") + ylab("-log10(P-value)") + ggtitle("Volcano Plot for Leaves") dev.off() #Saving Volcano Plot ggsave("volcano_leaves.png", width = 10, height = 10, units = "cm") #####Test differentially expressed genes in roots##### #6.3 for roots d$genes$Symbol <- genes$Symbol d$genes$Gene_ID <- genes$Gene_ID d$genes$Gene_biotype <- genes$Gene_biotype design <- model.matrix(~0 + group_root) colnames(design) <- levels(group_root) #Estimate dispersion for roots d <- estimateDisp(d, design, robust=TRUE) dispersion <- d$common.dispersion dispersion #Fit the model for roots fit <- glmFit(d, design) #Fit the dispersion for roots lRT <- glmLRT(fit, contrast = c(1,1)) #list of DEGs for roots tab <- topTags(lRT, n=Inf, p.value=0.05) #Extracting significant genes deg.roots <- rownames(tab$table)[which(tab$table$FDR<0.05)] write.csv(deg.roots, file = "deg.roots.csv") deg.roots # Add metadata information of the samples colnames(design) <- c("Control", "Treated") design <- data.frame(design, row.names = colnames(design)) design DGEList <- DGEList(counts=d$counts, genes=d$genes, samples=samplenames_root) #root DGEList DGEList$samples <- cbind(DGEList$samples, group_root) DGEList$samples <- data.frame(DGEList$samples, row.names = colnames(DGEList$samples)) head(DGEList$samples) #Differential analysis DGEList <- calcNormFactors(DGEList) counts.per.million <- DGEList$counts/DGEList$samples$lib.size*1e6 rownames(counts.per.million) <- rownames(DGEList) DGEList <- estimateGLMCommonDisp(DGEList, design) DGEList <- estimateGLMTrendedDisp(DGEList, design) DGEList <- estimateGLMTagwiseDisp(DGEList, design) fit <- glmFit(DGEList, design) contrast.matrix <- makeContrasts(Control - Treated, levels = design) lR <- glmLRT(fit, contrast=contrast.matrix) #Differential expression analysis for roots topTags(lR) #volcano plot volcano <- as.data.frame(topTags(lR)$table) colnames(volcano) # Calculate logFC volcano$logFC <- topTags(lR)$table$logFC # Calculate -log10(p-value) volcano$minus_log10_p_value <- -log10(topTags(lR)$table$PValue) # Set threshold for significance sig_threshold <- 1.3 # Create plot ggplot(data = volcano, aes(x = logFC, y = minus_log10_p_value)) + geom_point(aes(color = ifelse(abs(logFC) > sig_threshold & minus_log10_p_value > 1.3, "red", "black"))) + geom_hline(yintercept = 1.3, linetype = "dashed", color = "blue") + geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") + scale_color_manual(values = c("black" = "black", "red" = "red")) + theme_minimal() + theme(legend.position = "none") + xlab("log2 Fold Change") + ylab("-log10(P-value)") + ggtitle("Volcano Plot for Roots") dev.off() #Saving Volcano Plot ggsave("volcano_roots.png", width = 10, height = 10, units = "cm")