###Scripts###


#####Statistics####

#DIRECTORY
getwd()

####################
#Load Data
####################

data=read.table("comp.txt",head=TRUE)
names(data)
attach(data)
detach(data)

data$Genotype=as.factor(data$Genotype)

##################################################LENGTHS

#boxplot(interest variable name ~ ind) ###PA
windowsFonts()
boxplot(data$PA ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Aboveground length (cm)", xlab="Genotype",bty="1", family= "TT Times New Roman", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)
dd = tapply(data$PA, data$Genotype, summary)

dd

#MEAN: sm: 16.43 ufes250: 14.33

#boxplot(interest variable name ~ ind) ###Foliolo
boxplot(data$Foliolo ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Leaflet length (cm)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd2 = tapply(data$Foliolo, data$Genotype, summary)

dd2

#MEAN: sm: 7.836 ufes250: 7.043

#boxplot(interest variable name ~ ind) ###Caule
boxplot(data$Caule ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab=" Stalk length (cm)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd3 = tapply(data$Caule, data$Genotype, summary)

dd3
#MEAN: sm: 7.802 ufes250: 6.824

#boxplot(interest variable name ~ ind) ###Raiz
boxplot(data$Raiz ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Roots length (cm)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd4 = tapply(data$Raiz, data$Genotype, summary)

dd4
#MEAN: sm: 17.5 ufes250: 13.15

###############################################MASSES
data=read.table("massafresca_r.txt",head=TRUE)
names(data)
attach(data)
detach(data)

data$Genotype=as.factor(data$Genotype)

#boxplot(interest variable name ~ ind) ###Fresh mass_roots
boxplot(data$Raiz ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Fresh weight (roots) (g)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd5 = tapply(data$Raiz, data$Genotype, summary)

dd5
#MEAN: sm: 0.5136 ufes250: 0.7569

data=read.table("massaseca_r.txt",head=TRUE)
names(data)
attach(data)
detach(data)

data$Genotype=as.factor(data$Genotype)

#boxplot(interest variable name ~ ind) ###Dry mass_roots
boxplot(data$Raiz ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd6 = tapply(data$Raiz, data$Genotype, summary)

dd6
#MEAN: sm: 0.1837 ufes250: 0.1948

data=read.table("massafresca_pa.txt",head=TRUE)
names(data)
attach(data)
detach(data)

data$Genotype=as.factor(data$Genotype)

#boxplot(interest variable name ~ ind) ###Fresh mass_pa
boxplot(data$PA ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Fresh weight (Aboveground) (g)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd5 = tapply(data$PA, data$Genotype, summary)

dd5
#MEAN: sm: 1.372 ufes250: 1.155

data=read.table("massaseca_pa.txt",head=TRUE)
names(data)
attach(data)
detach(data)

data$Genotype=as.factor(data$Genotype)

#boxplot(interest variable name ~ ind) ###Dry mass_pa
boxplot(data$PA ~ data$Genotype,col=c('cyan3', 'magenta3'), las=1, ylab="Dry weight (roots) (g)", xlab="Genotype",bty="l", cex.lab= 1.4, cex.axis= 1.5, cex.main= 1.4)

dd6 = tapply(data$PA, data$Genotype, summary)

dd6
#MEAN: sm: 0.3914 ufes250: 0.3119

boxplot(Individuo ~ Genotype, data=data)
#plot(interest variable name ~ ind)
plot(data$PA)

#####Ancova####

###Directory, package installation, file call
setwd()

##install.packages("car")
library(car)

d2<- read.table("comp.txt", header = TRUE, sep= "\t")
head(d2)
names(d2)

###############FOLIOLO
aov1 = aov(Foliolo ~ Genotype + Caule + PA + Raiz, d2)
Anova(aov1 ,type="III")

levels(d2$Genotype)

contrast1 = c(1,-1)#sm x ufes-250

d2$Genotype <- as.factor(d2$Genotype)
contrasts(d2$Genotype) = cbind(contrast1)
contrasts(d2$Genotype)

summary.lm(aov1)

###############ABOVEGROUND PART
aov2 = aov(PA ~ Genotype + Caule + Foliolo + Raiz, d2)
summary.lm(aov2)

###############STALK
aov3 = aov(Caule ~ Genotype + PA + Foliolo + Raiz, d2)
summary.lm(aov3)

###############ROOT
aov4 = aov(Raiz ~ Genotype + PA + Foliolo + Caule, d2)
summary.lm(aov4)


#####Differential expression analysis#####

#working directory
getwd()


#Packages

#if (!requireNamespace("BiocManager", quietly = TRUE))
  #install.packages("BiocManager")

#BiocManager::install(c("edgeR", "limma", "statmod", "Glimma", "biomaRt", "gplots", "ggplot2", 
                       #"AnnotationDbi", "readxl", "yarrr", "fs", "RNAseqQC", "DESeq2", "NOISeq", 
                       #"ensembldb", "GenomicFeatures", "rtracklayer", "AnnotationHub", "RColorBrewer"))

pacotes <- c("rmarkdown", "knitr", "data.table", "tidyverse", "janitor", "DescTools",  "grid",
             "ggplot2", "gtable", "gridExtra", "plotly", "ggrepel", "PerformanceAnalytics", 
             "reshape2", "FSA", "ggpubr", "rstatix", "viridis", "edgeR", "limma", "statmod",
             "Glimma", "biomaRt", "gplots", "ggplot2", "AnnotationDbi", "readxl", "yarrr", "fs", "RNAseqQC",
             "DESeq2", "NOISeq", "ensembldb", "GenomicFeatures", "rtracklayer", "AnnotationHub", "RColorBrewer")

#if(sum(as.numeric(!pacotes %in% installed.packages())) != 0){
  instalador <- pacotes[!pacotes %in% installed.packages()]
  for(i in 1:length(instalador)) {
    install.packages(instalador, dependencies = T)
    break()}
  sapply(pacotes, require, character = T)
} else {
  sapply(pacotes, require, character = T)
}

#####
#install.packages("Glimma")
library(Glimma)

#install.packages("ggplot2")
library ("ggplot2")

#install.packages("ggpubr")
library("ggpubr")

##samples file
#Considering all samples (8) and also by tissues separately (roots and leaves)

#considering all samples (8)
amostras_total <- read.table("samples.txt",header=T,as.is=T)
print(amostras_total)

#considering only leaf samples
linhas_selecionadas_folhas <- c(1, 2, 5, 6)  # Indices of desired rows
amostras_folhas <- amostras_total[linhas_selecionadas_folhas, ]
print(amostras_folhas)

#considering only root samples
linhas_selecionadas_raiz <- c(3,4,7,8)  # Indices of desired rows
amostras_raiz <- amostras_total[linhas_selecionadas_raiz, ]
print(amostras_raiz)

#including file information of counts in the samples

file_total <- c("EeLP1_Read_Count.txt", "EeLP3_Read_Count.txt", "EeRP1_Read_Count.txt", "EeRP2_Read_Count.txt", "EeSMLP1_Read_Count.txt", "EeSMLP3_Read_Count.txt", "EeSMRP1_Read_Count.txt", "EeSMRP2_Read_Count.txt")
print(file_total)

file_leaf <- c("EeLP1_Read_Count.txt", "EeLP3_Read_Count.txt", "EeSMLP1_Read_Count.txt", "EeSMLP3_Read_Count.txt")

file_root <- c( "EeRP1_Read_Count.txt","EeRP2_Read_Count.txt", "EeSMRP1_Read_Count.txt","EeSMRP2_Read_Count.txt")

#Joining the files of the samples
total_file <- mutate(amostras_total, file_total)
total_file

#Joining the leaf files
leaf_file <- mutate(amostras_folhas, file_leaf)
leaf_file

#Joining the root files
root_file <- mutate(amostras_raiz, file_root)
root_file

# Install and load the library rentrez
#install.packages("rentrez")
library(rentrez)
library(dplyr)


###Annotation#####
# Set the email to access NCBI
#email <- "francine.almeida@orcid"
# Specify the search term for Euterpe edulis
#search_term <- "Elaeis guineensis[Organism]"
#db<- "gene"
# Perform the search on NCBI
#search_result <- entrez_search(db = "gene", term = search_term, retmax = 1000)

# Extract the gene IDs from the search
#gene_ids <- search_result$ids

# Get the gene information
#gene_info <- entrez_fetch(db = "gene", id = gene_ids, rettype = "docsum")


# Create a dataframe with the gene information
#genes_df <- read.delim(textConnection(gene_info), stringsAsFactors = FALSE)

# Select the desired columns
#selected_columns <- c('GeneID', 'Symbol', 'description', 'chromosome', 'start', 'end', 'type_of_gene', 'wikigene_name', 'HGNC', 'gc_content', 'biotype')
#genes_df <- genes_df[, selected_columns]

# Rename the columns to match the desired information
#colnames(genes_df) <- c('ensembl_gene_id', 'external_gene_name', 'description', 'chromosome_name', 'start_position', 'end_position', 'gene_biotype', 'wikigene_name', 'hgnc_symbol', 'percentage_gene_gc_content', 'gene_biotype')

# Print the dataframe
#print(genes_df)
###################

#
##
#
#
#

##Prepare the object with readDGE to make a matrix with count of all samples

#install.packages("edgeR")
library("edgeR")

#including file names in the dataframe:

d <- readDGE(file_leaf, header = F) #header false because the first line is gene
d$counts <- d$counts[order(rownames(d$counts)),]
class(d)
dim(d$counts)
#d.full <- d # backup object, in case you want to go back to the original
d

##create object containing annotated gene information
getwd()
file.exists("list_genes_rnaseq.csv")
genes<- read.csv("list_genes_rnaseq.csv", header = TRUE, sep = ";")
genes<- as.data.frame(genes)

head(genes)

#To see if the two dataframes match in the same order, because counts and annotation must be in the same order
match(genes$Gene_ID,row.names(d$counts))
summary(match(genes$Gene_ID,row.names(d$counts)))

## 5. Creating objects to include in the DGE-list**

#5.1. Creating new objects that will be included in the DGE-list later

#total
samplenames <- amostras_total$Sample_ID
samplenames

group <- as.factor(amostras_total$Treatment)
group
library(ggplot2)

#leaf
samplenames_leaf <- amostras_folhas$Sample_ID
samplenames_leaf

group_leaf <- as.factor(amostras_folhas$Treatment)
group_leaf

#root
samplenames_root <- amostras_raiz$Sample_ID
samplenames_root

group_root <- as.factor(amostras_raiz$Treatment)
group_root

## 6. Making DGE list object
#6.1 for all samples

d$genes$Symbol <- genes$Symbol
d$genes$Gene_ID <- genes$Gene_ID
d$genes$Gene_biotype <- genes$Gene_biotype

design <- model.matrix(~0 + group)
colnames(design) <- levels(group)

#Estimate dispersion for all samples
d <- estimateDisp(d, design, robust=TRUE)
dispersion <- d$common.dispersion
dispersion

#Fit the model for all samples
fit <- glmFit(d, design)

#Fit the dispersion for all samples
lRT <- glmLRT(fit, contrast = c(1,1))

#list of DEGs for all samples
tab <- topTags(lRT, n=Inf, p.value=0.05)

#Extracting significant genes
deg.all <- rownames(tab$table)[which(tab$table$FDR<0.05)]
write.csv(deg.all, file = "deg.all.csv")
deg.all


# Add metadata information of the samples

colnames(design) <- c("Control", "Treated")
design <- data.frame(design, row.names = colnames(design))
design

DGEList <- DGEList(counts=d$counts, genes=d$genes, samples=samplenames)
#total DGEList
DGEList$samples <- cbind(DGEList$samples, group)
DGEList$samples <- data.frame(DGEList$samples, row.names = colnames(DGEList$samples))
head(DGEList$samples)

#Differential analysis
DGEList <- calcNormFactors(DGEList)
counts.per.million <- DGEList$counts/DGEList$samples$lib.size*1e6
rownames(counts.per.million) <- rownames(DGEList)
DGEList <- estimateGLMCommonDisp(DGEList, design)
DGEList <- estimateGLMTrendedDisp(DGEList, design)
DGEList <- estimateGLMTagwiseDisp(DGEList, design)
fit <- glmFit(DGEList, design)
contrast.matrix <- makeContrasts(Control - Treated, levels = design)
lR <- glmLRT(fit, contrast=contrast.matrix)

#Differential expression analysis for total
topTags(lR)

#volcano plot
volcano <- as.data.frame(topTags(lR)$table)
colnames(volcano)

# Calculate logFC
volcano$logFC <- topTags(lR)$table$logFC

# Calculate -log10(p-value)
volcano$minus_log10_p_value <- -log10(topTags(lR)$table$PValue)

# Set threshold for significance
sig_threshold <- 1.3

# Create plot
ggplot(data = volcano, aes(x = logFC, y = minus_log10_p_value)) +
  geom_point(aes(color = ifelse(abs(logFC) > sig_threshold & minus_log10_p_value > 1.3, "red", "black"))) +
  geom_hline(yintercept = 1.3, linetype = "dashed", color = "blue") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  scale_color_manual(values = c("black" = "black", "red" = "red")) +
  theme_minimal() +
  theme(legend.position = "none") +
  xlab("log2 Fold Change") +
  ylab("-log10(P-value)") +
  ggtitle("Volcano Plot for All Samples")


dev.off()

#Saving Volcano Plot
ggsave("volcano_all.png", width = 10, height = 10, units = "cm")


#####Test differentially expressed genes in leaves#####

#6.2 for leaves

d$genes$Symbol <- genes$Symbol
d$genes$Gene_ID <- genes$Gene_ID
d$genes$Gene_biotype <- genes$Gene_biotype

design <- model.matrix(~0 + group_leaf)
colnames(design) <- levels(group_leaf)

#Estimate dispersion for leaves
d <- estimateDisp(d, design, robust=TRUE)
dispersion <- d$common.dispersion
dispersion

#Fit the model for leaves
fit <- glmFit(d, design)

#Fit the dispersion for leaves
lRT <- glmLRT(fit, contrast = c(1,1))

#list of DEGs for leaves
tab <- topTags(lRT, n=Inf, p.value=0.05)

#Extracting significant genes
deg.leaves <- rownames(tab$table)[which(tab$table$FDR<0.05)]
write.csv(deg.leaves, file = "deg.leaves.csv")
deg.leaves

# Add metadata information of the samples

colnames(design) <- c("Control", "Treated")
design <- data.frame(design, row.names = colnames(design))
design

DGEList <- DGEList(counts=d$counts, genes=d$genes, samples=samplenames_leaf)
#leaf DGEList
DGEList$samples <- cbind(DGEList$samples, group_leaf)
DGEList$samples <- data.frame(DGEList$samples, row.names = colnames(DGEList$samples))
head(DGEList$samples)

#Differential analysis
DGEList <- calcNormFactors(DGEList)
counts.per.million <- DGEList$counts/DGEList$samples$lib.size*1e6
rownames(counts.per.million) <- rownames(DGEList)
DGEList <- estimateGLMCommonDisp(DGEList, design)
DGEList <- estimateGLMTrendedDisp(DGEList, design)
DGEList <- estimateGLMTagwiseDisp(DGEList, design)
fit <- glmFit(DGEList, design)
contrast.matrix <- makeContrasts(Control - Treated, levels = design)
lR <- glmLRT(fit, contrast=contrast.matrix)

#Differential expression analysis for leaves
topTags(lR)

#volcano plot
volcano <- as.data.frame(topTags(lR)$table)
colnames(volcano)

# Calculate logFC
volcano$logFC <- topTags(lR)$table$logFC

# Calculate -log10(p-value)
volcano$minus_log10_p_value <- -log10(topTags(lR)$table$PValue)

# Set threshold for significance
sig_threshold <- 1.3

# Create plot
ggplot(data = volcano, aes(x = logFC, y = minus_log10_p_value)) +
  geom_point(aes(color = ifelse(abs(logFC) > sig_threshold & minus_log10_p_value > 1.3, "red", "black"))) +
  geom_hline(yintercept = 1.3, linetype = "dashed", color = "blue") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  scale_color_manual(values = c("black" = "black", "red" = "red")) +
  theme_minimal() +
  theme(legend.position = "none") +
  xlab("log2 Fold Change") +
  ylab("-log10(P-value)") +
  ggtitle("Volcano Plot for Leaves")


dev.off()

#Saving Volcano Plot
ggsave("volcano_leaves.png", width = 10, height = 10, units = "cm")


#####Test differentially expressed genes in roots#####

#6.3 for roots

d$genes$Symbol <- genes$Symbol
d$genes$Gene_ID <- genes$Gene_ID
d$genes$Gene_biotype <- genes$Gene_biotype

design <- model.matrix(~0 + group_root)
colnames(design) <- levels(group_root)

#Estimate dispersion for roots
d <- estimateDisp(d, design, robust=TRUE)
dispersion <- d$common.dispersion
dispersion

#Fit the model for roots
fit <- glmFit(d, design)

#Fit the dispersion for roots
lRT <- glmLRT(fit, contrast = c(1,1))

#list of DEGs for roots
tab <- topTags(lRT, n=Inf, p.value=0.05)

#Extracting significant genes
deg.roots <- rownames(tab$table)[which(tab$table$FDR<0.05)]
write.csv(deg.roots, file = "deg.roots.csv")
deg.roots

# Add metadata information of the samples

colnames(design) <- c("Control", "Treated")
design <- data.frame(design, row.names = colnames(design))
design

DGEList <- DGEList(counts=d$counts, genes=d$genes, samples=samplenames_root)
#root DGEList
DGEList$samples <- cbind(DGEList$samples, group_root)
DGEList$samples <- data.frame(DGEList$samples, row.names = colnames(DGEList$samples))
head(DGEList$samples)

#Differential analysis
DGEList <- calcNormFactors(DGEList)
counts.per.million <- DGEList$counts/DGEList$samples$lib.size*1e6
rownames(counts.per.million) <- rownames(DGEList)
DGEList <- estimateGLMCommonDisp(DGEList, design)
DGEList <- estimateGLMTrendedDisp(DGEList, design)
DGEList <- estimateGLMTagwiseDisp(DGEList, design)
fit <- glmFit(DGEList, design)
contrast.matrix <- makeContrasts(Control - Treated, levels = design)
lR <- glmLRT(fit, contrast=contrast.matrix)

#Differential expression analysis for roots
topTags(lR)

#volcano plot
volcano <- as.data.frame(topTags(lR)$table)
colnames(volcano)

# Calculate logFC
volcano$logFC <- topTags(lR)$table$logFC

# Calculate -log10(p-value)
volcano$minus_log10_p_value <- -log10(topTags(lR)$table$PValue)

# Set threshold for significance
sig_threshold <- 1.3

# Create plot
ggplot(data = volcano, aes(x = logFC, y = minus_log10_p_value)) +
  geom_point(aes(color = ifelse(abs(logFC) > sig_threshold & minus_log10_p_value > 1.3, "red", "black"))) +
  geom_hline(yintercept = 1.3, linetype = "dashed", color = "blue") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  scale_color_manual(values = c("black" = "black", "red" = "red")) +
  theme_minimal() +
  theme(legend.position = "none") +
  xlab("log2 Fold Change") +
  ylab("-log10(P-value)") +
  ggtitle("Volcano Plot for Roots")


dev.off()

#Saving Volcano Plot
ggsave("volcano_roots.png", width = 10, height = 10, units = "cm")