#####
#R code for Figure 2
#two plots : one for species with sgs3x, one for species without sgs3x
#by species, x position is defined according to the gene in the column 'gene' and y positions according to gene size in the column 'somme'
#dot positions are plotted with geom_point and segments between two dots with geom_segment
######


rm(list = ls())

###

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

###

library(ggplot2)
library(ggrepel)

############################## species with sgs3x###########################

allgeneswithsgs3x = read.table("/perso/monier/Easyfig_2.2.2_linux/gene_size_species_with_sgs3x.csv", header = T, sep = ",", dec = ".")
colnames(allgeneswithsgs3x) = c("Species", "gene", "Somme")
allgeneswithsgs3x$Species = sub('+AF8-', '_', allgeneswithsgs3x$Species, fixed = T)
allgeneswithsgs3x$Species = gsub(' ', '', gsub('+ACI-', '', allgeneswithsgs3x$Species, fixed = T))
allgeneswithsgs3x$gene = gsub('+ACI-', '', allgeneswithsgs3x$gene, fixed = T)

# fix color
pal = gg_color_hue(length(unique(allgeneswithsgs3x$Species)))
names(pal) = unique(allgeneswithsgs3x$Species)
color = pal[allgeneswithsgs3x$Species]
allgeneswithsgs3x = cbind(allgeneswithsgs3x, color)

gene_order = c("Sgs1", "Sgs3X", "Sgs3", "Sgs7")

data_segment = data.frame(NULL, NULL, NULL, NULL, NULL, NULL)

for (line in 1:dim(allgeneswithsgs3x)[1]){
  x_start = allgeneswithsgs3x[line, "gene"]
  y_start = allgeneswithsgs3x[line, "Somme"]
  species = allgeneswithsgs3x[line, "Species"]
  
  if (x_start != tail(gene_order, 1)){
    x_stop = gene_order[which(gene_order == x_start) + 1]
    vect_y_stop = allgeneswithsgs3x[which(allgeneswithsgs3x$gene == x_stop & allgeneswithsgs3x$Species == species), "Somme"]
  } else {
    next
  }
  if (length(vect_y_stop) != 0) {
    for (y_stop in vect_y_stop) {
      new_line = c(x_start, y_start, x_stop, y_stop, pal[species], species)
      data_segment = rbind(data_segment, new_line)
    }
  }
}

colnames(data_segment) = c("x_start", "y_start", "x_stop", "y_stop", "color", "species")
rownames(data_segment) = 1:dim(data_segment)[1]
data_segment$y_start = as.numeric(data_segment$y_start)
data_segment$y_stop = as.numeric(data_segment$y_stop)

#Plot
p <- ggplot(data = allgeneswithsgs3x, aes(x = factor(gene, levels=gene_order), y = Somme))
p <- p + geom_point(color = color, size = 3, shape = 16)
p <- p + geom_text_repel(aes(label = Species), segment.color = 'grey50',nudge_x=0.2, hjust=0, direction="y")
p <- p + geom_segment(data = data_segment, x = data_segment$x_start, y = log10(data_segment$y_start), xend = data_segment$x_stop, yend = log10(data_segment$y_stop), color = data_segment$color)
p <- p + scale_y_log10(limits = c(200, 6000), breaks = c(200, 500, 1000, 2000, 3000, 4000, 5000, 6000)) + theme_classic() + theme(legend.position = "none", axis.text.x = element_text(face = "italic",size = 25, colour = "black"), axis.text.y = element_text(size = 25, colour = "black"))
p <- p + ylab('nucleotide coding sequence length (bp)', size)
p

###################################species without sgs3x#################################
allgeneswithoutsgs3x = read.table("/perso/monier/Easyfig_2.2.2_linux/gene_size_species_without_sgs3x.csv", header = T, sep = ",", dec = ".")
colnames(allgeneswithoutsgs3x) = c("Species", "gene", "Somme")
allgeneswithoutsgs3x$Species = sub('+AF8-', '_', allgeneswithoutsgs3x$Species, fixed = T)
allgeneswithoutsgs3x$Species = gsub(' ', '', gsub('+ACI-', '', allgeneswithoutsgs3x$Species, fixed = T))
allgeneswithoutsgs3x$gene = gsub('+ACI-', '', allgeneswithoutsgs3x$gene, fixed = T)

# fix color
pal = gg_color_hue(length(unique(allgeneswithoutsgs3x$Species)))
names(pal) = unique(allgeneswithoutsgs3x$Species)
color = pal[allgeneswithoutsgs3x$Species]
allgeneswithoutsgs3x = cbind(allgeneswithoutsgs3x, color)

gene_order = c("Sgs1", "Sgs3", "Sgs7")

data_segment = data.frame(NULL, NULL, NULL, NULL, NULL, NULL)

for (line in 1:dim(allgeneswithoutsgs3x)[1]){
  x_start = allgeneswithoutsgs3x[line, "gene"]
  y_start = allgeneswithoutsgs3x[line, "Somme"]
  species = allgeneswithoutsgs3x[line, "Species"]
  
  if (x_start != tail(gene_order, 1)){
    x_stop = gene_order[which(gene_order == x_start) + 1]
    vect_y_stop = allgeneswithoutsgs3x[which(allgeneswithoutsgs3x$gene == x_stop & allgeneswithoutsgs3x$Species == species), "Somme"]
  } else {
    next
  }
  if (length(vect_y_stop) != 0) {
    for (y_stop in vect_y_stop) {
      new_line = c(x_start, y_start, x_stop, y_stop, pal[species], species)
      data_segment = rbind(data_segment, new_line)
    }
  }
}

colnames(data_segment) = c("x_start", "y_start", "x_stop", "y_stop", "color", "species")
rownames(data_segment) = 1:dim(data_segment)[1]
data_segment$y_start = as.numeric(data_segment$y_start)
data_segment$y_stop = as.numeric(data_segment$y_stop)

# Plot
p <- ggplot(data = allgeneswithoutsgs3x, aes(x = factor(gene, levels=gene_order), y = Somme))
p <- p + geom_point(color = color, size = 3, shape = 16)
p <- p + geom_text_repel(aes(label = Species), segment.color = 'grey50',nudge_x=0.2, hjust=0, direction="y")
p <- p + geom_segment(data = data_segment, x = data_segment$x_start, y = log10(data_segment$y_start), xend = data_segment$x_stop, yend = log10(data_segment$y_stop), color = data_segment$color)
p <- p + scale_y_log10(breaks = c(200, 500, 1000, 2000, 3000, 4000, 5000, 6000)) + theme_classic() + theme(legend.position = "none", axis.text.x = element_text(face = "italic",size = 25, colour = "black"), axis.text.y = element_text(size = 25, colour = "black"))
p
