#####
#R code for Figure S7
#gg_data creates a table with the position of the repeat given on a 129-kb sequence
#counts the number of times a nucleotide is annotated as a repeat within a 1-kb sliding window
#####
library(ggplot2)
library(zoo)
library(scales)
library(ggpubr)


############################### sgs3 melanogaster mismatch5 ###############################"

repeats_sgs3_melano = read.table("/perso/monier/Easyfig_2.2.2_linux/D.melanogaster_sgs3_mismatch5.csv", header = T, sep = ",", dec = ".")
name_order <- repeats_sgs3_melano$Name
length_order <- repeats_sgs3_melano$Length
direction_order <- repeats_sgs3_melano$Direction
mini <- repeats_sgs3_melano$Minimum
maxi <- repeats_sgs3_melano$Maximum

#gg_data
gg_data = data.frame()
for(i in 1:length(name_order)){
  temp = data.frame('Name' = rep(name_order[i], length_order[i]),
                    'Position' = mini[i]:maxi[i],
                    'Length' = rep(length_order[i], length_order[i]),
                    'Direction' = rep(direction_order[i], length_order[i])
  )
  gg_data = rbind(gg_data, temp)
}

# data preparation

position = 1:129000
repetition = numeric(length(position))
repetition[unique(gg_data$Position)] = 1

# bar plot
window_width_adj = 1000
sliding_sum_adj = rollapply(repetition, width = window_width_adj, align = "center", by = window_width_adj, FUN = sum)
gg_data_sliding_window_adj = data.frame("position_window_adj" = seq((window_width_adj / 2), (length(position) - window_width_adj / 2), by = window_width_adj),
                                        "repetition_window_adj" = sliding_sum_adj)
p14 = ggplot(data = gg_data_sliding_window_adj, aes(x = position_window_adj, y = repetition_window_adj)) +
  geom_bar(stat = "identity", position = "identity") +
  theme_classic() +
  ggtitle(paste0("D. melanogaster - mismatch 5")) +
  ylab("Repeat content in 1-kb bins") + ylim(0, 1000) +
  annotate(geom = "text", x = 49500, y = 900, label = "Sgs3", hjust = "left") +
  annotate(geom = "text", x = 46000, y = 800, label = "Sgs8", hjust = "left") +
  annotate(geom = "text", x = 47000, y = 1000, label = "Sgs7", hjust = "left") +
  scale_x_continuous(expand = c(0, 0), labels = comma) # remove scientific number
p14

############################### sgs1 melanogaster mismatch 5 ###############################"

repeats_sgs1_melano_mismatch5 = read.table("/perso/monier/Easyfig_2.2.2_linux/D.melanogaster_repeats_sgs1_mismatch5.csv", header = T, sep = ",", dec = ".")
name_order <- repeats_sgs1_melano_mismatch5$Name
length_order <- repeats_sgs1_melano_mismatch5$Length
direction_order <- repeats_sgs1_melano_mismatch5$Direction
mini <- repeats_sgs1_melano_mismatch5$Minimum
maxi <- repeats_sgs1_melano_mismatch5$Maximum

#gg_data
gg_data = data.frame()
for(i in 1:length(name_order)){
  temp = data.frame('Name' = rep(name_order[i], length_order[i]),
                    'Position' = mini[i]:maxi[i],
                    'Length' = rep(length_order[i], length_order[i]),
                    'Direction' = rep(direction_order[i], length_order[i])
  )
  gg_data = rbind(gg_data, temp)
}

# data preparation

position = 1:129000
repetition = numeric(length(position))
repetition[unique(gg_data$Position)] = 1

# bar plot
window_width_adj = 1000
sliding_sum_adj = rollapply(repetition, width = window_width_adj, align = "center", by = window_width_adj, FUN = sum)
gg_data_sliding_window_adj = data.frame("position_window_adj" = seq((window_width_adj / 2), (length(position) - window_width_adj / 2), by = window_width_adj),
                                        "repetition_window_adj" = sliding_sum_adj)
p15 = ggplot(data = gg_data_sliding_window_adj, aes(x = position_window_adj, y = repetition_window_adj)) +
  geom_bar(stat = "identity", position = "identity") +
  theme_classic() +
  ggtitle(paste0("D. melanogaster - Sgs1 - mismatch 5")) +
  ylab("Repeat content in 1-kb bins") + ylim(0, 1000) +
  annotate(geom = "text", x = 90882, y = 900, label = "Sgs1", hjust = "left") +
  scale_x_continuous(expand = c(0, 0), labels = comma) # remove scientific number
p15


##################sgs3x melanogaster mismatch 5 #########################
repeats_sgs3x_melano = read.table("/perso/monier/Easyfig_2.2.2_linux/D.melanogaster_Sgs3x_mismatch5.csv", header = T, sep = ",", dec = ".")
name_order <- repeats_sgs3x_melano$Name
length_order <-repeats_sgs3x_melano$Length
direction_order <- repeats_sgs3x_melano$Direction
mini <- repeats_sgs3x_melano$Minimum
maxi <- repeats_sgs3x_melano$Maximum

#histogram
gg_data = data.frame()
for(i in 1:length(name_order)){
  temp = data.frame('Name' = rep(name_order[i], length_order[i]),
                    'Position' = mini[i]:maxi[i],
                    'Length' = rep(length_order[i], length_order[i]),
                    'Direction' = rep(direction_order[i], length_order[i])
  )
  gg_data = rbind(gg_data, temp)
}

# data preparation

position = 1:129000
repetition = numeric(length(position))
repetition[unique(gg_data$Position)] = 1

# bar plot
window_width_adj = 1000
sliding_sum_adj = rollapply(repetition, width = window_width_adj, align = "center", by = window_width_adj, FUN = sum)
gg_data_sliding_window_adj = data.frame("position_window_adj" = seq((window_width_adj / 2), (length(position) - window_width_adj / 2), by = window_width_adj),
                                        "repetition_window_adj" = sliding_sum_adj)
p3 = ggplot(data = gg_data_sliding_window_adj, aes(x = position_window_adj, y = repetition_window_adj)) +
  geom_bar(stat = "identity", position = "identity") +
  theme_classic() +
  ggtitle(paste0("D. melanogaster")) +
  ylab("Repeat content in 1-kb bins") + ylim(0, 1000) +
  scale_x_continuous(expand = c(0, 0), labels = comma) # remove scientific number
p3

##################sgs3 teissieri#########################
repeats_sgs3_teissieri = read.table("/perso/monier/Easyfig_2.2.2_linux/D.teissieri_3L_ extraction_mismatch5.csv", header = T, sep = ",", dec = ".")
name_order <- repeats_sgs3_teissieri$Name
length_order <-repeats_sgs3_teissieri$Length
direction_order <- repeats_sgs3_teissieri$Direction
mini <- repeats_sgs3_teissieri$Minimum
maxi <- repeats_sgs3_teissieri$Maximum

#histogram
gg_data = data.frame()
for(i in 1:length(name_order)){
  temp = data.frame('Name' = rep(name_order[i], length_order[i]), #le nom de la repetition est repete autant de fois que la longueur de la repetition
                    'Position' = mini[i]:maxi[i],#le numero de nucleotide est donne pour toutes les lignes de la repetition
                    'Length' = rep(length_order[i], length_order[i]),#la taille de la repetition est repete autant de fois que la longueur de la repetition
                    'Direction' = rep(direction_order[i], length_order[i])
  )
  gg_data = rbind(gg_data, temp)
}

# data preparation

position = 1:129000
repetition = numeric(length(position))#creation d'un vecteur de la taille de position
repetition[unique(gg_data$Position)] = 1 #dans ce vecteur on ne garde qu'une seule valeur de position, on a une seule repetition par position

# bar plot
window_width_adj = 1000
sliding_sum_adj = rollapply(repetition, width = window_width_adj, align = "center", by = window_width_adj, FUN = sum)
gg_data_sliding_window_adj = data.frame("position_window_adj" = seq((window_width_adj / 2), (length(position) - window_width_adj / 2), by = window_width_adj),
                                        "repetition_window_adj" = sliding_sum_adj)
p4 = ggplot(data = gg_data_sliding_window_adj, aes(x = position_window_adj, y = repetition_window_adj)) +
  geom_bar(stat = "identity", position = "identity") +
  theme_classic() +
  ggtitle(paste0("D. teissieri")) +
  ylab("Repeat content in 1-kb bins") + 
  annotate(geom = "text", x = 47719, y = 1000, label = "Sgs7a", hjust = "left") +
  annotate(geom = "text", x = 48335, y = 900, label = "Sgs8", hjust = "left") +
  annotate(geom = "text", x = 50181, y = 800, label = "Sgs7b", hjust = "left") +
  annotate(geom = "text", x = 52434, y = 700, label = "Sgs3b", hjust = "left") +
  annotate(geom = "text", x = 113942, y = 950, label = "Sgs3f", hjust = "left") +
  annotate(geom = "text", x = 118788, y = 950, label = "Sgs3g", hjust = "left") +
  scale_x_continuous(expand = c(0, 0), labels = comma) # remove scientific number
p4

#####################subobscura sgs3################################
repeats_sgs3_subobscura = read.table("/perso/monier/Easyfig_2.2.2_linux/D.subobscura_contig_115_sgs3_mismatch5.csv", header = T, sep = ",", dec = ".")
name_order <- repeats_sgs3_subobscura$Name
length_order <-repeats_sgs3_subobscura$Length
direction_order <- repeats_sgs3_subobscura$Direction
mini <- repeats_sgs3_subobscura$Minimum
maxi <- repeats_sgs3_subobscura$Maximum

#histogram
gg_data = data.frame()
for(i in 1:length(name_order)){
  temp = data.frame('Name' = rep(name_order[i], length_order[i]), #le nom de la repetition est repete autant de fois que la longueur de la repetition
                    'Position' = mini[i]:maxi[i],#le numero de nucleotide est donne pour toutes les lignes de la repetition
                    'Length' = rep(length_order[i], length_order[i]),#la taille de la repetition est repete autant de fois que la longueur de la repetition
                    'Direction' = rep(direction_order[i], length_order[i])
  )
  gg_data = rbind(gg_data, temp)
}

# data preparation

position = 1:129000
repetition = numeric(length(position))#creation d'un vecteur de la taille de position
repetition[unique(gg_data$Position)] = 1 #dans ce vecteur on ne garde qu'une seule valeur de position, on a une seule repetition par position

# bar plot
window_width_adj = 1000
sliding_sum_adj = rollapply(repetition, width = window_width_adj, align = "center", by = window_width_adj, FUN = sum)
gg_data_sliding_window_adj = data.frame("position_window_adj" = seq((window_width_adj / 2), (length(position) - window_width_adj / 2), by = window_width_adj),
                                        "repetition_window_adj" = sliding_sum_adj)
p5 = ggplot(data = gg_data_sliding_window_adj, aes(x = position_window_adj, y = repetition_window_adj)) +
  geom_bar(stat = "identity", position = "identity") +
  theme_classic() +
  ggtitle(paste0("D. subobscura")) +
  ylab("Repeat content in 1-kb bins") +
  annotate(geom = "text", x = 53160, y = 900, label = "Sgs3b", hjust = "left") +
  annotate(geom = "text", x = 57411, y = 1000, label = "Sgs3c", hjust = "left") +
  annotate(geom = "text", x = 63171, y = 900, label = "Sgs3d", hjust = "left") +
  annotate(geom = "text", x = 70984, y = 700, label = "Sgs3e", hjust = "left") +
  scale_x_continuous(expand = c(0, 0), labels = comma) # remove scientific number
p5



################################sgs3x eugracilis######################""
repeats_sgs3x_eugracilis = read.table("/perso/monier/Easyfig_2.2.2_linux/eugracilis_sgs3x.csv", header = T, sep = ",", dec = ".")
name_order <- repeats_sgs3x_eugracilis$Name
length_order <-repeats_sgs3x_eugracilis$Length
direction_order <- repeats_sgs3x_eugracilis$Direction
mini <- repeats_sgs3x_eugracilis$Minimum
maxi <- repeats_sgs3x_eugracilis$Maximum

#histogram
gg_data = data.frame()
for(i in 1:length(name_order)){
  temp = data.frame('Name' = rep(name_order[i], length_order[i]), #le nom de la repetition est repete autant de fois que la longueur de la repetition
                    'Position' = mini[i]:maxi[i],#le numero de nucleotide est donne pour toutes les lignes de la repetition
                    'Length' = rep(length_order[i], length_order[i]),#la taille de la repetition est repete autant de fois que la longueur de la repetition
                    'Direction' = rep(direction_order[i], length_order[i])
  )
  gg_data = rbind(gg_data, temp)
}

# data preparation

position = 1:129000
repetition = numeric(length(position))#creation d'un vecteur de la taille de position
repetition[unique(gg_data$Position)] = 1 #dans ce vecteur on ne garde qu'une seule valeur de position, on a une seule repetition par position

# bar plot
window_width_adj = 1000
sliding_sum_adj = rollapply(repetition, width = window_width_adj, align = "center", by = window_width_adj, FUN = sum)
gg_data_sliding_window_adj = data.frame("position_window_adj" = seq((window_width_adj / 2), (length(position) - window_width_adj / 2), by = window_width_adj),
                                        "repetition_window_adj" = sliding_sum_adj)
p6 = ggplot(data = gg_data_sliding_window_adj, aes(x = position_window_adj, y = repetition_window_adj)) +
  geom_bar(stat = "identity", position = "identity") +
  theme_classic() +
  ggtitle(paste0("D. eugracilis")) +
  ylab("Repeat content in 1-kb bins") + ylim(0, 1000)+
  annotate(geom = "text", x = 78406, y = 800, label = "Sgs3X", hjust = "left") +
  scale_x_continuous(expand = c(0, 0), labels = comma) # remove scientific number
p6






#combine plots in one window
figure <- ggarrange(p15,p3,p14, p4,p5,p6,
                    labels = c("A", "B", "C", "D", "E", "F"),
                    ncol = 1, nrow = 6)
figure

