library(optparse)
option_list <- list(
  make_option(c("-i", "--infile"), type = "character", default = '/pub1/data/mg_projects/projects/web_script/tool_runing/7e47df1a231e741a8200d44a6bbae213/input.json',
              action = "store", help = "Input a exp file path!"
  ),
  make_option(c("-o", "--outfile"), type = "character", default = '/pub1/data/mg_projects/projects/web_script/tool_runing/b5d509f2ddaf74dd2ca07303a86e46a3',
              action = "store", help = "Input a outfolder path!"
  )
)
logs=c()
baseFolder='/pub1/data/mg_projects/projects/web_script/source/ensg2genesymbol.fst'
merge_data_by_group=function(datExpr,anno=NULL,method=c('mean','median','max','min')[1]){
  mean_rm_na=function(x){
    return(mean(x,na.rm =T))
  }
  median_rm_na=function(x){
    return(median(x,na.rm =T))
  }
  max_rm_na=function(x){
    return(max(x,na.rm =T))
  }
  min_rm_na=function(x){
    return(min(x,na.rm =T))
  }
  
  library(dplyr)
  anno=unique(anno[,1:2])
  #gp=table(anno[,1])
  an.cmp=intersect(anno[,1],row.names(datExpr))
  test.data=datExpr[match(an.cmp,row.names(datExpr)),]
  test.data=as.data.frame(test.data)
  test.data$MG_SXR_Group=anno[match(an.cmp,anno[,1]),2]
  vd.test.data=NULL
  if(method=='mean'){
    vd.test.data=test.data %>% dplyr::group_by(MG_SXR_Group) %>% summarise_each(funs(mean_rm_na),vals=c(colnames(test.data)[1:(ncol(test.data)-1)]))
  }else if(method=='median'){
    vd.test.data=test.data %>% dplyr::group_by(MG_SXR_Group) %>% summarise_each(funs(median_rm_na),vals=c(colnames(test.data)[1:(ncol(test.data)-1)]))
  }else if(method=='max'){
    vd.test.data=test.data %>% dplyr::group_by(MG_SXR_Group) %>% summarise_each(funs(max_rm_na),vals=c(colnames(test.data)[1:(ncol(test.data)-1)]))
  }else if(method=='min'){
    vd.test.data=test.data %>% dplyr::group_by(MG_SXR_Group) %>% summarise_each(funs(min_rm_na),vals=c(colnames(test.data)[1:(ncol(test.data)-1)]))
  }
  if(!is.null(vd.test.data)){
    vd.test.data=as.data.frame(vd.test.data)
    row.names(vd.test.data)=vd.test.data[,1]
    vd.test.data=vd.test.data[,-1]
    colnames(vd.test.data)=colnames(test.data)[1:(ncol(test.data)-1)]
  }
  return(vd.test.data)
}

mg_convert_ByENSG=function(exp,Symbol=T,type=c('lncRNA','pseudogene','protein_coding')[1]){
  #cut=c("__alignment_not_unique","__not_aligned","__too_low_aQual","__ambiguous","__no_feature")
  #exp=exp[which(!row.names(exp)%in%cut),]
  ft = tidyfst::import_fst(baseFolder,as.data.table = F)
  ft.lnc=ft[which(ft[,3]==type),]
  row.names(exp)=gsub('\\..*','',row.names(exp))
  if(Symbol){
    exp1=merge_data_by_group(exp,ft.lnc[,1:2],'median')
  }else{
    exp1=exp[row.names(exp)%in%ft.lnc[,1],]
  }
  return(exp1)
}

mg_count2FPKMs=function(exp,gff3=NULL){
  if(is.null(gff3)){
    #anno=readMatrix(paste0(MG_Grobal_baseFolder,'/source/gencode.v22.ensg.genelen.tab'))
    ft = tidyfst::import_fst(baseFolder,as.data.table = F)
    row.names(exp)=gsub('\\..*','',row.names(exp))
    exp1=exp[row.names(exp)%in%ft[,1],]
    anno=ft[match(row.names(exp1),ft[,1]),]
    inLnc.inds=which(anno$GeneLenV22>0)
    exp1=exp1[inLnc.inds,]
    anno=anno[inLnc.inds,]
    lens=anno[,4]
    if(nrow(anno)==60483){
      ct=apply(exp1[which(anno[,3]=='protein_coding'),],2,sum)
    }else{
      ct=apply(exp1,2,sum)
    }
    exp1=exp1/lens
    exp1=t(t(exp1)/ct)
    exp1=exp1*1e9
    return(exp1)
  }else{
    library(GenomicFeatures)
    txdb <- makeTxDbFromGFF(gff3,format="gff3")
    exons_gene <- exonsBy(txdb, by = "gene")
    exons_gene_lens <- lapply(exons_gene,function(x){sum(width(reduce(x)))})
    exons_gene_lens=unlist(exons_gene_lens)
    exp=exp[row.names(exp)%in%names(exons_gene_lens),]
    ct=apply(exp,2,sum)
    lens=exons_gene_lens[match(row.names(exp),names(exons_gene_lens))]
    exp1=exp/lens
    exp1=t(t(exp1)/ct)
    exp1=exp1*1e9
    return(exp1)
  }
}

mg_count2TPMs=function(exp,gff3=NULL){
  fpkm.exp=mg_count2FPKMs(exp,gff3)
  return(mg_FPKM2TPMs(fpkm.exp))
}
#makeTxDbFromGFF('/pub1/data/mg_projects/projects/codes/source/gencode.v19.annotation.gff3.gz',format="gff3")

mg_FPKM2TPMs=function(exp){
  exp1=exp
  exp1[is.na(exp1)]=0
  ct=apply(exp1, 2, sum)
  exp1=t(t(exp1)/ct)
  exp1=exp1*1e6
  return(exp1)
}

tryCatch({
  Args <- commandArgs()
  opt = parse_args(OptionParser(option_list = option_list, usage = "Data press"))
  #logs=c(logs,paste0('geting data:',paste0(paste0(names(opt),'=',opt),collapse = ',')))
  logs=c(logs,paste0('run tcga_rnaseq.R-',basename(opt$outfile)))
  #library("rjson")
  library(jsonlite)
  data<-jsonlite::stream_in(file(opt$infile),pagesize = 1000)
  exp_path=unlist(data$exp_path)
  type=unlist(data$type)
  gff3=unlist(data$gff3)
  outType=unlist(data$outType)
  
  #exp_path='/pub1/data/mg_projects/projects/web_script/tool_runing/test_data/Merge_90970e8164adad879a63a20869e5f685_RNAseqCount.txt'
  #type='fpkm2tpm'#count2tpm,count2fpkm,extractlncRNA,extractPCG,extractPseudogene,symbol2ensg,ensg2symbol
  #gff3='/pub1/data/mg_projects/projects/web_script/source/gencode.v22.annotation.gff3.gz'
  #outType='Symbol' #ENSG
  if(gff3=='/pub1/data/mg_projects/projects/web_script/source/gencode.v22.annotation.gff3.gz'){
    gff3=NULL
  }
  dat=data.table::fread(exp_path, sep = "\t",header = T,stringsAsFactors = F,check.names = F
                        ,na.strings="NA",data.table = F)  
  nms=unique(dat[,1])
  dat=dat[match(nms,dat[,1]),]
  row.names(dat)=dat[,1]
  dat=dat[,-1]  
  #sort(apply(exp,2, function(x){sum(is.na(x))}))
  exp=apply(dat, 2, function(x){
    return(as.numeric(as.character(x)))
  })
  row.names(exp)=row.names(dat)
  #dim(exp)
  #exp[,1]
  logs=c(logs,paste0('read exp nrow:',nrow(exp),',ncol:',ncol(exp)))
  cut=c("__alignment_not_unique","__not_aligned","__too_low_aQual","__ambiguous","__no_feature")
  exp=exp[which(!row.names(exp)%in%cut),]
  logs=c(logs,paste0('read exp nrow:',nrow(exp),',ncol:',ncol(exp)))
  
  if(length(grep('^ENSG',row.names(exp)))==nrow(exp)){
    row.names(exp)=gsub('\\..*','',row.names(exp))
  }
  
  logs=c(logs,paste0('staring ',type))
  if(!is.null(gff3)){
    logs=c(logs,paste0('reading  gff3:',basename(gff3)))
  }
  if(type=='fpkm2tpm'){
    exp1=mg_FPKM2TPMs(exp)
    logs=c(logs,paste0('end ',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  }else if(type=='count2tpm'){
    exp[is.na(exp)]=0
    exp1=mg_count2TPMs(exp,gff3 = gff3)
    #head(exp1)
    logs=c(logs,paste0('end ',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  }else if(type=='count2fpkm'){
    exp[is.na(exp)]=0
    exp1=mg_count2FPKMs(exp,gff3 = gff3)
    logs=c(logs,paste0('end ',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  }else if(type=='extractlncRNA'){
    exp1=mg_convert_ByENSG(exp,Symbol=F,c('lncRNA','pseudogene','protein_coding')[1])
    logs=c(logs,paste0('end ',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  }else if(type=='extractPCG'){
    exp1=mg_convert_ByENSG(exp,Symbol=F,c('lncRNA','pseudogene','protein_coding')[3])
    logs=c(logs,paste0('end ',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  }else if(type=='extractPseudogene'){
    exp1=mg_convert_ByENSG(exp,Symbol=F,c('lncRNA','pseudogene','protein_coding')[2])
    logs=c(logs,paste0('end ',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  }else if(type=='ensg2symbol'){
    exp1=exp
    outType=='Symbol'
  }else if(type=='symbol2ensg'){
    exp1=exp
    outType='ENSG'
  }else{
    exp1=NULL
  }
  logs=c(logs,paste0('exporting',type,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))

  #if(is.null(gff3)){
    ft = tidyfst::import_fst(baseFolder,as.data.table = F)
    if(outType=='Symbol'){
    anno=cbind(c(as.character(ft[,1]),as.character(ft[,2])),
    c(as.character(ft[,2]),as.character(ft[,2])))
  }else{
    anno=cbind(c(as.character(ft[,1]),as.character(ft[,2])),
               c(as.character(ft[,1]),as.character(ft[,1])))
  }
  #}else{
  #  library(GenomicFeatures)
  #  library(refGenome)
    
  #  BiocManager::install('refGenome')
  #  mkg=GenomicFeatures::makeTxDbFromGFF('/pub1/data/mg_projects/projects/codes/source/gencode.v19.annotation.gff3.gz',format="gff3")
  #  keytypes(mkg)
  #  keys(mkg)
  #  columns(mkg)
  #  select(mkg, head(keys(mkg, "GENEID")),
  #                columns=c("GENEID","TXNAME",'CDSID'),
  #                keytype="GENEID")
    #mkg$show
    #exons_gene <- GenomicFeatures::genes(mkg)
    #GenomicFeatures::transcriptsBy(mkg, by=c("gene", "exon", "cds"), use.names=FALSE)
    #exons_gene_lens <- lapply(exons_gene,function(x){sum(width(reduce(x)))})
  #}
  exp1=merge_data_by_group(exp1,anno,'median')
  #row.names(exp1)
  logs=c(logs,paste0('exported ',outType,',ncol=',ncol(exp1),',nrow=',nrow(exp1)))
  logs=c(logs,paste0('outputing'))
  write.table(cbind(Tag=row.names(exp1),exp1)
              ,file = paste0(opt$outfile,'/convert_exp.txt')
              ,quote = F,row.names = F,col.names = T,sep = '\t')
  logs=c(logs,paste0('succ outputed'))
},error = function(e) {
  print(conditionMessage(e))
  logs=c(logs,paste0('error:',conditionMessage(e)))
}, finally = {
  write.table(logs,file = paste0(opt$outfile,'/run.log'),quote = F
              ,row.names = T,col.names = T,sep = '\t')
})

#exp1[which(row.names(exp1)=='ENSG00000276070'),]
#exp1[which(row.names(exp1)=='CCL4L2'),]
#exp[which(row.names(exp)=='ENSG00000276070'),]
#anno[which(anno[,1]=='ENSG00000276070'),]
#exp2=merge_data_by_group(exp1,anno,'median')
#head(exp1)

