diff --git a/R/assign_fasta_fun.R b/R/assign_fasta_fun.R index c536a809cf7fe1e3bcc81986fa51d012bce37582..969c30b4ab2a46ceda56cbaa5108c1c3ceef1636 100644 --- a/R/assign_fasta_fun.R +++ b/R/assign_fasta_fun.R @@ -18,10 +18,12 @@ #' @export idtaxa_assign_fasta_fun <- function(fasta, id_db, output = "./assign_fasta/", confidence = 50, verbose = 1, returnval = TRUE){ - + if(verbose == 3){ + flog.threshold(DEBUG) + } if(!dir.exists(output)){ flog.debug('Creating output directory...') - dir.create(output) + dir.create(output, recursive = TRUE) flog.debug('Done.') } @@ -45,57 +47,81 @@ idtaxa_assign_fasta_fun <- function(fasta, id_db, output = "./assign_fasta/", co } if(length(db_list)>1){ - flog.info('Merging both annotation...') + # Multiple references + flog.info('Merging all annotations...') + + Fannot = list() for (seq_name in names(dna)){ - #for each seq - if(length(taxid_list[[1]][[seq_name]]$taxon) == 0 & length(taxid_list[[2]][[seq_name]]$taxon) == 0){ - taxid_list[[3]][[seq_name]]$taxon = rep("unassigned", 8) - taxid_list[[3]][[seq_name]]$confidence = rep(0, 8) - } else { - if(length(taxid_list[[1]][[seq_name]]$taxon) == length(taxid_list[[2]][[seq_name]]$taxon)){ + flog.debug(seq_name) + + # create list + L1=list(); L2=list() + for (i in 1:length(db_list)){ + L1[[i]] = taxid_list[[i]][[seq_name]]$taxon + L2[[i]] = taxid_list[[i]][[seq_name]]$confidence + } + + depth1 = sapply(L1, length) + if ( all(depth1 == 0) ){ + flog.debug("unassigned") + Fannot[[seq_name]]$taxon = rep("unassigned", 8) + Fannot[[seq_name]]$confidence = rep(0, 8) + } + else { + if(length(unique(depth1)) == 1){ + flog.debug("best conf") #if same assignation depth, test on confidence last_rank <- length(taxid_list[[1]][[seq_name]]$taxon) - if(taxid_list[[1]][[seq_name]]$confidence[last_rank] > taxid_list[[2]][[seq_name]]$confidence[last_rank]){ - taxid_list[[3]][[seq_name]] <- taxid_list[[1]][[seq_name]] #DB1 - } else{ - taxid_list[[3]][[seq_name]] <- taxid_list[[2]][[seq_name]] #DB2 + conf1 = sapply(L2, function(x){x[last_rank]}) + Fannot[[seq_name]] <- taxid_list[[which(conf1 == max(conf1))[1]]][[seq_name]] # ..[1] in case of same depth / same conf. + } + else { + if( any( table(depth1) > 1 ) ){ + # if some equal depth + flog.debug("some equal depth, best conf") + last_rank <- max(sapply(L1, length)) + maxDepth_list = L2[which(depth1 == max(depth1))] + conf1 = sapply(L2, function(x){x[last_rank]}); conf1[is.na(conf1)] = 0 + Fannot[[seq_name]] <- taxid_list[[which(conf1 == max(conf1))]][[seq_name]] } - } else { - #if different assignation depth - if(length(taxid_list[[1]][[seq_name]]$taxon) > length(taxid_list[[2]][[seq_name]]$taxon)){ - taxid_list[[3]][[seq_name]] <- taxid_list[[1]][[seq_name]] #DB1 - } else{ - taxid_list[[3]][[seq_name]] <- taxid_list[[2]][[seq_name]] #DB2 + else{ + #if all different assignation depth + flog.debug("best depth") + Fannot[[seq_name]] <- taxid_list[[which(depth1 == max(depth1))]][[seq_name]] } } - } - + flog.debug(paste('length Fannot:',length(Fannot))) } flog.info('Done.') flog.info('Output table...') #Process output table with different assignations. all_annot_tab = data.frame(row.names=names(taxid_list[[1]])) - for (i in 1:3){ - if(i<3){print(db_list[i])}else{print("Final")} + for (i in 1:length(db_list)){ + ALLassign <- sapply(taxid_list[[i]],function (id) { paste(id$taxon," (",round(id$confidence, digits=1),"%)",sep="",collapse="; ") }) all_annot_tab[,i] <- ALLassign - if(verbose == 3){print(head(all_annot_tab))} + } + FINALassign <- sapply(Fannot,function (id) { + paste(id$taxon," (",round(id$confidence, digits=1),"%)",sep="",collapse="; ") + }) + + all_annot_tab$FINAL = FINALassign + seq_tab <- data.frame(sequences=as.character(dna)) row.names(seq_tab) <- names(dna) final_tax_table <- merge(all_annot_tab, seq_tab, by = "row.names") - - colnames(final_tax_table)=c("ASV",db_list[1], db_list[2], "FINAL", "sequences") + colnames(final_tax_table)=c("ASV",db_list, "FINAL", "sequences") write.table(final_tax_table, paste(output,"/allDB_tax_table.csv",sep=""), quote = FALSE, sep = "\t", row.names = FALSE) - # Keep merged assignment (taxid_list[[3]]) - taxid <- sapply(taxid_list[[3]], function(x) { + # Keep merged assignment (Fannot) + taxid <- sapply(Fannot, function(x) { taxa <- rep(NA,7) assign <- x$taxon # print(assign) @@ -108,7 +134,6 @@ idtaxa_assign_fasta_fun <- function(fasta, id_db, output = "./assign_fasta/", co return(taxa) }) - # taxid <- as.data.frame(taxid) }else{ # One DB assignment @@ -133,7 +158,7 @@ idtaxa_assign_fasta_fun <- function(fasta, id_db, output = "./assign_fasta/", co taxid[[nn]] = taxid[[nn]][1:7] } } - taxid <- t(as.data.frame(taxid)) + taxid <- as.data.frame(t(taxid)) flog.info('Done.') if(verbose == 3){ @@ -144,34 +169,40 @@ idtaxa_assign_fasta_fun <- function(fasta, id_db, output = "./assign_fasta/", co names(taxid) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") flog.info("Filling missing taxonomy ranks...") PREFIX = c("k__","p__","c__","o__","f__","g__","s__") + # handling possible prefix + noprefix_taxid = taxid[grep("k__",taxid[,1], invert = TRUE),] + prefix_taxid = taxid[grep("k__",taxid[,1]),] + + if(nrow(noprefix_taxid) != 0){ + noprefix_taxid = fill_tax_fun(noprefix_taxid, prefix = FALSE) + noprefix_taxid = as.data.frame( t(apply(noprefix_taxid, 1, function(x){ paste(PREFIX, x, sep="")})), stringAsFactors = FALSE) + names(noprefix_taxid) = names(taxid) + } - if( length(grep("p__",taxid[,2])) == 0 ){ - taxid = fill_tax_fun(taxid, prefix = FALSE) - taxid = as.data.frame( t(apply(taxid, 1, function(x){ paste(PREFIX, x, sep="")})), stringAsFactors = FALSE) - }else{ - taxid = fill_tax_fun(taxid, prefix = TRUE) + if(nrow(prefix_taxid) != 0){ + prefix_taxid = fill_tax_fun(prefix_taxid, prefix = TRUE) } - flog.info('Done.') + filled_taxid = rbind.data.frame(noprefix_taxid, prefix_taxid) - names(taxid) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") - tax.table <- tax_table(as.matrix(taxid)) + tax.table = filled_taxid[names(dna),] + flog.info('Done.') flog.info("Check taxonomy consistency...") - tax.table = check_tax_fun(tax.table, output = NULL) + tax.tablecheck = check_tax_fun(tax.table, output = NULL, rank = 6, verbose = 3) flog.info("Done.") #Output table 2 - Tab2 = apply( as.data.frame(tax.table, stringsAsFactors=FALSE) , 1 , paste , collapse = ";" ) + Tab2 = apply( as.data.frame(tax.tablecheck, stringsAsFactors=FALSE) , 1 , paste , collapse = ";" ) Tab2 <- cbind(Tab2, seq=as.character(dna)) write.table(Tab2, paste(output,"/final_tax_table.csv",sep=""), quote = FALSE, sep = "\t", col.names = FALSE, row.names = TRUE) flog.info('Saving R objects.') - save(tax.table, file=paste(output,'/robjects.Rdata',sep='')) + save(tax.tablecheck, file=paste(output,'/robjects.Rdata',sep='')) flog.info('Finish.') - if(returnval){return(tax.table)} + if(returnval){return(tax.tablecheck)} } diff --git a/R/assign_taxo_fun.R b/R/assign_taxo_fun.R index 6e3f8886b34321cb2b5493cfcd2741a534711742..6e754147364c7a78e0665c68c2ecbee48a5885e2 100644 --- a/R/assign_taxo_fun.R +++ b/R/assign_taxo_fun.R @@ -33,162 +33,15 @@ assign_taxo_fun <- function(dada_res = dada_res, output = "./idtaxa/", id_db = } - if(!dir.exists(output)){ - flog.debug('Creating output directory...') - dir.create(output) - flog.debug('Done.') - } ## IDATAXA assignment - flog.info(paste('Taxonomy assignation with IDTAXA',sep='')) dna <- DNAStringSet(getSequences(dada_res$seqtab.nochim)) + names(dna) <- sapply(colnames(dada_res$seqtab.nochim), digest::digest, algo="md5") + if(!all( names(dna) == rownames(dada_res$otu.table) )){stop("Seq names and ASV table row names are different")} - db_list <- unlist(strsplit(id_db,",")) - taxid_list=vector("list", length(db_list)+1) - for (i in 1:length(db_list)){ - db_file <- db_list[i] - taxid_list[[i]] <- idTaxa_assign(db_file, dna, colnames(dada_res$seqtab.export), confidence, ncpu) - } - - if(verbose == 3){ - save(taxid_list, file = "./annot_taxid_list_debug.rdata") - } - - if(length(db_list)>1){ - flog.info('Merging both annotation...') - for (seq_name in colnames(dada_res$seqtab.export)){ - #for each seq - if(length(taxid_list[[1]][[seq_name]]$taxon) == 0 & length(taxid_list[[2]][[seq_name]]$taxon) == 0){ - taxid_list[[3]][[seq_name]]$taxon = rep("unassigned", 8) - taxid_list[[3]][[seq_name]]$confidence = rep(0, 8) - } else { - if(length(taxid_list[[1]][[seq_name]]$taxon) == length(taxid_list[[2]][[seq_name]]$taxon)){ - #if same assignation depth, test on confidence - last_rank <- length(taxid_list[[1]][[seq_name]]$taxon) - if(taxid_list[[1]][[seq_name]]$confidence[last_rank] > taxid_list[[2]][[seq_name]]$confidence[last_rank]){ - taxid_list[[3]][[seq_name]] <- taxid_list[[1]][[seq_name]] #DB1 - } else{ - taxid_list[[3]][[seq_name]] <- taxid_list[[2]][[seq_name]] #DB2 - } - } else { - #if different assignation depth - if(length(taxid_list[[1]][[seq_name]]$taxon) > length(taxid_list[[2]][[seq_name]]$taxon)){ - taxid_list[[3]][[seq_name]] <- taxid_list[[1]][[seq_name]] #DB1 - } else{ - taxid_list[[3]][[seq_name]] <- taxid_list[[2]][[seq_name]] #DB2 - } - } - - } - - } - flog.info('Done.') - - flog.info('Output table...') - #Process output table with different assignations. - all_annot_tab = data.frame(row.names=names(taxid_list[[1]])) - for (i in 1:3){ - if(i<3){print(db_list[i])}else{print("Final")} - ALLassign <- sapply(taxid_list[[i]],function (id) { - paste(id$taxon," (",round(id$confidence, digits=1),"%)",sep="",collapse="; ") - }) - all_annot_tab[,i] <- ALLassign - if(verbose == 3){print(head(all_annot_tab))} - } - - seq_tab <- data.frame(sequences=as.character(dna)) - row.names(seq_tab) <- colnames(dada_res$seqtab.export) - final_tax_table <- merge(all_annot_tab, seq_tab, by = "row.names") - - - colnames(final_tax_table)=c("ASV",db_list[1], db_list[2], "FINAL", "sequences") - write.table(final_tax_table, paste(output,"/allDB_tax_table.csv",sep=""), quote = FALSE, sep = "\t", row.names = FALSE) - - # Keep merged assignment (taxid_list[[3]]) - taxid <- sapply(taxid_list[[3]], function(x) { - taxa <- rep(NA,7) - assign <- x$taxon - # print(assign) - if(length(assign[-1])==0){ - taxa <- rep("unassigned",7) - } else { - taxa[1:length(assign[-1])] <- assign[-1] - } - - return(taxa) - }) - - # taxid <- as.data.frame(taxid) - - }else{ - # One DB assignment - taxid <- sapply(taxid_list[[1]], function(x) { - taxa <- rep(NA,7) - assign <- x$taxon - # print(assign) - if(length(assign[-1])==0){ - taxa <- rep("unassigned",7) - } else { - taxa[1:length(assign[-1])] <- assign[-1] - } - - return(taxa) - }) - } - - if(max(sapply(taxid, length)) > 7){ - LL=sapply(taxid, length) - Tnames=names(LL[LL>7]) - for(nn in Tnames){ - taxid[[nn]] = taxid[[nn]][1:7] - } - } - FeatNames = colnames(taxid); if(is.null(FeatNames)){FeatNames = names(taxid)} - taxid <- t(as.data.frame(taxid)) - row.names(taxid) = FeatNames - flog.info('Done.') - - if(verbose == 3){ - save(taxid, file = "./annot_taxid_debug.rdata") - } - - - # Filling taxonomy with last assigned rank. - names(taxid) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") - flog.info("Filling missing taxonomy ranks...") - PREFIX = c("k__","p__","c__","o__","f__","g__","s__") - - if( length(grep("p__",taxid[,2])) == 0 ){ - taxid = fill_tax_fun(taxid, prefix = FALSE) - taxid = as.data.frame( t(apply(taxid, 1, function(x){ paste(PREFIX, x, sep="")})), stringAsFactors = FALSE) - }else{ - taxid = fill_tax_fun(taxid, prefix = TRUE) - } - - - flog.info('Done.') - - names(taxid) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") - tax.table <- tax_table(as.matrix(taxid)) - - - flog.info("Check taxonomy consistency...") - tax.table = check_tax_fun(tax.table, output = NULL) - flog.info("Done.") - - #Output table 2 - Tab2 = apply( as.data.frame(tax.table, stringsAsFactors=FALSE) , 1 , paste , collapse = ";" ) - Tab2 <- cbind(Tab2, seq=colnames(dada_res$seqtab.nochim)) - write.table(Tab2, paste(output,"/final_tax_table.csv",sep=""), quote = FALSE, sep = "\t", - col.names = FALSE, row.names = TRUE) - - flog.info('Saving R objects.') - save(tax.table, file=paste(output,'/robjects.Rdata',sep='')) - flog.info('Finish.') - - if(returnval){return(tax.table)} - + tt2 = idtaxa_assign_fasta_fun(fasta = dna, id_db = id_db, + output = output, confidence = confidence, verbose = verbose, returnval = returnval) } diff --git a/R/idtaxaDB_formatting_functions.R b/R/idtaxaDB_formatting_functions.R index d5cdcf7ef25b27a58068c26864c3ae976d5ae7fe..9e9782a4a257be97fdfd53f2de3ae158fecc3201 100644 --- a/R/idtaxaDB_formatting_functions.R +++ b/R/idtaxaDB_formatting_functions.R @@ -67,7 +67,6 @@ fill_tax_fun <- function(taxtable = taxtable, prefix = TRUE){ check_tax_fun <- function(taxtable = taxtable, output = NULL, rank = 7, verbose=3, returnval = TRUE){ RANKS = c("_domain","_phylum","_class","_order","_family","_genus","_species") - print("Check taxonomy consistency...") # Check for multiple ancestors at each rank, choose first occurence for each problematic taxon sink(paste('./check_tax_fun.log', sep=""), split = TRUE) for(rk in rank:2){ @@ -101,15 +100,17 @@ check_tax_fun <- function(taxtable = taxtable, output = NULL, rank = 7, verbose= ftax <- names(uniqTax2[order(uniqTax2,decreasing=TRUE)])[1] ftax <- unlist(strsplit(ftax,";")) + cat( glue::glue( "CORRECTED : {paste(ftax, collapse = ';')}" ) ); cat("\n") #Change taxonomy with final ftax. the most common in taxtable for(j in row.names(taxtable[taxtable[,rk]==i,])){ if(rk == 7){ + # print(ftax) taxtable[j,] = ftax }else{ - taxtable[j,] = c(ftax, taxtable[j,(rk+1):ncol(taxtable)]) + taxtable[j,] = c(ftax, as.matrix(taxtable[j,(rk+1):ncol(taxtable)]) ) } } } diff --git a/R/idtaxa_assign_fun.R b/R/idtaxa_assign_fun.R index 7e81d368b5b9a4de4f17fb58fc4b0f83d5ad122f..26e08b4df18662fea5d6fbff19027bfbbe3d71b6 100644 --- a/R/idtaxa_assign_fun.R +++ b/R/idtaxa_assign_fun.R @@ -15,7 +15,7 @@ #' @export -idTaxa_assign = function(db_file, dna, asv_names, confidence, ncpu = NULLS){ +idTaxa_assign = function(db_file, dna, asv_names, confidence, ncpu = NULL){ flog.info(paste('Using database ',db_file,sep='')) toto <- load(db_file) ids <- IdTaxa(dna, trainingSet, strand="both", processors=ncpu, verbose=TRUE)