From 6c2d16d9703d056e70073be50c887347e7578d28 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 8 Aug 2023 10:59:05 +0200 Subject: [PATCH 1/6] Increase memory for seqtk sample process Ref: #72 --- conf/base.config | 2 +- nextflow.config | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 43666af..19f7c45 100644 --- a/conf/base.config +++ b/conf/base.config @@ -229,7 +229,7 @@ process { ext.args = '-s100' ext.args2 = params.subset_seq - memory = { 5.GB * task.attempt } + memory = { 50.GB * task.attempt } module = toolsModuleHash['SEQTK_SAMPLE'] publishDir = [ diff --git a/nextflow.config b/nextflow.config index 5541f97..e9728fe 100644 --- a/nextflow.config +++ b/nextflow.config @@ -137,6 +137,7 @@ process.container = "$baseDir/template-nf.sif" profiles { dev { includeConfig "$baseDir/conf/test.config" } + prod { System.out.println "Mode prod !" } } // Avoid this error: -- GitLab From ae809cf9745f050a8f39f9de8d65f1985fb9633d Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 9 Aug 2023 16:48:39 +0200 Subject: [PATCH 2/6] Add diversity QC sub-wf - Add joinPairs process Ref: #71 --- assets/multiqc_config.yaml | 9 +++++++++ conf/base.config | 20 +++++++++++++++++++ conf/dependencies_genobioinfo.config | 3 +++ conf/dependencies_genologin.config | 3 +++ modules/local/module_diversity.nf | 30 ++++++++++++++++++++++++++++ nextflow.config | 5 +++-- sub-workflows/local/diversity_qc.nf | 8 +++++++- workflow/illumina_qc.nf | 15 ++++++++++++++ 8 files changed, 90 insertions(+), 3 deletions(-) create mode 100644 modules/local/module_diversity.nf diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 593ca97..a401889 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -103,3 +103,12 @@ custom_logo_title: "GeT-GenoToul" fastqscreen_simpleplot: true # Qualimap + +# Flash +flash: + use_output_name: true +flash/log: + contents: "[FLASH]" + shared: true +flash/hist: + fn: "*_flash.hist" \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index 19f7c45..466213b 100644 --- a/conf/base.config +++ b/conf/base.config @@ -161,6 +161,26 @@ process { cpus = 2 } + // ----- 16S/Amplicon ----- // + withName: JOIN_PAIR { + module = toolsModuleHash['FLASH'] + time = { 30.m * task.attempt } + memory = { 500.MB * task.attempt } + cpus = 2 + + ext.args = [ + "-x ${params.max_mismatch_density}", + "-m ${params.min_overlap}", + "-M ${params.max_overlap}" + ].join(' ') + + publishDir = [ + path: "${params.outdir}/joinPair", + mode: 'copy', + pattern: "*.{log,hist}" + ] + } + // ----- WithLabel withLabel: littleJob { executor = 'local' diff --git a/conf/dependencies_genobioinfo.config b/conf/dependencies_genobioinfo.config index 5276bce..96b1ed9 100644 --- a/conf/dependencies_genobioinfo.config +++ b/conf/dependencies_genobioinfo.config @@ -16,6 +16,9 @@ toolsModuleHash['STAR'] = ['bioinfo/STAR/2.7.5a'] // version upgraded face to toolsModuleHash['BWA'] = ['bioinfo/bwa/0.7.17'] toolsModuleHash['SAMTOOLS'] = ['bioinfo/samtools/1.18'] // version upgraded face to genologin +// ----- 16S/Amplicon ----- // +toolsModuleHash['FLASH'] = ['bioinfo/FLASH/1.2.11'] // version upgraded face to genologin + // ======================================== // SHARED MODULES //========================================= diff --git a/conf/dependencies_genologin.config b/conf/dependencies_genologin.config index 7c9fa92..1b2636e 100644 --- a/conf/dependencies_genologin.config +++ b/conf/dependencies_genologin.config @@ -16,6 +16,9 @@ toolsModuleHash['STAR'] = ['bioinfo/STAR-2.7.10a_alpha_220314'] toolsModuleHash['BWA'] = ['/tools/share/Modules/bioinfo/bwa-0.7.17'] toolsModuleHash['SAMTOOLS'] = ['bioinfo/samtools-1.16.1'] +// ----- 16S/Amplicon ----- // +toolsModuleHash['FLASH'] = ['bioinfo/FLASH-1.2.6'] + // ======================================== // SHARED MODULES //========================================= diff --git a/modules/local/module_diversity.nf b/modules/local/module_diversity.nf new file mode 100644 index 0000000..b611b4b --- /dev/null +++ b/modules/local/module_diversity.nf @@ -0,0 +1,30 @@ +/* + * Module pour les analyses des données 16S et Amplicon +*/ + +process JOIN_PAIR { + tag "$sample" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.extendedFrags.fastq.gz"), emit: extendedFrags + tuple val(sample), path("*.notCombined_*.fastq.gz"), emit: notCombined + tuple val(sample), path("*.log"), emit: logs + tuple val(sample), path("*.hist"), emit: histogram + + script: + def args = task.ext.args ?: '' + """ + flash \\ + $reads \\ + -z \\ + -t ${task.cpus} \\ + -o ${sample} \\ + $args \\ + > ${sample}_flash.log + + mv ${sample}.hist ${sample}_flash.hist + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index e9728fe..2de3b8c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,8 +27,9 @@ params { reference_transcriptome = "" // Amplicon / 16S params - min_overlap = "" - max_overlap = "" + min_overlap = 20 + max_overlap = 55 + max_mismatch_density = 0.1 // 10X params diff --git a/sub-workflows/local/diversity_qc.nf b/sub-workflows/local/diversity_qc.nf index 8bc288d..35a0616 100644 --- a/sub-workflows/local/diversity_qc.nf +++ b/sub-workflows/local/diversity_qc.nf @@ -8,7 +8,7 @@ // ------------------------------------------------- // MODULES // ------------------------------------------------- -include { } from "$baseDir/modules/local/module_diversity.nf" +include { JOIN_PAIR } from "$baseDir/modules/local/module_diversity.nf" // ------------------------------------------------- @@ -17,6 +17,12 @@ include { } from "$baseDir/modules/local/module_diversity.nf" workflow DIVERSITY_QC { take: fastq + main: + JOIN_PAIR(fastq) + emit: + extendedFrags = JOIN_PAIR.out.extendedFrags + histogram = JOIN_PAIR.out.histogram + logs = JOIN_PAIR.out.logs } \ No newline at end of file diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index 5d2e895..aed0e65 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -62,6 +62,7 @@ include { CORE_ILLUMINA } from "$baseDir/sub-workflows/local/core_illumina.nf" include { CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" include { RNA_QC } from "$baseDir/sub-workflows/local/rna_qc.nf" +include { DIVERSITY_QC } from "$baseDir/sub-workflows/local/diversity_qc.nf" include { PARSE_REPORTS } from "$baseDir/modules/local/module_DTM.nf" include { TREATMENT_DEMUXSTAT as TREATMENT_DEMUX_RUN; TREATMENT_DEMUXSTAT as TREATMENT_DEMUX_READSETS @@ -126,6 +127,20 @@ workflow ILLUMINA_QC { RNA_QC.out.align_results.collect{it[1]}.ifEmpty([]), RNA_QC.out.sortmerna_log.collect{it[1]}.ifEmpty([]) ) + + } else if (params.data_nature =~ "16S|Amplicon") { + DIVERSITY_QC(fastq + .collect{it[1]} + .flatten() + .map { $it -> [ ($it.simpleName =~ /(.*)_R[1-2]_.*/)[0][1] , $it ] } + .groupTuple() + ) // les deux en meme temps !!!! + + ch_mqc = ch_mqc.mix( + DIVERSITY_QC.out.histogram.collect{it[1]}.ifEmpty([]), + DIVERSITY_QC.out.logs.collect{it[1]}.ifEmpty([]) + ) + } else { System.out.println "Le QC des données non ADN n'est pas prit en charge pour le moment." ch_mqc = ch_mqc.mix( Channel.empty() ) -- GitLab From 7516ccf8180030a067cd1fd3e55f52fc1a492ba3 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Fri, 11 Aug 2023 17:15:57 +0200 Subject: [PATCH 3/6] Add ressources for subsetAssignation Subworkflow : diversity QC Ref: #71 --- assets/multiqc_config.yaml | 4 +++ conf/base.config | 46 ++++++++++++++++++++++++++-- conf/dependencies_genobioinfo.config | 4 ++- conf/dependencies_genologin.config | 4 ++- modules/local/module_diversity.nf | 30 +++++++++++++++++- nextflow.config | 3 ++ sub-workflows/local/diversity_qc.nf | 26 +++++++++++++++- 7 files changed, 111 insertions(+), 6 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index a401889..b5f8ae5 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -80,6 +80,10 @@ module_order: name: "AlignmentStat" href: "https://combine-lab.github.io/salmon/" target: "Salmon" + - flash: + name: "JoinPairs" + href: "https://ccb.jhu.edu/software/FLASH/" + target: "Flash" # Pattern diff --git a/conf/base.config b/conf/base.config index 466213b..e3c8bd6 100644 --- a/conf/base.config +++ b/conf/base.config @@ -179,7 +179,20 @@ process { mode: 'copy', pattern: "*.{log,hist}" ] - } + } + + withName: BLAST_N { + module = toolsModuleHash['BLAST'] + time = { 5.h * task.attempt } + memory = { 2.GB * task.attempt } + cpus = 4 + + ext.args = [ + "-max_target_seqs ${params.blast_max_target}", + "-outfmt ${params.blast_outfmt}" + ].join(' ') + + } // ----- WithLabel withLabel: littleJob { @@ -250,7 +263,7 @@ process { ext.args2 = params.subset_seq memory = { 50.GB * task.attempt } - module = toolsModuleHash['SEQTK_SAMPLE'] + module = toolsModuleHash['SEQTK'] publishDir = [ path: { "${params.outdir}/subset" }, @@ -260,6 +273,35 @@ process { ] } + withName: KRONA_BLAST { + label = 'littleJob' + module = toolsModuleHash['KRONA'] + + ext.args = [ + '-i', + '-b' + ].join(' ') + + publishDir = [ + path: "${params.outdir}/subsetAssignation", + mode: 'copy', + pattern: "*.html" + ] + + } + + withName: SUBSET { + ext.args = '-s100' + ext.args2 = "1000000" + + memory = { 50.GB * task.attempt } + module = toolsModuleHash['SEQTK'] + } + + withName: FQ_TO_FA { + module = toolsModuleHash['SEQTK'] + } + withName: MULTIQC { ext.args = [ "--config ${baseDir}/assets/multiqc_config.yaml", diff --git a/conf/dependencies_genobioinfo.config b/conf/dependencies_genobioinfo.config index 96b1ed9..e32e586 100644 --- a/conf/dependencies_genobioinfo.config +++ b/conf/dependencies_genobioinfo.config @@ -18,11 +18,13 @@ toolsModuleHash['SAMTOOLS'] = ['bioinfo/samtools/1.18'] // version upgraded fa // ----- 16S/Amplicon ----- // toolsModuleHash['FLASH'] = ['bioinfo/FLASH/1.2.11'] // version upgraded face to genologin +toolsModuleHash['BLAST'] = ['bioinfo/NCBI_Blast+/2.10.0+'] // ======================================== // SHARED MODULES //========================================= -toolsModuleHash['SEQTK_SAMPLE'] = ['bioinfo/Seqtk/1.3'] +toolsModuleHash['SEQTK'] = ['bioinfo/Seqtk/1.3'] toolsModuleHash['MULTIQC'] = ['bioinfo/MultiQC/1.14'] toolsModuleHash['SORTMERNA'] = ['bioinfo/SortMeRNA/4.3.6'] // version upgraded face to genologin toolsModuleHash['QUALIMAP'] = ['bioinfo/Qualimap/31-08-20'] +toolsModuleHash['KRONA'] = ['bioinfo/Krona/2.8.1'] // version upgraded face to genologin diff --git a/conf/dependencies_genologin.config b/conf/dependencies_genologin.config index 1b2636e..fd18ea5 100644 --- a/conf/dependencies_genologin.config +++ b/conf/dependencies_genologin.config @@ -18,11 +18,13 @@ toolsModuleHash['SAMTOOLS'] = ['bioinfo/samtools-1.16.1'] // ----- 16S/Amplicon ----- // toolsModuleHash['FLASH'] = ['bioinfo/FLASH-1.2.6'] +toolsModuleHash['BLAST'] = ['bioinfo/ncbi-blast-2.10.0+'] // ======================================== // SHARED MODULES //========================================= -toolsModuleHash['SEQTK_SAMPLE'] = ['bioinfo/seqtk-1.3'] +toolsModuleHash['SEQTK'] = ['bioinfo/seqtk-1.3'] toolsModuleHash['MULTIQC'] = ['bioinfo/MultiQC-1.14'] toolsModuleHash['SORTMERNA'] = ['bioinfo/sortmerna-4.3.2'] toolsModuleHash['QUALIMAP'] = ['bioinfo/qualimap-31-08-20'] +toolsModuleHash['KRONA'] = ['bioinfo/KronaTools-2.7'] diff --git a/modules/local/module_diversity.nf b/modules/local/module_diversity.nf index b611b4b..b2a27ab 100644 --- a/modules/local/module_diversity.nf +++ b/modules/local/module_diversity.nf @@ -27,4 +27,32 @@ process JOIN_PAIR { mv ${sample}.hist ${sample}_flash.hist """ -} \ No newline at end of file +} + +// Blastn +process BLAST_N { + tag "$sample" + + input: + tuple val(sample), path(fasta) + val db + + output: + tuple val(sample), path("*.blastn"), emit: results + + script: + def args = task.ext.args ?: '' + """ + db_dir=\$(dirname $db) + [[ `find -L \$db_dir -name "*.00.idx"` ]] && isIndexed='true' || isIndexed='false' + + blastn \\ + -num_threads $task.cpus \\ + -db $db \\ + -query $fasta \\ + -use_index \$isIndexed \\ + $args \\ + -out ${sample}.blastn + """ + +} diff --git a/nextflow.config b/nextflow.config index 2de3b8c..7d38ea8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,6 +30,9 @@ params { min_overlap = 20 max_overlap = 55 max_mismatch_density = 0.1 + assignation_databank = '' + blast_outfmt = 7 + blast_max_target = 10 // 10X params diff --git a/sub-workflows/local/diversity_qc.nf b/sub-workflows/local/diversity_qc.nf index 35a0616..06c59d0 100644 --- a/sub-workflows/local/diversity_qc.nf +++ b/sub-workflows/local/diversity_qc.nf @@ -8,7 +8,12 @@ // ------------------------------------------------- // MODULES // ------------------------------------------------- -include { JOIN_PAIR } from "$baseDir/modules/local/module_diversity.nf" +include { JOIN_PAIR; + BLAST_N } from "$baseDir/modules/local/module_diversity.nf" +include { GUNZIP } from "${params.shared_modules}/gzip.nf" +include { SEQTK_SAMPLE as SUBSET; + SEQTK_SEQ_A as FQ_TO_FA } from "${params.shared_modules}/seqtk.nf" +include { KRONA_BLAST } from "${params.shared_modules}/krona.nf" // ------------------------------------------------- @@ -19,10 +24,29 @@ workflow DIVERSITY_QC { fastq main: + // Pairs merging JOIN_PAIR(fastq) + // SubsetAssignation + if (params.assignation_databank != '') { + GUNZIP(JOIN_PAIR.out.extendedFrags) + SUBSET(GUNZIP.out) + + // -- Fastq to Fasta + FQ_TO_FA(SUBSET.out) + + // -- Taxonomic assignation + BLAST_N(FQ_TO_FA.out.fasta, params.assignation_databank) + KRONA_BLAST(BLAST_N.out.results) + krona_html = KRONA_BLAST.out.html + + } else { + krona_html = Channel.empty() + } + emit: extendedFrags = JOIN_PAIR.out.extendedFrags histogram = JOIN_PAIR.out.histogram logs = JOIN_PAIR.out.logs + krona = krona_html } \ No newline at end of file -- GitLab From 887e279eece65aa5e1a9004ed63fdadbb22ed7b0 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 28 Aug 2023 10:59:35 +0200 Subject: [PATCH 4/6] replace NAs by 0 Ref: #73 --- bin/demuxStatsFromXML.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R index 247e498..5ad8819 100755 --- a/bin/demuxStatsFromXML.R +++ b/bin/demuxStatsFromXML.R @@ -218,10 +218,12 @@ colnames(percentOfFragment)<-"percentageOfFragment" df2<-cbind(df2, percentOfFragment) +# Remplacement des NA par 0 +df2[is.na(df2)] <- 0 + # Export du data.frame cat("\nSauvegarde du data.frame.\n") #myProject<-"DEBUG" -# mettre des 0 à la place des NA dans df2 write.table(df2, row.names = FALSE, quote = F, sep = "\t", file = paste0("DemultiplexStats.tsv")) # Ecrire un fichier par valeur de myProject ! Cas ou il y a plusieurs projets sur la même lane. cat(paste0("\tLe fichier suivant à été créé :\t", launchDir, "/DemultiplexStats.tsv\n")) -- GitLab From de87eef071da47c38f6fac1861d333ed6b198bf2 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Thu, 31 Aug 2023 16:07:52 +0200 Subject: [PATCH 5/6] Delete useless parameters in config Ref: #74 --- README.md | 2 -- assets/params.config_example | 19 ----------------- assets/params.yml_example | 2 -- bin/DTM/data_prepare.pl | 2 -- conf/base.config | 3 +-- nextflow.config | 41 ++++++++++++++++-------------------- workflow/illumina_qc.nf | 2 +- 7 files changed, 20 insertions(+), 51 deletions(-) delete mode 100644 assets/params.config_example diff --git a/README.md b/README.md index ec0e744..41fc3a7 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,6 @@ project: 'GwOAK_small' is_multiplex: true data_nature: "DNA" pairedEnd: true -split_reads: true # // ???? reference_genome: "/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Quercus_robur/genome/GCA_900291515.1/BWA/GCA_900291515.1_Q_robur_v1_genomic.fna" addBankForConta: "" run_name: "ContaComparison" @@ -38,7 +37,6 @@ run_date: "2022" machine_id: "NOVA" fc_id: "HNMTTDSX2" lane: "1" -demux_uniqueness: "1638345606" ``` diff --git a/assets/params.config_example b/assets/params.config_example deleted file mode 100644 index b197245..0000000 --- a/assets/params.config_example +++ /dev/null @@ -1,19 +0,0 @@ -params { - inputdir="/home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x" - samplesheet = inputdir+'/SampleSheet.csv' - project = 'MAGICs' - data=inputdir+'/'+project - is_multiplex = true - data_nature = 'DNA' - //pairedEnd = true - split_reads = true - reference_genome = '' - addBankForConta = '' - run_name='Test_10X' - sequencer='NovaSeq' - run_date='230116' - machine_id='NOVA' - fc_id='BHNKY7DRX2' - lane='1' - demux_uniqueness='1673933427' -} \ No newline at end of file diff --git a/assets/params.yml_example b/assets/params.yml_example index ab6728c..5d2ed15 100644 --- a/assets/params.yml_example +++ b/assets/params.yml_example @@ -3,7 +3,6 @@ inputdir: "/home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1 project: "MAGICs" is_multiplex: true data_nature: "DNA" -split_reads: true species: "Mus musculus" reference_genome: "" reference_transcriptome: "" @@ -15,6 +14,5 @@ run_date: "230116" fc_id: "BHNKY7DRX2" fc_type: "Flowcell Standard - Lane 1" lane: "1" -demux_uniqueness: "1673933427" min_overlap: 100 max_overlap: 230 \ No newline at end of file diff --git a/bin/DTM/data_prepare.pl b/bin/DTM/data_prepare.pl index 05d9b77..9dfc6fd 100644 --- a/bin/DTM/data_prepare.pl +++ b/bin/DTM/data_prepare.pl @@ -200,7 +200,6 @@ MAIN: print NF_PARAMS "project: '$project'\n"; print NF_PARAMS "is_multiplex: $isMultiplex\n"; print NF_PARAMS "data_nature: '$dataNature'\n"; - print NF_PARAMS "split_reads: true\n"; print NF_PARAMS "species: '$species'\n"; print NF_PARAMS "reference_genome: '$genomeRef'\n" if (defined($genomeRef)); # parametre non obligatoire print NF_PARAMS "make_star_index: true\n" if ($force_indexing); # parametre non obligatoire @@ -215,7 +214,6 @@ MAIN: print NF_PARAMS "insert_to_ngl: false\n" unless (defined($sq_xp_code)); print NF_PARAMS "sq_xp_code: '$sq_xp_code'\n" if (defined($sq_xp_code)); print NF_PARAMS "bi_run_code: '$ngl_bi_run_name'\n" if (defined($ngl_bi_run_name)); - print NF_PARAMS "demux_uniqueness: $demuxUniqueness\n"; print NF_PARAMS "min_overlap: $minOverlap\n" if (defined($minOverlap)); # parametre non obligatoire print NF_PARAMS "max_overlap: $maxOverlap\n" if (defined($maxOverlap)); # parametre non obligatoire print NF_PARAMS "email: '$nf_mailRecipient'\n" if (defined($nf_mailRecipient)); # parametre non obligatoire diff --git a/conf/base.config b/conf/base.config index e3c8bd6..c5d1005 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,7 +11,6 @@ params { System.out.println "run_date : "+run_date System.out.println "fc_id : "+fc_id System.out.println "lane : "+lane - System.out.println "demux_uniqueness : "+demux_uniqueness System.out.println "outdir : "+outdir System.out.println "" } @@ -345,7 +344,7 @@ process { withName: QUALIMAP { module = toolsModuleHash['QUALIMAP'] cpus = { 8 * task.attempt } - memory = { 2.GB * task.attempt } + memory = { 8.GB * task.attempt } time = { 3.h * task.attempt } publishDir = [ diff --git a/nextflow.config b/nextflow.config index 7d38ea8..38b1344 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,6 @@ params { fc_id = "" fc_type = "" lane = "" - demux_uniqueness = "" data_nature = "" species = "" @@ -19,12 +18,29 @@ params { run_name = "" run_date = "" description = "" - split_reads = false + // CORE params + /// FASTP + fastp_n_reads = 100000000 + + /// Subset fastq files params + no_subset = false // to skip subset step -> use every reads to align + large_sampling_threshold = 200 // 200 samples run is high multiplexed + miseq_subset_seq = "50000" // in reads must be a string + nova_subset_seq = "50000000" // in reads + large_indexing_nova_subset_seq = "500000" // in reads + // DNA / RNA params reference_genome = "" make_star_index = false reference_transcriptome = "" + sortmerna_db_path = '/usr/local/bioinfo/src/SortMeRNA/sortmerna-2.1b/rRNA_databases' + sortmerna_bac_16s = sortmerna_db_path + '/silva-bac-16s-id90.fasta' + sortmerna_bac_23s = sortmerna_db_path + '/silva-bac-23s-id98.fasta' + sortmerna_arc_16s = sortmerna_db_path + '/silva-arc-16s-id95.fasta' + sortmerna_arc_23s = sortmerna_db_path + '/silva-arc-23s-id98.fasta' + sortmerna_euk_18s = sortmerna_db_path + '/silva-euk-18s-id95.fasta' + sortmerna_euk_28s = sortmerna_db_path + '/silva-euk-28s-id98.fasta' // Amplicon / 16S params min_overlap = 20 @@ -61,27 +77,6 @@ params { //email_labo="get-plage.labo@genotoul.fr" email_labo="" - - // ----- TOOLS PARAMETERS ----- - // Subset fastq files params - no_subset = false // to skip subset step -> use every reads to align - large_sampling_threshold = 200 // 200 samples run is high multiplexed - miseq_subset_seq = "50000" // in reads must be a string - nova_subset_seq = "50000000" // in reads - large_indexing_nova_subset_seq = "500000" // in reads - - // RNA QC - sortmerna_db_path = '/usr/local/bioinfo/src/SortMeRNA/sortmerna-2.1b/rRNA_databases' - sortmerna_bac_16s = sortmerna_db_path + '/silva-bac-16s-id90.fasta' - sortmerna_bac_23s = sortmerna_db_path + '/silva-bac-23s-id98.fasta' - sortmerna_arc_16s = sortmerna_db_path + '/silva-arc-16s-id95.fasta' - sortmerna_arc_23s = sortmerna_db_path + '/silva-arc-23s-id98.fasta' - sortmerna_euk_18s = sortmerna_db_path + '/silva-euk-18s-id95.fasta' - sortmerna_euk_28s = sortmerna_db_path + '/silva-euk-28s-id98.fasta' - - // FASTP - fastp_n_reads = 100000000 - // skip parameters skip_core_illumina = false diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index aed0e65..9407f9c 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -121,7 +121,7 @@ workflow ILLUMINA_QC { PARSE_REPORTS(CORE.out.fastp_report, DNA_QC.out.qualimap_report) } - } else if (params.data_nature =~ 'RNA') { + } else if (params.data_nature =~ 'RNA-*') { RNA_QC(CORE.out.subset_fastq, ch_sortmerna_db) ch_mqc = ch_mqc.mix( RNA_QC.out.align_results.collect{it[1]}.ifEmpty([]), -- GitLab From 41fd14b4de286f69c7411ed41269a4e5641d2c2c Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Thu, 31 Aug 2023 16:08:30 +0200 Subject: [PATCH 6/6] Create usage.md file Ref : #74 --- docs/usage.md | 311 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 216 insertions(+), 95 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index def788a..c967a19 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,131 +1,252 @@ -# get-nextflow-ngl-bi/template-nf: Usage +# get-nextflow-ngl-bi/wf-Illumina-nf: Usage -## Inputs +Below is a description of the configurable options available for the pipeline. -You will need to create a samplesheet `samples.csv` file with information about the samples in your input directory before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 5 columns, and a header row as shown in the examples below. +## Usefull core Nextflow arguments +> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -```bash ---inputdir '/directory/to/data' -``` -or -```bash ---inputdir '/directory/to/data' --samplesheet /path/to/samples.csv -``` +- **`-name`** [str] +Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. -Below is an example for a single reads, than paireds reasd: +- **`-resume`** +Specify this flag when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. +You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. -```bash -#id,name,fastq_1,fastq_2 -1,sample1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -2,control,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -3,controlsingle,AEG588A3_S3_L002_R1_001.fastq.gz -``` +- **`-profile`** [str] +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Conda) - see below. +Note that multiple profiles can be loaded, for example: `-profile dev,docker` - the order of arguments is important! +They are loaded in sequence, so later profiles can overwrite earlier profiles. +Availlable profiles: + * `dev` + * A profile with a complete configuration for automated testing + * `prod` + * Use in production context only (largest ressources) + +## Pipeline Options +The pipeline offers the following configurable options with their default values. It is possible to use them with double hyphens in the command line, but the pipeline need a lot of them, so to avoid a very long command line it is recommanded to put these options into a YAML parameter file (see example in the README.md), with `-params-file [path]`. -## Running the pipeline +### Mandatory parameters +Some parameters have not default value, therefore they must be set to run the pipeline. Here, is the exhaustive list of them : -The typical command for running the pipeline is as follows: +- **`--inputdir`** [str] +Path to the input directory for the data to be analyzed. No default value, MUST be given to the command line. This is the output directory of bcl2fastq. See bellow for the particular structure of it. +_Default_ : null -```bash -nextflow run path/to/main.nf --inputdir '/directory/to/data' --samplesheet /path/to/samples.csv -profile singularity -``` -This will launch the pipeline with the `singularity` configuration profile. See below for more information about profiles. +- **`--project`** [str] +The project name associated with the analysis. The value of this parameter MUST be a directory name found in the `inputdir` path. +_Default_ : null -Note that the pipeline will create the following files in your working directory: +- **`--data_nature`** [str] +Nature of the data sequenced. This parameter will be used to automatically select the workflow. Authorized values are : `DNA`, `RNA-*`, `Amplicon`, `16S`, (soon : `10X`, `Emseq-DNA`, `Hi-C`, `sparse`). +If value of data_nature is unknown, only the CORE pipeline is executed. +_Default_ : null -```bash -work # Directory containing the nextflow working files -results # Finished results (configurable, see below) -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. -``` +- **`--sequencer`** [str] +The sequencing platform used, such as `NovaSeq` or `AVITI`. +_Default_ : null -## Pipeline arguments +- **`--is_multiplex`** [bool] +Indicates if the data is multiplexed or not. +_Default_ : false -### `--contaminant` +- **`--run_name`** [str] +The name of the analysis run, defined in NGL-SQ. Will be used among other things for the naming of some files. +_Default_ : null -Set value define in `conf/genomes.config`. Depend on your pipeline needs. +- **`--host`** [str] +The name of the server on which the pipeline is launched. This value is used to select slurm modules to load. +_Default_ : genologin -### `--email myemail@fai.com` - Set to receive email when pipeline is complete. +- **`--shared_modules`** [str] +Path to the shared_modules sources. This is nextflow modules shared between several pipelines. +_Default_ : '/home/sbsuser/save/scripts-ngs/shared_modules_Current' -> Add here parameters specific to your pipeline +- **`--ngl_bi_client`** [str] +Path to NGL-Bi client source. +_Default_ : '/home/sbsuser/save/scripts-ngs/NGL-Bi_client_Current' -## Core Nextflow arguments +- **`--insert_to_ngl`** [bool] +Whether to insert data into NGL-Bi or not. +_Default_ : true -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +- **`--sq_xp_code`** [str] +Sequencing experiment code from NGL-SQ. Mandatory if `insert_to_ngl = true`. +_Default_ : null -### `-profile` - -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. - -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Conda) - see below. - -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. - -Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! -They are loaded in sequence, so later profiles can overwrite earlier profiles. - -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. - -* `docker` - * A generic configuration profile to be used with [Docker](https://docker.com/) - * Pulls software from Docker Hub: [`nfcore/rnaseq`](https://hub.docker.com/r/nfcore/rnaseq/) -* `singularity` - * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) - * Pulls software from Docker Hub: [`nfcore/rnaseq`](https://hub.docker.com/r/nfcore/rnaseq/) -* `conda` - * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity or Podman. - * A generic configuration profile to be used with [Conda](https://conda.io/docs/) - * Pulls most software from [Bioconda](https://bioconda.github.io/) -* `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters -* `path` - * A profile with a configuration to use binaries store in directory specified with --globalPath -* `multipath` - * A profile with a specific configuration for each process - * The user must configure file in `conf/path.config` - +- **`--bi_run_code`** [str] +Run code for NGL-Bi. +_Default_ : null +### Optionnal parameters +Some other parameters are only for tracability and have no effect on analysis, there are : +- **`--machine_id`** [str] +The machine identifier, such as `A00318` or `AV232702`. +_Default_ : null -### `-resume` +- **`--fc_type`** [str] +Information about the flow cell used. Example : `Flowcell Standard - Lane 1` +_Default_ : null -Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. +- **`--fc_id`** [str] +The flow cell identifier/barcode. +_Default_ : null -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. +- **`--lane`** [str] +The lane number(s). +_Default_ : null + +- **`--species`** [str] +(Scientific) Name of the species sequenced. +_Default_ : null + +- **`--run_date`** [str] +The date of the run, formatted as YYMMDD. +_Default_ : null + +- **`--description`** [str] +The nG6 like description of the analysis. +_Default_ : null + +### Skipping parameters +There are some availlable flags can be set to not run some parts of the pipeline. +- **`--no_subset`** [bool] +To skip subsampling step in core pipeline. +_Default_ : false + +- **`--skip_core_illumina`** [bool] +To skip core illumina sub-workflow in core pipeline. To be use to analyze data produced by other platform than Illumina. +_Default_ : false + +## Workflows related parameters +Here are listed mandatory parameters used to performe one particular analysis. Most of these options have default value, so is it not needed to overwriting them all the time. +### CORE +- **`--samplesheet`** [str] +Path to the IEM sampleSheet, only use for CORE illumina sub-workflow. +_Default_ : params.inputdir + "/SampleSheet.csv" + +- **`--fastp_n_reads`** [num] +Number of reads to process for duplicate estimation with FASTP (`--reads_to_process` parameter). +_Default_ : 100000000 + +- **`--miseq_subset_seq`** [str] +Number of sequences to use to subset reads for MiSeq dataset. +_Default_ : 50000 + +- **`--nova_subset_seq`** [str] +Number of sequences to use to subset reads for NovaSeq dataset. +_Default_ : 50000000 + +- **`--large_sampling_threshold`** [int] +Number of samples from which we consider that it is a very large multiplexed run. This option takes into account that the quantity of data for each sample is small. +_Default_ : 200 + +- **`--large_indexing_nova_subset_seq`** [str] +Number of sequences to use to subset reads for NovaSeq in case of very large multiplexed run. +_Default_ : 500000 + + +### DNA / RNA +- **`--reference_genome`** [str] +Path to the reference genome. +_Default_ : null + +- **`--reference_transcriptome`** [str] +Path to the reference transcriptome. +_Default_ : null + +- **`--make_star_index`** [bool] +Whether to force to create a STAR index, for RNA analysis using a reference genome. +_Default_ : false + + +### Amplicon / 16S +- **`--min_overlap`** [int] +Minimum overlap for paired reads merging. +_Default_ : 20 + +- **`--max_overlap`** [int] +Maximum overlap for paired reads merging. +_Default_ : 55 + +- **`--max_mismatch_density`** [float] +Maximum mismatch density for paired reads merging (flash -x option). +_Default_ : 0.1 + +- **`--assignation_databank`** [str] +Path to the databank for taxonomic assignment. +_Default_ : null + +- **`--blast_outfmt`** [int] +BLAST output format. +_Default_ : 7 + +- **`--blast_max_target`** [int] +Maximum BLAST targets. +_Default_ : 10 + +- **`--sortmerna_db_path`** [str] +Path where every SortMeRNA databases are stored. +_Default_ : '/usr/local/bioinfo/src/SortMeRNA/sortmerna-2.1b/rRNA_databases' + + +### 10X +Not available yet. + +### MethylSeq +Not available yet. +- **`--puc19`** [str] +Path to the fasta of the pUC19 (methylated control). +_Default_ : null + +- **`--lambda`** [str] +Path to the fasta of the lambda (unmethylated control). +_Default_ : null -#### Custom resource requests +## Other paramters +- **`--cluster_options`** [str] +Option used to launch slurm jobs. Usefull to exclude some busy nodes for example. +_Default_ : null -Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. +- **`--is_dev_mode`** [bool] +Development mode flag, automatically set `true` using dev profil (see bellow). +_Default_ : false -Whilst these default requirements will hopefully work for most people with most data, you may find that you want to customise the compute resources that the pipeline requests. You can do this by creating a custom config file. For example, to give the workflow process `star` 32GB of memory, you could use the following config: +- **`--DTM_mode`** [bool] +Set true to add some special process for DTM validation. +_Default_ : false -```nextflow -process { - withName: star { - memory = 32.GB - } -} -``` +- **`--email`** [bool/str] +Set to false to not send email notification, or set to an email address to receive on. +_Default_ : null -See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information. +- **`--email_dev`** [str] +Only one email used in dev mode. +_Default_ : jules.sabban@inrae.fr -### Running in the background +- **`--email_on_fail`** [str] +Email address for failure notifications. +_Default_ : jules.sabban@inrae.fr -Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. +- **`--email_bioinfo`** [str] +Bioinformatics team email address. +_Default_ : get-plage.bioinfo@genotoul.fr -The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. +- **`--email_labo`** [str] +Laboratory email address. (Currently null) +_Default_ : get-plage.labo@genotoul.fr -Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. -Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). +## Parameters overwritten in dev mode +Here are listed every parameters with overwrited default value in dev mode. These parameters should not be use in command line in most cases. +- **`--ngl_bi_client`** set to '/home/sbsuser/work/test/jules/VisualStudioSources/ngl-bi_client/' +- **`--shared_modules`** set to '/home/sbsuser/work/Nextflow/shared_modules/ExportSources_Jules/' +- **`--is_dev_mode`** set to true -#### Nextflow memory requirements -In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. -We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): +## Structure of the inputdir +The `--inputdir` parameter has not default value, so it must be given in the command line. This is a path to a folder that must have a particular structure. The `inputdir` folder must have the minimum following data folder : -```bash -NXF_OPTS='-Xms1g -Xmx4g' -``` +For every workflows, the inputdir folder must only have a directory with the name of the project. In this project folders (or subfolders bellow), fastq files must be gzipped. In the input folder, it must also have the fastq_screen configuration file (see example in assets/). +For the core illumina pipeline, the inputdir folder must also contains à Stats folder, with statistics files of bcl2fastq inside. -- GitLab