From 0db84a697ef53ec2946e5ede0d7165d1521561e7 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Tue, 4 Feb 2025 13:36:09 +0100
Subject: [PATCH 01/10] remove unapropriate mdbg references

---
 README.md             | 2 +-
 docs/source/output.md | 2 +-
 docs/source/usage.md  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 31a795d..f243e25 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Many of these steps are optional and their necessity depends on the desired anal
    * controls the quality of raw and cleaned data ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
    * makes a taxonomic classification of cleaned reads ([Kaiju MEM](https://github.com/bioinformatics-centre/kaiju) + [kronaTools](https://github.com/marbl/Krona/wiki/KronaTools) + [plot_kaiju_stat.py](bin/plot_kaiju_stat.py) + [merge_kaiju_results.py](bin/merge_kaiju_results.py))
 * `S02_ASSEMBLY` 
-   * assembles reads ([metaSPAdes](https://github.com/ablab/spades) or [Megahit](https://github.com/voutcn/megahit) or [Hifiasm_meta](https://github.com/lh3/hifiasm-meta), [metaFlye](https://github.com/fenderglass/Flye) or [metaMDBG](https://github.com/GaetanBenoitDev/metaMDBG))
+   * assembles reads ([metaSPAdes](https://github.com/ablab/spades) or [Megahit](https://github.com/voutcn/megahit) or [Hifiasm_meta](https://github.com/lh3/hifiasm-meta), [metaFlye](https://github.com/fenderglass/Flye))
    * assesses the quality of assembly ([metaQUAST](http://quast.sourceforge.net/metaquast))
    * reads deduplication, alignment against contigs for short reads ([BWA-MEM2](https://github.com/bwa-mem2/bwa-mem2) + [Samtools](http://www.htslib.org/))
    * reads alignment against contigs for HiFi reads ([Minimap2](https://github.com/lh3/minimap2)  + [Samtools](http://www.htslib.org/))
diff --git a/docs/source/output.md b/docs/source/output.md
index cbea201..e9ca730 100644
--- a/docs/source/output.md
+++ b/docs/source/output.md
@@ -53,7 +53,7 @@ The `results/` directory contains a sub-directory for each step launched:
 
 ```{note}
 In this directory you have either the results of assembly with `metaspades` or `megahit`
-if you analyse short read data and `hifiasm-meta`, `metaflye` or `metamdbg`
+if you analyse short read data and `hifiasm-meta` or `metaflye`.
 if you analyse HiFi data. You have chosen your assembly tool with `--assembly` parameter.
 ```
 
diff --git a/docs/source/usage.md b/docs/source/usage.md
index c49c5b2..5aae59f 100644
--- a/docs/source/usage.md
+++ b/docs/source/usage.md
@@ -351,7 +351,7 @@ See [II. Input files](usage.md#ii-input-files) and warnings.
 
 | Parameter | Description |
 | ------------ | ----------- |
-| `--assembly` | allows to indicate the assembly tool.<br /> For short reads: `["metaspades" or "megahit"]`:  Default: `metaspades`.<br /> For HiFi reads:  `["hifiasm-meta", "metaflye", "metamdbg"]`. Default: `hifiasm-meta`.|
+| `--assembly` | allows to indicate the assembly tool.<br /> For short reads: `["metaspades" or "megahit"]`:  Default: `metaspades`.<br /> For HiFi reads:  `["hifiasm-meta", "metaflye"]`. Default: `hifiasm-meta`.|
 | `--coassembly` | allows to assemble together the samples labeled with the same group in the samplesheet. <br /> It will generate one assembly for each group. To co-assemble all of your samples together, <br /> you must indicate a unique group for each sample in the samplesheet.|
 
 ```{warning}
@@ -362,7 +362,7 @@ for each group co-assembled and automatically mapping with every sample of his g
 * For short reads, the user can choose between `metaspades` or `megahit` for `--assembly` parameter. 
 The choice can be based on CPUs and memory availability: `metaspades` needs more CPUs and memory than `megahit` but
 our tests showed that assembly metrics are better for `metaspades` than `megahit`. For PacBio HiFi reads, the user
-can choose between `hifiasm-meta`, `metaflye` or `metamdbg`.
+can choose between `hifiasm-meta` or `metaflye`.
 ```
 
 ```{note}
-- 
GitLab


From 8b2275fd6a044cb10def4b381a3d660db329a754 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Tue, 4 Feb 2025 13:37:00 +0100
Subject: [PATCH 02/10] use external gtdbtk image

---
 conf/base.config  | 1 +
 env/binning.yml   | 1 -
 modules/gtdbtk.nf | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index aa3d2cd..65e0d69 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -105,6 +105,7 @@ process {
         cpus = 10
     }
     withName: GTDBTK {
+        container = 'quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_2'
         memory = { 64.GB * task.attempt }
         cpus = 16
     }
diff --git a/env/binning.yml b/env/binning.yml
index 77d3f73..45a2405 100644
--- a/env/binning.yml
+++ b/env/binning.yml
@@ -26,6 +26,5 @@ dependencies:
   - metabat2=2.15
   - maxbin2=2.2.7
   - concoct=1.1.0
-  - gtdbtk=2.1
   - drep=3.0.0
   - pprodigal
diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf
index 73499ba..ba9b86d 100644
--- a/modules/gtdbtk.nf
+++ b/modules/gtdbtk.nf
@@ -15,7 +15,7 @@ process GTDBTK {
 
   export GTDBTK_DATA_PATH=$gtdbtk_db
 
-  gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
+  gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --skip_ani_screen --pplacer_cpus ${task.cpus} --cpus ${task.cpus}
   echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt
   """
 }
\ No newline at end of file
-- 
GitLab


From d6ea7ddd7f2b5f3ed952a9e016f5570a54909212 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Thu, 16 Jan 2025 13:42:10 +0100
Subject: [PATCH 03/10] calcul cpm for each sample one by one reorder
 kept_contigs and idxstat

---
 bin/filter_contig_per_cpm.py | 123 ++++++++++++++++++++++++++---------
 1 file changed, 91 insertions(+), 32 deletions(-)

diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py
index fdea68d..21f2fe6 100755
--- a/bin/filter_contig_per_cpm.py
+++ b/bin/filter_contig_per_cpm.py
@@ -35,7 +35,7 @@ import pyfastx
 
 def parse_arguments():
     """Parse script arguments."""
-    parser = ArgumentParser(description="...",
+    parser = ArgumentParser(description="Calculate CPM per contig",
                             formatter_class=ArgumentDefaultsHelpFormatter)
 
     parser.add_argument("-i", "--samtools_idxstats", nargs='+',  required = True, 
@@ -59,36 +59,77 @@ def parse_arguments():
     args = parser.parse_args()
     return args
 
-def combine_idxstat_files(idxstat_files):
+def combine_idxstat_files(idxstat_dfs):
     """
-    Combine multiple idxstat files that have the same contigs.
+    Combine multiple idxstat df that have the same contigs.
 
     Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences.
     """
+    #columns_names = ['reference_sequence_name', 
+    #                'sequence_length',
+    #                '#_mapped_read_segments',
+    #                '#_unmapped_read-segments']
+
+    combined_df = idxstat_dfs[0].copy()
+    
+    for idxstat_df in idxstat_dfs[1:]:
+        combined_df['#_mapped_read_segments'] += idxstat_df['#_mapped_read_segments']
+        combined_df['cpm_count'] += idxstat_df['cpm_count']
+        
+    return combined_df
+
+    #idxstat_df = pd.read_csv(idxstat_files[0],
+    #                        sep ='\t',
+    #                        names = columns_names, 
+    #                        usecols = ['reference_sequence_name', 
+    #                                    'sequence_length',
+    #                                    '#_mapped_read_segments',],
+    #                        comment="*").set_index('reference_sequence_name')
+                            
+    #for idxstat_file in idxstat_files[1:]:
+    #    other_idxstat_df = pd.read_csv(idxstat_file,
+    #                        sep ='\t',
+    #                        names = columns_names, 
+    #                        usecols = ['reference_sequence_name',
+    #                                    '#_mapped_read_segments',],
+    #                        comment="*").set_index('reference_sequence_name')
+            
+    #    idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments']
+
+    #return idxstat_df
+
+def filter_cpm(idxstat_file, cpm_cutoff):
+    """
+    Calculate and filter byCPM for each contig.
+    """
     columns_names = ['reference_sequence_name', 
                     'sequence_length',
-                    '#_mapped_read_segments',
+                    '#_mapped_read_segments',    
                     '#_unmapped_read-segments']
 
-    idxstat_df = pd.read_csv(idxstat_files[0],
-                            sep ='\t',
+    idxstat_df = pd.read_csv(idxstat_file, sep ='\t',
                             names = columns_names, 
                             usecols = ['reference_sequence_name', 
                                         'sequence_length',
                                         '#_mapped_read_segments',],
                             comment="*").set_index('reference_sequence_name')
-                            
-    for idxstat_file in idxstat_files[1:]:
-        other_idxstat_df = pd.read_csv(idxstat_file,
-                            sep ='\t',
-                            names = columns_names, 
-                            usecols = ['reference_sequence_name',
-                                        '#_mapped_read_segments',],
-                            comment="*").set_index('reference_sequence_name')
-            
-        idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments']
+    
+    
+    logging.info(f'idxstat_file: {idxstat_df}')
+    
+    
+    sum_reads = idxstat_df['#_mapped_read_segments'].sum()
+    idxstat_df['cpm_count'] = 1e6 * (idxstat_df['#_mapped_read_segments'] / sum_reads)
+    
+    # Apply CPM cutoff  
+    #filtered_idxstat_df = idxstat_df[idxstat_df['cpm_count'] >= cpm_cutoff]
+    kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index
+    
+    logging.info(f'length of idxstat_file: {len(idxstat_df)}')
+    logging.info(f'length of kept_contigs: {len(kept_contigs)}')
+
+    return kept_contigs, idxstat_df
 
-    return idxstat_df
 
 def main():
     args = parse_arguments()
@@ -103,28 +144,46 @@ def main():
     cpm_cutoff = float(args.cutoff_cpm)
 
     # Read input tables 
-    idxstat_df = combine_idxstat_files(args.samtools_idxstats)
-
+    # idxstat_df = combine_idxstat_files(args.samtools_idxstats)
+    
+    # logging.info(f'idxstat file: {idxstat_file}')
+    idxstat_results = [filter_cpm(idxstat_file, cpm_cutoff) for idxstat_file in args.samtools_idxstats]
+    
+    # Separate idxstat and kept_contigs
+    kept_contigs, idxstat_dfs = zip(*idxstat_results)
+    
+    logging.info(f'number of kept_contigs: {len(kept_contigs)}')
+    logging.info(f'number of idxstat_file: {len(idxstat_dfs)}')
+    
+    # Combine idxstat dataframes
+    #combined_idxstat_df = combine_idxstat_files(idxstat_dfs)   
+    #logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}')
+    #logging.info(f'head of list_idxstat_df[0]: {combined_idxstat_df.head()}')
+    
     # Calculates cpm for each contig
-    sum_reads = idxstat_df['#_mapped_read_segments'].sum()
-    logging.info(f'Total number of mapped reads {sum_reads}')
-
-    logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.')
-    idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads
+    #sum_reads = idxstat_df['#_mapped_read_segments'].sum()
+    #logging.info(f'Total number of mapped reads {sum_reads}')
 
+    #logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.')
+    #idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads
+        
+    #list_idxstat_df.append(idxstat_df)
     
+    #logging.info(f'head of list_idxstat_df[1]: {list_idxstat_df[1].head()}')
+
+
     # Contigs with nb reads > cutoff
-    kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index
+    #kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index
 
-    logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.')
+    #logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.')
     # Write new fasta files with kept and unkept contigs
-    with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle:
+   # with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle:
         
-        for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False):
-            if contig in kept_contigs:
-                out_select_handle.write(f'>{contig}\n{seq}\n')
-            else:
-                out_discard_handle.write(f'>{contig}\n{seq}\n')
+   #     for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False):
+   #         if contig in kept_contigs:
+   #             out_select_handle.write(f'>{contig}\n{seq}\n')
+   #         else:
+   #             out_discard_handle.write(f'>{contig}\n{seq}\n')
 
 if __name__ == "__main__":
     main()
-- 
GitLab


From d8d8c4e1a3aad2621ae38bb04c2f35758746b72a Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Thu, 16 Jan 2025 15:31:35 +0100
Subject: [PATCH 04/10] combine output of filter_cpm

---
 bin/filter_contig_per_cpm.py | 65 ++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py
index 21f2fe6..e7e72a4 100755
--- a/bin/filter_contig_per_cpm.py
+++ b/bin/filter_contig_per_cpm.py
@@ -59,7 +59,7 @@ def parse_arguments():
     args = parser.parse_args()
     return args
 
-def combine_idxstat_files(idxstat_dfs):
+def combine_idxstat_files_old(idxstat_dfs):
     """
     Combine multiple idxstat df that have the same contigs.
 
@@ -98,6 +98,44 @@ def combine_idxstat_files(idxstat_dfs):
 
     #return idxstat_df
 
+def combine_idxstat_files(idxstat_dfs):
+    """
+    Combine multiple idxstat df that have the same contigs.
+
+    Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences.
+    """
+    #columns_names = ['reference_sequence_name', 
+    #                'sequence_length',
+    #                '#_mapped_read_segments',
+    #                '#_unmapped_read-segments']
+
+    common_contigs = set(idxstat_dfs[0].index)
+    for df in idxstat_dfs[1:]:
+        common_contigs = common_contigs.intersection(set(df.index))
+        
+    # Convert set to list for pandas indexing
+    common_contigs_list = sorted(list(common_contigs))
+    
+    # Filter dataframes to keep only common contigs
+    filtered_dfs = [df.loc[common_contigs_list] for df in idxstat_dfs]
+    
+    # Combine dataframes
+    combined_df = filtered_dfs[0].copy()
+    
+    for df in filtered_dfs[1:]:
+        combined_df['#_mapped_read_segments'] += df['#_mapped_read_segments']
+        combined_df['cpm_count'] += df['cpm_count']
+        
+    return combined_df      
+   
+def combine_kept_contigs(kept_contigs_list):
+    """Combine multiple lists of kept contigs."""
+    # Get contigs that appear in all lists (intersection)
+    common_contigs = set(kept_contigs_list[0])
+    for contigs in kept_contigs_list[1:]:
+        common_contigs = common_contigs.intersection(set(contigs))
+    return pd.Index(list(common_contigs))
+
 def filter_cpm(idxstat_file, cpm_cutoff):
     """
     Calculate and filter byCPM for each contig.
@@ -152,14 +190,18 @@ def main():
     # Separate idxstat and kept_contigs
     kept_contigs, idxstat_dfs = zip(*idxstat_results)
     
+    print(kept_contigs)
+    
     logging.info(f'number of kept_contigs: {len(kept_contigs)}')
     logging.info(f'number of idxstat_file: {len(idxstat_dfs)}')
     
     # Combine idxstat dataframes
-    #combined_idxstat_df = combine_idxstat_files(idxstat_dfs)   
-    #logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}')
+    combined_idxstat_df = combine_idxstat_files(idxstat_dfs)   
+    logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}')
     #logging.info(f'head of list_idxstat_df[0]: {combined_idxstat_df.head()}')
     
+    combined_kept_contigs = combine_kept_contigs(kept_contigs)
+    logging.info(f'length of combined_kept_contigs: {len(combined_kept_contigs)}')
     # Calculates cpm for each contig
     #sum_reads = idxstat_df['#_mapped_read_segments'].sum()
     #logging.info(f'Total number of mapped reads {sum_reads}')
@@ -175,15 +217,18 @@ def main():
     # Contigs with nb reads > cutoff
     #kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index
 
-    #logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.')
+    logging.info(f'{len(combined_kept_contigs)}/{len(combined_idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.')
     # Write new fasta files with kept and unkept contigs
-   # with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle:
+    with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle:
         
-   #     for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False):
-   #         if contig in kept_contigs:
-   #             out_select_handle.write(f'>{contig}\n{seq}\n')
-   #         else:
-   #             out_discard_handle.write(f'>{contig}\n{seq}\n')
+        for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False):
+            logging.info(f'contig in last step: {contig}\n{seq}\n')
+            if contig in kept_contigs:
+                logging.info(f'contig in kept_contigs: {contig}')
+                #out_select_handle.write(f'>{contig}\n{seq}\n')
+            else:
+                logging.info(f'contig not in kept_contigs: {contig}')
+                #out_discard_handle.write(f'>{contig}\n{seq}\n')
 
 if __name__ == "__main__":
     main()
-- 
GitLab


From 319324d42f1e5d0678cdf182407b7abccd16b615 Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Fri, 17 Jan 2025 10:55:15 +0100
Subject: [PATCH 05/10] delete old code delete unused library

---
 bin/filter_contig_per_cpm.py | 82 ++++--------------------------------
 1 file changed, 8 insertions(+), 74 deletions(-)

diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py
index e7e72a4..8963aa3 100755
--- a/bin/filter_contig_per_cpm.py
+++ b/bin/filter_contig_per_cpm.py
@@ -25,7 +25,6 @@ __status__ = 'dev'
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 import pandas as pd
-import numpy as np
 import logging
 import pyfastx
 
@@ -59,56 +58,13 @@ def parse_arguments():
     args = parser.parse_args()
     return args
 
-def combine_idxstat_files_old(idxstat_dfs):
-    """
-    Combine multiple idxstat df that have the same contigs.
-
-    Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences.
-    """
-    #columns_names = ['reference_sequence_name', 
-    #                'sequence_length',
-    #                '#_mapped_read_segments',
-    #                '#_unmapped_read-segments']
-
-    combined_df = idxstat_dfs[0].copy()
-    
-    for idxstat_df in idxstat_dfs[1:]:
-        combined_df['#_mapped_read_segments'] += idxstat_df['#_mapped_read_segments']
-        combined_df['cpm_count'] += idxstat_df['cpm_count']
-        
-    return combined_df
-
-    #idxstat_df = pd.read_csv(idxstat_files[0],
-    #                        sep ='\t',
-    #                        names = columns_names, 
-    #                        usecols = ['reference_sequence_name', 
-    #                                    'sequence_length',
-    #                                    '#_mapped_read_segments',],
-    #                        comment="*").set_index('reference_sequence_name')
-                            
-    #for idxstat_file in idxstat_files[1:]:
-    #    other_idxstat_df = pd.read_csv(idxstat_file,
-    #                        sep ='\t',
-    #                        names = columns_names, 
-    #                        usecols = ['reference_sequence_name',
-    #                                    '#_mapped_read_segments',],
-    #                        comment="*").set_index('reference_sequence_name')
-            
-    #    idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments']
-
-    #return idxstat_df
-
 def combine_idxstat_files(idxstat_dfs):
     """
     Combine multiple idxstat df that have the same contigs.
 
     Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences.
     """
-    #columns_names = ['reference_sequence_name', 
-    #                'sequence_length',
-    #                '#_mapped_read_segments',
-    #                '#_unmapped_read-segments']
-
+    
     common_contigs = set(idxstat_dfs[0].index)
     for df in idxstat_dfs[1:]:
         common_contigs = common_contigs.intersection(set(df.index))
@@ -159,8 +115,7 @@ def filter_cpm(idxstat_file, cpm_cutoff):
     sum_reads = idxstat_df['#_mapped_read_segments'].sum()
     idxstat_df['cpm_count'] = 1e6 * (idxstat_df['#_mapped_read_segments'] / sum_reads)
     
-    # Apply CPM cutoff  
-    #filtered_idxstat_df = idxstat_df[idxstat_df['cpm_count'] >= cpm_cutoff]
+    # Apply CPM cutoff
     kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index
     
     logging.info(f'length of idxstat_file: {len(idxstat_df)}')
@@ -180,55 +135,34 @@ def main():
         logging.basicConfig(format="%(levelname)s: %(message)s")
 
     cpm_cutoff = float(args.cutoff_cpm)
-
-    # Read input tables 
-    # idxstat_df = combine_idxstat_files(args.samtools_idxstats)
     
     # logging.info(f'idxstat file: {idxstat_file}')
     idxstat_results = [filter_cpm(idxstat_file, cpm_cutoff) for idxstat_file in args.samtools_idxstats]
     
     # Separate idxstat and kept_contigs
     kept_contigs, idxstat_dfs = zip(*idxstat_results)
-    
-    print(kept_contigs)
-    
+        
     logging.info(f'number of kept_contigs: {len(kept_contigs)}')
     logging.info(f'number of idxstat_file: {len(idxstat_dfs)}')
     
     # Combine idxstat dataframes
     combined_idxstat_df = combine_idxstat_files(idxstat_dfs)   
     logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}')
-    #logging.info(f'head of list_idxstat_df[0]: {combined_idxstat_df.head()}')
     
     combined_kept_contigs = combine_kept_contigs(kept_contigs)
+    combined_kept_contigs = set(combined_kept_contigs.tolist())
     logging.info(f'length of combined_kept_contigs: {len(combined_kept_contigs)}')
-    # Calculates cpm for each contig
-    #sum_reads = idxstat_df['#_mapped_read_segments'].sum()
-    #logging.info(f'Total number of mapped reads {sum_reads}')
-
-    #logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.')
-    #idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads
-        
-    #list_idxstat_df.append(idxstat_df)
     
-    #logging.info(f'head of list_idxstat_df[1]: {list_idxstat_df[1].head()}')
-
-
-    # Contigs with nb reads > cutoff
-    #kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index
-
     logging.info(f'{len(combined_kept_contigs)}/{len(combined_idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.')
+    
     # Write new fasta files with kept and unkept contigs
     with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle:
         
         for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False):
-            logging.info(f'contig in last step: {contig}\n{seq}\n')
-            if contig in kept_contigs:
-                logging.info(f'contig in kept_contigs: {contig}')
-                #out_select_handle.write(f'>{contig}\n{seq}\n')
+            if contig in combined_kept_contigs:
+                out_select_handle.write(f'>{contig}\n{seq}\n')
             else:
-                logging.info(f'contig not in kept_contigs: {contig}')
-                #out_discard_handle.write(f'>{contig}\n{seq}\n')
+                out_discard_handle.write(f'>{contig}\n{seq}\n')
 
 if __name__ == "__main__":
     main()
-- 
GitLab


From a1f47bd4c78313f589e11c4565db74c87f56122b Mon Sep 17 00:00:00 2001
From: Philippe Ruiz <philippe.ruiz@inrae.fr>
Date: Fri, 17 Jan 2025 15:36:45 +0100
Subject: [PATCH 06/10] use union in place of intersection to merge table

---
 bin/filter_contig_per_cpm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py
index 8963aa3..7ec0b9d 100755
--- a/bin/filter_contig_per_cpm.py
+++ b/bin/filter_contig_per_cpm.py
@@ -67,7 +67,7 @@ def combine_idxstat_files(idxstat_dfs):
     
     common_contigs = set(idxstat_dfs[0].index)
     for df in idxstat_dfs[1:]:
-        common_contigs = common_contigs.intersection(set(df.index))
+        common_contigs = common_contigs.union(set(df.index))
         
     # Convert set to list for pandas indexing
     common_contigs_list = sorted(list(common_contigs))
@@ -86,10 +86,10 @@ def combine_idxstat_files(idxstat_dfs):
    
 def combine_kept_contigs(kept_contigs_list):
     """Combine multiple lists of kept contigs."""
-    # Get contigs that appear in all lists (intersection)
+    # Get contigs that appear in all lists (union)
     common_contigs = set(kept_contigs_list[0])
     for contigs in kept_contigs_list[1:]:
-        common_contigs = common_contigs.intersection(set(contigs))
+        common_contigs = common_contigs.union(set(contigs))
     return pd.Index(list(common_contigs))
 
 def filter_cpm(idxstat_file, cpm_cutoff):
-- 
GitLab


From ab9d99b20f59bda4a2b2cb5d716003262ba788e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr>
Date: Mon, 10 Feb 2025 15:01:02 +0100
Subject: [PATCH 07/10] Upgrade multiQC

---
 conf/base.config   | 1 +
 conf/test.config   | 1 +
 modules/multiqc.nf | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/conf/base.config b/conf/base.config
index 65e0d69..919746c 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -28,6 +28,7 @@ process {
         memory = { 8.GB * task.attempt }
     }
     withName: MULTIQC {
+        container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0'
         memory = { 8.GB * task.attempt }
     }
     withName: HOST_FILTER {
diff --git a/conf/test.config b/conf/test.config
index 9bac780..cd3005c 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -18,6 +18,7 @@ process {
         memory = { 2.GB * task.attempt }
     }
     withName: MULTIQC {
+        container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0'
         executor = "local"
         memory = { 2.GB * task.attempt }
     }
diff --git a/modules/multiqc.nf b/modules/multiqc.nf
index d4581ad..68b8816 100644
--- a/modules/multiqc.nf
+++ b/modules/multiqc.nf
@@ -26,7 +26,7 @@ process MULTIQC {
   
   script:
     """
-    multiqc . --config ${multiqc_config} -m custom_content -m fastqc -m cutadapt -m sickle -m kaiju -m quast -m prokka -m featureCounts -m samtools
+    multiqc . --config ${multiqc_config} -m custom_content -m fastqc -m cutadapt -m sickle -m kaiju -m quast -m prokka -m featurecounts -m samtools
     multiqc --version > v_multiqc.txt
     """
 }
-- 
GitLab


From fd725c16488d537e07b6bbcb705b34e4219eae13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr>
Date: Mon, 10 Feb 2025 15:08:41 +0100
Subject: [PATCH 08/10] remove multiQC version from process

---
 modules/get_software_versions.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/get_software_versions.nf b/modules/get_software_versions.nf
index a60f217..97cbef9 100644
--- a/modules/get_software_versions.nf
+++ b/modules/get_software_versions.nf
@@ -17,7 +17,6 @@ process GET_SOFTWARE_VERSIONS {
     echo $workflow.manifest.version > v_pipeline.txt
     echo $workflow.nextflow.version > v_nextflow.txt
     python --version &> v_python.txt
-    multiqc --version &> v_multiqc.txt
     scrape_software_versions.py > software_versions_mqc.yaml
     """
 }
\ No newline at end of file
-- 
GitLab


From f8e8403aac248adc41ec8fc0f712745e9617ba68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr>
Date: Mon, 10 Feb 2025 15:36:54 +0100
Subject: [PATCH 09/10] remove multiQC from yml

---
 env/metagWGS.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/env/metagWGS.yml b/env/metagWGS.yml
index 463510d..dbde1c5 100644
--- a/env/metagWGS.yml
+++ b/env/metagWGS.yml
@@ -19,7 +19,6 @@ dependencies:
   - krona=2.8.1
   - megahit=1.2.9
   - minimap2=2.24
-  - multiqc=1.14
   - pandas=1.5.2
   - plotly
   - prodigal=2.6.3
-- 
GitLab


From 53045c1546a93ba9a21ee639f6705112a044e97c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr>
Date: Mon, 10 Feb 2025 16:46:09 +0100
Subject: [PATCH 10/10] Upgrade cutadapt to 5.0

---
 conf/base.config | 1 +
 conf/test.config | 1 -
 env/metagWGS.yml | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index 919746c..061d014 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -17,6 +17,7 @@ process {
     maxErrors = '-1'
     
     withName: CUTADAPT {
+        container = 'quay.io/biocontainers/cutadapt:5.0--py310h1fe012e_0'
         cpus = 8
         memory = { 8.GB * task.attempt }
     }
diff --git a/conf/test.config b/conf/test.config
index cd3005c..9bac780 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -18,7 +18,6 @@ process {
         memory = { 2.GB * task.attempt }
     }
     withName: MULTIQC {
-        container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0'
         executor = "local"
         memory = { 2.GB * task.attempt }
     }
diff --git a/env/metagWGS.yml b/env/metagWGS.yml
index dbde1c5..cb02fd8 100644
--- a/env/metagWGS.yml
+++ b/env/metagWGS.yml
@@ -8,7 +8,6 @@ dependencies:
   - bcbio-gff=0.6.9
   - bwa-mem2=2.2.1
   - cd-hit=4.8.1
-  - cutadapt=4.2
   - diamond=2.0.15
   - eggnog-mapper=2.1.9
   - fastqc=0.11.9
-- 
GitLab