diff --git a/.config/.masterconfig.yaml b/.config/.masterconfig.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9bb5f7075f3fe5d2be5ecf206fff208aa3638d5
--- /dev/null
+++ b/.config/.masterconfig.yaml
@@ -0,0 +1,11 @@
+# Configuration file for genome simulation parameters per sample.
+# Each sample is defined by a unique name and associated parameters.
+
+samples:
+  # Unique identifier for each sample with specific simulation parameters.
+  test_sample1:        
+    fasta_gz: "test_genome.fa.gz"       # Path to the compressed FASTA file (genome sequence) for this sample.
+    population_size: 5                  # Effective population size (Ne) the size of the population.
+    mutation_rate: 1e-7                 # Mutation rate (Âµ) the rate of mutations in the genome.
+    recombination_rate: 1e-9            # Recombination rate (r) the frequency of recombination events.
+    sample_size: 2                      # Number of individual samples to simulate (number of FASTA output)
diff --git a/vg_exact_data/snakemake_profile/slurm/CookieCutter.py b/.config/snakemake_profile/slurm/CookieCutter.py
old mode 100755
new mode 100644
similarity index 100%
rename from vg_exact_data/snakemake_profile/slurm/CookieCutter.py
rename to .config/snakemake_profile/slurm/CookieCutter.py
diff --git a/.config/snakemake_profile/slurm/cluster_config.yml b/.config/snakemake_profile/slurm/cluster_config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7bb55b87ec9223956e9b85796daa32878fe1b18b
--- /dev/null
+++ b/.config/snakemake_profile/slurm/cluster_config.yml
@@ -0,0 +1,103 @@
+### default ressources used by snakemake (applied to all rules)
+__default__:
+  job-name: "{rule}"
+  time: "96:00:00" # max run time "hours:minutes:seconds"
+  ntasks: 1 # nb of processes
+  cpus-per-task: 4 # nb of cores for each process(1 process)
+  mem: "60G"
+  nodes: 1 # Requirements nodes/servers (default: 1)
+  ntasks-per-node: 1 # Requirements cpu/core/task (default: 1)
+  output: "slurm_logs/{rule}.%N.%j.out"
+  error: "slurm_logs/{rule}.%N.%j.err"
+  mail-type: END,FAIL #email notification
+  mail-user: your.email@here.fr
+
+create_chr_config_file:
+  output: "/dev/null"
+  mail-type: NONE 
+  mem: "5G"
+  cpus-per-task: 1
+
+generate_fai_index:
+  output: "/dev/null"
+  mem: "100G"
+  cpus-per-task: 20
+
+split_index:
+  output: "/dev/null"
+  mail-type: NONE 
+  mem: "5G"
+  cpus-per-task: 1
+
+run_msprime:
+  error: "/dev/null"
+  output: "/dev/null"
+  mail-type: NONE 
+  cpus-per-task: 1
+  mem: "50G"
+
+compress_sim_vcf:
+  error: "/dev/null"
+  output: "/dev/null"
+  mail-type: NONE 
+
+index_sim_vcf:
+    error: "/dev/null"
+    output: "/dev/null"
+    mail-type: NONE 
+
+add_header:
+  output: "/dev/null"
+  mail-type: NONE 
+  cpus-per-task: 1
+
+sort_header:
+  output: "/dev/null"
+  mail-type: NONE 
+  cpus-per-task: 1
+
+remove_vcf_header:
+  output: "/dev/null"
+  mail-type: NONE 
+  cpus-per-task: 1
+  
+extract_vcf_header:
+  output: "/dev/null"
+  mail-type: NONE 
+  cpus-per-task: 1
+
+unzip_simulated_vcf:
+  output: "/dev/null"
+  mail-type: NONE 
+
+unzip_input_fasta:
+  output: "/dev/null"
+  mail-type: NONE 
+
+compress_vcf_for_griaffe:
+  mail-type: NONE 
+
+generate_variants:
+  mem: "200G"
+
+index_giraffe:
+  mail-type: NONE
+  mem: "150G"
+
+sort_vcf:
+  mail-type: NONE
+  mem: "150G"
+
+construct_graph:
+  mail-type: NONE
+  mem: "150G"
+
+vg_to_gfa:
+  mem: "150G"
+
+gbz_to_gfa:
+  mail-type: NONE
+  mem: "150G"
+  
+graph_to_fasta:
+  mem: "150G"
\ No newline at end of file
diff --git a/vg_exact_data/snakemake_profile/slurm/config.yaml b/.config/snakemake_profile/slurm/config.yaml
similarity index 100%
rename from vg_exact_data/snakemake_profile/slurm/config.yaml
rename to .config/snakemake_profile/slurm/config.yaml
diff --git a/.config/snakemake_profile/slurm/settings.json b/.config/snakemake_profile/slurm/settings.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ffbf2bca3ea56cef33ed6cd9998496366dc634f
--- /dev/null
+++ b/.config/snakemake_profile/slurm/settings.json
@@ -0,0 +1,6 @@
+{
+    "SBATCH_DEFAULTS": "",
+    "CLUSTER_NAME": "",
+    "CLUSTER_CONFIG": "",
+    "ADVANCED_ARGUMENT_CONVERSION": "no"
+}
diff --git a/vg_exact_data/snakemake_profile/slurm/slurm-jobscript.sh b/.config/snakemake_profile/slurm/slurm-jobscript.sh
old mode 100644
new mode 100755
similarity index 100%
rename from vg_exact_data/snakemake_profile/slurm/slurm-jobscript.sh
rename to .config/snakemake_profile/slurm/slurm-jobscript.sh
diff --git a/vg_exact_data/snakemake_profile/slurm/slurm-status.py b/.config/snakemake_profile/slurm/slurm-status.py
similarity index 100%
rename from vg_exact_data/snakemake_profile/slurm/slurm-status.py
rename to .config/snakemake_profile/slurm/slurm-status.py
diff --git a/vg_exact_data/snakemake_profile/slurm/slurm-submit.py b/.config/snakemake_profile/slurm/slurm-submit.py
similarity index 100%
rename from vg_exact_data/snakemake_profile/slurm/slurm-submit.py
rename to .config/snakemake_profile/slurm/slurm-submit.py
diff --git a/vg_exact_data/snakemake_profile/slurm/slurm_utils.py b/.config/snakemake_profile/slurm/slurm_utils.py
old mode 100755
new mode 100644
similarity index 100%
rename from vg_exact_data/snakemake_profile/slurm/slurm_utils.py
rename to .config/snakemake_profile/slurm/slurm_utils.py
diff --git a/.config/visor_sv_type.yaml b/.config/visor_sv_type.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a782dd9c60be062853913ee933e37fa1882cae1
--- /dev/null
+++ b/.config/visor_sv_type.yaml
@@ -0,0 +1,11 @@
+# Percentage of each variant type in the simulation
+# Sum of values MUST be 100
+SNP: 20
+Deletion: 20
+Insertion: 20
+Inversion: 10
+TandemDuplication: 10
+InvertedTandemDuplication: 10
+Translocation: 10
+Transduplication : 0 # Not suported yet
+ReciprocalTranslocation: 0 # Not suported yet
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 73777e9c3755c2d23d9363a99bfde46165d67667..fadcb32593c9d81357fa1654903a947ea61ab4ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,16 +47,18 @@
 #!*.template
 #!*.txt
 !*.yml
-!*.yaml
 !Snakefile
 !*.def
+!tes
+!settings.json
 
-# 3) add a pattern to track the file patterns of section2 even if they are in
-    # subdirectories
+# 3) add a pattern to track the file patterns of section2 even if they are in subdirectories
 !*/
-
-
+!**.smk
+!dag.svg
 # 4) specific files or folder to TRACK (the '**' sign means 'any path')
-!/sv_distributions/*
-
-# 5) specific folders to UNTRACK (wherever it may be in the treeview)
\ No newline at end of file
+!/simulation_data/**
+!test**fa.gz
+!**setting.json
+!slurm_logs/
+# 5) specific folders to UNTRACK (wherever it may be in the treeview)
diff --git a/README.md b/README.md
index 34d913ee925de2e9f96773e8735b848e04780ba2..ca8336d77ee7276d5c7c5b81a9f4e915019b4629 100644
--- a/README.md
+++ b/README.md
@@ -1,50 +1,71 @@
-# Generate structural variants in a population
-With a population generated by msprime (or any VCF), generate structural variants with VISOR and obtain a final VCF with structural variant sequences and population genotypes.
+# Warnings / Issues
 
-Help for commands is available:
+> **`/!\`:** Act with care; this workflow uses significant memory if you increase the values in `.masterconfig`. We recommend keeping the default settings and running a test first.
+
+> **`/!\`:** For now dont run multiple split at once
+
+> **`/!\`:** The Transduplication and Reciprocal Translocation sections in the `visor_sv_type.yaml` config file are placeholders; do not use them yet.
+
+# How to Use
+## A. Running on the CBIB
+### 1. Set up
+Clone the Git repository and switch to my branch:
 ```bash
-python tree_generation.py -h
-python variants_generation.py -h
+git clone https://forgemia.inra.fr/pangepop/MSpangepop.git
+cd MSpangepop
+git checkout dev_lpiat
 ```
 
-The code may be slow and should be ran on a cluster if you wish to generate a large population. A Singularity image can be created (`singularity_img` folder).
+### 2. Add your files
+- Add a `.fasta.gz` file; an example can be found in the repository.
+
+### 3. Configure the pipeline
+- Edit the `.masterconfig` file in the `.config/` directory with your sample information. 
+- Edit the `visor_sv_type.yaml` file with the mutations you want.
+- Edit line 17 of `job.sh` and line 13 of `./config/snakemake_profile/clusterconfig.yaml` with your email.
 
-## 1. Use a VCF as a base
-A VCF is used as a base for genotypes and the position and number of variants. This VCF can be create with `msprime` to generate a population with:
+### 4. Run the WF
+The workflow has two parts: `split` and `simulate`. Always run the split first and once its done (realy quick) run the simulate.
 ```bash
-python tree_generation.py -fai <FAI> -p <POPULATION SIZE> -r <MUTATION RATE> -n <SAMPLE SIZE>
+sbatch job.sh [split or simulate] dry
 ```
-This command output as many VCF as there are chromosomes in the reference FAI.
-
-## 2. Generate structural variants
+If no warnings are displayed, run:
 ```bash
-python variants_generation.py -fai <FAI> -fa <FASTA> -v <VCF> -y <YAML>
+sbatch job.sh [split or simulate] 
 ```
-The VCF is the one created with the previous command using `msprime`.
+> **Nb 1:** to create a visual representation of the workflow, use `dag` instead of `dry`. Open the generated `.dot` file with a [viewer](https://dreampuf.github.io/GraphvizOnline/) that supports the format.
 
-The variants generation is inspired by [VISOR](https://github.com/davidebolo1993/VISOR). YAML template is available in `VISOR_random_bed` folder. Modify the YAML input to set the percentage of each variant you want to simulate (must equal 100).
+> **Nb 2:** Frist execution of the workflow will be slow since images need to be pulled.
 
-Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394).
+> **Nb 3:** The workflow is in two parts because we want to execute the simulations chromosome by chromosome. Snakemake cannot retrieve the number of chromosomes in one go and needs to index and split first.
 
-The distributions are used to randomly sample each structural variant size.
+> **Nb 4:** Since the cbib dose not support `python:3.9.7` we cant use cookie cutter config, use the `cbib_job.sh` to run. 
 
-# TODO
-- [ ] Add not yet supported VISOR variant types
-    - [ ] MNP
-    - [ ] tandem repeat contraction
-    - [ ] tandem repeat expansion
-    - [ ] approximate tandem repetition
-- [x] Add VCF merging when there are multiple chromosomes
+## B. Run localy
+- Ensure `snakemake` and `singularity` are installed on your machine, then run the workflow:
+```bash
+./local_run [split or simulate] dry
+```
+If the workflow cannot download images from the container registry, install `Docker`, log in with your credentials, and rerun the workflow:
+```bash
+docker login -u "<your_username>" -p "<your_token>" "registry.forgemia.inra.fr" 
+```
 
-## 3. Create exact data with vg
-In the `vg_extact_data` folder.
+# Workflow
+![Dag of the workflow](workflow/dag.svg)
 
-Snakemake/Singularity pipeline to get a pangenome in GFA format and a FASTA with all individuals from the VCF. Starts from a reference FASTA and a VCF to specify in the `config.yaml` file, with a name for the output (**WARNING**: think to set the correct email address in `config.yaml`).
+# More informations
 
-Create directory or modify the SBATCH options in `job.sh` (**WARNING**: think to set the correct email address in `job.sh` if you want to receive the slurm emails).
-```
-mkdir -p slurm_logs
-```
-On SLURM cluster, run `sbatch job.sh dry` for a dry run or `sbatch job.sh` directly. Adjust the `SNG_BIND` variable if files are not found and the snakemake profile as necessary for performance.
+The variants generation is inspired by [VISOR](https://github.com/davidebolo1993/VISOR).
 
 You can extract a VCF from the graph using the `vg deconstruct` command. It is not implemented in the pipeline.
+
+You can use the script `workflow/scripts/split_path.sh` to cut the final fasta into chromosome level fasta files. 
+```bash
+./split_fasta.sh input.fasta /path/to/output_directory
+```
+# Dependencies
+TODO
+pandas, msprime, argprase, os, multiprocessing, yaml, Bio.Seq
+singularity, snakemake
+vg:1.60.0, bcftools:1.12, bgzip:latest, tabix:1.7. 
\ No newline at end of file
diff --git a/VISOR_random_bed/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml
deleted file mode 100644
index 518bcaffc0c7edf4595fe3b6f5ae42f438d0ff7b..0000000000000000000000000000000000000000
--- a/VISOR_random_bed/visor_sv_type.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-### percentage of each variant type in the simulation, sum of values should be 100
-SNP: 0
-deletion: 0
-insertion: 0
-inversion: 100
-tandem duplication: 0
-inverted tandem duplication: 0
-translocation copy-paste: 0
-translocation cut-paste : 0
-reciprocal translocation: 0
\ No newline at end of file
diff --git a/bed2vcf.py b/bed2vcf.py
deleted file mode 100644
index 1b366166d9a66e371b8656f5389a6ac4361bb664..0000000000000000000000000000000000000000
--- a/bed2vcf.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import pandas as pd
-import re, random, json, os
-import numpy as np
-from Bio import SeqIO
-from Bio.Seq import Seq
-
-# read FASTA as a BioSeq dictionary
-# (not directly loaded in memory)
-def read_fa(input_file):
-	record_dict = SeqIO.index(input_file, "fasta")
-	return record_dict
-
-# gÃ©nÃ¨re une sÃ©quence pour insertion
-def DNA(length):
-    return ''.join(random.choice('CGTA') for _ in range(length))
-
-def SNP(aa):
-	s = 'ACGT'
-	s = s.replace(aa, '')
-	return(random.choice(s))
-
-# check if reverse
-def is_reverse(s):
-	return(s=="reverse")
-
-# reverse sequence
-def reverse(s):
-	seq = Seq(str(s))
-	return(seq.reverse_complement())
-
-# create deletion by switching alt and ref seq
-def deletion(ref_seq, alt_seq):
-	ref = str(ref_seq) + str(alt_seq)
-	alt = str(ref_seq)
-	return(ref,alt)
-
-def set_ref_alt(ref, alt, row, df):
-	df.at[row, "REF"] = ref
-	df.at[row, "ALT"] = alt
-
-def get_random_len(svtype):	
-	#Â INS, DEL, DUP, CNV, INV
-	df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t")
-	pb = df["pb"].tolist()
-	li = np.random.multinomial(1, pb)
-	i = np.argmax(li)
-	interval = df["size_interval"].iloc[i]
-	interval = json.loads(interval)
-	s = np.random.uniform(interval[0], interval[1], 1).round()
-	return(int(s[0]))
-
-### get the sequence of each variants in BED, output VCF
-# vcf_df : msprime VCF
-# bed_def : VISOR BED
-# fa_dict : FASTA where variants will be generated
-def get_seq(vcf_df, bed_df, fa_dict, output_file):
-	cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
-	ncol = len(vcf_df.columns) - len(cols)
-	li = [i for i in range(ncol)]
-	cols.extend(li)
-	vcf_df.columns = cols
-
-	df = vcf_df.join(bed_df)
-	# number of variants
-	n = len(df)
-	# add variants
-	series = []
-
-	for i in range(n):
-		## nom du chr pris dans le VCF
-		chr_name = df["CHROM"].iloc[i]
-		## position du variant pris dans le VCF
-		start = df["POS"].iloc[i]
-		start = int(start)-1 # pour ajuster Ã  l'index python
-		## type de SV Ã  gÃ©nÃ©rer
-		sv_info = df['SVINFO'].iloc[i]
-		sv_type = df['SVTYPE'].iloc[i]
-
-		fasta_seq = fa_dict[chr_name].upper()
-
-		if sv_type == "SNP":
-			ref = str(fasta_seq.seq[start])
-			alt = SNP(ref)
-
-		elif sv_type == "deletion":
-			end = start + get_random_len("DEL")
-			ref = str(fasta_seq.seq[start:end])
-			alt = str(fasta_seq.seq[start])
-
-		elif sv_type == "insertion":
-			end = start + get_random_len("INS")
-			alt_seq = DNA(get_random_len("INS"))
-			ref = str(fasta_seq.seq[start])
-			alt = ref + str(alt_seq)
-
-		elif sv_type == "inversion":
-			end = start + get_random_len("INV")
-			ref = str(fasta_seq.seq[start:end])
-			alt = str(fasta_seq.seq[start]) + reverse(str(fasta_seq.seq[start+1:end]))
-
-		elif re.search("duplication", sv_type):
-			end = start + get_random_len("DUP")
-			ref = str(fasta_seq.seq[start])
-			alt_seq1 = str(fasta_seq.seq[start+1:end])
-			cp = sv_info-1
-			# multiplication pour obtenir les copies
-			alt_seq = alt_seq1*cp
-			if sv_type == "inverted tandem duplication":
-				alt_seq = reverse(alt_seq)
-			
-			alt = ref + alt_seq
-
-		elif re.search("translocation", sv_type):
-			# TODO choisir la distribution utilisÃ©e
-			l = get_random_len("INS")
-			end = start + l
-			
-			# informations sur la translocation
-			trans_chr = sv_info[0]
-			trans_start = sv_info[1]-1
-			trans_end = trans_start + l
-			fasta_seq_trans = fa_dict[trans_chr].upper()
-			
-			# translocation rÃ©ciproque
-			if sv_type == "reciprocal translocation":
-				ref = str(fasta_seq.seq[start:end])
-				ref2 = str(fasta_seq_trans.seq[trans_start:trans_end])
-				
-				if sv_info[2] == "reverse":
-					alt = str(fasta_seq.seq[start]) + reverse(str(fasta_seq_trans.seq[trans_start+1:trans_end]))
-				else :
-					alt = str(fasta_seq.seq[start]) + str(fasta_seq_trans.seq[trans_start+1:trans_end])
-
-				if sv_info[3] == "reverse":
-					alt2 = str(fasta_seq_trans.seq[trans_start]) + reverse(str(fasta_seq.seq[start+1:end]))
-				else:
-					alt2 = str(fasta_seq_trans.seq[trans_start]) + str(fasta_seq.seq[start+1:end])
-			
-			# couper-coller
-			elif sv_type == "translocation cut-paste":
-				# dÃ©lÃ©tion
-				ref = str(fasta_seq.seq[start:end])
-				alt = str(fasta_seq.seq[start])
-				
-				# insertion
-				ref2 = str(fasta_seq_trans.seq[trans_start])
-				if sv_info[2] == "reverse":
-					alt2 = ref2 + reverse(str(fasta_seq.seq[start+1:end]))
-				else :
-					alt2 = ref2 + str(fasta_seq.seq[start+1:end])
-
-			# copier-coller
-			else:
-				ref = str(fasta_seq.seq[start])
-				alt = SNP(ref)
-
-				ref2 = str(fasta_seq_trans.seq[trans_start])
-				alt2 = ref2 + str(fasta_seq.seq[start+1:end])
-			
-			new_vcf_var = vcf_df.iloc[i].tolist()
-			new_vcf_var[0] = trans_chr
-			new_vcf_var[1] = trans_start+1
-			new_vcf_var[3] = ref2
-			new_vcf_var[4] = alt2
-			save_series = pd.Series(new_vcf_var, index=cols)
-			series.append(save_series)
-
-		set_ref_alt(ref, alt, i, df)
-
-	df_add = pd.DataFrame(series)
-	df = pd.concat([df, df_add], ignore_index=True)
-	df = df.drop(['SVTYPE', 'SVINFO'], axis=1)
-
-	# remove unnecessary columns for VCF
-	df["ID"] = "."
-	df["QUAL"] = 99
-	# output VCF
-	if not os.path.exists("results"):
-		os.mkdir("results")
-	df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False)
\ No newline at end of file
diff --git a/cbib_job.sh b/cbib_job.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3efcf0c7dedd238840ab526666b52051430811c9
--- /dev/null
+++ b/cbib_job.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+################################ Slurm options #################################
+### prepare_calling_jobs
+#SBATCH -J smk_main
+### Max run time "hours:minutes:seconds"
+#SBATCH --time=95:00:00
+#SBATCH --ntasks=1 #nb of processes
+#SBATCH --cpus-per-task=1 # nb of cores for each process(1 process)
+#SBATCH --mem=10G # max of memory (-m) 
+### Requirements nodes/servers (default: 1)
+#SBATCH --nodes=1
+### Requirements cpu/core/task (default: 1)
+#SBATCH --ntasks-per-node=1
+#SBATCH -o slurm_logs/snakemake.%N.%j.out
+#SBATCH -e slurm_logs/snakemake.%N.%j.err
+#SBATCH --mail-type=END,FAIL
+#SBATCH --mail-user=<your.email@here.fr>
+################################################################################
+
+# Function to load modules
+load_modules() {
+    module purge  # Clear any previously loaded modules
+
+    # Loop through each module and load it
+    for module_name in "$@"; do
+        module load "$module_name"
+    done
+}
+
+load_modules "containers/singularity/3.9.9" "bioinfo/Snakemake/8.3.1"
+
+SNG_BIND=$(pwd)
+MAX_CORES=10
+
+echo 'Starting Snakemake workflow'
+
+run_snakemake() {
+    local snakefile="$1"  # The Snakefile to run
+    local option="$2"     # The option for dry run or DAG
+
+    echo "Starting $snakefile..."
+
+    # Execute the Snakemake command with the specified option
+    if [[ "$option" == "dry" ]]; then
+        snakemake -s "$snakefile" -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" -n
+    elif [[ "$option" == "dag" ]]; then
+        snakemake -s "$snakefile" -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --dag > dag.dot
+        echo "DAG has been generated as dag.png"
+        return
+    else
+        snakemake -s "$snakefile" -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND"
+    fi
+
+    # Check if the Snakemake command was successful
+    if [ $? -eq 0 ]; then
+        echo "$snakefile completed successfully."
+    else
+        echo "Error: $snakefile failed."
+        exit 1
+    fi
+}
+
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 [split|simulate] [dry|dag|run]"
+    echo "    split - run the split Snakefile"
+    echo "    simulate - run the simulate Snakefile"
+    echo "    dry - run the specified Snakefile in dry-run mode"
+    echo "    dag - generate DAG for the specified Snakefile"
+    echo "    run - run the specified Snakefile normally (default)"
+    exit 1
+fi
+
+# Determine the workflow and option based on the arguments
+workflow="$1"
+option="$2"
+
+# Run the specified Snakefile based on user input
+case "$workflow" in
+    split)
+        snakefile="workflow/Snakefile_split.smk"
+        ;;
+    simulate)
+        snakefile="workflow/Snakefile_simulate.smk"
+        ;;
+    *)
+        echo "Invalid workflow: $workflow"
+        echo "Usage: $0 [split|simulate] [dry|dag|run]"
+        exit 1
+        ;;
+esac
+
+# Run the specified Snakefile with the provided option
+run_snakemake "$snakefile" "$option"
\ No newline at end of file
diff --git a/job.sh b/job.sh
new file mode 100644
index 0000000000000000000000000000000000000000..408d17f581cfcf89397613b92711be2527d9d9df
--- /dev/null
+++ b/job.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+################################ Slurm options #################################
+### prepare_calling_jobs
+#SBATCH -J smk_main
+### Max run time "hours:minutes:seconds"
+#SBATCH --time=96:00:00
+#SBATCH --ntasks=1 #nb of processes
+#SBATCH --cpus-per-task=1 # nb of cores for each process(1 process)
+#SBATCH --mem=10G # max of memory (-m) 
+### Requirements nodes/servers (default: 1)
+#SBATCH --nodes=1
+### Requirements cpu/core/task (default: 1)
+#SBATCH --ntasks-per-node=1
+#SBATCH -o slurm_logs/snakemake.%N.%j.out
+#SBATCH -e slurm_logs/snakemake.%N.%j.err
+#SBATCH --mail-type=END,FAIL
+#SBATCH --mail-user=<your.email@here.fr>
+################################################################################
+
+# Useful information to print
+echo '########################################'
+echo 'Date:' $(date --iso-8601=seconds)
+echo 'User:' $USER
+echo 'Host:' $HOSTNAME
+echo 'Job Name:' $SLURM_JOB_NAME
+echo 'Job ID:' $SLURM_JOB_ID
+echo 'Number of nodes assigned to job:' $SLURM_JOB_NUM_NODES
+echo 'Total number of cores for job (?):' $SLURM_NTASKS
+echo 'Number of requested cores per node:' $SLURM_NTASKS_PER_NODE
+echo 'Nodes assigned to job:' $SLURM_JOB_NODELIST
+echo 'Number of CPUs assigned for each task:' $SLURM_CPUS_PER_TASK
+echo 'Directory:' $(pwd)
+# Detail Information:
+echo 'scontrol show job:'
+scontrol show job $SLURM_JOB_ID
+echo '########################################'
+
+# Function to load modules
+load_modules() {
+    module purge  # Clear any previously loaded modules
+
+    # Loop through each module and load it
+    for module_name in "$@"; do
+        module load "$module_name"
+    done
+}
+
+# Here specify the modules to load and their path
+load_modules "python/3.9.7" "snakemake/6.5.1" 
+
+### variables
+SNG_BIND=$(pwd)
+CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml"
+MAX_CORES=10
+PROFILE=".config/snakemake_profile/slurm"
+
+echo 'Starting Snakemake workflow'
+
+
+run_snakemake() {
+    local snakefile="$1"  # The Snakefile to run
+    local option="$2"     # The option for dry run or DAG
+
+    echo "Starting $snakefile..."
+
+    # Execute the Snakemake command with the specified option
+    if [[ "$option" == "dry" ]]; then
+        snakemake -s "$snakefile" --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -n -r
+    elif [[ "$option" == "dag" ]]; then
+        snakemake -s "$snakefile" --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG --dag > dag.dot
+        echo "DAG has been generated as dag.png"
+        return
+    else
+        snakemake -s "$snakefile" --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG
+    fi
+
+    # Check if the Snakemake command was successful
+    if [ $? -eq 0 ]; then
+        echo "$snakefile completed successfully."
+    else
+        echo "Error: $snakefile failed."
+        exit 1
+    fi
+}
+
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 [split|simulate] [dry|dag|run]"
+    echo "    split - run the split Snakefile"
+    echo "    simulate - run the simulate Snakefile"
+    echo "    dry - run the specified Snakefile in dry-run mode"
+    echo "    dag - generate DAG for the specified Snakefile"
+    echo "    run - run the specified Snakefile normally (default)"
+    exit 1
+fi
+
+# Determine the workflow and option based on the arguments
+workflow="$1"
+option="$2"
+
+# Run the specified Snakefile based on user input
+case "$workflow" in
+    split)
+        snakefile="workflow/Snakefile_split.smk"
+        ;;
+    simulate)
+        snakefile="workflow/Snakefile_simulate.smk"
+        ;;
+    *)
+        echo "Invalid workflow: $workflow"
+        echo "Usage: $0 [split|simulate] [dry|dag|run]"
+        exit 1
+        ;;
+esac
+
+# Run the specified Snakefile with the provided option
+run_snakemake "$snakefile" "$option"
+squeue -u $USER
\ No newline at end of file
diff --git a/local_run.sh b/local_run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2a96fa122e6f5d9c8d66e8e73f1aa6efa2cf832d
--- /dev/null
+++ b/local_run.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#Script to run localy the workflow DO NOT USE AS IS ON A CLUSTER!
+
+SNG_BIND=$(pwd)
+
+run_snakemake() {
+    local snakefile="$1"  # The Snakefile to run
+    local option="$2"     # The option for dry run or DAG
+
+    echo "Starting $snakefile..."
+
+    # Execute the Snakemake command with the specified option
+    if [[ "$option" == "dry" ]]; then
+        snakemake -s "$snakefile" --use-singularity --singularity-args "-B $SNG_BIND" -c4 -n
+    elif [[ "$option" == "dag" ]]; then
+                snakemake -s "$snakefile" --use-singularity --singularity-args "-B $SNG_BIND" -c4 --dag > dag.dot
+        echo "DAG has been generated as dag.png"
+        return
+    else
+        snakemake -s "$snakefile" --use-singularity --singularity-args "-B $SNG_BIND" -c4
+    fi
+
+    # Check if the Snakemake command was successful
+    if [ $? -eq 0 ]; then
+        echo "$snakefile completed successfully."
+    else
+        echo "Error: $snakefile failed."
+        exit 1
+    fi
+}
+
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 [split|simulate] [dry|dag|run]"
+    echo "    split - run the split Snakefile"
+    echo "    simulate - run the simulate Snakefile"
+    echo "    dry - run the specified Snakefile in dry-run mode"
+    echo "    dag - generate DAG for the specified Snakefile"
+    echo "    run - run the specified Snakefile normally (default)"
+    exit 1
+fi
+
+# Determine the workflow and option based on the arguments
+workflow="$1"
+option="$2"
+
+# Run the specified Snakefile based on user input
+case "$workflow" in
+    split)
+        snakefile="workflow/Snakefile_split.smk"
+        ;;
+    simulate)
+        snakefile="workflow/Snakefile_simulate.smk"
+        ;;
+    *)
+        echo "Invalid workflow: $workflow"
+        echo "Usage: $0 [split|simulate] [dry|dag|run]"
+        exit 1
+        ;;
+esac
+
+# Run the specified Snakefile with the provided option
+run_snakemake "$snakefile" "$option"
+
+
diff --git a/randombed.py b/randombed.py
deleted file mode 100644
index 3f15a719a596e4a1eb6c5bfc722b159e9bf29833..0000000000000000000000000000000000000000
--- a/randombed.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pandas as pd
-import yaml, random, re
-
-## read yaml file containing variants informations
-def read_yaml(f):
-	stream = open(f, 'r')
-	d = yaml.safe_load(stream)
-	stream.close()
-	if sum(d.values()) != 100:
-		raise ValueError("Sum of values must be 100")
-	return(d)
-
-infos_dict = {'deletion': None, 'insertion': None, 'inversion': None, 'SNP': None,
-		 'tandem duplication': 2, 'inverted tandem duplication': 2, 
-		 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0}
-
-## generate info field for translocations
-#Â randomly select chromosome to add sequence to
-def select_chr(fai):
-	df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"])
-	names=df["name"].to_list()
-	chr = random.choice(names)
-	df = pd.pivot_table(df, columns='name')
-	len = df.iloc[0][chr]
-	pos = random.randrange(len)
-	li = [chr, pos]
-	return(li)
-
-# randomly select sequence orientation
-def select_orientation(li):
-	li.append(random.choice(["forward", "reverse"]))
-	return(li)
-
-## create dataframe with all the variants to simulate
-def generate_type(nvar, yml, fai):
-	d = read_yaml(yml)
-	## list for variant type and info
-	vartype = []
-	infos = []
-	## filter dict to keep value != 0
-	dict = {k: v for k, v in d.items() if v != 0}
-	for k in dict:
-		# generate n variants
-		perc = dict[k]
-		n = round(perc*nvar/100)
-		# add info field
-		if re.search("translocation", k):
-			for i in range(n):
-				info = select_chr(fai)
-				info = select_orientation(info)		
-				if k == "reciprocal translocation":
-					info = select_orientation(info)
-				infos.extend([info])
-				i += 1
-		else:
-			info = infos_dict[k]
-			infos.extend([info]*n)
-		vartype.extend([k]*n)
-	
-	if len(vartype) != nvar:
-		n = nvar - len(vartype)
-		vartype.extend(["SNP"]*n)
-		infos.extend([None]*n)
-
-	for_df = {'SVTYPE': vartype, 'SVINFO': infos}
-
-	df = pd.DataFrame(data = for_df)
-	# shuffle values in dataframe
-	df = df.sample(frac = 1, ignore_index=True)
-	# print(df)
-	df.to_csv("random_var.tsv", sep="\t", index=False)
-	return(df)
diff --git a/simulation_data/README.md b/simulation_data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8531f650071bf817a1e9307399aea07b46f4934
--- /dev/null
+++ b/simulation_data/README.md
@@ -0,0 +1,5 @@
+This folder contains information about the size distribtion of each type of variants
+
+Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394). The distributions are used to randomly sample each structural variant size.
+
+Ultimately, this will be replaced by values for the stuided organism. 
\ No newline at end of file
diff --git a/sv_distributions/size_distribCNV.tsv b/simulation_data/size_distribCNV.tsv
similarity index 100%
rename from sv_distributions/size_distribCNV.tsv
rename to simulation_data/size_distribCNV.tsv
diff --git a/sv_distributions/size_distribDEL.tsv b/simulation_data/size_distribDEL.tsv
similarity index 100%
rename from sv_distributions/size_distribDEL.tsv
rename to simulation_data/size_distribDEL.tsv
diff --git a/sv_distributions/size_distribDUP.tsv b/simulation_data/size_distribDUP.tsv
similarity index 100%
rename from sv_distributions/size_distribDUP.tsv
rename to simulation_data/size_distribDUP.tsv
diff --git a/sv_distributions/size_distribINS.tsv b/simulation_data/size_distribINS.tsv
similarity index 100%
rename from sv_distributions/size_distribINS.tsv
rename to simulation_data/size_distribINS.tsv
diff --git a/sv_distributions/size_distribINV.tsv b/simulation_data/size_distribINV.tsv
similarity index 100%
rename from sv_distributions/size_distribINV.tsv
rename to simulation_data/size_distribINV.tsv
diff --git a/singularity_img/build_singularity.sh b/singularity_img/build_singularity.sh
deleted file mode 100644
index 1e15b3ddd8b44ffddeda0731f4488811e751f0c0..0000000000000000000000000000000000000000
--- a/singularity_img/build_singularity.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-
-### outside of msprime_VISOR_fusionVCF folder
-sudo singularity build mspvisor.sif msprime_VISOR_fusionVCF/singularity_img/to_singularity.def
-
-### use singularity image
-#Â singularity exec mspvisor.sif python3 /mspv/tree_generation.py --help
diff --git a/singularity_img/to_singularity.def b/singularity_img/to_singularity.def
deleted file mode 100644
index 8525ee504dbd10c0b89878217b2f778b351743cf..0000000000000000000000000000000000000000
--- a/singularity_img/to_singularity.def
+++ /dev/null
@@ -1,19 +0,0 @@
-Bootstrap: docker
-# From: ubuntu:focal-20220415
-From: ubuntu:jammy-20231004
-IncludeCmd: yes
-
-%files
-	msprime_VISOR_fusionVCF mspv
-
-%environment
-	export PATH="/mspv/tree_generation.py":$PATH
-	export PATH="/mspv/variants_generation.py":$PATH
-
-%post
-	##Python installation
-	apt update -y
-	apt install python3 -y
-	python3 --version
-	apt install python3-pip -y
-	pip install msprime pandas defopt numpy biopython pyyaml
diff --git a/slurm_logs/.gitignore b/slurm_logs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a05540034ae706ccd697f2d967ba840daf133d32
--- /dev/null
+++ b/slurm_logs/.gitignore
@@ -0,0 +1,6 @@
+### This file is here to avoid a SLURM error when running a job because of the missing slurm_logs directory
+
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/test_genome.fa.gz b/test_genome.fa.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0d3c08761e296622e73488f1a7e4dc7aaebeeacf
Binary files /dev/null and b/test_genome.fa.gz differ
diff --git a/tree_generation.py b/tree_generation.py
deleted file mode 100644
index 223b71d494e4463a9aa380114da056c913144288..0000000000000000000000000000000000000000
--- a/tree_generation.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import pandas as pd
-import math, msprime, argparse, os
-import numpy as np
-from multiprocessing import Process
-
-def chrom_pos(df):
-    l=df["length"].to_list()
-    map_positions=[0]
-    for i in l:
-        map_positions.append(i+map_positions[-1])
-    n=len(map_positions)
-    
-    for i in range(1,n*2-3, 2):
-        map_positions.insert(i+1, map_positions[i]+1)
-    return(map_positions)
-
-
-### simulation de plusieurs chromosomes
-def chrom_rate(df):
-    n=len(df)
-    r_chrom = 1e-8
-    r_break = math.log(2)
-    li = np.resize([r_chrom,r_break], (n*2-1))
-    return(li)
-
-def msprime_map(df):
-    map_positions=chrom_pos(df)
-    rates=chrom_rate(df)
-    rate_map=msprime.RateMap(position=map_positions, rate=rates)
-    return(rate_map)
-
-### ts_chroms = list of trees for each chromosome
-### names = list of chromosome name
-def output_files(ts_chroms, names, outdir="results"):
-    if not os.path.exists(outdir):
-            os.mkdir(outdir)
-
-        ## write vcf
-    filename = outdir + "/msprime_simulation.vcf"
-    for i in range(len(ts_chroms)):
-        txt = ts_chroms[i].as_vcf(contig_id=names[i])
-        t = ''.join(txt.splitlines(keepends=True)[6:])
-        # filename = 'tmp_vcf/output' + str(i) + ".vcf"
-        if i==0:
-            header_top = ''.join(txt.splitlines(keepends=True)[:4])
-            header_bot = ''.join(txt.splitlines(keepends=True)[4:6])
-            with open(filename, "w") as vcf_file:
-                vcf_file.write(t)
-                # ts_chroms[i].write_vcf(vcf_file, contig_id=names[i])
-        else:
-            header_top += txt.splitlines(keepends=True)[3]
-            
-            with open(filename, "a") as vcf_file:
-                vcf_file.write(t)
-            vcf_file.close()
-    ## write header to file
-    header = header_top + header_bot
-    with open(outdir + "/header.vcf", "w") as head:
-        head.write(header)
-    head.close()
-
-# input_fai = samtools FAI of FASTA where variants will be generated
-# pop_size = population size
-# mut_rate = mutation rate
-# n = sample size / number of indiv
-def msprime_vcf(fai, pop_size, mut_rate, n, outdir):
-
-    df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"])
-    rate_map=msprime_map(df)
-
-    ts = msprime.sim_ancestry(
-		samples=n,
-		recombination_rate=rate_map,
-		population_size=pop_size,
-		# random_seed=123456
-        )
-
-	# avec des mutations
-    mutated_ts = msprime.sim_mutations(
-        ts, rate=mut_rate, 
-        # random_seed=5678, 
-        discrete_genome=True)
-
-    ts_chroms = []
-
-    l=df["length"].to_list()
-
-    chrom_positions=[0]
-    for i in l:
-        chrom_positions.append(i+chrom_positions[-1])
-
-    for j in range(len(chrom_positions) - 1):
-        start, end = chrom_positions[j: j + 2]
-        chrom_ts = mutated_ts.keep_intervals([[start, end]], simplify=False).trim()
-        ts_chroms.append(chrom_ts)
-
-    names=df["name"].to_list()
-
-    ## create files    
-    if outdir == None:
-        output_files(ts_chroms, names)
-    else:
-        output_files(ts_chroms, names, outdir)
-
-parser = argparse.ArgumentParser(description='Generate VCF for each chromosome in reference FASTA.')
-parser.add_argument('-fai', '--fai', type=str, required = True, help='FAI samtools index of reference FASTA')
-parser.add_argument('-p', '--popSize', type=int, required = True, help='population size (Ne)')
-parser.add_argument('-r', '--rate', type=float, required = True, help='mutation rate (Âµ)')
-parser.add_argument('-n', '--sampleSize', type=int, required = True, help='sample size')
-parser.add_argument('-o', '--outDir', type=str, help='output directory')
-
-if __name__ == '__main__':
-	args = parser.parse_args()
-	d = vars(args)
-	li = list(d.values())
-	p = Process(target=msprime_vcf, args=(li))
-	p.start()
-	p.join()
\ No newline at end of file
diff --git a/variants_generation.py b/variants_generation.py
deleted file mode 100644
index df622cbf6a3a96a2a46643840382cfbe286c4cea..0000000000000000000000000000000000000000
--- a/variants_generation.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from bed2vcf import read_fa, get_seq
-from randombed import generate_type
-import argparse
-import pandas as pd
-from multiprocessing import Process
-
-## n : length of variant
-def sv_vcf(vcf, fasta, fai, yml, outName):
-	vcf_df = pd.read_table(vcf, sep="\t")
-	nvar = len(vcf_df)
-	bed_df = generate_type(nvar, yml, fai)
-	fa = read_fa(fasta)
-
-	get_seq(vcf_df, bed_df, fa, outName)
-
-# parse arguments
-parser = argparse.ArgumentParser(description='create final VCF')
-# parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include')
-parser.add_argument('-v', '--vcf', type=str, required = True, help='VCF generated with msprime --> tree generation script')
-parser.add_argument('-fa', '--fasta', type=str, required = True, help='Reference FASTA for the VCF and the variants')
-parser.add_argument('-fai', '--fai', type=str, required = True, help='Samtools index of reference FASTA')
-parser.add_argument('-y', '--yaml', type=str, required = True, help='Reference FASTA for the VCF and the variants')
-parser.add_argument('-o', '--outName', type=str, required = True, help='Output name')
-# parser.add_argument('--header', type=int, help='vcf header size')
-
-if __name__ == '__main__':
-	args = parser.parse_args()
-	d = vars(args)
-	li = list(d.values())
-	p = Process(target=sv_vcf, args=(li))
-	p.start()
-	p.join()
\ No newline at end of file
diff --git a/vg_exact_data/Snakefile b/vg_exact_data/Snakefile
deleted file mode 100644
index 4739017f56df06eb76c00833d20a0a8877faad7b..0000000000000000000000000000000000000000
--- a/vg_exact_data/Snakefile
+++ /dev/null
@@ -1,75 +0,0 @@
-configfile: "config.yaml"
-SAMPLES = config["name"]
-
-
-rule all:
-	input:
-		expand("results/{sample}/{sample}.gfa", sample=SAMPLES),
-		# expand("results/{sample}/{sample}.fa", sample=SAMPLES),
-		expand("results/{sample}/{sample}_from_vg.gfa", sample=SAMPLES)
-
-rule sort:
-	input:
-		vcf = config["vcf"]
-	output:
-		sorted_vcf = "results/{sample}/{sample}.vcf.gz"
-	container:
-		"docker://mgibio/bcftools:1.12"
-	shell:
-		"bcftools sort {input.vcf} -Oz -o {output}"
-
-rule construct:
-	input:
-		ref = config["fa"],
-		vcf = rules.sort.output
-	output:
-		"results/{sample}/graph.vg"
-	container:
-		"docker://quay.io/vgteam/vg:v1.53.0"
-	shell:
-		"vg construct -m 2000000000 -r {input.ref} -v {input.vcf} -f -p > {output}"
-
-rule convert:
-	input:
-		vg = rules.construct.output
-	output:
-		gfa = "results/{sample}/{sample}_from_vg.gfa"
-	container:
-		"docker://quay.io/vgteam/vg:v1.53.0"
-	shell:
-		"vg convert -f {input.vg} > {output.gfa}"
-
-rule graph:
-	input:
-		ref = config["fa"],
-		vcf = rules.sort.output
-		# vcf = config["vcf"]
-	output:
-		gbz = "results/" + "{sample}/index.giraffe.gbz"
-	params:
-		prefix = "results/" + "{sample}/index"
-	container:
-		"docker://quay.io/vgteam/vg:v1.53.0"
-	shell:
-		"vg autoindex -r {input.ref} -v {input.vcf} -w giraffe -p {params.prefix}"
-
-rule graph_gfa:
-	input:
-		gbz = rules.graph.output
-	output:
-		gfa = "results/{sample}/{sample}.gfa"
-	container:
-		"docker://quay.io/vgteam/vg:v1.53.0"
-	shell:
-		"vg convert -f {input.gbz} > {output.gfa}"
-
-rule fa:
-	input:
-		gbz = rules.graph.output
-	output:
-		fa = "results/{sample}/{sample}.fa"
-	container:
-		"docker://quay.io/vgteam/vg:v1.53.0"
-	shell:
-		"vg paths -F -x {input.gbz} > {output.fa}"
-
diff --git a/vg_exact_data/config.yaml b/vg_exact_data/config.yaml
deleted file mode 100644
index a233aa7b8f91825effd87cf366a0b235748bcc7a..0000000000000000000000000000000000000000
--- a/vg_exact_data/config.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-fa: "../Sibirica_v1.0.fa"
-vcf: "../with_singularity/results/sib_10.vcf"
-name: "sibirica_10k"
\ No newline at end of file
diff --git a/vg_exact_data/job.sh b/vg_exact_data/job.sh
deleted file mode 100644
index 5c80ff10f5ff29a49371be2e7868a66bda8a87d3..0000000000000000000000000000000000000000
--- a/vg_exact_data/job.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-################################ Slurm options #################################
-### prepare_calling_jobs
-#SBATCH -J vg_data
-### Max run time "hours:minutes:seconds"
-#SBATCH --time=120:00:00
-#SBATCH --ntasks=1 #nb of processes
-#SBATCH --cpus-per-task=1 # nb of cores for each process(1 process)
-#SBATCH --mem=10G # max of memory (-m) 
-### Requirements nodes/servers (default: 1)
-#SBATCH --nodes=1
-### Requirements cpu/core/task (default: 1)
-#SBATCH --ntasks-per-node=1
-#SBATCH -o slurm_logs/snakemake.%N.%j.out
-#SBATCH -e slurm_logs/snakemake.%N.%j.err
-#SBATCH --mail-type=END,FAIL
-#SBATCH --mail-user=my.email@somewhere.com
-################################################################################
-
-# Useful information to print
-echo '########################################'
-echo 'Date:' $(date --iso-8601=seconds)
-echo 'User:' $USER
-echo 'Host:' $HOSTNAME
-echo 'Job Name:' $SLURM_JOB_NAME
-echo 'Job ID:' $SLURM_JOB_ID
-echo 'Number of nodes assigned to job:' $SLURM_JOB_NUM_NODES
-echo 'Total number of cores for job (?):' $SLURM_NTASKS
-echo 'Number of requested cores per node:' $SLURM_NTASKS_PER_NODE
-echo 'Nodes assigned to job:' $SLURM_JOB_NODELIST
-echo 'Number of CPUs assigned for each task:' $SLURM_CPUS_PER_TASK
-echo 'Directory:' $(pwd)
-# Detail Information:
-echo 'scontrol show job:'
-scontrol show job $SLURM_JOB_ID
-echo '########################################'
-
-
-### variables
-CLUSTER_CONFIG="snakemake_profile/slurm/cluster_config.yml"
-MAX_CORES=10
-PROFILE="snakemake_profile/slurm"
-SNG_BIND=".,/gpfs"
-
-### Module Loading:
-module purge
-module load snakemake/6.5.1
-# module load singularity
-
-echo 'Starting Snakemake workflow'
-
-### Snakemake commands
-
-if [ "$1" = "dry" ]
-then
-    # dry run
-    snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG -n -r
-else
-    # run
-    snakemake --profile $PROFILE -j $MAX_CORES --use-singularity --singularity-args "-B $SNG_BIND" --cluster-config $CLUSTER_CONFIG
-fi
diff --git a/vg_exact_data/snakemake_profile/slurm/cluster_config.yml b/vg_exact_data/snakemake_profile/slurm/cluster_config.yml
deleted file mode 100644
index 3740c7eab4a289b7709bcfc41edc72edc022f8c0..0000000000000000000000000000000000000000
--- a/vg_exact_data/snakemake_profile/slurm/cluster_config.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-### default ressources used by snakemake (applied to all rules)
-__default__:
-  job-name: "{rule}"
-  time: "120:00:00" # max run time "hours:minutes:seconds"
-  ntasks: 1 # nb of processes
-  cpus-per-task: 10 # nb of cores for each process(1 process)
-  mem: "60G"
-  nodes: 1 # Requirements nodes/servers (default: 1)
-  ntasks-per-node: 1 # Requirements cpu/core/task (default: 1)
-  output: "slurm_logs/{rule}.%N.%j.out"
-  error: "slurm_logs/{rule}.%N.%j.err"
-  mail-type: FAIL #email notification
-  mail-user: my.email@mydomain.com
-
-###Â rule resources
diff --git a/workflow/Snakefile_simulate.smk b/workflow/Snakefile_simulate.smk
new file mode 100644
index 0000000000000000000000000000000000000000..cc4a2cb9e63d04665166ed574eba821ae0c99366
--- /dev/null
+++ b/workflow/Snakefile_simulate.smk
@@ -0,0 +1,264 @@
+# Load the configuration file
+configfile: ".config/.masterconfig.yaml"
+
+import os
+import yaml
+
+# Set the output directory for results
+output_dir = "results/"
+
+# Function to load chromosomes dynamically from chr_config.yaml for each sample
+def load_chromosomes(sample):
+    chr_config_path = os.path.join(output_dir, f"{sample}_results", "01_chromosome_index", "chr_config.yaml")
+    if os.path.exists(chr_config_path):
+        with open(chr_config_path, 'r') as f:
+            chromosomes = yaml.safe_load(f)["chromosomes"]
+            return chromosomes
+    return []
+
+# Rule to define all final outputs
+
+rule all:
+    input:
+        expand(os.path.join(output_dir, "{sample}_results", "04_generated_variants", "{sample}_simulated_variants.vcf.gz"),
+               sample=config["samples"].keys()) + 
+        expand(os.path.join(output_dir, "{sample}_results", "05_vg_graph", "{sample}_vg_graph.gfa"),
+               sample=config["samples"].keys()) + 
+        expand(os.path.join(output_dir, "{sample}_results", "06_graph_paths", "{sample}_paths.fasta"),
+               sample=config["samples"].keys())
+
+# Define a function to get the path of the FAI file for each sample and chromosome
+def get_fai(wildcards):
+    sample = wildcards.sample
+    chromosome = wildcards.chromosome
+    return os.path.join(output_dir, f"{sample}_results", "02_splited_index", f"{chromosome}.fai")
+
+# Rule to generate trees for each chromosome of each sample
+rule run_msprime:
+    input:
+        fai=get_fai
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "{chromosome}_msprime_simulation.vcf"))
+    params:
+        pop_size=lambda wildcards: config["samples"][wildcards.sample]["population_size"],
+        mut_rate=lambda wildcards: config["samples"][wildcards.sample]["mutation_rate"],
+        reco_rate=lambda wildcards: config["samples"][wildcards.sample]["recombination_rate"],
+        n=lambda wildcards: config["samples"][wildcards.sample]["sample_size"],
+        out=lambda wildcards: os.path.join(output_dir, f"{wildcards.sample}_results", "temp")
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/mspangepop_dep:0.0.1"
+    shell:
+        """
+        mkdir -p {params.out} &&
+        python3 workflow/scripts/tree_generation.py -fai {input.fai} -p {params.pop_size} -m {params.mut_rate} -r {params.reco_rate} -n {params.n} -o {params.out} -c {wildcards.chromosome}
+        """
+
+rule compress_sim_vcf:
+    input:
+        vcf=rules.run_msprime.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "{chromosome}_msprime_simulation.vcf.gz"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/bgzip:latest"
+    shell:
+        """
+        bgzip -c {input.vcf} > {output}
+        """
+
+rule index_sim_vcf:
+    input:
+        vcf_gz=rules.compress_sim_vcf.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "{chromosome}_msprime_simulation.vcf.gz.tbi"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/tabix:1.7"
+    shell:
+        """
+        tabix -p vcf {input.vcf_gz}
+        """
+
+# Rule to merge VCF files for each sample by combining VCFs from all chromosomes
+rule merge_simulations:
+    input:
+        vcf_files=lambda wildcards: expand(
+            os.path.join(output_dir, "{sample}_results", "temp", "{chromosome}_msprime_simulation.vcf.gz"),
+            sample=[wildcards.sample],
+            chromosome=load_chromosomes(wildcards.sample)
+        ),
+        index_files=lambda wildcards: expand(
+            os.path.join(output_dir, "{sample}_results", "temp", "{chromosome}_msprime_simulation.vcf.gz.tbi"),
+            sample=[wildcards.sample],
+            chromosome=load_chromosomes(wildcards.sample)
+        )
+    output:
+        merged_vcf=os.path.join(output_dir, "{sample}_results", "03_msprime_simulation", "msprime_simulation.vcf.gz")
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/bcftools:1.12"
+    shell:
+        """
+        bcftools concat -a -O z -o{output.merged_vcf} -O z {input.vcf_files}
+        """
+
+# Rule to unzip the FASTA file for each sample
+rule unzip_input_fasta:
+    input:
+        fasta_gz=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"]
+    output:
+        temp(output_dir + "{sample}_results/temp/{sample}.fasta")
+    shell:
+        """
+        gunzip -c {input.fasta_gz} > {output}
+        """
+
+rule unzip_simulated_vcf:
+    input:
+        vcf_gz=rules.merge_simulations.output.merged_vcf
+    output:
+        temp(output_dir + "{sample}_results/temp/{sample}_msprime.vcf")
+    shell:
+        """
+        gunzip -c {input.vcf_gz} > {output}
+        """
+
+rule remove_vcf_header:
+    input:
+        vcf=rules.unzip_simulated_vcf.output
+    output:
+        temp(output_dir + "{sample}_results/temp/{sample}_msprime_no_header.vcf")
+    shell:
+        """
+        awk '!/^#/' {input.vcf} > {output}
+        """
+
+# Rule to extract the header from the uncompressed VCF file
+rule extract_vcf_header:
+    input:
+        vcf=rules.unzip_simulated_vcf.output
+    output:
+        temp(output_dir + "{sample}_results/temp/{sample}_msprime_header.vcf")
+    shell:
+        """
+        awk '/^#/' {input.vcf} > {output}
+        """
+
+# Rule to generate structural variants using variants_generation.py
+rule generate_variants:
+    input:
+        fai=os.path.join(output_dir, "{sample}_results", "01_chromosome_index", "{sample}_full.fai"),
+        fasta=rules.unzip_input_fasta.output,
+        vcf=rules.remove_vcf_header.output,
+        yaml=".config/visor_sv_type.yaml"
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "simulated_variants.vcf"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/mspangepop_dep:0.0.1"
+    shell:
+        """
+        python3 workflow/scripts/generate_variant.py --fai {input.fai} --fasta {input.fasta} --vcf {input.vcf} --yaml {input.yaml} --output {output}
+        """
+
+# Rule to sort the extracted VCF header using the external script
+rule sort_header:
+    input:
+        header=rules.extract_vcf_header.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "sorted_header.vcf"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/mspangepop_dep:0.0.1"
+    shell:
+        """
+        python3 workflow/scripts/sort_vcf_header.py {input.header} {output}
+        """
+
+# Rule to add the header to the generated VCF output
+rule add_header:
+    input:
+        header=rules.sort_header.output,
+        vcf=rules.generate_variants.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "final_simulated_variants.vcf"))
+    shell:
+        """
+        cat {input.header} {input.vcf} > {output}
+        """
+
+# Workflow to sort, compress, index VCF files and construct graphs
+rule sort_vcf:
+    input:
+        vcf = rules.add_header.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "sorted_simulated_variants.vcf"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/bcftools:1.12"
+    shell:
+        "bcftools sort {input.vcf} -Oz -o {output}"
+
+
+rule construct_graph:
+    input:
+        fasta = rules.unzip_input_fasta.output, 
+        vcf = rules.sort_vcf.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "05_vg_graph", "graph.vg"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/vg:1.60.0"
+    shell:
+        "vg construct -m 2000000000 -r {input.fasta} -v {input.vcf} -f -p > {output}"
+
+
+rule vg_to_gfa:
+    input:
+        vg = rules.construct_graph.output
+    output:
+        outfile = os.path.join(output_dir, "{sample}_results", "05_vg_graph", "{sample}_vg_graph.gfa")
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/vg:1.60.0"
+    shell:
+        "vg convert -f {input.vg} > {output.outfile}"
+
+rule compress_vcf_for_griaffe:
+    input:
+        vcf = rules.sort_vcf.output
+    output:
+        os.path.join(output_dir, "{sample}_results", "04_generated_variants", "{sample}_simulated_variants.vcf.gz")
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/bgzip:latest"
+    shell:
+        """
+        bgzip -c {input.vcf} > {output}
+        """
+
+rule index_giraffe:
+    input:
+        fasta = rules.unzip_input_fasta.output, 
+        vcf_gz = rules.compress_vcf_for_griaffe.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "index.giraffe.gbz"))
+    params:
+        out = os.path.join(output_dir, "{sample}_results", "temp", "index")
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/vg:1.60.0"
+    shell:
+        "vg autoindex -r {input.fasta} -v {input.vcf_gz} -w giraffe -p {params.out}"
+
+
+rule gbz_to_gfa:
+    input:
+        gbz = rules.index_giraffe.output
+    output:
+        temp(os.path.join(output_dir, "{sample}_results", "temp", "giraffe_graph.gfa"))
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/vg:1.60.0"
+    shell:
+        "vg convert -f {input.gbz} > {output}"
+
+
+rule graph_to_fasta:
+    input:
+        gbz = rules.gbz_to_gfa.output
+    output:
+        os.path.join(output_dir, "{sample}_results", "06_graph_paths", "{sample}_paths.fasta")
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/vg:1.60.0"
+    shell:
+        "vg paths -F -x {input.gbz} > {output}"
\ No newline at end of file
diff --git a/workflow/Snakefile_split.smk b/workflow/Snakefile_split.smk
new file mode 100644
index 0000000000000000000000000000000000000000..928a0600bcf2296f34d4e46f68682b2fd824de3d
--- /dev/null
+++ b/workflow/Snakefile_split.smk
@@ -0,0 +1,57 @@
+# Load the configuration file
+configfile: ".config/.masterconfig.yaml"
+
+import os
+import yaml
+
+# Set the output directory for results
+output_dir = "results/"
+
+# Rule all ensures final files are generated
+rule all:
+    input:
+        expand(os.path.join(output_dir, "{sample}_results", "02_splited_index"), sample=config["samples"].keys()),
+        expand(os.path.join(output_dir, "{sample}_results", "01_chromosome_index", "chr_config.yaml"), sample=config["samples"].keys())
+
+# Rule to generate FAI index for each FASTA file
+rule generate_fai_index:
+    input:
+        fasta=lambda wildcards: config["samples"][wildcards.sample]["fasta_gz"]
+    output:
+        fai=os.path.join(output_dir, "{sample}_results", "01_chromosome_index", "{sample}_full.fai")  
+    params: 
+        out=os.path.join(output_dir, "{sample}_results", "01_chromosome_index")  
+    container:
+        "docker://registry.forgemia.inra.fr/pangepop/mspangepop/samtool:1.21"
+    shell:
+        """
+        samtools faidx {input.fasta} &&
+        mv {input.fasta}.fai {output.fai} &&
+        rm {input.fasta}.gzi || true
+        """
+
+# Rule to split the FAI file into separate chromosome files
+rule split_index:
+    input:
+        fai=rules.generate_fai_index.output.fai
+    output:
+        directory(os.path.join(output_dir, "{sample}_results", "02_splited_index"))
+    params:
+        out=os.path.join(output_dir, "{sample}_results", "02_splited_index")
+    shell:
+        """
+        mkdir -p {params.out}
+        awk '{{print > "{params.out}/" $1 ".fai"}}' {input.fai}
+        """
+
+# Rule to create chromosome configuration YAML with chromosome names
+rule create_chr_config_file:
+    input:
+        fai=rules.generate_fai_index.output.fai
+    output:
+        yaml=os.path.join(output_dir, "{sample}_results", "01_chromosome_index", "chr_config.yaml")
+    shell:
+        """
+        awk '{{print $1}}' {input.fai} | \
+        awk 'BEGIN {{print "chromosomes:"}} {{print "  - \\"" $1 "\\""}}' > {output.yaml}
+        """
\ No newline at end of file
diff --git a/workflow/dag.svg b/workflow/dag.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2e4de8a6bbd08d53bc946824dbac4af80ba80209
--- /dev/null
+++ b/workflow/dag.svg
@@ -0,0 +1,415 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+ -->
+<!-- Title: snakemake_dag Pages: 1 -->
+<svg width="476pt" height="1196pt" viewBox="0.00 0.00 476.00 1196.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1192)">
+<title>snakemake_dag</title>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-1192 472,-1192 472,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<path fill="none" stroke="#56c1d8" stroke-width="2" d="M309,-36C309,-36 279,-36 279,-36 273,-36 267,-30 267,-24 267,-24 267,-12 267,-12 267,-6 273,0 279,0 279,0 309,0 309,0 315,0 321,-6 321,-12 321,-12 321,-24 321,-24 321,-30 315,-36 309,-36"/>
+<text text-anchor="middle" x="294" y="-15" font-family="sans" font-size="10.00" fill="#000000">all</text>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<path fill="none" stroke="#568ad8" stroke-width="2" d="M238.441,-324C238.441,-324 179.559,-324 179.559,-324 173.559,-324 167.559,-318 167.559,-312 167.559,-312 167.559,-300 167.559,-300 167.559,-294 173.559,-288 179.559,-288 179.559,-288 238.441,-288 238.441,-288 244.441,-288 250.441,-294 250.441,-300 250.441,-300 250.441,-312 250.441,-312 250.441,-318 244.441,-324 238.441,-324"/>
+<text text-anchor="middle" x="209" y="-303" font-family="sans" font-size="10.00" fill="#000000"> Compress VCF </text>
+</g>
+<!-- 66 -->
+<g id="node25" class="node">
+<title>66</title>
+<path fill="none" stroke="#568ad8" stroke-width="2" d="M216.4121,-252C216.4121,-252 167.5879,-252 167.5879,-252 161.5879,-252 155.5879,-246 155.5879,-240 155.5879,-240 155.5879,-228 155.5879,-228 155.5879,-222 161.5879,-216 167.5879,-216 167.5879,-216 216.4121,-216 216.4121,-216 222.4121,-216 228.4121,-222 228.4121,-228 228.4121,-228 228.4121,-240 228.4121,-240 228.4121,-246 222.4121,-252 216.4121,-252"/>
+<text text-anchor="middle" x="192" y="-231" font-family="sans" font-size="10.00" fill="#000000"> Index giraffe </text>
+</g>
+<!-- 1&#45;&gt;66 -->
+<g id="edge39" class="edge">
+<title>1-&gt;66</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M204.7102,-287.8314C202.892,-280.131 200.7301,-270.9743 198.7095,-262.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="202.0519,-261.3414 196.3476,-252.4133 195.2392,-262.95 202.0519,-261.3414"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<path fill="none" stroke="#c5e544" stroke-width="2" d="M311.6152,-396C311.6152,-396 276.3848,-396 276.3848,-396 270.3848,-396 264.3848,-390 264.3848,-384 264.3848,-384 264.3848,-372 264.3848,-372 264.3848,-366 270.3848,-360 276.3848,-360 276.3848,-360 311.6152,-360 311.6152,-360 317.6152,-360 323.6152,-366 323.6152,-372 323.6152,-372 323.6152,-384 323.6152,-384 323.6152,-390 317.6152,-396 311.6152,-396"/>
+<text text-anchor="middle" x="294" y="-375" font-family="sans" font-size="10.00" fill="#000000"> Sort VCF </text>
+</g>
+<!-- 2&#45;&gt;0 -->
+<g id="edge1" class="edge">
+<title>2-&gt;0</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M294,-359.8146C294,-332.4983 294,-279.25 294,-234 294,-234 294,-234 294,-162 294,-121.876 294,-75.4631 294,-46.4177"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="297.5001,-46.1853 294,-36.1854 290.5001,-46.1854 297.5001,-46.1853"/>
+</g>
+<!-- 2&#45;&gt;1 -->
+<g id="edge11" class="edge">
+<title>2-&gt;1</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M272.5509,-359.8314C262.1822,-351.0485 249.5771,-340.3712 238.3354,-330.8489"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="240.3349,-327.9556 230.4422,-324.1628 235.8105,-333.297 240.3349,-327.9556"/>
+</g>
+<!-- 63 -->
+<g id="node22" class="node">
+<title>63</title>
+<path fill="none" stroke="#5b3366" stroke-width="2" d="M413.09,-324C413.09,-324 334.91,-324 334.91,-324 328.91,-324 322.91,-318 322.91,-312 322.91,-312 322.91,-300 322.91,-300 322.91,-294 328.91,-288 334.91,-288 334.91,-288 413.09,-288 413.09,-288 419.09,-288 425.09,-294 425.09,-300 425.09,-300 425.09,-312 425.09,-312 425.09,-318 419.09,-324 413.09,-324"/>
+<text text-anchor="middle" x="374" y="-303" font-family="sans" font-size="10.00" fill="#000000"> Construct VG graph </text>
+</g>
+<!-- 2&#45;&gt;63 -->
+<g id="edge35" class="edge">
+<title>2-&gt;63</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M314.1874,-359.8314C323.8514,-351.1337 335.5796,-340.5783 346.0816,-331.1265"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="348.7275,-333.454 353.8191,-324.1628 344.0447,-328.251 348.7275,-333.454"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<path fill="none" stroke="#c5e544" stroke-width="2" d="M315.866,-468C315.866,-468 272.134,-468 272.134,-468 266.134,-468 260.134,-462 260.134,-456 260.134,-456 260.134,-444 260.134,-444 260.134,-438 266.134,-432 272.134,-432 272.134,-432 315.866,-432 315.866,-432 321.866,-432 327.866,-438 327.866,-444 327.866,-444 327.866,-456 327.866,-456 327.866,-462 321.866,-468 315.866,-468"/>
+<text text-anchor="middle" x="294" y="-447" font-family="sans" font-size="10.00" fill="#000000"> Add header </text>
+</g>
+<!-- 3&#45;&gt;2 -->
+<g id="edge12" class="edge">
+<title>3-&gt;2</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M294,-431.8314C294,-424.131 294,-414.9743 294,-406.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="297.5001,-406.4132 294,-396.4133 290.5001,-406.4133 297.5001,-406.4132"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<path fill="none" stroke="#c5e544" stroke-width="2" d="M256.3133,-540C256.3133,-540 213.6867,-540 213.6867,-540 207.6867,-540 201.6867,-534 201.6867,-528 201.6867,-528 201.6867,-516 201.6867,-516 201.6867,-510 207.6867,-504 213.6867,-504 213.6867,-504 256.3133,-504 256.3133,-504 262.3133,-504 268.3133,-510 268.3133,-516 268.3133,-516 268.3133,-528 268.3133,-528 268.3133,-534 262.3133,-540 256.3133,-540"/>
+<text text-anchor="middle" x="235" y="-519" font-family="sans" font-size="10.00" fill="#000000"> Sort header </text>
+</g>
+<!-- 4&#45;&gt;3 -->
+<g id="edge13" class="edge">
+<title>4-&gt;3</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M249.8882,-503.8314C256.753,-495.454 265.0301,-485.3531 272.5511,-476.1749"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="275.2803,-478.3665 278.9114,-468.4133 269.8659,-473.9297 275.2803,-478.3665"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<path fill="none" stroke="#c5e544" stroke-width="2" d="M263.467,-612C263.467,-612 186.533,-612 186.533,-612 180.533,-612 174.533,-606 174.533,-600 174.533,-600 174.533,-588 174.533,-588 174.533,-582 180.533,-576 186.533,-576 186.533,-576 263.467,-576 263.467,-576 269.467,-576 275.467,-582 275.467,-588 275.467,-588 275.467,-600 275.467,-600 275.467,-606 269.467,-612 263.467,-612"/>
+<text text-anchor="middle" x="225" y="-591" font-family="sans" font-size="10.00" fill="#000000"> Extract VCF header </text>
+</g>
+<!-- 5&#45;&gt;4 -->
+<g id="edge15" class="edge">
+<title>5-&gt;4</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M227.5234,-575.8314C228.5929,-568.131 229.8647,-558.9743 231.0532,-550.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="234.5336,-550.7997 232.4426,-540.4133 227.6001,-549.8367 234.5336,-550.7997"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<path fill="none" stroke="#c5e544" stroke-width="2" d="M260.3691,-684C260.3691,-684 189.6309,-684 189.6309,-684 183.6309,-684 177.6309,-678 177.6309,-672 177.6309,-672 177.6309,-660 177.6309,-660 177.6309,-654 183.6309,-648 189.6309,-648 189.6309,-648 260.3691,-648 260.3691,-648 266.3691,-648 272.3691,-654 272.3691,-660 272.3691,-660 272.3691,-672 272.3691,-672 272.3691,-678 266.3691,-684 260.3691,-684"/>
+<text text-anchor="middle" x="225" y="-663" font-family="sans" font-size="10.00" fill="#000000"> Decompress Â VCF </text>
+</g>
+<!-- 6&#45;&gt;5 -->
+<g id="edge16" class="edge">
+<title>6-&gt;5</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M225,-647.8314C225,-640.131 225,-630.9743 225,-622.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="228.5001,-622.4132 225,-612.4133 221.5001,-622.4133 228.5001,-622.4132"/>
+</g>
+<!-- 61 -->
+<g id="node20" class="node">
+<title>61</title>
+<path fill="none" stroke="#c5e544" stroke-width="2" d="M380.2403,-612C380.2403,-612 305.7597,-612 305.7597,-612 299.7597,-612 293.7597,-606 293.7597,-600 293.7597,-600 293.7597,-588 293.7597,-588 293.7597,-582 299.7597,-576 305.7597,-576 305.7597,-576 380.2403,-576 380.2403,-576 386.2403,-576 392.2403,-582 392.2403,-588 392.2403,-588 392.2403,-600 392.2403,-600 392.2403,-606 386.2403,-612 380.2403,-612"/>
+<text text-anchor="middle" x="343" y="-591" font-family="sans" font-size="10.00" fill="#000000"> Remove vcf header </text>
+</g>
+<!-- 6&#45;&gt;61 -->
+<g id="edge32" class="edge">
+<title>6-&gt;61</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M254.7764,-647.8314C269.8693,-638.6221 288.3757,-627.3301 304.5343,-617.4706"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="306.5198,-620.3593 313.2331,-612.1628 302.8737,-614.3838 306.5198,-620.3593"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M242.7084,-756C242.7084,-756 171.2916,-756 171.2916,-756 165.2916,-756 159.2916,-750 159.2916,-744 159.2916,-744 159.2916,-732 159.2916,-732 159.2916,-726 165.2916,-720 171.2916,-720 171.2916,-720 242.7084,-720 242.7084,-720 248.7084,-720 254.7084,-726 254.7084,-732 254.7084,-732 254.7084,-744 254.7084,-744 254.7084,-750 248.7084,-756 242.7084,-756"/>
+<text text-anchor="middle" x="207" y="-735" font-family="sans" font-size="10.00" fill="#000000"> Merge simulations </text>
+</g>
+<!-- 7&#45;&gt;6 -->
+<g id="edge17" class="edge">
+<title>7-&gt;6</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M211.5422,-719.8314C213.4884,-712.0463 215.8068,-702.7729 217.9663,-694.1347"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="221.3668,-694.9636 220.3967,-684.4133 214.5758,-693.2658 221.3668,-694.9636"/>
+</g>
+<!-- 8 -->
+<g id="node9" class="node">
+<title>8</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M165.4422,-900C165.4422,-900 108.5578,-900 108.5578,-900 102.5578,-900 96.5578,-894 96.5578,-888 96.5578,-888 96.5578,-876 96.5578,-876 96.5578,-870 102.5578,-864 108.5578,-864 108.5578,-864 165.4422,-864 165.4422,-864 171.4422,-864 177.4422,-870 177.4422,-876 177.4422,-876 177.4422,-888 177.4422,-888 177.4422,-894 171.4422,-900 165.4422,-900"/>
+<text text-anchor="middle" x="137" y="-879" font-family="sans" font-size="10.00" fill="#000000"> Compress VCF</text>
+</g>
+<!-- 8&#45;&gt;7 -->
+<g id="edge18" class="edge">
+<title>8-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M117.8416,-863.7291C100.6542,-845.0452 80.2359,-815.509 95,-792 107.2456,-772.5013 128.7302,-759.8766 149.6291,-751.7827"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="150.9185,-755.0387 159.1547,-748.3743 148.5602,-748.4479 150.9185,-755.0387"/>
+</g>
+<!-- 42 -->
+<g id="node15" class="node">
+<title>42</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M157.7156,-828C157.7156,-828 116.2844,-828 116.2844,-828 110.2844,-828 104.2844,-822 104.2844,-816 104.2844,-816 104.2844,-804 104.2844,-804 104.2844,-798 110.2844,-792 116.2844,-792 116.2844,-792 157.7156,-792 157.7156,-792 163.7156,-792 169.7156,-798 169.7156,-804 169.7156,-804 169.7156,-816 169.7156,-816 169.7156,-822 163.7156,-828 157.7156,-828"/>
+<text text-anchor="middle" x="137" y="-807" font-family="sans" font-size="10.00" fill="#000000"> Index VCF </text>
+</g>
+<!-- 8&#45;&gt;42 -->
+<g id="edge27" class="edge">
+<title>8-&gt;42</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M137,-863.8314C137,-856.131 137,-846.9743 137,-838.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="140.5001,-838.4132 137,-828.4133 133.5001,-838.4133 140.5001,-838.4132"/>
+</g>
+<!-- 9 -->
+<g id="node10" class="node">
+<title>9</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M165.1632,-972C165.1632,-972 108.8368,-972 108.8368,-972 102.8368,-972 96.8368,-966 96.8368,-960 96.8368,-960 96.8368,-948 96.8368,-948 96.8368,-942 102.8368,-936 108.8368,-936 108.8368,-936 165.1632,-936 165.1632,-936 171.1632,-936 177.1632,-942 177.1632,-948 177.1632,-948 177.1632,-960 177.1632,-960 177.1632,-966 171.1632,-972 165.1632,-972"/>
+<text text-anchor="middle" x="137" y="-957" font-family="sans" font-size="10.00" fill="#000000"> Â Run msprime Â </text>
+<text text-anchor="middle" x="137" y="-945" font-family="sans" font-size="10.00" fill="#000000">chr01</text>
+</g>
+<!-- 9&#45;&gt;8 -->
+<g id="edge24" class="edge">
+<title>9-&gt;8</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M137,-935.8314C137,-928.131 137,-918.9743 137,-910.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="140.5001,-910.4132 137,-900.4133 133.5001,-910.4133 140.5001,-910.4132"/>
+</g>
+<!-- 10 -->
+<g id="node11" class="node">
+<title>10</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M263.4422,-900C263.4422,-900 206.5578,-900 206.5578,-900 200.5578,-900 194.5578,-894 194.5578,-888 194.5578,-888 194.5578,-876 194.5578,-876 194.5578,-870 200.5578,-864 206.5578,-864 206.5578,-864 263.4422,-864 263.4422,-864 269.4422,-864 275.4422,-870 275.4422,-876 275.4422,-876 275.4422,-888 275.4422,-888 275.4422,-894 269.4422,-900 263.4422,-900"/>
+<text text-anchor="middle" x="235" y="-879" font-family="sans" font-size="10.00" fill="#000000"> Compress VCF</text>
+</g>
+<!-- 10&#45;&gt;7 -->
+<g id="edge19" class="edge">
+<title>10-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M227.8555,-863.7757C224.0733,-853.433 219.6711,-840.1616 217,-828 212.5103,-807.5585 209.9858,-784.0391 208.5968,-766.1708"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="212.0824,-765.8409 207.8876,-756.1118 205.0997,-766.3333 212.0824,-765.8409"/>
+</g>
+<!-- 43 -->
+<g id="node16" class="node">
+<title>43</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M279.7156,-828C279.7156,-828 238.2844,-828 238.2844,-828 232.2844,-828 226.2844,-822 226.2844,-816 226.2844,-816 226.2844,-804 226.2844,-804 226.2844,-798 232.2844,-792 238.2844,-792 238.2844,-792 279.7156,-792 279.7156,-792 285.7156,-792 291.7156,-798 291.7156,-804 291.7156,-804 291.7156,-816 291.7156,-816 291.7156,-822 285.7156,-828 279.7156,-828"/>
+<text text-anchor="middle" x="259" y="-807" font-family="sans" font-size="10.00" fill="#000000"> Index VCF </text>
+</g>
+<!-- 10&#45;&gt;43 -->
+<g id="edge28" class="edge">
+<title>10-&gt;43</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M241.0562,-863.8314C243.6512,-856.0463 246.7424,-846.7729 249.6218,-838.1347"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="253.0203,-839.0069 252.8622,-828.4133 246.3795,-836.7933 253.0203,-839.0069"/>
+</g>
+<!-- 11 -->
+<g id="node12" class="node">
+<title>11</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M263.1632,-972C263.1632,-972 206.8368,-972 206.8368,-972 200.8368,-972 194.8368,-966 194.8368,-960 194.8368,-960 194.8368,-948 194.8368,-948 194.8368,-942 200.8368,-936 206.8368,-936 206.8368,-936 263.1632,-936 263.1632,-936 269.1632,-936 275.1632,-942 275.1632,-948 275.1632,-948 275.1632,-960 275.1632,-960 275.1632,-966 269.1632,-972 263.1632,-972"/>
+<text text-anchor="middle" x="235" y="-957" font-family="sans" font-size="10.00" fill="#000000"> Â Run msprime Â </text>
+<text text-anchor="middle" x="235" y="-945" font-family="sans" font-size="10.00" fill="#000000">chr02</text>
+</g>
+<!-- 11&#45;&gt;10 -->
+<g id="edge25" class="edge">
+<title>11-&gt;10</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M235,-935.8314C235,-928.131 235,-918.9743 235,-910.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="238.5001,-910.4132 235,-900.4133 231.5001,-910.4133 238.5001,-910.4132"/>
+</g>
+<!-- 12 -->
+<g id="node13" class="node">
+<title>12</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M335,-900C335,-900 305,-900 305,-900 299,-900 293,-894 293,-888 293,-888 293,-876 293,-876 293,-870 299,-864 305,-864 305,-864 335,-864 335,-864 341,-864 347,-870 347,-876 347,-876 347,-888 347,-888 347,-894 341,-900 335,-900"/>
+<text text-anchor="middle" x="320" y="-879" font-family="sans" font-size="10.00" fill="#000000">...</text>
+</g>
+<!-- 12&#45;&gt;7 -->
+<g id="edge20" class="edge">
+<title>12-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M320.3698,-863.6908C319.868,-844.1606 316.4023,-813.182 301,-792 291.5573,-779.0138 277.9688,-768.6859 264.0464,-760.6868"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="265.3544,-757.4161 254.8895,-755.7619 262.0387,-763.5811 265.3544,-757.4161"/>
+</g>
+<!-- 44 -->
+<g id="node17" class="node">
+<title>44</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M390,-828C390,-828 360,-828 360,-828 354,-828 348,-822 348,-816 348,-816 348,-804 348,-804 348,-798 354,-792 360,-792 360,-792 390,-792 390,-792 396,-792 402,-798 402,-804 402,-804 402,-816 402,-816 402,-822 396,-828 390,-828"/>
+<text text-anchor="middle" x="375" y="-807" font-family="sans" font-size="10.00" fill="#000000">...</text>
+</g>
+<!-- 12&#45;&gt;44 -->
+<g id="edge29" class="edge">
+<title>12-&gt;44</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M333.8788,-863.8314C340.2136,-855.5386 347.8384,-845.557 354.7926,-836.4533"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="357.6452,-838.4847 360.9343,-828.4133 352.0825,-834.2353 357.6452,-838.4847"/>
+</g>
+<!-- 13 -->
+<g id="node14" class="node">
+<title>13</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M335,-972C335,-972 305,-972 305,-972 299,-972 293,-966 293,-960 293,-960 293,-948 293,-948 293,-942 299,-936 305,-936 305,-936 335,-936 335,-936 341,-936 347,-942 347,-948 347,-948 347,-960 347,-960 347,-966 341,-972 335,-972"/>
+<text text-anchor="middle" x="320" y="-951" font-family="sans" font-size="10.00" fill="#000000">...</text>
+</g>
+<!-- 13&#45;&gt;12 -->
+<g id="edge26" class="edge">
+<title>13-&gt;12</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M320,-935.8314C320,-928.131 320,-918.9743 320,-910.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="323.5001,-910.4132 320,-900.4133 316.5001,-910.4133 323.5001,-910.4132"/>
+</g>
+<!-- 42&#45;&gt;7 -->
+<g id="edge21" class="edge">
+<title>42-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M154.664,-791.8314C162.9732,-783.2848 173.0264,-772.9443 182.0918,-763.6198"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="184.6369,-766.023 189.0982,-756.4133 179.6179,-761.1435 184.6369,-766.023"/>
+</g>
+<!-- 43&#45;&gt;7 -->
+<g id="edge22" class="edge">
+<title>43-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M245.8782,-791.8314C239.9501,-783.6232 232.8271,-773.7606 226.3067,-764.7323"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="228.9908,-762.4708 220.2985,-756.4133 223.316,-766.5693 228.9908,-762.4708"/>
+</g>
+<!-- 44&#45;&gt;7 -->
+<g id="edge23" class="edge">
+<title>44-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M347.6722,-797.9496C343.1106,-795.9533 338.4244,-793.9118 334,-792 309.4505,-781.3923 282.1647,-769.7688 259.196,-760.032"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="260.5397,-756.8002 249.9666,-756.1225 257.8094,-763.2458 260.5397,-756.8002"/>
+</g>
+<!-- 59 -->
+<g id="node18" class="node">
+<title>59</title>
+<path fill="none" stroke="#56a9d8" stroke-width="2" d="M376.7373,-540C376.7373,-540 309.2627,-540 309.2627,-540 303.2627,-540 297.2627,-534 297.2627,-528 297.2627,-528 297.2627,-516 297.2627,-516 297.2627,-510 303.2627,-504 309.2627,-504 309.2627,-504 376.7373,-504 376.7373,-504 382.7373,-504 388.7373,-510 388.7373,-516 388.7373,-516 388.7373,-528 388.7373,-528 388.7373,-534 382.7373,-540 376.7373,-540"/>
+<text text-anchor="middle" x="343" y="-519" font-family="sans" font-size="10.00" fill="#000000"> Generate variants </text>
+</g>
+<!-- 59&#45;&gt;3 -->
+<g id="edge14" class="edge">
+<title>59-&gt;3</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M330.6352,-503.8314C325.0491,-495.6232 318.337,-485.7606 312.1928,-476.7323"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="315.051,-474.7112 306.5313,-468.4133 309.264,-478.6496 315.051,-474.7112"/>
+</g>
+<!-- 60 -->
+<g id="node19" class="node">
+<title>60</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M353.3123,-1188C353.3123,-1188 310.6877,-1188 310.6877,-1188 304.6877,-1188 298.6877,-1182 298.6877,-1176 298.6877,-1176 298.6877,-1164 298.6877,-1164 298.6877,-1158 304.6877,-1152 310.6877,-1152 310.6877,-1152 353.3123,-1152 353.3123,-1152 359.3123,-1152 365.3123,-1158 365.3123,-1164 365.3123,-1164 365.3123,-1176 365.3123,-1176 365.3123,-1182 359.3123,-1188 353.3123,-1188"/>
+<text text-anchor="middle" x="332" y="-1167" font-family="sans" font-size="10.00" fill="#000000"> Unzip fasta </text>
+</g>
+<!-- 60&#45;&gt;59 -->
+<g id="edge30" class="edge">
+<title>60-&gt;59</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M355.2246,-1151.7681C384.2046,-1126.78 430,-1078.9204 430,-1026 430,-1026 430,-1026 430,-666 430,-623.9747 423.5509,-611.4624 401,-576 394.1941,-565.2975 384.7687,-555.3005 375.4928,-546.8888"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="377.6451,-544.1225 367.7971,-540.2151 373.059,-549.4109 377.6451,-544.1225"/>
+</g>
+<!-- 60&#45;&gt;63 -->
+<g id="edge34" class="edge">
+<title>60-&gt;63</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M365.618,-1153.1649C405.8915,-1130.3375 468,-1085.49 468,-1026 468,-1026 468,-1026 468,-450 468,-401.9707 430.8785,-357.4394 403.2912,-330.9769"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="405.5058,-328.2567 395.8018,-324.0044 400.736,-333.3801 405.5058,-328.2567"/>
+</g>
+<!-- 60&#45;&gt;66 -->
+<g id="edge38" class="edge">
+<title>60-&gt;66</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M298.6465,-1163.0043C214.4558,-1144.3906 0,-1090.6334 0,-1026 0,-1026 0,-1026 0,-378 0,-305.4792 88.75,-265.0071 145.5709,-246.527"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="146.9398,-249.7659 155.4343,-243.4339 144.8452,-243.0866 146.9398,-249.7659"/>
+</g>
+<!-- 67 -->
+<g id="node26" class="node">
+<title>67</title>
+<path fill="none" stroke="#61d856" stroke-width="2" d="M273.4088,-1116C273.4088,-1116 196.5912,-1116 196.5912,-1116 190.5912,-1116 184.5912,-1110 184.5912,-1104 184.5912,-1104 184.5912,-1092 184.5912,-1092 184.5912,-1086 190.5912,-1080 196.5912,-1080 196.5912,-1080 273.4088,-1080 273.4088,-1080 279.4088,-1080 285.4088,-1086 285.4088,-1092 285.4088,-1092 285.4088,-1104 285.4088,-1104 285.4088,-1110 279.4088,-1116 273.4088,-1116"/>
+<text text-anchor="middle" x="235" y="-1095" font-family="sans" font-size="10.00" fill="#000000"> Generate FAI index </text>
+</g>
+<!-- 60&#45;&gt;67 -->
+<g id="edge4" class="edge">
+<title>60-&gt;67</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M307.5228,-1151.8314C295.4605,-1142.8779 280.746,-1131.9558 267.7314,-1122.2955"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="269.5852,-1119.3127 259.4694,-1116.1628 265.413,-1124.9335 269.5852,-1119.3127"/>
+</g>
+<!-- 61&#45;&gt;59 -->
+<g id="edge31" class="edge">
+<title>61-&gt;59</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M343,-575.8314C343,-568.131 343,-558.9743 343,-550.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="346.5001,-550.4132 343,-540.4133 339.5001,-550.4133 346.5001,-550.4132"/>
+</g>
+<!-- 62 -->
+<g id="node21" class="node">
+<title>62</title>
+<path fill="none" stroke="#5b3366" stroke-width="2" d="M414.031,-180C414.031,-180 333.969,-180 333.969,-180 327.969,-180 321.969,-174 321.969,-168 321.969,-168 321.969,-156 321.969,-156 321.969,-150 327.969,-144 333.969,-144 333.969,-144 414.031,-144 414.031,-144 420.031,-144 426.031,-150 426.031,-156 426.031,-156 426.031,-168 426.031,-168 426.031,-174 420.031,-180 414.031,-180"/>
+<text text-anchor="middle" x="374" y="-159" font-family="sans" font-size="10.00" fill="#000000"> Convert graph to gfa </text>
+</g>
+<!-- 62&#45;&gt;0 -->
+<g id="edge3" class="edge">
+<title>62-&gt;0</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M363.8679,-143.7623C349.9824,-118.7682 324.9402,-73.6924 308.9116,-44.8409"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="311.9658,-43.1314 304.0498,-36.0896 305.8467,-46.531 311.9658,-43.1314"/>
+</g>
+<!-- 63&#45;&gt;62 -->
+<g id="edge33" class="edge">
+<title>63-&gt;62</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M374,-287.7623C374,-263.201 374,-219.2474 374,-190.3541"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="377.5001,-190.0896 374,-180.0896 370.5001,-190.0897 377.5001,-190.0896"/>
+</g>
+<!-- 64 -->
+<g id="node23" class="node">
+<title>64</title>
+<path fill="none" stroke="#568ad8" stroke-width="2" d="M254.2452,-108C254.2452,-108 129.7548,-108 129.7548,-108 123.7548,-108 117.7548,-102 117.7548,-96 117.7548,-96 117.7548,-84 117.7548,-84 117.7548,-78 123.7548,-72 129.7548,-72 129.7548,-72 254.2452,-72 254.2452,-72 260.2452,-72 266.2452,-78 266.2452,-84 266.2452,-84 266.2452,-96 266.2452,-96 266.2452,-102 260.2452,-108 254.2452,-108"/>
+<text text-anchor="middle" x="192" y="-87" font-family="sans" font-size="10.00" fill="#000000"> Â Â Â Convert graph to fasta paths Â Â Â </text>
+</g>
+<!-- 64&#45;&gt;0 -->
+<g id="edge2" class="edge">
+<title>64-&gt;0</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M217.7389,-71.8314C230.5437,-62.7927 246.1909,-51.7476 259.9719,-42.0198"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="262.118,-44.7891 268.2693,-36.1628 258.0812,-39.0703 262.118,-44.7891"/>
+</g>
+<!-- 65 -->
+<g id="node24" class="node">
+<title>65</title>
+<path fill="none" stroke="#568ad8" stroke-width="2" d="M227.7025,-180C227.7025,-180 156.2975,-180 156.2975,-180 150.2975,-180 144.2975,-174 144.2975,-168 144.2975,-168 144.2975,-156 144.2975,-156 144.2975,-150 150.2975,-144 156.2975,-144 156.2975,-144 227.7025,-144 227.7025,-144 233.7025,-144 239.7025,-150 239.7025,-156 239.7025,-156 239.7025,-168 239.7025,-168 239.7025,-174 233.7025,-180 227.7025,-180"/>
+<text text-anchor="middle" x="192" y="-159" font-family="sans" font-size="10.00" fill="#000000"> Convert gbz to gfa </text>
+</g>
+<!-- 65&#45;&gt;64 -->
+<g id="edge36" class="edge">
+<title>65-&gt;64</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M192,-143.8314C192,-136.131 192,-126.9743 192,-118.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="195.5001,-118.4132 192,-108.4133 188.5001,-118.4133 195.5001,-118.4132"/>
+</g>
+<!-- 66&#45;&gt;65 -->
+<g id="edge37" class="edge">
+<title>66-&gt;65</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M192,-215.8314C192,-208.131 192,-198.9743 192,-190.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="195.5001,-190.4132 192,-180.4133 188.5001,-190.4133 195.5001,-190.4132"/>
+</g>
+<!-- 68 -->
+<g id="node27" class="node">
+<title>68</title>
+<path fill="none" stroke="#61d856" stroke-width="2" d="M147.2353,-1044C147.2353,-1044 64.7647,-1044 64.7647,-1044 58.7647,-1044 52.7647,-1038 52.7647,-1032 52.7647,-1032 52.7647,-1020 52.7647,-1020 52.7647,-1014 58.7647,-1008 64.7647,-1008 64.7647,-1008 147.2353,-1008 147.2353,-1008 153.2353,-1008 159.2353,-1014 159.2353,-1020 159.2353,-1020 159.2353,-1032 159.2353,-1032 159.2353,-1038 153.2353,-1044 147.2353,-1044"/>
+<text text-anchor="middle" x="106" y="-1023" font-family="sans" font-size="10.00" fill="#000000"> Create chr config file </text>
+</g>
+<!-- 67&#45;&gt;68 -->
+<g id="edge5" class="edge">
+<title>67-&gt;68</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M202.4479,-1079.8314C185.7952,-1070.5368 165.3415,-1059.1208 147.5615,-1049.1971"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="148.9795,-1045.9803 138.5417,-1044.1628 145.5679,-1052.0927 148.9795,-1045.9803"/>
+</g>
+<!-- 69 -->
+<g id="node28" class="node">
+<title>69</title>
+<path fill="none" stroke="#61d856" stroke-width="2" d="M255.1151,-1044C255.1151,-1044 214.8849,-1044 214.8849,-1044 208.8849,-1044 202.8849,-1038 202.8849,-1032 202.8849,-1032 202.8849,-1020 202.8849,-1020 202.8849,-1014 208.8849,-1008 214.8849,-1008 214.8849,-1008 255.1151,-1008 255.1151,-1008 261.1151,-1008 267.1151,-1014 267.1151,-1020 267.1151,-1020 267.1151,-1032 267.1151,-1032 267.1151,-1038 261.1151,-1044 255.1151,-1044"/>
+<text text-anchor="middle" x="235" y="-1023" font-family="sans" font-size="10.00" fill="#000000"> Split index </text>
+</g>
+<!-- 67&#45;&gt;69 -->
+<g id="edge6" class="edge">
+<title>67-&gt;69</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M235,-1079.8314C235,-1072.131 235,-1062.9743 235,-1054.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="238.5001,-1054.4132 235,-1044.4133 231.5001,-1054.4133 238.5001,-1054.4132"/>
+</g>
+<!-- 68&#45;&gt;7 -->
+<g id="edge10" class="edge">
+<title>68-&gt;7</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M97.4506,-1007.7586C79.1631,-965.7959 41.3948,-860.752 86,-792 100.1955,-770.1199 125.5509,-757.0285 149.3967,-749.2329"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="150.5664,-752.536 159.1339,-746.3031 148.5495,-745.8329 150.5664,-752.536"/>
+</g>
+<!-- 69&#45;&gt;9 -->
+<g id="edge7" class="edge">
+<title>69-&gt;9</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M210.2705,-1007.8314C198.0838,-998.8779 183.2177,-987.9558 170.0688,-978.2955"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="171.8527,-975.263 161.7216,-972.1628 167.7082,-980.9042 171.8527,-975.263"/>
+</g>
+<!-- 69&#45;&gt;11 -->
+<g id="edge8" class="edge">
+<title>69-&gt;11</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M235,-1007.8314C235,-1000.131 235,-990.9743 235,-982.4166"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="238.5001,-982.4132 235,-972.4133 231.5001,-982.4133 238.5001,-982.4132"/>
+</g>
+<!-- 69&#45;&gt;13 -->
+<g id="edge9" class="edge">
+<title>69-&gt;13</title>
+<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M256.4491,-1007.8314C266.8178,-999.0485 279.4229,-988.3712 290.6646,-978.8489"/>
+<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="293.1895,-981.297 298.5578,-972.1628 288.6651,-975.9556 293.1895,-981.297"/>
+</g>
+</g>
+</svg>
\ No newline at end of file
diff --git a/workflow/scripts/fai2yaml.sh b/workflow/scripts/fai2yaml.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ff3f5ef8e0f95e61f3afa9a66a4aa3c29cb2a36f
--- /dev/null
+++ b/workflow/scripts/fai2yaml.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Author: Lucien Piat
+# Date: October 24, 2024
+# Project: PangenOak at INRAE
+
+# Check if the input file is provided
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <input_fai_file> <output_yaml_file>"
+    exit 1
+fi
+
+input_fai_file="$1"
+output_yaml_file="$2"
+
+# Initialize the output YAML file
+echo "chromosomes:" > "$output_yaml_file"
+
+# Read the FAI file and extract chromosome names
+while IFS=$'\t' read -r chromosome _; do
+    # Append each chromosome name to the YAML file
+    echo "  - \"$chromosome\"" >> "$output_yaml_file"
+done < "$input_fai_file"
+
+echo "YAML index file generated: $output_yaml_file"
diff --git a/workflow/scripts/generate_variant.py b/workflow/scripts/generate_variant.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4e0eeaeb644150320e7bcaf1fc54ea9afbfa05
--- /dev/null
+++ b/workflow/scripts/generate_variant.py
@@ -0,0 +1,104 @@
+"""
+Author: Lucien Piat, based on Sukanya Denni's initial script
+Date: 28 Oct 2024
+Institution: INRAe
+Project: PangenOak
+
+This script generates genetic variants based on input data from msprime simulated VCF, FAI, and FASTA files.
+It reads variant probabilities and lengths from specified files, creates variants for each 
+row in the VCF file, and writes the generated variants to a new VCF file.
+"""
+
+import random
+import pandas as pd
+import argparse
+from readfile import read_fasta, read_variant_length_file, read_fai, read_vcf, read_yaml
+from variants_class import *
+
+def create_variant(variant_probs, chrom, position, length_files, chrom_lengths, chromosome_dict, samples):
+    """Create a variant with a random length based on probabilities and file data, respecting chromosome length."""
+    variant_types = list(variant_probs.keys())
+    probabilities = list(variant_probs.values())
+    
+    selected_variant_type = random.choices(variant_types, weights=probabilities, k=1)[0]
+    
+    # Dynamically get the class
+    variant_class = globals().get(selected_variant_type)
+    if variant_class is None:
+        raise ValueError(f"Variant type '{selected_variant_type}' does not correspond to an implemented class.")
+    
+    # Instantiate the variant object with relevant parameters
+    # TODO, include no_ref as a variant variable
+    if selected_variant_type == 'SNP':
+        variant = variant_class(chrom, position, chromosome_dict, chrom_lengths)
+    else:
+        length_df = length_files[selected_variant_type]
+        variant = variant_class(chrom, position, chromosome_dict, chrom_lengths, length_df)
+
+    no_ref = (selected_variant_type == 'Insertion')
+    variant.reference_seq = variant.retrieve_reference_sequence(no_ref=no_ref)
+    
+    variant.samples = samples
+    variant.compute_alt_seq()
+    
+    # Special handling for Translocation
+    if isinstance(variant, Translocation):
+        # Choose a random destination chromosome and position
+        dest_chrom = random.choice(list(chrom_lengths.keys()))
+        dest_position = random.randint(1, chrom_lengths[dest_chrom])
+        variant.set_destination(dest_chrom, dest_position)
+        
+        variant.compute_alt_seq()
+
+    return variant
+
+def main(fai_file, vcf_input_file, vcf_output_file, fasta_file, yaml_file):
+    # Load variant probabilities and length files dynamically from YAML input
+    variant_probs = read_yaml(yaml_file)
+
+    # Load length distributions for each variant type from files
+    length_files = {
+        'Deletion': read_variant_length_file('simulation_data/size_distribDEL.tsv'),
+        'Insertion': read_variant_length_file('simulation_data/size_distribINS.tsv'),
+        'Inversion': read_variant_length_file('simulation_data/size_distribINV.tsv'),
+        'TandemDuplication': read_variant_length_file('simulation_data/size_distribDUP.tsv'),
+        'InvertedTandemDuplication': read_variant_length_file('simulation_data/size_distribDUP.tsv'),
+        'Translocation': read_variant_length_file('simulation_data/size_distribINS.tsv'),
+        'Transduplication': read_variant_length_file('simulation_data/size_distribINS.tsv'),
+        'ReciprocalTranslocation': read_variant_length_file('simulation_data/size_distribINS.tsv')
+    }
+
+    # Load chromosome lengths, VCF data, and chromosome dictionary
+    chrom_lengths = read_fai(fai_file)
+    vcf_data = read_vcf(vcf_input_file)
+    chromosome_dict = read_fasta(fasta_file)
+
+    # Open the output VCF file for writing
+    with open(vcf_output_file, 'w') as vcf_output:
+        
+        for index, random_row in vcf_data.iterrows():
+            chrom = random_row['CHROM']
+            position = random_row['POS']
+
+
+            # Extract sample data from VCF row
+            samples = {col: random_row[col] for col in vcf_data.columns if col.startswith('SAMPLE')}
+
+            
+            # Create a variant instance and generate VCF line output
+            variant_instance = create_variant(variant_probs, chrom, position, length_files, chrom_lengths, chromosome_dict, samples)
+            vcf_line = variant_instance.vcf_line()
+            print(variant_instance.describe())
+            vcf_output.write(vcf_line + '\n')
+
+if __name__ == "__main__":
+    # Set up argument parsing with YAML file path
+    parser = argparse.ArgumentParser(description="Generate variants from VCF and other genomic data.")
+    parser.add_argument("--fai", help="Path to the FAI file.")
+    parser.add_argument("--vcf", help="Path to the input VCF file.")
+    parser.add_argument("--output", help="Path to the output VCF file.")
+    parser.add_argument("--fasta", help="Path to the FASTA file.")
+    parser.add_argument("--yaml", help="Path to the YAML file with variant probabilities.")
+
+    args = parser.parse_args()
+    main(args.fai, args.vcf, args.output, args.fasta, args.yaml)
diff --git a/workflow/scripts/readfile.py b/workflow/scripts/readfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c520b2fcf54d80852ffae6c23d984168e7dc41e
--- /dev/null
+++ b/workflow/scripts/readfile.py
@@ -0,0 +1,82 @@
+"""
+Author: Lucien Piat
+Date: 28 Oct 2024
+Institution: INRAe
+Project: PangenOak
+"""
+
+import yaml
+import pandas as pd
+from Bio import SeqIO
+
+
+def read_yaml(file_path):
+    """Read a YAML file and ensure that percentages sum to 100."""
+    try:
+        with open(file_path, 'r') as stream:
+            data = yaml.safe_load(stream)
+        
+        if sum(data.values()) != 100:
+            raise ValueError("Sum of values in the config file MUST be 100.")
+        
+        return data
+    
+    except Exception as e:
+        print(f"Error reading YAML file: {e}")
+        raise
+
+def read_vcf(vcf_file):
+    """Read a VCF file, format it with custom column names, and extract chromosome, position, and sample data."""
+    try:
+        # Read the first line to determine the number of columns
+        with open(vcf_file, 'r') as file:
+            first_line = file.readline().strip()
+        
+        # Split the first line to get the number of columns
+        cols = first_line.split('\t')
+        num_samples = len(cols) - 9  # Subtract the first 9 standard VCF columns
+        
+        # Create a dynamic list of sample column names
+        sample_cols = [f"SAMPLE{i + 1}" for i in range(num_samples)]
+        all_cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + sample_cols
+        
+        # Read the entire VCF file with the appropriate column names
+        vcf_df = pd.read_table(vcf_file, sep="\t", header=None, names=all_cols)
+        vcf_df['CHROM'] = vcf_df['CHROM'].astype(str)
+        vcf_df['POS'] = vcf_df['POS'].astype(int)
+
+        return vcf_df[['CHROM', 'POS'] + sample_cols]
+    
+    except Exception as e:
+        print(f"Error reading VCF file: {e}")
+        raise
+
+def read_fai(fai_file):
+    """Read a .fai file and return a dictionary of chromosome lengths."""
+    try:
+        fai_df = pd.read_table(fai_file, header=None, names=["CHROM", "LENGTH", "OFFSET", "LINEBASES", "LINEWIDTH"])
+        return dict(zip(fai_df["CHROM"], fai_df["LENGTH"]))
+    
+    except Exception as e:
+        print(f"Error reading FAI file: {e}")
+        raise
+
+def read_variant_length_file(file_path):
+    """Read length distribution file and parse intervals with probabilities."""
+    try:
+        df = pd.read_table(file_path)
+        df['cumulative_pb'] = df['pb'].cumsum()
+        return df
+    
+    except Exception as e:
+        print(f"Error reading variant length file {file_path}: {e}")
+        raise
+
+def read_fasta(input_fasta):
+    """Read a FASTA file and return a dictionary of sequences."""
+    try:
+        return SeqIO.index(input_fasta, "fasta")
+    
+    except Exception as e:
+        print(f"Error reading FASTA file: {e}")
+        raise
diff --git a/workflow/scripts/sort_vcf_header.py b/workflow/scripts/sort_vcf_header.py
new file mode 100644
index 0000000000000000000000000000000000000000..01d7b13e5f98f84da3fe9ccc7db56550ab6f5688
--- /dev/null
+++ b/workflow/scripts/sort_vcf_header.py
@@ -0,0 +1,105 @@
+"""
+Author: Lucien Piat
+Date: 28 Oct 2024
+Institution: INRAe
+Project: PangenOak
+
+Description:
+This script parses the header of a VCF file, extracts information sections,
+and sorts contigs (genomic references) based on priority:
+  - Contigs starting with "." have the highest priority.
+  - Numeric contigs (e.g., chromosomes) are sorted in ascending order.
+  - Alphabetic contigs are sorted lexicographically.
+The script saves the organized header to an output file.
+
+This is done beacause VG wants super well organised vcf files
+
+Usage:
+python script_name.py input_file output_file
+"""
+
+import argparse
+import os
+
+def parse_vcf_header(vcf_header):
+    """
+    Parses the VCF header to extract general info, contigs, and sorts contigs.
+
+    :param vcf_header: str, the VCF header as a string.
+    :return: list, containing combined lines of parsed results.
+    """
+
+    lines = vcf_header.strip().split('\n')
+
+    first_part = []
+    contigs = []
+    second_part = []
+    
+    start = True
+    for line in lines:
+        if line.startswith("##contig"):
+            contigs.append(line)
+            start = False 
+        elif start:
+            first_part.append(line)
+        else:
+            second_part.append(line)
+    
+    def sort_key(contig_line):
+        """
+        Sorting key function to prioritize and order contigs:
+        - Highest priority for contigs starting with ".".
+        - Numerical sorting for digit-only IDs (e.g., chromosomes).
+        - Lexicographic sorting for alphabetic IDs.
+        """
+        try:
+            contig_id = contig_line.split('<ID=')[1].split(',')[0]
+        except IndexError:
+            contig_id = ""
+        
+        if contig_id.startswith("."):
+            return (0, contig_id)
+        elif contig_id.isdigit():
+            return (2, int(contig_id))
+        else:
+            return (1, contig_id)
+
+    sorted_contigs = sorted(contigs, key=sort_key)
+    
+    combined_lines = first_part + sorted_contigs + second_part
+    return combined_lines
+
+def main(input_file, output_file):
+    """
+    Main function to process VCF header from input and save sorted output.
+
+    :param input_file: str, path to input file with VCF header.
+    :param output_file: str, path to output file for sorted header.
+    """
+
+    if not os.path.isfile(input_file):
+        print(f"Error: Input file '{input_file}' does not exist.")
+        return
+    if os.path.getsize(input_file) == 0:
+        print(f"Error: Input file '{input_file}' is empty.")
+        return
+
+    with open(input_file, 'r') as infile:
+        vcf_header = infile.read()
+        
+    parsed_result = parse_vcf_header(vcf_header)
+
+    with open(output_file, 'w') as outfile:
+        outfile.write('\n'.join(parsed_result) + '\n')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parse VCF header and sort contigs.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("input_file", help="Path to the input VCF header file.")
+    parser.add_argument("output_file", help="Path to the output file for results.")
+
+    args = parser.parse_args()
+    
+    main(args.input_file, args.output_file)
diff --git a/workflow/scripts/split_path.sh b/workflow/scripts/split_path.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8fba85991ea566a13da879c3b279328b09e940dd
--- /dev/null
+++ b/workflow/scripts/split_path.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+in="$1"
+out_dir="$2"
+out=""
+
+while read -r line; do
+    if [[ "$line" == \>* ]]; then
+        [ -n "$out" ] && exec 3>&-
+        out="${line##>}.fasta"
+        exec 3> "$out_dir/$out"
+        echo "$line" >&3
+    else
+        echo "$line" >&3
+    fi
+done < "$in"
+
+[ -n "$out" ] && exec 3>&-
diff --git a/workflow/scripts/tree_generation.py b/workflow/scripts/tree_generation.py
new file mode 100755
index 0000000000000000000000000000000000000000..540cb7b84bc8b13fe22f823b592b9107c4ba3294
--- /dev/null
+++ b/workflow/scripts/tree_generation.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+"""
+Author: Sukanya Denni, Lucien Piat
+Date: 28 Oct 2024
+Institution: INRAe
+Project: PangenOak
+
+This script generates a VCF file for a specified chromosome using msprime simulations.
+It utilizes recombination and mutation rates provided by the user and outputs the result to a specified directory.
+
+Usage:
+    python script_name.py -fai reference.fai -p 1000 -m 1e-8 -r 1e-8 -n 10 -o output_directory -c chromosome_name
+"""
+
+import pandas as pd
+import msprime
+import argparse
+import os
+
+def get_chromosome_bounds(chrom_length):
+    """
+    Define the chromosome boundaries based on length.
+    """
+    return [0, chrom_length]
+
+def create_recombination_map(chrom_length, recombination_rate):
+    """
+    Create a recombination rate map for the chromosome.
+    """
+    chrom_positions = get_chromosome_bounds(chrom_length)
+    return msprime.RateMap(position=chrom_positions, rate=[recombination_rate])
+
+def save_vcf_output(ts_chrom, chromosome_name, output_dir="results", batch_size=1000):
+    """
+    Save the VCF file generated from the simulation in batches to reduce I/O time.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    vcf_filename = os.path.join(output_dir, f"{chromosome_name}_msprime_simulation.vcf")
+    
+    with open(vcf_filename, "w") as vcf_file:
+        vcf_content = ts_chrom.as_vcf(contig_id=chromosome_name).splitlines(keepends=True)
+        for i in range(0, len(vcf_content), batch_size):
+            vcf_file.writelines(vcf_content[i:i + batch_size])
+
+def simulate_chromosome_vcf(fai_file, population_size, mutation_rate, recombination_rate, sample_size, output_dir, chromosome_name):
+    """
+    Main function to generate a VCF for a specified chromosome using msprime simulations.
+
+    :param fai_file: str, path to the FAI index file of the reference FASTA.
+    :param population_size: int, effective population size for the simulation.
+    :param mutation_rate: float, mutation rate per base pair.
+    :param recombination_rate: float, recombination rate per base pair.
+    :param sample_size: int, number of samples to simulate.
+    :param output_dir: str, directory to save the output VCF file.
+    :param chromosome_name: str, specific chromosome to simulate.
+    """
+    # Load the chromosome length from the FAI file (assuming only one chromosome in FAI file)
+    chrom_length = pd.read_table(fai_file, header=None, usecols=[1], names=["length"])['length'].values[0]
+
+    recombination_map = create_recombination_map(chrom_length, recombination_rate)
+
+    ancestry_ts = msprime.sim_ancestry(
+        samples=sample_size,
+        recombination_rate=recombination_map,
+        population_size=population_size
+    )
+
+    mutated_ts = msprime.sim_mutations(ancestry_ts, rate=mutation_rate, discrete_genome=True)
+    ts_chrom = mutated_ts.keep_intervals([[0, chrom_length]], simplify=False).trim()
+
+    # Save the output with optimized batch write
+    save_vcf_output(ts_chrom, chromosome_name, output_dir)
+
+if __name__ == '__main__':
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description="Generate a VCF for a specific chromosome using msprime simulations.")
+    parser.add_argument('-fai', '--fai', type=str, required=True, help='Path to the FAI index file for the reference FASTA.')
+    parser.add_argument('-p', '--population_size', type=int, required=True, help='Effective population size (Ne).')
+    parser.add_argument('-m', '--mutation_rate', type=float, required=True, help='Mutation rate per base pair (Âµ).')
+    parser.add_argument('-r', '--recombination_rate', type=float, required=True, help='Recombination rate per base pair.')
+    parser.add_argument('-n', '--sample_size', type=int, required=True, help='Sample size (number of individuals to simulate).')
+    parser.add_argument('-o', '--output_dir', type=str, required=True, help='Directory to save the output VCF file.')
+    parser.add_argument('-c', '--chromosome', type=str, required=True, help='Chromosome to simulate.')
+
+    args = parser.parse_args()
+
+    # Run the simulation with specified parameters
+    simulate_chromosome_vcf(
+        fai_file=args.fai,
+        population_size=args.population_size,
+        mutation_rate=args.mutation_rate,
+        recombination_rate=args.recombination_rate,
+        sample_size=args.sample_size,
+        output_dir=args.output_dir,
+        chromosome_name=args.chromosome
+    )
diff --git a/workflow/scripts/variants_class.py b/workflow/scripts/variants_class.py
new file mode 100644
index 0000000000000000000000000000000000000000..8da6f15686e49473e8d2a597e2cc0e32108b58cc
--- /dev/null
+++ b/workflow/scripts/variants_class.py
@@ -0,0 +1,158 @@
+import random
+
+class Variant:
+    """Base class for genetic variants."""
+    bases = ['A', 'C', 'G', 'T']
+    repetition_range = (2, 3)
+
+    def __init__(self, variant_type, chrom, position, chromosome_dict, chrom_lengths, length=1, reference_seq="", samples=None):
+        self.variant_type = variant_type
+        self.chrom = chrom
+        self.position = position
+        self.chromosome_dict = chromosome_dict
+        self.chrom_lengths = chrom_lengths
+        self.length = length
+        self.reference_seq = reference_seq
+        self.alt_seq = ""
+        self.samples = samples if samples is not None else {}
+
+    def set_length(self, length_df):
+        """Determine variant length based on provided length distribution."""
+        rand_val = random.random()
+        row = length_df[length_df['cumulative_pb'] >= rand_val].iloc[0]
+        interval = row['size_interval']
+        lower_bound, upper_bound = map(float, interval.strip('[]').split(','))
+        self.length = random.randint(int(lower_bound), int(upper_bound))
+
+        # Ensure length does not exceed chromosome boundaries
+        max_length = self.chrom_lengths[self.chrom] - self.position
+        self.length = min(self.length, max_length)
+        if self.length == 0:
+            self.length = 1
+
+    def retrieve_reference_sequence(self, chrom=None, position=None, no_ref=False):
+        """Retrieve the reference sequence from a FASTA file."""
+        chrom = chrom or self.chrom
+        position = position or self.position
+        
+        if chrom in self.chromosome_dict:
+            seq_record = self.chromosome_dict[chrom]
+            start = position - 1
+            if no_ref:
+                return str(seq_record.seq[start:start + 1])  # Only one base needed for insertions
+            end = min(start + self.length, len(seq_record))
+            return str(seq_record.seq[start:end])
+        else:
+            raise ValueError(f"Chromosome {chrom} not found in FASTA.")
+
+    def compute_alt_seq(self):
+        raise NotImplementedError("Subclasses must implement this method.")
+
+    def reverse_sequence(self, sequence):
+        return sequence[::-1]
+
+    def describe(self):
+        return f"{self.variant_type}\t{self.chrom}\t{self.position}\t{self.length}\t{self.reference_seq[:10]}\t{self.alt_seq[:10]}\t{self.samples}"
+
+    def vcf_line(self):
+        samples_str = '\t'.join(map(str, self.samples.values()))
+        return f"{self.chrom}\t{self.position}\t.\t{self.reference_seq}\t{self.alt_seq}\t.\tPASS\t.\tGT\t{samples_str}"
+
+class SingleBaseVariant(Variant):
+    """Variants that involve a change at a single base."""
+    
+    def __init__(self, variant_type, chrom, position, chromosome_dict, chrom_lengths):
+        super().__init__(variant_type, chrom, position, chromosome_dict, chrom_lengths, length=1)
+
+class SNP(SingleBaseVariant):
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths):
+        super().__init__('SNP', chrom, position, chromosome_dict, chrom_lengths)
+
+    def compute_alt_seq(self):
+        self.alt_seq = random.choice([base for base in self.bases if base != self.reference_seq])
+
+class StructuralVariant(Variant):
+    """Variants that involve multiple bases and structural changes."""
+    
+    def __init__(self, variant_type, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__(variant_type, chrom, position, chromosome_dict, chrom_lengths)
+        self.set_length(length_df)
+
+class Deletion(StructuralVariant):
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__('Deletion', chrom, position, chromosome_dict, chrom_lengths, length_df)
+
+    def compute_alt_seq(self):
+        self.alt_seq = self.reference_seq[0]  # Simulate deletion
+
+class Insertion(StructuralVariant):
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__('Insertion', chrom, position, chromosome_dict, chrom_lengths, length_df)
+        self.retrieve_reference_sequence(no_ref=True)
+
+    def compute_alt_seq(self):
+        self.alt_seq = self.reference_seq + "".join(random.choice(self.bases) for _ in range(self.length))
+
+class Inversion(StructuralVariant):
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__('Inversion', chrom, position, chromosome_dict, chrom_lengths, length_df)
+
+    def compute_alt_seq(self):
+        self.alt_seq = self.reverse_sequence(self.reference_seq)
+
+class RepeatableVariant(StructuralVariant):
+    """Variants that involve repetitions."""
+    
+    def compute_alt_seq(self):
+        number_of_repeats = random.randint(self.repetition_range[0], self.repetition_range[1])
+        self.alt_seq = self.reference_seq * number_of_repeats
+
+class TandemDuplication(RepeatableVariant):
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__('TandemDuplication', chrom, position, chromosome_dict, chrom_lengths, length_df)
+
+class InvertedTandemDuplication(RepeatableVariant):
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__('InvertedTandemDuplication', chrom, position, chromosome_dict, chrom_lengths, length_df)
+
+    def compute_alt_seq(self):
+        number_of_repeats = random.randint(self.repetition_range[0], self.repetition_range[1])
+        self.alt_seq = self.reverse_sequence(self.reference_seq * number_of_repeats)
+
+class Translocation(Variant):
+    """Variants that involve translocations."""
+    def __init__(self, chrom, position, chromosome_dict, chrom_lengths, length_df):
+        super().__init__('Translocation', chrom, position, chromosome_dict, chrom_lengths)
+        self.set_length(length_df)
+        self.dest_chrom = None
+        self.dest_position = None
+        self.dest_reference_seq = ""  # This must be one base at dest_position on dest_chrom
+        self.dest_alt_seq = ""
+
+    def set_destination(self, dest_chrom, dest_position):
+        """Set the destination chromosome and position, retrieving the reference sequence."""
+        self.dest_chrom = dest_chrom
+        self.dest_position = dest_position
+        
+        # Retrieve the reference sequence for the destination position
+        self.dest_reference_seq = self.retrieve_reference_sequence(self.dest_chrom, self.dest_position, no_ref=True)
+        self.dest_alt_seq = ""  # Initialize the destination alternative sequence
+
+    def compute_alt_seq(self):
+        """Compute the alternative sequences for the translocation."""
+        if self.dest_chrom is not None and self.dest_position is not None:
+            # Set the alternative sequence for the original position
+            self.alt_seq = self.reference_seq[0]  # Use the first base from the original sequence
+            # Set the destination alternative sequence to be the same as the destination reference sequence
+            self.dest_alt_seq = self.dest_reference_seq + self.reference_seq
+
+    def describe(self):
+        """Return a description of the translocation variant, including destination info."""
+        return f"{self.variant_type}\t{self.chrom}\t{self.position}\t{self.dest_chrom}\t{self.dest_position}\t{self.dest_alt_seq[:10]}\t{self.samples}"
+
+    def vcf_line(self):
+        """Generate a VCF line for both the original and destination variants."""
+        samples_str = '\t'.join(map(str, self.samples.values()))
+        orig_vcf = f"{self.chrom}\t{self.position}\t.\t{self.reference_seq}\t{self.alt_seq}\t.\tPASS\t.\tGT\t{samples_str}"
+        dest_vcf = f"{self.dest_chrom}\t{self.dest_position}\t.\t{self.dest_reference_seq}\t{self.dest_alt_seq}\t.\tPASS\t.\tGT\t{samples_str}"
+        return f"{orig_vcf}\n{dest_vcf}"