diff --git a/README.md b/README.md index 552a70b43ef1bb1535f327d088cd1d49335d13dc..4ee10d3a2ef3ab9a989f8b5cfb9f6d8b3d531a81 100644 --- a/README.md +++ b/README.md @@ -7,26 +7,27 @@ Tools used within the workflow : The file architecture for the workflow is as follow : ``` -pan1c +Pan1c/ ├── config.yaml -├── copyHaplotypes.sh -├── data -│  └── haplotypes +├── README.md ├── runSnakemake.sh ├── scripts -│  ├── bin_split.py -│  └── statsAggregation.py -├── Snakefile -└── README.md +│ ├── bin_split.py +│ ├── getPanachePAV.sh +│ ├── inputClustering.py +│ ├── ragtagChromInfer.sh +│ └── statsAggregation.py +└── Snakefile ``` # Prepare your data -This workflow can take chromosome level assemblies as well as contig level assembly. +This workflow can take chromosome level assemblies as well as contig level assemblies. **Fasta files need to be compressed** using bgzip2 (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). -Sequence names **should follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example). -Make your input file read only to prevent snakemake to mess with them. +Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example). +Because of the clustering step, this pattern is only needed for the reference assembly. +Input files should be read only to prevent snakemake to mess with them (which seems to happen in some rare cases). # Usage -Create `data/haplotypes` and put all haplotypes in it. -Change reference name and apptainer image path in `config.yml`. -Change variables in `runSnakemake.sh` to match your needs (job name, mail, etc...). -Run `runSnakemake.sh` and wait ! \ No newline at end of file +Clone this repo and create `data/haplotypes`. Place all your haplotypes in it. +Change the reference name and the apptainer image path in `config.yaml`. +Finally, change variables in `runSnakemake.sh` to match your needs (threads, memory, job name, mail, etc...). +Go in the root directory of the repo and run `sbatch runSnakemake.sh` ! \ No newline at end of file diff --git a/Snakefile b/Snakefile index 45335d568e6411fa5144b0079e7dda3eb960dae9..a2eef6fd6651c937ca4de78a651f28e853283e26 100644 --- a/Snakefile +++ b/Snakefile @@ -4,7 +4,6 @@ import os import numpy as np SAMPLES = np.unique([os.path.basename(f).split('.fa')[0] for f in os.listdir("data/haplotypes/")]) -print(SAMPLES) nHAP = len(SAMPLES) rule all: @@ -44,34 +43,33 @@ rule samtools_index: "apptainer run --app samtools {params.apppath}/PanGeTools.sif " "faidx {input.fa}" -rule wfmash_premapping: - # Mapping haplotypes contigs/chr on a reference +rule ragtag_scaffolding: + # Scaffold input haplotype against the reference to infer chromosome scale sequences input: ref="data/haplotypes/"+config['reference'], reffai="data/haplotypes/"+config['reference']+".fai", refgzi="data/haplotypes/"+config['reference']+".gzi", - fa="data/input_haplotypes/aggregated.fa.gz", - fagzi="data/input_haplotypes/aggregated.fa.gz.gzi", - fafai="data/input_haplotypes/aggregated.fa.gz.fai" + fa="data/haplotypes/{haplotype}.fa.gz" output: - protected("data/premapping/premapping.paf") - params: - wfmash=config['wfmash.params'], + "data/ragtag_hap/{haplotype}.ragtagged.fa.gz" + threads: 8 + params: apppath=config["app.path"] - threads: workflow.cores - log: - time="logs/premapping/premapping.time.log" shell: """ - /usr/bin/time -v -o {log.time} \ - apptainer run --app wfmash {params.apppath}/PanGeTools.sif \ - -t {threads} {params.wfmash} {input.ref} {input.fa} > {output} + bash scripts/ragtagChromInfer.sh \ + -d $(dirname {output})/{wildcards.haplotype} \ + -a {params.apppath} \ + -t {threads} \ + -r {input.ref} \ + -q {input.fa} \ + -o {output} """ + checkpoint clustering: # Reading alignment file to create bins for each chromosome input: - paf="data/premapping/premapping.paf", - fa="data/input_haplotypes/aggregated.fa.gz" + expand('data/ragtag_hap/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: directory("data/chrInputs") threads: workflow.cores @@ -80,7 +78,7 @@ checkpoint clustering: shell: """ mkdir -p {output} - python scripts/bin_split.py --paf {input.paf} --dir {output} --fasta {input.fa} + apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py --fasta {input} --output {output} for file in {output}/*.fa; do apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file done diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py new file mode 100644 index 0000000000000000000000000000000000000000..8ceed3406048705634322cc7c10549314ca4419e --- /dev/null +++ b/scripts/inputClustering.py @@ -0,0 +1,89 @@ +""" +Clustering script for Pan1c workflow + +Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>, +the script clusters sequence by chromosome and returns several fasta. +Each output fasta contains sequences related to one chromosome only. + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + +from Bio import SeqIO +import numpy as np +import argparse +import gzip +import os + +## Arguments +arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering') +arg_parser.add_argument( + "--fasta", + "-f", + nargs="+", + dest = "fastafiles", + required = True, + help = "Fasta file(s)" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "outdir", + required = True, + help = "Output directory" + ) +arg_parser.add_argument( + "--debug", + "-d", + action="store_true", + dest = "debug", + help = "Show debug" + ) +args = arg_parser.parse_args() + +## Toolbox +def getChr(name): + """ + Simply returns the chromosome name of a given fasta record + """ + return name.split('#')[-1] + +## Main script +seqDict = {} + +# Parsing fasta files +for filename in args.fastafiles: + + # Reading .fa.gz file and adding records to seqDict + with gzip.open(filename, "rt") as handle: + sequences = SeqIO.parse(handle, "fasta") + + for record in sequences: + seqDict[record.id] = record + +# Inferring the list of available chromosomes from the sequence record ids +chrList = np.unique([ + getChr(recordid) for recordid in seqDict.keys() +]) + +# Clustering records based on chromosome tag in records id +chrSeq = {} + +for recordid, record in seqDict.items(): + # Getting the chromosome id + _chrname = getChr(recordid) + + # If not encountered before, create a key for this chromosome in chrSeq + if _chrname not in list(chrSeq.keys()): + chrSeq[_chrname] = [] + + # Add the record to the chromosome bin in chrSeq + chrSeq[_chrname].append(record) + +# Debug purpose print +if args.debug: print(chrSeq.keys()) + +# Writing chromosome specific fasta file +for chrName in chrSeq.keys(): + with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle: + SeqIO.write(chrSeq[chrName], output_handle, "fasta") \ No newline at end of file diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh new file mode 100755 index 0000000000000000000000000000000000000000..bb529b6aaccc95b767e0b61bd31b754487e539ba --- /dev/null +++ b/scripts/ragtagChromInfer.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Produce a Scaffolded version of the haplotype using RagTag + +# Initializing arguments +tmpdir="" # Directory used by ragtag +appdir="" # Directory containing apptainer images +threads="" # Threads count +inputref="" # Reference fasta +inputquery="" # Query fasta +output="" # Output fasta + +## Getting arguments +while getopts "d:a:t:r:q:o:" option; do + case "$option" in + d) tmpdir="$OPTARG";; + a) appdir="$OPTARG";; + t) threads="$OPTARG";; + r) inputref="$OPTARG";; + q) inputquery="$OPTARG";; + o) output="$OPTARG";; + \?) echo "Usage: $0 [-d tmpdir] [-a appdir] [-t threads] [-r inputref] [-q inputquery] [-o output]" >&2 + exit 1;; + esac +done + +## Main script +hap=$(basename $inputquery .fa.gz) +ref=$(basename $inputref .fa.gz) +mkdir -p $tmpdir + +# Running ragtag scaffold +apptainer run $appdir/pytools.sif ragtag.py scaffold \ + -t $threads -o $tmpdir $inputref $inputquery + +# Renaming sequence according to naming scheme +grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ + sed "s/${ref}#chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa + +# Compressing fasta +apptainer run --app bgzip $appdir/PanGeTools.sif \ + -@ $threads $tmpdir/${hap}.ragtagged.fa + +# Moving fa.gz to output dir +mv $tmpdir/${hap}.ragtagged.fa.gz $output