diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 636d9c75dcb3907911b74a291911a2020bb0f322..085078bf8ca9a2d98667e28292967d5823de4621 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,32 +40,32 @@ create_graph: paths: - workflow.svg -deep_run_workflow: - stage: test - image: - name: kaczmarj/apptainer:latest - entrypoint: [""] - tags: - - custom - script: - ## Downloading apps - - mkdir appimgs - - apptainer build appimgs/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest - - apptainer build appimgs/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest - - apptainer build appimgs/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest - - apptainer build appimgs/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest +# deep_run_workflow: +# stage: test +# image: +# name: kaczmarj/apptainer:latest +# entrypoint: [""] +# tags: +# - custom +# script: +# ## Downloading apps +# - mkdir appimgs +# - apptainer build appimgs/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest +# - apptainer build appimgs/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest +# - apptainer build appimgs/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest +# - apptainer build appimgs/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest - ## Running workflow - - mkdir -p data/haplotypes - - cp example/*.fa.gz data/haplotypes - - HOME=$(pwd) - - rm config.yaml && mv example/config_CICD.yaml config.yaml - - apptainer run --bind $HOME:/root appimgs/pan1c-box.sif snakemake -c 4 - artifacts: - paths: - - output/pan1c.pggb.03SC_CICD.gfa - - output/pan1c.pggb.03SC_CICD.workflow.stats.tsv - allow_failure: true +# ## Running workflow +# - mkdir -p data/haplotypes +# - cp example/*.fa.gz data/haplotypes +# - HOME=$(pwd) +# - rm config.yaml && mv example/config_CICD.yaml config.yaml +# - apptainer run --bind $HOME:/root appimgs/pan1c-box.sif snakemake -c 4 +# artifacts: +# paths: +# - output/pan1c.pggb.03SC_CICD.gfa +# - output/pan1c.pggb.03SC_CICD.workflow.stats.tsv +# allow_failure: true diff --git a/Snakefile b/Snakefile index b1304234749e36e263050cca1bfe24b102887292..81cf6432214f176c39698a892b00b7f9b8a2a1b2 100644 --- a/Snakefile +++ b/Snakefile @@ -99,6 +99,7 @@ rule ragtag_scaffolding: output: "data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" threads: 8 + retries: 1 params: apppath=config["app.path"] shell: @@ -110,6 +111,11 @@ rule ragtag_scaffolding: -r {input.ref} \ -q {input.fa} \ -o {output} + + if [ ! -s {output} ]; then + echo "Empty fasta" + exit 1 + fi """ rule clustering: @@ -164,6 +170,8 @@ rule create_sglAsm_syri_fig: fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", wrkdir=directory('data/asm.syri/{haplotype}') threads: 4 + resources: + mem_gb=48 params: apppath=config["app.path"] shell: @@ -222,29 +230,169 @@ rule create_pggb_input_syri_fig: rm {output.wrkdir}/*.fa """ -rule pggb_on_chr: - # Run pggb on a specific chromosome +# rule pggb_on_chr: +# # Run pggb on a specific chromosome +# input: +# fa="data/chrInputs/{chromosome}.fa.gz", +# gzi="data/chrInputs/{chromosome}.fa.gz.gzi" +# output: +# gfa="data/chrGraphs/{chromosome}.gfa" +# threads: 16 +# params: +# pggb=config['pggb.params'], +# apppath=config['app.path'] +# log: +# cmd="logs/pggb/{chromosome}.pggb.cmd.log", +# time="logs/pggb/{chromosome}.pggb.time.log" +# shell: +# """ +# /usr/bin/time -v -o {log.time} \ +# apptainer run {params.apppath}/pggb.sif \ +# -i {input.fa} {params.pggb} -n {nHAP} -t {threads} -T {threads} -o data/chrGraphs/{wildcards.chromosome} 2>&1 | \ +# tee {log.cmd} +# mv data/chrGraphs/{wildcards.chromosome}/*.gfa {output.gfa} +# """ + +## WIP - Adding rules to split pggb workflow into dedicated steps + +rule wfmash_on_chr: + # Run wfmash on a specific chromosome input input: fa="data/chrInputs/{chromosome}.fa.gz", - gzi="data/chrInputs/{chromosome}.fa.gz.gzi" + fai="data/chrInputs/{chromosome}.fa.gz.fai" output: - gfa="data/chrGraphs/{chromosome}.gfa" + mapping="data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf", + aln="data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf" threads: 16 - params: - pggb=config['pggb.params'], - apppath=config['app.path'] + params: + apppath=config['app.path'], + segment_length=config['wfmash.segment_length'], + mapping_id=config['wfmash.mapping_id'], + wfmash_sec=config['wfmash.secondary'] + log: + cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", + time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", + cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", + time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", + shell: + """ + ## Mapping + /usr/bin/time -v -o {log.time_map} \ + apptainer exec {params.apppath}/pggb.sif wfmash \ + -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ + -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ + --tmp-base $(dirname {output.aln}) \ + {input.fa} --approx-map \ + 1> {output.mapping} \ + 2> >(tee {log.cmd_map} >&2) + + ## Aligning + /usr/bin/time -v -o {log.time_aln} \ + apptainer exec {params.apppath}/pggb.sif wfmash \ + -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ + -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ + --tmp-base $(dirname {output.aln}) \ + {input.fa} -i {output.mapping} \ + --invert-filtering \ + 1> {output.aln} \ + 2> >(tee {log.cmd_aln} >&2) + """ + +rule seqwish: + # Run seqwish on alignement produced by wfmash + input: + fa="data/chrInputs/{chromosome}.fa.gz", + aln=rules.wfmash_on_chr.output.aln + output: + "data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa" + threads: 8 + params: + apppath=config['app.path'], + seqwish=config['seqwish.params'] log: - cmd="logs/pggb/{chromosome}.pggb.cmd.log", - time="logs/pggb/{chromosome}.pggb.time.log" + cmd="logs/pggb/{chromosome}.seqwish.cmd.log", + time="logs/pggb/{chromosome}.seqwish.time.log" shell: """ /usr/bin/time -v -o {log.time} \ - apptainer run {params.apppath}/pggb.sif \ - -i {input.fa} {params.pggb} -n {nHAP} -t {threads} -T {threads} -o data/chrGraphs/{wildcards.chromosome} 2>&1 | \ + apptainer exec {params.apppath}/pggb.sif seqwish \ + -s {input.fa} -p {input.aln} -g {output} \ + {params.seqwish} -t {threads} \ + --temp-dir $(dirname {output}) -P 2>&1 | \ tee {log.cmd} - mv data/chrGraphs/{wildcards.chromosome}/*.gfa {output.gfa} """ +rule gfaffix_on_chr: + # Run gfaffix on seqwish graph + input: + rules.seqwish.output + output: + gfa="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa", + transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" + threads: 1 + params: + apppath=config['app.path'] + log: + time="logs/pggb/{chromosome}.gfaffix.time.log" + shell: + """ + /usr/bin/time -v -o {log.time} \ + apptainer exec {params.apppath}/pggb.sif gfaffix \ + {input} -o {output.gfa} -t {output.transform} \ + > /dev/null + """ + +rule odgi_postprocessing: + # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph + input: + rules.gfaffix_on_chr.output.gfa + output: + gfa="data/chrGraphs/{chromosome}.gfa" + threads: 8 + params: + apppath=config['app.path'] + log: + cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", + time_build="logs/pggb/{chromosome}.odgi_build.time.log", + cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", + time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", + cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", + time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", + cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", + time_view="logs/pggb/{chromosome}.odgi_view.time.log" + shell: + """ + OGfile="$(dirname {input})/$(basename {input} .gfa)" + + ## Converting to odgi format and optimizing namespace + /usr/bin/time -v -o {log.time_build} \ + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ + tee {log.cmd_build} + + ## Unchoping the nodes (merges unitigs into single nodes) + /usr/bin/time -v -o {log.time_unchop} \ + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ + tee {log.cmd_unchop} + + ## Sorting the nodes + /usr/bin/time -v -o {log.time_sort} \ + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ + -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ + tee {log.cmd_sort} + + ## Exporting to gfa + /usr/bin/time -v -o {log.time_view} \ + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + view -i $OGfile.unchoped.sorted.og -g \ + 1> {output.gfa} \ + 2> >(tee {log.cmd_view} >&2) + """ + +## ---------------------------------------- + rule generate_graph_list: # Generate a text file containing all created graphs input: @@ -288,7 +436,7 @@ rule graph_stats: shell: """ apptainer run --app gfastats {params.apppath}/PanGeTools.sif \ - -g {input.graph} -P -o $(dirname {output.genstats})/{wildcards.chromosome} + -g {input.graph} -P -o $(dirname {output.genstats})/{wildcards.chromosome} -t {threads} """ rule graph_figs: @@ -331,12 +479,13 @@ rule aggregate_graphs_stats: --outputPaths {output.pathstats} --panname {params.panname} """ -rule gfaTagR: +rule final_graph_tagging: # Add metadata to the final GFA input: graph="output/pan1c.pggb."+config['name']+".gfa", output: "output/pan1c.pggb."+config['name']+".gfa.metadata" + threads: 1 params: apppath=config['app.path'], panname=config['name'] @@ -365,7 +514,6 @@ rule pggb_input_stats: rule core_statistics: # Aggregate chrInput, chrGraph and pggb statistics into a single tsv input: - pggbStats=expand("logs/pggb/{chromosome}.pggb.time.log", chromosome=CHRLIST), chrInputStats="output/stats/pan1c.pggb."+config['name']+".chrInput.stats.tsv", chrGraphStats="output/stats/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv" output: @@ -378,7 +526,7 @@ rule core_statistics: """ mkdir -p {output.dir} apptainer run {params.apppath}/pan1c-env.sif python scripts/coreStats.py \ - --pggbStats {input.pggbStats} --chrInputStats {input.chrInputStats} \ + --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.panname} """ diff --git a/config.yaml b/config.yaml index d3677ba3a71233ce885f79f80e85faf70bbb2637..7780f37515a3d69019c4a9e172c9f37f41f4c34a 100644 --- a/config.yaml +++ b/config.yaml @@ -5,6 +5,16 @@ name: '<pangenome name>' reference: 'CHM13.hap1.fa.gz' # Directory of apptainer images (downloaded with getApps.sh) app.path: '<path>' + +# Core parameters (WIP) +# Wfmash alignement parameters : leave to None if unknown +wfmash.segment_length: 5000 +wfmash.mapping_id: 90 +wfmash.secondary: '-k 19 -H 0.001 -X' + +# Seqwish parameters +seqwish.params: '-B 10000000 -k 19 -f 0' + # Optional parameters for PGGB pggb.params: '-X --skip-viz' # Odgi 1D and path coverage viz parameters @@ -18,3 +28,6 @@ get_PAV: 'False' get_allASM_SyRI: 'False' # All vs all get_ASMs_SyRI: 'False' # Haplotype vs Reference +# Debug options +debug: 'False' + diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml index ccec4c32daa3f4f60d5064351c528f10d7d4fb8b..111a904ade2c07331426caae26a0aa1984d134d6 100644 --- a/example/config_CICD.yaml +++ b/example/config_CICD.yaml @@ -1,12 +1,29 @@ +## Main parameters +# Pangenome name name: '03SC_CICD' +# Reference fasta (BGziped) reference: 'R64.hap1.fa.gz' +# Directory of apptainer images (downloaded with getApps.sh) app.path: 'appimgs/' + +# Core parameters (WIP) +# Wfmash alignement parameters : leave to None if unknown +wfmash.segment_length: 5000 +wfmash.mapping_id: 90 +wfmash.secondary: '-k 19 -H 0.001 -X' + +# Seqwish parameters +seqwish.params: '-B 10000000 -k 19 -f 0' + +# Optional parameters for PGGB pggb.params: '-X --skip-viz' +# Odgi 1D and path coverage viz parameters odgi.1Dviz.params: '-x 500 -b' odgi.pcov.params: '-x 500 -O' -## Control over optional parts of the workflow -# get_PAV is very long the more haplotypes there are (all vs all comparison) + +## Optional parts of the workflow +# Computes Presence Absence Variant matrices for Panache (not recommended; very long) get_PAV: 'False' -# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes) -get_allASM_SyRI: 'False' -get_ASMs_SyRI: 'False' +# Computes SyRI figures for haplotypes +get_allASM_SyRI: 'False' # All vs all +get_ASMs_SyRI: 'False' # Haplotype vs Reference diff --git a/runSnakemake.sh b/runSnakemake.sh index 5f63d8d1715590abab5f8cfebd110a5844f67bad..4206dc0ac47eb9ab6ed7e1b4818f8b14e9039a9d 100755 --- a/runSnakemake.sh +++ b/runSnakemake.sh @@ -12,5 +12,5 @@ module load containers/Apptainer/1.2.5 # Creating DAG apptainer run <path_to_pan1c-box>/pan1c-box.sif snakemake -c $(nproc) --dag | dot -Tsvg > workflow.svg # Running the workflow -apptainer run <path_to_pan1c-box>/pan1c-box.sif snakemake -c $(nproc) +/usr/bin/time -v -o whole.run.time.log apptainer run <path_to_pan1c-box>/pan1c-box.sif snakemake -c $(nproc) diff --git a/scripts/coreStats.py b/scripts/coreStats.py index c81a347837235f3057e5a8e6194d263b57b87403..91e693fc7aa1bc9bcb03f7c889ead49a48958363 100644 --- a/scripts/coreStats.py +++ b/scripts/coreStats.py @@ -24,10 +24,9 @@ import seaborn as sns arg_parser = argparse.ArgumentParser(description='Pan1c usage stats script') arg_parser.add_argument( "--pggbStats", - nargs="+", - dest = "pggbStatsFiles", + dest = "pggbStats", required = True, - help = "Time logs for pggb" + help = "Time logs for all pggb steps" ) arg_parser.add_argument( "--chrInputStats", @@ -83,12 +82,15 @@ def getRegEquation(x, y): ## Main script aggregatedData = {} # Temporary data holder. Will be transformed into dataframe after having parsed every inputs +PGGBsteps = [] # Storing PGGB steps name #%% Parsing section ## Parsing ressources usage for pggb # Iterating through all pggb time stats -for pggbStats in args.pggbStatsFiles: +TimeStatsList = [os.path.join(args.pggbStats, file) for file in os.listdir(args.pggbStats) if file.split('.')[-2] == "time"] +print(TimeStatsList) +for pggbStats in TimeStatsList: _tmpdf = pd.read_csv( pggbStats, @@ -101,7 +103,11 @@ for pggbStats in args.pggbStatsFiles: # Getting Time, CPU and memory stats time, cpu, memory = _tmpdf.iloc[4][1], int(_tmpdf.iloc[3][1][:-1]), int(_tmpdf.iloc[9][1])/((1024)**2) - chrid = (os.path.basename(pggbStats)).split(".")[0] + chrid, stepName = (os.path.basename(pggbStats)).split(".")[:2] + stepName = stepName.replace("_", ".") + + # Adding steps to identified pggb steps + if not stepName in PGGBsteps: PGGBsteps.append(stepName) # Convert time to pd.timedelta (handling the 2 input types) try : @@ -109,10 +115,15 @@ for pggbStats in args.pggbStatsFiles: except : time = pd.to_timedelta(f"00:{time}") - if args.debug: print(f"[timeStats::debug] Pangenome Name: {args.panname}\tChr: {chrid}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb") + if args.debug: print(f"[timeStats::debug] Pangenome Name: {args.panname}\tChr: {chrid}\tStep: {stepName}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb") # Adding to aggregatedData - aggregatedData[(args.panname, chrid)] = {"pggb.time": time, "pggb.cpu": cpu, "pggb.mem": memory} + if (args.panname, chrid) not in aggregatedData.keys(): + aggregatedData[(args.panname, chrid)] = {f"{stepName}.time": time, f"{stepName}.cpu": cpu, f"{stepName}.mem": memory} + else : + aggregatedData[(args.panname, chrid)][f"{stepName}.time"] = time + aggregatedData[(args.panname, chrid)][f"{stepName}.cpu"] = cpu + aggregatedData[(args.panname, chrid)][f"{stepName}.mem"] = memory ## Parsing chromosome input file size (in #bases). # Iterating over input length files @@ -157,9 +168,13 @@ for graphStats in args.chrGraphStatsFiles: df = pd.DataFrame.from_dict(aggregatedData, orient='index') df.reset_index(inplace=True) df.rename(columns={'level_0': 'Pangenome.name', 'level_1': 'Chr.id'}, inplace=True) -df["pggb.time"] = pd.to_timedelta(df["pggb.time"]) -df["pggb.mem"] = df["pggb.mem"].astype(float) -df["pggb.cpu"] = df["pggb.cpu"].astype(int) + +if args.debug: print(df) + +for step in PGGBsteps: + df[f"{step}.time"] = pd.to_timedelta(df[f"{step}.time"]) + df[f"{step}.mem"] = df[f"{step}.mem"].astype(float) + df[f"{step}.cpu"] = df[f"{step}.cpu"].astype(int) df["input.total.length"] = df["input.total.length"].astype(int) if args.debug: print("[timeStats::debug]", df.dtypes) @@ -167,6 +182,8 @@ if args.debug: print("[timeStats::debug]", df.dtypes) # Saving the dataframe df.to_csv(args.output, sep='\t', index=False) +## Deprecated functions that may be repurposed in another script +""" # Creating some figures # Time versus base count sns.regplot(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5}) @@ -203,3 +220,4 @@ plt.ylabel('Peak memory usage (GB)') plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') plt.savefig(os.path.join(args.figdir,"MemoryVSCpu.png")) plt.close() +""" \ No newline at end of file diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 761955cd2b445ebd1e1ea60af93080f9550bc310..2d068542076294be5327637d28cb3ec3c365470e 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -51,8 +51,9 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do #Â Minimap2 genome vs genome alignment apptainer run --app minimap2 $appdir/PanGeTools.sif \ - -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \ - apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile + -a -x asm5 -c --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads > $wrkdir/$bamFile.sam + apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile + rm $wrkdir/$bamFile.sam # Syri on previous alignment apptainer run $appdir/pan1c-env.sif \ diff --git a/scripts/getTags.py b/scripts/getTags.py index 778c7ef0ebaac988f645a2620dbf0a87624ef223..4a5a95ede17382d8c2afc4b703b30f5fa53d0cd8 100644 --- a/scripts/getTags.py +++ b/scripts/getTags.py @@ -1,7 +1,7 @@ """ Tag list creation script for Pan1c workflow -Returns a list of tags to ad at the top of the final gfa file as commented lines. +Returns a list of tags to add at the top of the final gfa file as commented lines. @author: alexis.mergez@inrae.fr @version: 1.0 diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py index b3688bb4cbbf12ee3c5a9c8f09b068e020fceb4e..6284eb4946b7e742726714c8917ad148e818beae 100644 --- a/scripts/inputClustering.py +++ b/scripts/inputClustering.py @@ -52,6 +52,8 @@ def getChr(name): ## Main script seqDict = {} +if args.debug: print(args.fastafiles) + # Parsing fasta files for filename in args.fastafiles: