diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b70e77af8f95fbabadccba9b585b1368b9efce64..636d9c75dcb3907911b74a291911a2020bb0f322 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,6 +40,32 @@ create_graph: paths: - workflow.svg +deep_run_workflow: + stage: test + image: + name: kaczmarj/apptainer:latest + entrypoint: [""] + tags: + - custom + script: + ## Downloading apps + - mkdir appimgs + - apptainer build appimgs/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest + - apptainer build appimgs/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest + - apptainer build appimgs/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest + - apptainer build appimgs/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest + + ## Running workflow + - mkdir -p data/haplotypes + - cp example/*.fa.gz data/haplotypes + - HOME=$(pwd) + - rm config.yaml && mv example/config_CICD.yaml config.yaml + - apptainer run --bind $HOME:/root appimgs/pan1c-box.sif snakemake -c 4 + artifacts: + paths: + - output/pan1c.pggb.03SC_CICD.gfa + - output/pan1c.pggb.03SC_CICD.workflow.stats.tsv + allow_failure: true diff --git a/README.md b/README.md index 25129741dc8860d82b659d5e1b133db4baf951de..eab2dc396127232bc3ffbc170fc70aa968d55c12 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Pan1c workflow +# Pan1c (Pangenome at chromosome level) workflow -Pan1c *(Pangenome 1 Chromosome)* : a snakemake workflow for creating a pangenome at a chromosomic scale. +Pan1c : a snakemake workflow for creating pangenomes at chromosomic scale. Tools used within the workflow : - PanGeTools : https://forgemia.inra.fr/alexis.mergez/pangetools - PanGraTools : https://forgemia.inra.fr/alexis.mergez/pangratools @@ -10,12 +10,13 @@ The file architecture for the workflow is as follow : ``` Pan1c/ ├── config.yaml +├── data +│  └── haplotypes +│ ├── ref.hap<x>.fa.gz +│ ├── samp1.hap<x>.fa.gz +│ └── ... ├── example -│  ├── CICC1445.fa.gz -│  ├── config_CICD.yaml -│  ├── R64.fa.gz -│  ├── SX2.fa.gz -│  └── workflow.svg +│  └── ... ├── getApps.sh ├── README.md ├── runSnakemake.sh @@ -33,10 +34,14 @@ This DAG shows the worflow for a pangenome of `Arabidospis Thaliana` using the `  # Prepare your data -This workflow can take chromosome level assemblies as well as contig level assemblies. +This workflow can take chromosome level assemblies as well as contig level assemblies but requires a reference assembly. **Fasta files need to be compressed** using **bgzip2** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). -Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example where the fasta file is named CHM13.fa.gz). -Because of the clustering step, this pattern is only needed for the reference assembly. +Sequences names of the reference **must follow this pattern** : `<sample>#<haplotype>#<contig or chromosome name>`. +For example, CHM13 chromosomes (haploïd) must be named `CHM13#1#chr..`. Only the reference needs to follow this pattern for its sequence names. Others haplotypes sequences will be renamed based on the reference and their respective fasta file names. +Fasta files **must also follow a pattern** : `<sample>.hap<haplotype>.fa.gz`. Once again with CHM13, the fasta file should be named : `CHM13.hap1.fa.gz`. + +See [PanSN](https://github.com/pangenome/PanSN-spec) for more info on sequence naming. + > Note : Input files should be read-only to prevent snakemake to mess with them (which seems to happen in some rare cases). # Download apptainer images diff --git a/Snakefile b/Snakefile index 72fa1e82fcf8ec100466251da1daed494fc23307..260d97dc1662f863105d4ebc1502d8bdc58e7d60 100644 --- a/Snakefile +++ b/Snakefile @@ -5,6 +5,7 @@ import numpy as np import gzip import re +# Getting the list of haplotypes SAMPLES = np.unique([ os.path.basename(f).split('.fa')[0] for f in os.listdir("data/haplotypes/") @@ -16,13 +17,25 @@ nHAP = len(SAMPLES) with gzip.open("data/haplotypes/"+config['reference'], "r") as handle: CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"] +def which_post_analysis(): + ## Simple function to configure which parts of the workflow needs to be run + post_analysis_inputs = [ # Default post analysis steps + "output/pan1c.pggb."+config['name']+".workflow.stats.tsv", + "output/figures", + "output/panacus.reports" + ] + + # Optionals post analysis steps + if config["get_PAV"] == "True": + post_analysis_inputs.append("output/pav.matrices") + + return post_analysis_inputs + rule all: input: "output/pan1c.pggb."+config['name']+".gfa", - "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv", - "output/figures", - "output/pav.matrices", - "output/pan1c.pggb."+config['name']+".stats.tsv" + "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv", + which_post_analysis() rule samtools_index: # Using samtools faidx to index compressed fasta @@ -177,7 +190,7 @@ rule aggregate_stats: input: "output/stats/" output: - "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" params: apppath=config['app.path'], panname=config['name'] @@ -207,34 +220,37 @@ rule get_pav: rule pggb_log_compression: input: - flag="output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" output: tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", - chrLen="logs/pan1c.pggb."+config['name']+".chrInputLen.tsv" shell: """ cd logs/pggb tar -zcvf ../../{output.tar} * cd ../.. + """ - if [ -f {output.chrLen} ]; then - rm {output.chrLen} - fi - - for chrFasta in data/chrInputs/*.fa.gz; do - filename=$(basename $chrFasta .fa.gz) - num_bases=$(zgrep -v '^>' $chrFasta | tr -d '\\n' | wc -c) - echo "$filename\t$num_bases" >> {output.chrLen} - done +rule pggb_input_stats: + input: + flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" + output: + "data/chrInputs/pan1c.pggb."+config['name']+".chrInput.stats.tsv" + params: + apppath=config['app.path'], + panname=config['name'] + shell: + """ + apptainer run {params.apppath}/pan1c-env.sif python scripts/chrInputStats.py \ + -f data/chrInputs/*.fa.gz -o {output} -p {params.panname} """ rule workflow_statistics: input: tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", - chrLen="logs/pan1c.pggb."+config['name']+".chrInputLen.tsv", - chrGraphStats="output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + chrInputStats="data/chrInputs/pan1c.pggb."+config['name']+".chrInput.stats.tsv", + chrGraphStats="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" output: - tsv="output/pan1c.pggb."+config['name']+".stats.tsv", + tsv="output/pan1c.pggb."+config['name']+".workflow.stats.tsv", dir=directory("output/pggb.usage.figs") params: apppath=config['app.path'] @@ -242,5 +258,24 @@ rule workflow_statistics: """ mkdir -p {output.dir} apptainer run {params.apppath}/pan1c-env.sif python scripts/workflowStats.py \ - -i {input.tar} -l {input.chrLen} -g {input.chrGraphStats} -o {output.tsv} -f {output.dir} - """ \ No newline at end of file + -i {input.tar} -c {input.chrInputStats} -g {input.chrGraphStats} -o {output.tsv} -f {output.dir} + """ + +rule panacus_stats: + input: + "data/chrGraphs/graphsList.txt" + output: + directory("output/panacus.reports") + params: + apppath=config['app.path'], + panname=config['name'], + refname=config['reference'] + threads: 8 + run: + shell("mkdir {output}") + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: + shell("bash scripts/getPanacusHG.sh -g {graph} -r $(basename {params.refname} .fa.gz) -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).histgrowth.html -a {params.apppath} -t {threads}") diff --git a/config.yaml b/config.yaml index 190b4bb7316bb49454f97d36999fcfe6536ebe17..7c8337f53157be50a0559a47559cc488fa76a661 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,7 @@ name: '02H' -reference: 'CHM13v2.0.fa.gz' +reference: 'CHM13.hap1.fa.gz' app.path: '/home/amergez/work/apptainer' pggb.params: '-X --skip-viz' odgi.1Dviz.params: '-x 500 -b' -odgi.pcov.params: '-x 500 -O' \ No newline at end of file +odgi.pcov.params: '-x 500 -O' +get_PAV: 'False' diff --git a/example/CICC1445.fa.gz b/example/CICC1445.hap1.fa.gz similarity index 100% rename from example/CICC1445.fa.gz rename to example/CICC1445.hap1.fa.gz diff --git a/example/R64.fa.gz b/example/R64.hap1.fa.gz similarity index 93% rename from example/R64.fa.gz rename to example/R64.hap1.fa.gz index 11e4e68704043afddb9f15926d30a25e8e934b07..dcdd7ed5bb290fdbf634c3156fc31aecd57bd826 100644 Binary files a/example/R64.fa.gz and b/example/R64.hap1.fa.gz differ diff --git a/example/SX2.fa.gz b/example/SX2.hap1.fa.gz similarity index 100% rename from example/SX2.fa.gz rename to example/SX2.hap1.fa.gz diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml index 775a7fc6d3036b53d11b000ba4ecc1deea96c9e2..aeebbf68068d3e005c17201176b49eb1b7957cc4 100644 --- a/example/config_CICD.yaml +++ b/example/config_CICD.yaml @@ -1,6 +1,7 @@ name: '03SC_CICD' -reference: 'R64.fa.gz' +reference: 'R64.hap1.fa.gz' app.path: 'appimgs/' pggb.params: '-X --skip-viz' odgi.1Dviz.params: '-x 500 -b' -odgi.pcov.params: '-x 500 -O' \ No newline at end of file +odgi.pcov.params: '-x 500 -O' +get_PAV: 'False' diff --git a/scripts/analyzeWorflowData.py b/scripts/analyzeWorflowData.py new file mode 100644 index 0000000000000000000000000000000000000000..de0e25985707db454bd9d0f628b5cb66007a2a01 --- /dev/null +++ b/scripts/analyzeWorflowData.py @@ -0,0 +1,113 @@ +""" +Clustering script for Pan1c workflow + +Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>, +the script clusters sequence by chromosome and returns several fasta. +Each output fasta contains sequences related to one chromosome only. + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + +from Bio import SeqIO +import numpy as np +import pandas as pd +import argparse +import gzip +import os +import seaborn as sns +import matplotlib.pyplot as plt + +## Arguments +arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering') +arg_parser.add_argument( + "--statsfiles", + "-i", + nargs="+", + dest = "statsfiles", + required = True, + help = "Workflow statistics file(s)" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "outdir", + required = True, + help = "Output directory" + ) +arg_parser.add_argument( + "--debug", + "-d", + action="store_true", + dest = "debug", + help = "Show debug" + ) +args = arg_parser.parse_args() + +## Toolbox +def getRegEquation(x, y): + (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True) + ymean = np.mean(y) + sst = np.sum((y - ymean)**2) + r2 = (1 - (ssr/sst))[0] + return f'y = {slope:.2f}x + {intercept:.2f} (R2 : {r2:.2f})' + +## Main script +dfList = [] +for file in args.statsfiles: + dfList.append( + pd.read_csv( + file, + sep='\t', + ) + ) + +df = pd.concat(dfList, ignore_index = True) + +if args.debug: print(df) +df.to_csv("Final.tsv", sep='\t', index=False) + +# Memory versus mean base count +sns.regplot(x=df["input.mean.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df["input.mean.length"], y=df.mem) +plt.xlabel('Mean input sequences length (#bases)') +plt.ylabel('Peak memory usage (GB)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig("MemoryVSMeanSeqLength.png") +plt.close() + +# Memory versus base count +sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df["input.total.length"], y=df.mem) +plt.xlabel('Total input sequences length (#bases)') +plt.ylabel('Peak memory usage (GB)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig("MemoryVSSeqLength.png") +plt.close() + +# PCA +if False: + excluded_columns = ["pangenome.name", "chrom.id", "time"] + columns = [ + col + for col in df.columns.tolist() + if col not in excluded_columns + ] + crdf = df.copy() + crdf[columns] = (df[columns]-df[columns].mean())/df[columns].std() + + if args.debug: print(crdf) + + from sklearn.decomposition import PCA + pca = PCA(n_components=3) + newX = pca.fit_transform(crdf[columns]) + + pcaDF = pd.DataFrame(newX) + pcaDF.columns = ["D1", "D2", "D3"] + print(pcaDF) + + finalDF = pd.concat([df[excluded_columns], pcaDF], axis = 1) + print(finalDF) + + fig = sns.lmplot(x="D1", y="D2", hue="chrom.id", data = finalDF, fit_reg=False) + plt.savefig("Test.png") \ No newline at end of file diff --git a/scripts/chrInputStats.py b/scripts/chrInputStats.py new file mode 100644 index 0000000000000000000000000000000000000000..9d23c75864aa9939babbc37aa9cfacf15fd04eac --- /dev/null +++ b/scripts/chrInputStats.py @@ -0,0 +1,115 @@ +""" +Input statistics script for Pan1c workflow + +Given a fasta input made for pggb, computes stats regarding the complexity of the sequences + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + +from Bio import SeqIO +import numpy as np +import pandas as pd +import argparse +import gzip +import os + +## Arguments +arg_parser = argparse.ArgumentParser(description='Pan1c pggb input statistics') +arg_parser.add_argument( + "--fasta", + "-f", + nargs="+", + dest = "fastafiles", + required = True, + help = "Fasta file(s)" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "output", + required = True, + help = "Output TSV" + ) +arg_parser.add_argument( + "--panname", + "-p", + dest = "panname", + required = True, + help = "Pangenome name" + ) +arg_parser.add_argument( + "--debug", + "-d", + action="store_true", + dest = "debug", + help = "Show debug" + ) +args = arg_parser.parse_args() + +## Toolbox + +## Main script +seqDict = {} + +# Parsing fasta files +for filename in args.fastafiles: + + chrName = os.path.basename(filename).split(".fa.gz")[0] + + # Reading .fa.gz file and adding records to seqDict + with gzip.open(filename, "rt") as handle: + sequences = SeqIO.parse(handle, "fasta") + + for record in sequences: + seqDict[(chrName, record.id)] = record.seq + +chromList = np.unique([x for x, y in seqDict.keys()]) + +aggregatedStats = {} + +for chrom in chromList: + _stats = { + "input.#N": 0, + "input.total.length": 0 + } + _lengths, _Nper = [], [] + for (chrName, seqid), seq in seqDict.items(): + if chrName == chrom: + # Counting number of Ns + _stats["input.#N"] += seq.count("N") + # Counting total bases + _stats["input.total.length"] += len(seq) + # Saving length and N percentage + _lengths.append(len(seq)) + _Nper.append((seq.count("N")/len(seq))) + + _stats["input.mean.N%"] = np.mean(_Nper)*100 + _stats["input.mean.length"] = np.mean(_lengths) + _stats["input.#sequences"] = len(_lengths) + + #Computing L50 + _lengths = sorted(_lengths, reverse = True) + halfTotal = _stats["input.total.length"]/2 + cumulativeLength, L50 = 0, 0 + for length in _lengths: + cumulativeLength += length + L50 += 1 + + if cumulativeLength >= halfTotal: + break + + _stats["input.L50"] = L50 + + aggregatedStats[(args.panname,chrom)] = _stats + +if args.debug: print(aggregatedStats) + +df = pd.DataFrame.from_dict(aggregatedStats, orient='index') +df.reset_index(inplace=True) +df.rename(columns={df.columns[0]: 'pangenome.name', df.columns[1]: 'chrom.id'}, inplace = True) +df.to_csv( + args.output, + sep='\t', + index = False +) \ No newline at end of file diff --git a/scripts/getPanacusHG.sh b/scripts/getPanacusHG.sh new file mode 100755 index 0000000000000000000000000000000000000000..25ba26bcee0a520775b18a0f4b928b9be824c647 --- /dev/null +++ b/scripts/getPanacusHG.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Get a GFA and returns a Panacus report + +# Initializing arguments +gfa="" # GFA path +refname="" # Reference haplotype +appdir="" # Directory containing apptainer images +threads="" +chrdir="" # Chromosome directory (directory used by pggb to store step files like .paf, etc...) +output="" # Output panacus figure + +## Getting arguments +while getopts "g:r:a:t:d:o:" option; do + case "$option" in + g) gfa="$OPTARG";; + r) refname="$OPTARG";; + a) appdir="$OPTARG";; + t) threads="$OPTARG";; + d) chrdir="$OPTARG";; + o) output="$OPTARG";; + \?) echo "Usage: $0 [-g gfa] [-r refname] [-a appdir] [-t threads] [-d chrdir] [-o output]" >&2 + exit 1;; + esac +done + +# Getting chromosome name +chrname=$(basename ${gfa} .gfa) +ref=$(echo $refname | sed 's/.hap/#/') + + +# Getting paths in chromosome graph +echo "[getPanacusHG::odgi::paths] Running on ${gfa}" +apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt + +# Running Panacus +echo "[getPanacusHG::panacus] Running on ${gfa}" +apptainer run --app panacus $appdir/PanGeTools.sif histgrowth \ + -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -S -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output} + + diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index 7721a181f79d19ec6a408817956613b6e8619c38..c68f0d1fd8f05f9b7471d360f2f42b91cba229e7 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -24,21 +24,24 @@ while getopts "d:a:t:r:q:o:" option; do done ## Main script -hap=$(basename $inputquery .fa.gz) -ref=$(basename $inputref .fa.gz) +sample=$(basename $inputquery .fa.gz) +hapID=$(echo $sample | sed 's/.hap/#/') +fullref=$(basename $inputref .fa.gz) +ref=$(echo $fullref | sed 's/.hap/#/') + mkdir -p $tmpdir # Running ragtag scaffold -apptainer run $appdir/pytools.sif ragtag.py scaffold \ +apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \ -t $threads -o $tmpdir $inputref $inputquery # Renaming sequence according to naming scheme grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ - sed "s/${ref}#chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa + sed "s/${ref}#chr\([^_]*\)_RagTag/${hapID}#chr\1/g" > $tmpdir/${sample}.ragtagged.fa # Compressing fasta apptainer run --app bgzip $appdir/PanGeTools.sif \ - -@ $threads $tmpdir/${hap}.ragtagged.fa + -@ $threads $tmpdir/${sample}.ragtagged.fa # Moving fa.gz to output dir -mv $tmpdir/${hap}.ragtagged.fa.gz $output +mv $tmpdir/${sample}.ragtagged.fa.gz $output diff --git a/scripts/sr_mapping2graph.sh b/scripts/sr_mapping2graph.sh new file mode 100755 index 0000000000000000000000000000000000000000..e203385b0731a6f5582372150cc1add48ab4787f --- /dev/null +++ b/scripts/sr_mapping2graph.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Map given shrot reads to the pangenome graph using vg giraffe. See wiki for detailled explanation. + +# Initializing arguments +shortreads="" # Fastq file to map against the graph +appdir="" # Directory containing apptainer images +threads="" # Threads count +graph="" # Pangenome graph +output="" # Output gam + +## Getting arguments +while getopts "a:t:g:f:o:" option; do + case "$option" in + a) appdir="$OPTARG";; + t) threads="$OPTARG";; + g) graph="$OPTARG";; + f) shortreads="$OPTARG";; + o) output="$OPTARG";; + \?) echo "Usage: $0 [-a apptainer dir] [-t threads] [-g graph] [-f fastq] [-o output gam]" >&2 + exit 1;; + esac +done + +## Main script +apptainer run --app vg $appdir/PanGeTools.sif \ + autoindex -t $threads -g $graph --workflow giraffe -p $(basename $graph) + +apptainer run --app vg $appdir/PanGeTools.sif \ + giraffe -Z ${graph}.giraffe.gbz -m ${graph}.min -d ${graph}.dist -f $shortreads -t $threads -p > $output + +apptainer run --app vg $appdir/PanGeTools.sif \ + stats -a $output diff --git a/scripts/workflowStats.py b/scripts/workflowStats.py index 8426780be5756c324948afd22fa3b847cb299deb..fa9e4f317fb5ae12be9830e3ff9ad1113cfc0fac 100644 --- a/scripts/workflowStats.py +++ b/scripts/workflowStats.py @@ -29,12 +29,12 @@ arg_parser.add_argument( help = "Tarballs containing time logs" ) arg_parser.add_argument( - "--chrinputlen", - "-l", + "--chrInputStats", + "-c", nargs = "+", - dest = "chrinputlenfiles", + dest = "chrInputStatsFiles", required = True, - help = "One or more TSV containing input sequence length (generated by Pan1c workflow)" + help = "One or more TSV containing input chrInput stats (generated by Pan1c workflow)" ) arg_parser.add_argument( "--graphstats", @@ -118,23 +118,25 @@ for tarball in args.tarballs: ## Parsing chromosome input file size (in #bases). # Iterating over input length files -for chrInputLength in args.chrinputlenfiles: +for chrInputStats in args.chrInputStatsFiles: - panname = os.path.basename(chrInputLength).split(".")[2] # Works if name is pan1c.pggb.<panname>.chrInputLen.tsv + panname = os.path.basename(chrInputStats).split(".")[2] # Works if name is pan1c.pggb.<panname>.chrInput.stats.tsv # Reading chrInputLen to get the number of bases used to construct chromosomes - chrSeqLength = pd.read_csv( - chrInputLength, - sep="\t", - header=None, - index_col=0 - ).to_dict()[1] - - if args.debug: print("[timeStats::debug]", chrSeqLength) + chrInputStatsdf = pd.read_csv( + chrInputStats, + sep="\t" + ) - # Adding base counts to aggregatedData - for chromName, chromLength in chrSeqLength.items(): - aggregatedData[(panname,chromName)]["nbases"] = chromLength + # Iterating over dataframe's rows + for index, row in chrInputStatsdf.iterrows(): + colnames = [ + col + for col in chrInputStatsdf.columns.tolist() + if col not in ["pangenome.name", "chrom.id"] + ] + for column in colnames: + aggregatedData[(row["pangenome.name"], row["chrom.id"])][column] = row[column] ## Parsing graph statistics generated by odgi stats graphColDict = { @@ -170,7 +172,7 @@ df.rename(columns=graphColDict, inplace = True) df.time = pd.to_timedelta(df.time) df.mem = df.mem.astype(float) df.cpu = df.cpu.astype(int) -df.nbases = df.nbases.astype(int) +df["input.total.length"] = df["input.total.length"].astype(int) if args.debug: print("[timeStats::debug]", df.dtypes) @@ -179,8 +181,8 @@ df.to_csv(args.output, sep='\t', index=False) # Creating some figures # Time versus base count -sns.regplot(x=df.nbases, y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5}) -equation = getRegEquation(x=df.nbases, y=df.time.dt.total_seconds()) +sns.regplot(x=df["input.total.length"], y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df["input.total.length"], y=df.time.dt.total_seconds()) plt.xlabel('Total input sequences length (#bases)') plt.ylabel('Graph creation time (s)') plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') @@ -188,8 +190,8 @@ plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png")) plt.close() # Memory versus base count -sns.regplot(x=df.nbases, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) -equation = getRegEquation(x=df.nbases, y=df.mem) +sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df["input.total.length"], y=df.mem) plt.xlabel('Total input sequences length (#bases)') plt.ylabel('Peak memory usage (GB)') plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') @@ -197,8 +199,8 @@ plt.savefig(os.path.join(args.figdir,"MemoryVSSeqLength.png")) plt.close() # CPU versus base count -sns.regplot(x=df.nbases, y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5}) -equation = getRegEquation(x=df.nbases, y=df.cpu) +sns.regplot(x=df["input.total.length"], y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df["input.total.length"], y=df.cpu) plt.xlabel('Total input sequences length (#bases)') plt.ylabel('CPU usage (%)') plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')