Commit 0e2c7b24 authored by Erwan DELAGE's avatar Erwan DELAGE
Browse files

Reference based read filtering added, read loss tracking updated (only for dada2)

parent e63b5b2d
......@@ -53,11 +53,11 @@ def initialize_pipeline(arguments):
configfile: config_file
json_config_path = ("snakefile", config_file)
else:
try :
try:
json_config_path = ("command line", arguments[arguments.index("--configfile") + 1])
except ValueError :
except ValueError:
json_config_path = ("command line", arguments[arguments.index("--configfiles") + 1])
# Check the configuration file
check_config()
......@@ -192,9 +192,9 @@ def check_config():
pass
# Qiime2 authors recommend to write adonis formula between quotes to "avoid unpleaseant surprises"
try :
try:
config["adonis"]["p-formula"] = "\"" + config["adonis"]["p-formula"] + "\""
except KeyError :
except KeyError:
pass
......@@ -441,9 +441,12 @@ def getInput(tool, given_file=""):
if tool == "cutadapt":
return getOutput("import", "import.qza")
if tool == "join_pairs":
if tool == "filter_reads":
return getOutput("cutadapt", "cutadapt.qza") if "cutadapt" in config else getOutput("import", "import.qza")
if tool == "join_pairs":
return getOutput("filter_reads", "filtered_reads.qza") if "filter_reads" in config else getOutput("cutadapt", "cutadapt.qza") if "cutadapt" in config else getOutput("import", "import.qza")
if tool == "quality_filter":
return getOutput("join_pairs", "joined_sequences.qza") if config["global"]["library-type"] == "paired-end" else getOutput("cutadapt", "cutadapt.qza") if "cutadapt" in config else getOutput("import", "import.qza")
......@@ -457,7 +460,7 @@ def getInput(tool, given_file=""):
return getOutput("quality_filter", "filtered_sequences.qza")
if tool == "dada2":
return getOutput("cutadapt", "cutadapt.qza") if "cutadapt" in config else getOutput("import", "import.qza")
return getOutput("filter_reads", "filtered_reads.qza") if "filter_reads" in config else getOutput("cutadapt", "cutadapt.qza") if "cutadapt" in config else getOutput("import", "import.qza")
if tool == "merge_denoiser_results":
return getOutput("denoiser", "table.qza")
......@@ -519,6 +522,10 @@ def getOutput(tool, given_file=""):
if tool == "denoiser":
tool = config["global"]["denoiser"]
# Read filtering (based on reference sequences) can be performed either with Qiime or with a custom process based on SortMeRNA
if tool == "filter_reads":
tool = "filter_reads/" + config["filter_reads"]["method"]
# Default path format
out_tool = outdir + "/" + tool + "/"
......@@ -630,7 +637,6 @@ def format_all_rule():
all_input.append(getOutput("deseq2/asv", config["deseq2"]["condition"]))
all_input += expand(getOutput("deseq2/{taxorank}", config["deseq2"]["condition"]), taxorank=["phylum", "class", "order", "family", "genus", "species"])
if "ancom" in config.keys():
all_input += expand(getOutput("ancom", "{taxorank}/volcano/index.html"), taxorank=["phylum", "class", "order", "family", "genus", "species", "asv"])
......@@ -713,11 +719,18 @@ rule split_manifest:
# Import Fastq files into a QIIME2 artifact
rule import:
input: getInput("import")
output: getOutput("import", "import.qza")
output: getOutput("import", "import.qza"),
getOutput("import", "summary.qzv"),
getOutput("import", "summary/index.html")
conda: qiime2_yml
params: options = getOptions("import")
shell: """
qiime tools import --input-path {input} --output-path {output} {params.options}
# Import fastqs in Qiime
qiime tools import --input-path {input} --output-path {output[0]} {params.options}
# Fastqs stats summary
qiime demux summarize --i-data {output[0]} --o-visualization {output[1]}
qiime tools export --input-path {output[1]} --output-path `dirname {output[2]}`
"""
#####################
......@@ -726,13 +739,37 @@ rule import:
rule cutadapt:
input: getInput("cutadapt")
output: getOutput("cutadapt", "cutadapt.qza")
output: getOutput("cutadapt", "cutadapt.qza"),
getOutput("cutadapt", "summary.qzv"),
getOutput("cutadapt", "summary/index.html")
conda: qiime2_yml
params: options = getOptions("cutadapt"),
lib_type = pairedOrSingle("cutadapt")
shell: """
# Remove PCR primers and discard reads that do not start with it
qiime cutadapt {params.lib_type} --i-demultiplexed-sequences {input} --o-trimmed-sequences {output} --p-discard-untrimmed {params.options}
qiime cutadapt {params.lib_type} --i-demultiplexed-sequences {input} --o-trimmed-sequences {output[0]} --p-discard-untrimmed {params.options}
# Fastqs stats summary
qiime demux summarize --i-data {output[0]} --o-visualization {output[1]}
qiime tools export --input-path {output[1]} --output-path `dirname {output[2]}`
"""
if "filter_reads" in config:
rule filter_reads:
input: getInput("filter_reads")
output: getOutput("filter_reads", "filtered_reads.qza"),
getOutput("filter_reads", "summary.qzv"),
getOutput("filter_reads", "summary/index.html")
conda: qiime2_yml
params: options = getOptions("filter_reads", ["method"])
shell: """
# Positive/Negative filtering
qiime quality-control filter-reads --i-demultiplexed-sequences {input[0]} {params.options} --o-filtered-sequences {output[0]}
# Fastqs stats summary
qiime demux summarize --i-data {output[0]} --o-visualization {output[1]}
qiime tools export --input-path {output[1]} --output-path `dirname {output[2]}`
"""
# If the library is paired-end, reads need to be joined together for further analysis
......@@ -1043,22 +1080,22 @@ if "denoiser" in config["global"]:
"""
rule adonis:
input: uw_unifrac = getInput("adonis", "unweighted_unifrac_distance_matrix.qza"),
w_unifrac = getInput("adonis", "weighted_unifrac_distance_matrix.qza"),
jaccard = getInput("adonis", "jaccard_distance_matrix.qza"),
bray_curtis = getInput("adonis", "bray_curtis_distance_matrix.qza"),
metadata = config["global"]["context"]
output: getOutput("adonis", "unweighted_unifrac_distance_matrix.qzv"),
getOutput("adonis", "weighted_unifrac_distance_matrix.qzv"),
getOutput("adonis", "jaccard_distance_matrix.qzv"),
getOutput("adonis", "bray_curtis_distance_matrix.qzv"),
getOutput("adonis", "unweighted_unifrac_distance_matrix/index.html"),
getOutput("adonis", "weighted_unifrac_distance_matrix/index.html"),
getOutput("adonis", "jaccard_distance_matrix/index.html"),
getOutput("adonis", "bray_curtis_distance_matrix/index.html")
params: options = getOptions("adonis")
conda: qiime2_yml
shell: """
input: uw_unifrac = getInput("adonis", "unweighted_unifrac_distance_matrix.qza"),
w_unifrac = getInput("adonis", "weighted_unifrac_distance_matrix.qza"),
jaccard = getInput("adonis", "jaccard_distance_matrix.qza"),
bray_curtis = getInput("adonis", "bray_curtis_distance_matrix.qza"),
metadata = config["global"]["context"]
output: getOutput("adonis", "unweighted_unifrac_distance_matrix.qzv"),
getOutput("adonis", "weighted_unifrac_distance_matrix.qzv"),
getOutput("adonis", "jaccard_distance_matrix.qzv"),
getOutput("adonis", "bray_curtis_distance_matrix.qzv"),
getOutput("adonis", "unweighted_unifrac_distance_matrix/index.html"),
getOutput("adonis", "weighted_unifrac_distance_matrix/index.html"),
getOutput("adonis", "jaccard_distance_matrix/index.html"),
getOutput("adonis", "bray_curtis_distance_matrix/index.html")
params: options = getOptions("adonis")
conda: qiime2_yml
shell: """
qiime diversity adonis --i-distance-matrix {input.uw_unifrac} --m-metadata-file {input.metadata} {params.options} --o-visualization {output[0]}
qiime diversity adonis --i-distance-matrix {input.w_unifrac} --m-metadata-file {input.metadata} {params.options} --o-visualization {output[1]}
qiime diversity adonis --i-distance-matrix {input.jaccard} --m-metadata-file {input.metadata} {params.options} --o-visualization {output[2]}
......
import pandas as pd
import seaborn as sns
import sys
import json
MANIFEST_FILE = sys.argv[1]
OUTDIR = sys.argv[2]
OUTDIR = sys.argv[1]
CONFIG = sys.argv[2]
#############
# LOAD DATA #
#############
with open(CONFIG) as json_file:
config = json.load(json_file)
# Load multiqc stats
multiqc_stats = pd.read_csv(OUTDIR + "/multiqc/multiqc_data/multiqc_fastqc.txt", sep="\t", index_col=0)
################
# Input stats #
################
stats = pd.read_csv(OUTDIR + "/import/summary/per-sample-fastq-counts.tsv", sep="\t", index_col=0)
stats = pd.DataFrame(stats.iloc[:, 0])
stats.columns = ["input"]
################################################################
# If cutadapt was run, get number of reads after its execution #
################################################################
try:
df = pd.read_csv(OUTDIR + "/cutadapt/summary/per-sample-fastq-counts.tsv", sep="\t", index_col=0)
df = pd.DataFrame(df.iloc[:, 0])
df.columns = ["cutadapt"]
stats = pd.merge(stats, df, left_index=True, right_index=True)
except FileNotFoundError:
print("No cutadapt filtering found for this run.")
pass
########################################################################################
# If a filtering on a reference was performed, get number of reads after its execution #
########################################################################################
try:
method = config["filter_reads"]["method"]
df = pd.read_csv(OUTDIR + "/filter_reads/{}/summary/per-sample-fastq-counts.tsv".format(method),
sep="\t", index_col=0)
df = pd.DataFrame(df.iloc[:, 0])
df.columns = ["reference_based_filtering"]
stats = pd.merge(stats, df, left_index=True, right_index=True)
except (KeyError, FileNotFoundError):
print("No reference based filtering found for this run.")
pass
# Load dada2 stats
dada2_stats = pd.read_csv(OUTDIR + "/dada2/stats/stats.csv", sep="\t", index_col=0)
dada2_stats.drop("#q2:types", inplace=True)
dada2_stats = dada2_stats.astype({"input" : int, "filtered" : int, "percentage of input passed filter" : float,
"denoised" : int, "merged" : int, "percentage of input merged" : float,
"non-chimeric" : int, "percentage of input non-chimeric" : float })
# Load manifest
manifest = pd.read_csv(MANIFEST_FILE, index_col=0)
##########################################
# Get number of input reads from multiqc #
##########################################
dict_nb_raw_reads = dict()
for idx, row in multiqc_stats.iterrows() :
filename = row["Filename"]
nb_reads = row["Total Sequences"]
row_manifest = manifest.loc[manifest["absolute-filepath"].str.contains(filename)]
if row_manifest["direction"][0] == "forward" :
sample_id = row_manifest.index[0]
dict_nb_raw_reads[sample_id] = int(nb_reads)
##############################################################
# Reshape dada2 results table to include pre-filtering stats #
##############################################################
dada2_stats.rename(columns={"input" : "pre-filtering"}, inplace=True)
dada2_stats.insert(0,"input",pd.Series(dict_nb_raw_reads))
dada2_stats.insert(2,"percentage of input passed pre-filtering", round(dada2_stats["pre-filtering"] * 100 / dada2_stats["input"], 2))
dada2_stats["percentage of input passed filter"] = round(dada2_stats["filtered"] * 100 / dada2_stats["input"], 2)
dada2_stats["percentage of input merged"] = round(dada2_stats["merged"] * 100 / dada2_stats["input"], 2)
dada2_stats["percentage of input non-chimeric"] = round(dada2_stats["non-chimeric"] * 100 / dada2_stats["input"], 2)
try:
dada2_stats = pd.read_csv(OUTDIR + "/dada2/stats/stats.csv", sep="\t", index_col=0)
dada2_stats.drop("#q2:types", inplace=True)
stats = pd.merge(stats, dada2_stats[["filtered", "denoised", "merged",
"non-chimeric"]], left_index=True, right_index=True)
stats = stats.astype(int)
except FileNotFoundError:
print("No dada2 data found for this run.")
pass
#####################################
# Plot evolution of number of reads #
#####################################
dada2_stats_plot = dada2_stats[["input", "pre-filtering", "filtered", "denoised", "merged", "non-chimeric"]]
dada2_stats_plot["sample_id"] = dada2_stats_plot.index
dada2_stats_plot = pd.melt(dada2_stats_plot, id_vars="sample_id", value_vars=["input", "pre-filtering", "filtered", "denoised", "merged", "non-chimeric"])
dada2_stats_plot.columns = ["sample_id", "processing_step", "nb_reads"]
stats_plot = stats.copy()
steps = list(stats.columns)
stats_plot["sample_id"] = stats_plot.index
stats_plot = pd.melt(stats_plot, id_vars="sample_id", value_vars=steps)
stats_plot.columns = ["sample_id", "processing_step", "nb_reads"]
nb_reads_boxplot = sns.boxplot(x="processing_step", y="nb_reads", data=dada2_stats_plot)
nb_reads_boxplot = sns.boxplot(x="processing_step", y="nb_reads", data=stats_plot)
# Export plot
nb_reads_boxplot.figure.savefig(OUTDIR + "/summarize_table/plots/evolution_nbreads.svg")
# Export table
dada2_stats.to_csv(OUTDIR + "/summarize_table/plots/evolution_nbreads.tsv", sep="\t")
\ No newline at end of file
#####################################
# Add percentage #
#####################################
for step in stats.columns[1:]:
stats["percentage of reads after {}".format(step)] = round(stats[step] * 100 / stats["input"], 2)
# Export table
stats.to_csv(OUTDIR + "/summarize_table/plots/evolution_nbreads.tsv", sep="\t")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment