Commit 61372ac6 authored by Erwan DELAGE's avatar Erwan DELAGE
Browse files

Merge branch 'develop'

parents 7343e34a 225aee30
......@@ -82,6 +82,13 @@
"m-metadata-column" : { "type": "string" }
},
"required": ["m-metadata-column"]
},
"deseq2": {
"type": "object",
"properties": {
"condition" : { "type": "string" }
},
"required": ["condition"]
}
},
"required": ["global"],
......
.DS_Store
.snakemake
out_dada2
out_deblur
reference
config-deblur.json
microSysMics.code-workspace
conda/phylodeseq.yml
test/manifest_forward.csv
test/manifest.csv
test/manifest.csv
......@@ -50,11 +50,14 @@ def initialize_pipeline(arguments):
# If no configuration file has been specified via the snakemake command line, use the one specified at the top of this snakefile
global json_config_path
if "--configfile" not in arguments:
if "--configfile" not in arguments and "--configfiles" not in arguments:
configfile: config_file
json_config_path = ("snakefile", config_file)
else:
json_config_path = ("command line", arguments[arguments.index("--configfile") + 1])
try:
json_config_path = ("command line", arguments[arguments.index("--configfile") + 1])
except ValueError:
json_config_path = ("command line", arguments[arguments.index("--configfiles") + 1])
# Check the configuration file
check_config()
......@@ -598,6 +601,7 @@ def format_all_rule():
all_input.append(getOutput("export_table_tsv", "abundance_table.tsv"))
all_input.append(getOutput("alpha_rarefaction", "results/index.html"))
all_input.append(getOutput("summarize_table", "plots/evolution_nbreads.tsv"))
if config["global"]["parallel-samples"]:
all_input.append(getOutput("merge_denoiser_stats", "html"))
......@@ -611,12 +615,16 @@ def format_all_rule():
if "taxonomy" in config.keys():
all_input.append(getOutput("taxonomy", "barplot/index.html"))
all_input.append(getOutput("summarize_table", "plots"))
all_input.append(getOutput("summarize_table", "plots/prevalence_abundance.html"))
all_input += expand(getOutput("taxorank_tables", "{taxorank}_table.tsv"), taxorank=["phylum", "class", "order", "family", "genus", "species"])
if "ancom" in config.keys():
all_input += expand(getOutput("ancom", "{taxorank}/volcano/index.html"), taxorank=["phylum", "class", "order", "family", "genus", "species", "asv"])
if "deseq2" in config.keys():
all_input.append(getOutput("deseq2/asv", config["deseq2"]["condition"]))
all_input += expand(getOutput("deseq2/{taxorank}", config["deseq2"]["condition"]), taxorank=["phylum", "class", "order", "family", "genus", "species"])
return all_input
......@@ -714,6 +722,7 @@ rule cutadapt:
params: options = getOptions("cutadapt"),
lib_type = pairedOrSingle("cutadapt")
shell: """
# Remove PCR primers and discard reads that do not start with it
qiime cutadapt {params.lib_type} --i-demultiplexed-sequences {input} --o-trimmed-sequences {output} --p-discard-untrimmed {params.options}
"""
......@@ -1083,6 +1092,7 @@ if "denoiser" in config["global"]:
qiime taxa barplot --i-table {input[2]} --i-taxonomy {output[0]} --m-metadata-file {input[3]} --o-visualization {output[2]}
qiime tools export --input-path {output[1]} --output-path `dirname {output[3]}`
qiime tools export --input-path {output[2]} --output-path `dirname {output[4]}`
python scripts/format_taxo.py {output[4]}
echo "\nTaxonomy.....OK" >> {params.logfile}
"""
......@@ -1116,18 +1126,36 @@ if "denoiser" in config["global"]:
shell: """
qiime tools export --input-path {input} --output-path `dirname {output}`/{wildcards.taxorank}
biom convert -i `dirname {output}`/{wildcards.taxorank}/feature-table.biom -o `dirname {output}`/{wildcards.taxorank}.tsv --to-tsv
# Remove first header line to match with regular TSV format
tail -n +2 `dirname {output}`/{wildcards.taxorank}.tsv > `dirname {output}`/{wildcards.taxorank}_temp.tsv
mv `dirname {output}`/{wildcards.taxorank}_temp.tsv `dirname {output}`/{wildcards.taxorank}.tsv
rm -r `dirname {output}`/{wildcards.taxorank}
"""
rule prevalence_abundance:
input: getOutput("export_table_tsv", "abundance_table.tsv"),
getOutput("taxonomy", "taxons/metadata.tsv")
output: directory(getOutput("summarize_table", "plots"))
output: getOutput("summarize_table", "plots/prevalence_abundance.html")
shell: """
python scripts/plot_prevalence_abundance.py {input[0]} {input[1]} `dirname {output}`
"""
rule evolution_nbreads:
input: getOutput("export_table_tsv", "abundance_table.tsv")
output: getOutput("summarize_table", "plots/evolution_nbreads.tsv")
params: manifest = config["global"]["manifest"],
outdir = config["global"]["outdir"]
shell: """
python scripts/plot_prevalence_abundance.py {input[0]} {input[1]} {output}
python scripts/evolution_nbreads.py {params.manifest} {params.outdir}
"""
# ANCOM (differential expression)
##################################
## DIFFERENTIAL ABUNDANCE ##
##################################
# ANCOM (differential abundance)
if 'ancom' in config.keys():
rule ancom:
......@@ -1146,6 +1174,34 @@ if "denoiser" in config["global"]:
qiime tools export --input-path {output[1]} --output-path `dirname {output[2]}`
"""
# DESeq2 (differential abundance)
if 'deseq2' in config.keys():
rule deseq2:
input:
getOutput("export_table_tsv", "abundance_table.tsv"),
config["global"]["context"],
json_config_path[1],
getOutput("taxonomy", "taxons/metadata.tsv")
output:
directory(getOutput("deseq2/asv", config["deseq2"]["condition"]))
conda: "conda/deseq2.yml"
shell: """
Rscript scripts/deseq2_microSysMics.R {input[0]} {input[1]} {input[2]} {output} --taxo {input[3]}
"""
rule deseq2_taxorank:
input:
getOutput("taxorank_tables", "{taxorank}_table.tsv"),
config["global"]["context"],
json_config_path[1],
getOutput("taxonomy", "taxons/metadata.tsv")
output:
directory(getOutput("deseq2/{taxorank}", config["deseq2"]["condition"]))
conda: "conda/deseq2.yml"
shell: """
Rscript scripts/deseq2_microSysMics.R {input[0]} {input[1]} {input[2]} {output} --taxo {input[3]}
"""
############################
## BUILD REPORT #
......
name: deseq2
channels:
- bioconda
- conda-forge
dependencies:
- bioconductor-deseq2
- r-docopt
- bioconductor-enhancedvolcano
- r-rjson
- r-ggplot2
- r-tidyverse
- r-hrbrthemes
- r-svglite
\ No newline at end of file
......@@ -8,3 +8,4 @@ dependencies :
- jsonschema
- jinja2
- plotly
- seaborn
\ No newline at end of file
......@@ -18,6 +18,13 @@
},
"ancom" : {
"m-metadata-column" : "time"
},
"deseq2" : {
"condition" : "time",
"filtering" : {
"type" : "prevalence",
"threshold" : 5
}
}
}
'DESeq2_microSysMics.
The DESeq2 implementation used in microSysMics pipeline for
differential abundance estimation.
Usage:
deseq2_microSysMics.R <abundance_table> <contextual_table> <config> <outdir> [--taxo <taxofile>]
deseq2_microSysMics.R (-h | --help)
deseq2_microSysMics.R --version
Options:
-h --help Show this screen.
--version Show version.
--taxo=taxofile Taxonomic file
' -> doc
library(DESeq2, quietly = TRUE)
library(docopt, quietly = TRUE)
#library(docstring)
library(rjson, quietly = TRUE)
library(ggplot2, quietly = TRUE)
library(EnhancedVolcano, quietly = TRUE)
library(tidyverse, quietly = TRUE)
library(hrbrthemes, quietly = TRUE)
#######################
# FILTERING FUNCTIONS #
#######################
# We can filter the abundance table in several ways (based on abundance, prevalence, quantiles etc...)
# We describe here some functions that allows for different filtering mechanism
filter_on_prevalence <- function(abundance_table, prevalence_threshold){
#' Filter abundance table based on feature prevalence
#'
#' @description Filter abundance table based on feature prevalence. All features found in more than @prevalence_threshold samples
#' are kept, others are discarded.
#'
#' @param abundance_table array abundance table
#' @param prevalence_threshold int prevalence threshold
#' @return Filtered abundance table.
filtered_df <- abundance_table[rowSums(abundance > 0) >= prevalence_threshold, ]
return(filtered_df)
}
filter_on_prevalence_per_condition <- function(abundance_table, prevalence_threshold, contextual_table, conditionCol){
#' Filter abundance table based on feature prevalence according to groups of samples
#'
#' @description Filter abundance table based on feature prevalence. All features found in more than @prevalence_threshold samples
#' in at least one group of samples are kept, others are discarded.
#'
#' @param abundance_table array abundance table
#' @param prevalence_threshold int prevalence threshold
#' @param contextual_table array contextual table
#' @param conidtionCol string condition describing groups of samples
#' @return Filtered abundance table.
category = levels(factor(contextual_table[,conditionCol]))
# Get an initial FALSE boolean vector
prev_filter_df = rowSums(abundance) < 0
# Then keep only ASVS found at least in N samples in at least one condition
for (i in 2:length(category)){
min_df = abundance[,rownames(context[context[, conditionCol] == category[i],])]
filter_df = rowSums(abundance_table > 0) >= prevalence_threshold
prev_filter_df = prev_filter_df | filter_df
}
filtered_df = abundance_table[prev_filter_df,]
return(filtered_df)
}
######################
# PLOTTING FUNCTIONS #
######################
enhanced_volcano_deseq2 <- function(deseq2_res, title="", outfile=""){
#' Export volcano plot of feature fold change
#'
#' @description Filter abundance table based on feature prevalence. All features found in more than @prevalence_threshold samples
#' in at least one group of samples are kept, others are discarded.
#'
#' @param deseq2_res array deseq2 results
#' @param title string plot title
#' @param outfile string output plot destination
# Create EnhancedVolcano plot
volcano <- EnhancedVolcano(deseq2_res,
lab = rownames(deseq2_res),
x = 'log2FoldChange',
y = 'padj',
pCutoff = 0.05,
col=c('black', 'black', 'black', 'red3'),
colAlpha = 0.8,
labSize = 0,
title = title,
drawConnectors = FALSE,
widthConnectors = 0.2,
colConnectors = 'grey30')
# Plot graph if no outfile specified (testing mode)
if (outfile == "")
{
return(p)
}
# Otherwise save it
else
{
ggsave(file=outfile, plot = volcano)
}
}
boxplot_significant_features <- function(feature_id, abundance_table, context_table, context_condition, outfile="") {
#' Create boxplot of raw abundance for a givean feature
#'
#' @description Create boxplot of raw abundance for a givean feature
#'
#' @param feature_id string Feature ID
#' @param abundance_table array Abundance table
#' @param context_table array Contextual table
#' @param context_condition string Column from the contextual file for grouping samples
#' @param outfile string output plot destination
# Select feature abundance profile
feature_abundance <- as.data.frame(t(abundance_table[feature_id,]))
# Get contextual distribution of given variable of interest
min_context <- context_table[,context_condition, drop=FALSE]
# Prepare data frame for plotting
df <- merge(feature_abundance, min_context, by="row.names")
colnames(df) <- c("SampleID", "Feature_TSS_abundance", context_condition)
# Create boxplot
p <- ggplot(df, aes_string(x=context_condition, y="Feature_TSS_abundance", fill=context_condition)) +
geom_boxplot(alpha=0.9) + # Create boxplot
scale_fill_brewer(palette="Dark2") + # Set color palette
ggtitle(feature_id) +
theme_ipsum() +
theme(legend.position="none") # No legend
# Plot graph if no outfile specified (testing mode)
if (outfile == "")
{
return(p)
}
# Otherwise save it
else
{
ggsave(file=outfile, plot = p)
}
}
#############
# LOAD DATA #
#############
# Read input parameters
arguments <- docopt(doc, version = 'DESeq2 microSysMics 1.0')
# Input test parameters
#arguments = vector()
#arguments["<abundance_table>"] = "/Users/delage-e/workspace/microSysMics/out_dada2/export_table_tsv/abundance_table.tsv"
#arguments["<contextual_table>"] = "/Users/delage-e/workspace/microSysMics/test/context.tsv"
#arguments["<config>"] = "/Users/delage-e/workspace/microSysMics/config.json"
#arguments["<outdir>"] = "/Users/delage-e/workspace/microSysMics/out_dada2/deseq2_test"
#arguments["<outdir>"] = "/Users/delage-e/workspace/microSysMics/out_dada2/deseq2_test"
#arguments["<taxo>"] = "/Users/delage-e/workspace/microSysMics/out_dada2/taxonomy/taxons/metadata.tsv"
# Load abundance table
abundance <- read.csv(arguments[["<abundance_table>"]],row.names=1, check.names = FALSE, sep="\t")
tss_abundance = as.data.frame(t(t(abundance) * 100 / colSums(abundance)))
# Load context and remove Qiime "categorical header" row (if there)
context <- read.csv(arguments[["<contextual_table>"]], sep="\t", row.names = 1, check.names=FALSE)
header.remove <- c("#q2:types")
context <- context[!(row.names(context) %in% header.remove), ]
# Restrain context to samples found in abundance matrix
context = context[colnames(abundance),]
# Load JSON config file
config <- fromJSON(file = arguments[["<config>"]])
deseq2_params <- config["deseq2"][[1]]
# Set default values
if (is.null(deseq2_params[["sftype"]])) deseq2_params[["sftype"]] <- "poscounts"
# Create output directory
dir.create(arguments[["<outdir>"]], showWarnings = FALSE)
# Load taxo file (if provided)
if ("taxo" %in% names(arguments)){
taxo <- read.csv(toString(arguments["taxo"]), sep="\t", check.names = FALSE, row.names = 1)
}
##########################
# FILTER ABUNDANCE TABLE #
##########################
# We can filter the abundance table in several ways (based on abundance, prevalence, quantiles etc...)
if("filtering" %in% names(deseq2_params)){
filteringType <- deseq2_params[["filtering"]][["type"]]
filteringThreshold <- deseq2_params[["filtering"]][["threshold"]]
if (filteringType == "prevalence")
{
filtered_abundance <- filter_on_prevalence(abundance, filteringThreshold)
}
else
{
filtered_abundance <- filter_on_prevalence_per_condition(abundance, filteringThreshold, context, deseq2_params[["filtering"]][["condition"]])
}
message("Filtering : ", dim(abundance)[1] - dim(filtered_abundance)[1], " features are removed after filtering on ", deseq2_params[["filtering"]][["type"]], " with a threshold of ", deseq2_params[["filtering"]][["threshold"]])
message("Filtered matrix contains ", dim(filtered_abundance)[1], " features and ", dim(filtered_abundance)[2], " samples.")
abundance <- filtered_abundance
}
##########
# DESEQ2 #
##########
# Formula used in the GLM. The user can either indicate a formula or a condition on which the formula will be built
if ("formula" %in% names(deseq2_params)) formulaModel <- deseq2_params[["formula"]] else formulaModel <- paste0("~", deseq2_params[["condition"]])
# Build DESeq object
dds <- DESeqDataSetFromMatrix(countData=abundance,
colData=context,
design=formula(formulaModel))
# Run DESeq2
dds <- DESeq(dds, sfType=deseq2_params[["sftype"]])
# The studied condition may have more than two values, we want to compare all pairwise differences
category <- levels(factor(context[,deseq2_params[["condition"]]]))
# For each pairwise comparison
for(i in 1:(length(category)-1)){
for(j in (i+1):length(category))
{
message("Treating condtion ", deseq2_params[["condition"]], " with levels ", category[i], " and ",category[j])
# Get DESeq2 results
contrast = c(deseq2_params[["condition"]], category[i], category[j])
res <- results(dds, contrast = c(deseq2_params[["condition"]], category[i], category[j]))
# Get significant results (FDR at 0.05)
resSig <- res[ which(res$padj < 0.05), ]
# Create one directory per comparison
outdir = paste0(arguments["<outdir>"], "/", category[i],"_vs_", category[j])
dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
if (dim(resSig)[1] != 0) {
# Add direction to DESeq2 results
resSig$over_abundant <- category[i]
resSig[resSig$log2FoldChange < 0,"over_abundant"] = category[j]
resSig <- resSig[,c(ncol(resSig),1:(ncol(resSig)-1))]
# Add taxo info if provided
if ("taxo" %in% names(arguments)){
resSig <- merge(as.data.frame(resSig), taxo, by="row.names",all.x=TRUE)
rownames(resSig) <- resSig$Row.names
resSig <- resSig[2:length(resSig)]
}
# Sort ASVs according to condition of over abundance and baseMean
resSig <- resSig[with(resSig, order(over_abundant, -baseMean)), ]
}
# Export results
write.csv(resSig, file=paste0(outdir,"/deseq2_results.csv"))
# Volcano plot
enhanced_volcano_deseq2(res, title=paste(deseq2_params[["condition"]], ":", category[i], "vs", category[j]), outfile=paste0(outdir,"/volcano_plot.svg"))
# Put boxplot in subdirectory
outdir = paste0(outdir, "/boxplot_tss_abundance")
dir.create(outdir, showWarnings = FALSE)
# TSS abundance boxplot for each significant genes
for (feature_id in rownames(resSig)){
boxplot_significant_features(feature_id, tss_abundance, context, deseq2_params[["condition"]], outfile=paste0(outdir,"/",feature_id,".svg"))
}
}
}
import pandas as pd
import seaborn as sns
import sys
MANIFEST_FILE = sys.argv[1]
OUTDIR = sys.argv[2]
#############
# LOAD DATA #
#############
# Load multiqc stats
multiqc_stats = pd.read_csv(OUTDIR + "/multiqc/multiqc_data/multiqc_fastqc.txt", sep="\t", index_col=0)
# Load dada2 stats
dada2_stats = pd.read_csv(OUTDIR + "/dada2/stats/stats.csv", sep="\t", index_col=0)
dada2_stats.drop("#q2:types", inplace=True)
dada2_stats = dada2_stats.astype({"input" : int, "filtered" : int, "percentage of input passed filter" : float,
"denoised" : int, "merged" : int, "percentage of input merged" : float,
"non-chimeric" : int, "percentage of input non-chimeric" : float })
# Load manifest
manifest = pd.read_csv(MANIFEST_FILE, index_col=0)
##########################################
# Get number of input reads from multiqc #
##########################################
dict_nb_raw_reads = dict()
for idx, row in multiqc_stats.iterrows() :
filename = row["Filename"]
nb_reads = row["Total Sequences"]
row_manifest = manifest.loc[manifest["absolute-filepath"].str.contains(filename)]
if row_manifest["direction"][0] == "forward" :
sample_id = row_manifest.index[0]
dict_nb_raw_reads[sample_id] = int(nb_reads)
##############################################################
# Reshape dada2 results table to include pre-filtering stats #
##############################################################
dada2_stats.rename(columns={"input" : "pre-filtering"}, inplace=True)
dada2_stats.insert(0,"input",pd.Series(dict_nb_raw_reads))
dada2_stats.insert(2,"percentage of input passed pre-filtering", round(dada2_stats["pre-filtering"] * 100 / dada2_stats["input"], 2))
dada2_stats["percentage of input passed filter"] = round(dada2_stats["filtered"] * 100 / dada2_stats["input"], 2)
dada2_stats["percentage of input merged"] = round(dada2_stats["merged"] * 100 / dada2_stats["input"], 2)
dada2_stats["percentage of input non-chimeric"] = round(dada2_stats["non-chimeric"] * 100 / dada2_stats["input"], 2)
#####################################
# Plot evolution of number of reads #
#####################################
dada2_stats_plot = dada2_stats[["input", "pre-filtering", "filtered", "denoised", "merged", "non-chimeric"]]
dada2_stats_plot["sample_id"] = dada2_stats_plot.index
dada2_stats_plot = pd.melt(dada2_stats_plot, id_vars="sample_id", value_vars=["input", "pre-filtering", "filtered", "denoised", "merged", "non-chimeric"])
dada2_stats_plot.columns = ["sample_id", "processing_step", "nb_reads"]
nb_reads_boxplot = sns.boxplot(x="processing_step", y="nb_reads", data=dada2_stats_plot)
# Export plot
nb_reads_boxplot.figure.savefig(OUTDIR + "/summarize_table/plots/evolution_nbreads.svg")
# Export table
dada2_stats.to_csv(OUTDIR + "/summarize_table/plots/evolution_nbreads.tsv", sep="\t")
\ No newline at end of file
import pandas as pd
import sys
if __name__ == "__main__":
# Load taxo file
TAXONOMY_FILE = sys.argv[1]
taxo = pd.read_csv(TAXONOMY_FILE, sep="\t", index_col=0)
taxo.drop("#q2:types", inplace=True)
# Associate taxorank with corresponding prefix letter
taxo["Domain"] = ""
taxo["Phylum"] = ""
taxo["Class"] = ""
taxo["Order"] = ""
taxo["Family"] = ""
taxo["Genus"] = ""
taxo["Species"] = ""
taxorank_dict = {"d": "Domain", "p": "Phylum", "c": "Class",
"o": "Order", "f": "Family", "g": "Genus", "s": "Species"}
# Create a column for each taxorank
for asv_id, row in taxo.iterrows():
taxo_split = row["Taxon"].split(";")
for idx, taxon in enumerate(taxo_split):
try:
rank = taxon.split("__")[0].strip()
taxo.loc[asv_id, taxorank_dict[rank]] = taxon.split("__")[1]
except IndexError:
continue
# Export taxo
taxo.to_csv(sys.argv[1], sep="\t")
#SampleID time dpw
#q2:types categorical numeric
F3D0 Early 0
F3D1 Early 1
F3D2 Early 2
F3D3 Early 3
F3D5 Early 5
F3D6 Early 6
F3D7 Early 7
F3D8 Early 8
F3D9 Early 9
F3D141 Late 141
F3D142 Late 142
F3D143 Late 143
F3D144 Late 144
F3D145 Late 145
F3D146 Late 146
F3D147 Late 147
F3D148 Late 148
F3D149 Late 149
F3D150 Late 150
#SampleID time dpw sample
#q2:types categorical numeric categorical