import os, sys
import glob
# DIRECTORIES AND VARIABLES #
ROOT_dir = config["par_general"]["outdir"]
input_dir = config["par_general"]["indir"]
FASTQC_BT_dir = ROOT_dir + "/1.FastQC_beforeTrimming"
TRIM_dir = ROOT_dir + "/2.Trimming"
FASTQC_AT_dir = ROOT_dir + "/3.FastQC_afterTrimming"
ALIGN_dir = ROOT_dir + "/4.Alignment"
BAM_sorted_dir = ROOT_dir + "/5.BAM_s"
BAM_s_dups_rm_dir = ROOT_dir + "/6.BAM_s_dups_rm"
BAM_s_dups_chr_rm_dir = ROOT_dir + "/7.BAM_s_dups_chr_rm"
BAM_s_dups_chr_rm_mapped_qs_dir = ROOT_dir + "/8.BAM_s_dups_chr_rm_mapped_qs"
Post_align_stat_dir = ROOT_dir + "/9.Post_align_stat"
LOG_BENCHMARK_dir = ROOT_dir + "/Logs_and_Benchmarks"
########### Executables
fastqc_exec = config["executables"]["fastqc_exec"]
trimmomatic_exec = config["executables"]["Trimmomatic_exec"]
bowtie2_exec = config["executables"]["bowtie2_exec"]
picardtoolsJar = config["executables"]["picardtoolsJar"]
####### Wildcard patters
fastq_file_pattern = config["wildcard_pattern"]["fastq_input"]
fastqc_bt_pattern = config["wildcard_pattern"]["fastqc_bt"]
fastq_trim_pattern = config["wildcard_pattern"]["fastq_trim"]
fastqc_at_pattern = config["wildcard_pattern"]["fastqc_at"]
############# To get all the samples
sample_regex_full = input_dir + "/" + "{sample}" + fastq_file_pattern
sample_list, = glob_wildcards(sample_regex_full)
######### Number of threads
threadsMax = 16
thereadMin = 1
##################### All rules
rule all:
input:expand("{dir}/{sample}_stat.txt", dir = Post_align_stat_dir,sample = sample_list)
rule run_fastqc_bt:
input:
fastq_file = expand("{dir}/{{sample}}{pattern_in}", dir = input_dir, pattern_in = fastq_file_pattern)
output:
expand("{dir}/{{sample}}{pattern_bt}", dir = FASTQC_BT_dir, pattern_bt = fastqc_bt_pattern)
log:
expand("{dir}/fastqc_BT/{{sample}}.log", dir = LOG_BENCHMARK_dir)
message:
"Perform FASTQC on the samples {input.fastq_file:q} before trimming..."
threads: threadsMax
shell:
"""fastqc -o {FASTQC_BT_dir:q} -t {threads} {input.fastq_file:q} 2> {log:q}"""
rule trimmomatic_adapters:
input:
fastq_file = expand("{dir}/{{sample}}{pattern_in}", dir = input_dir, pattern_in = fastq_file_pattern)
output:
expand("{dir}/{{sample}}{pattern_trim}", dir = TRIM_dir, pattern_trim = fastq_trim_pattern)
log:
expand("{dir}/fastq_trim/{{sample}}.log", dir = LOG_BENCHMARK_dir)
message:
"Perform trimming on the samples {input.fastq_file:q} ..."
threads: threadsMax
params:
mode = config["par_trimming"]["trimmomatic_mode"],
ILLUMINACLIP = config["par_trimming"]["trimmomatic_ILLUMINACLIP"],
trailing = config["par_trimming"]["trimmomatic_trailing"],
leading = config["par_trimming"]["trimmomatic_leading"],
sliding_window = config["par_trimming"]["trimmomatic_sliding_window"],
minlen = config["par_trimming"]["trimmomatic_minlen"],
adapters = config["par_trimming"]["trimmomatic_adapters"]
shell:
""" java -jar {trimmomatic_exec} {params.mode} \
-threads {threads} \
{input.fastq_file}\
{output} \
ILLUMINACLIP:{params.adapters}:{params.ILLUMINACLIP} \
LEADING:{params.leading}\
TRAILING:{params.trailing} \
SLIDINGWINDOW:{params.sliding_window} \
MINLEN:{params.minlen} \
2>{log:q} """
rule run_fastqc_at:
input:
fastq_file = rules.trimmomatic_adapters.output
output:
expand("{dir}/{{sample}}{pattern_at}", dir = FASTQC_AT_dir, pattern_at = fastqc_at_pattern)
log:
expand("{dir}/fastqc_AT/{{sample}}.log", dir = LOG_BENCHMARK_dir)
message:
"Perform FASTQC on the samples {input.fastq_file:q} after trimming..."
threads: threadsMax
shell:
"""fastqc -o {FASTQC_AT_dir:q} -t {threads} {input.fastq_file:q} 2> {log:q}"""
rule alignment:
input:
file = rules.trimmomatic_adapters.output,
report1 = rules.run_fastqc_bt.output,
report2 = rules.run_fastqc_at.output
output:
expand("{dir}/{{sample}}.sam", dir = ALIGN_dir)
threads: threadsMax
log: expand("{dir}/alignment/{{sample}}.log", dir = LOG_BENCHMARK_dir)
message:
"Do Bowtie2 alignment for files {input:q}. This may take a while..."
params:
sensitivity = config["par_align"]["bowtie2_sensitivity"],
refGenome = config["par_align"]["bowtie2_refGenome"]
shell:
"""{bowtie2_exec} -p {threads} {params.sensitivity} -t -x {params.refGenome} \
-U {input.file} -S {output} 2> {log:q}"""
rule postalign_SAM_TO_BAM:
input:
sam = rules.alignment.output
output:
#unsortedBam = temp(expand('{dir}/{{sample}}.bam', dir = ALIGN_dir)),
sortedBam = expand("{dir}/{{sample}}.s.bam", dir = BAM_sorted_dir),
index = expand("{dir}/{{sample}}.s.bam.bai", dir = BAM_sorted_dir)
#post_align_stat = expand("{dir}/{{sample}}_stat.txt", dir = Post_align_stat_dir)
threads:threadsMax
message:
"Conversion to BAM, sort, index for file {input:q} ..."
shell:
"""samtools view -u -S {input.sam}|samtools sort --threads {threads} -o {output.sortedBam} &&
samtools index {output.sortedBam} """
#&&
#samtools view --threads {threads} -c {output.sortedBam} >> {output.post_align_stat} """
rule to_remove_duplicates:
input:
bam_s = rules.postalign_SAM_TO_BAM.output.sortedBam
#post_align_stat_s_bam = rules.postalign_SAM_TO_BAM.output.post_align_stat
output:
bam_s_dups_rm = expand("{dir}/{{sample}}_s_dups_rm.bam", dir = BAM_s_dups_rm_dir)
log:
log_file = expand("{dir}/dups_rm/{{sample}}.log", dir = LOG_BENCHMARK_dir),
marked_dups_metric_file = expand("{dir}/dups_metric/{{sample}}_dups_metric.txt", dir = LOG_BENCHMARK_dir)
threads:
threadsMax
message:
"Removing duplicates using PICARD {input.bam_s:q} ..."
params:
ValidationStringency = config["par_postalign"]["ValidationStringencyMarkDuplicates"]
shell:
""" java -Xmx30G -jar {picardtoolsJar} MarkDuplicates INPUT={input.bam_s}\
OUTPUT={output.bam_s_dups_rm} \
METRICS_FILE={log.marked_dups_metric_file} \
VALIDATION_STRINGENCY={params.ValidationStringency}\
REMOVE_DUPLICATES=true \
2> {log.log_file} """
#&&
#samtools view --threads {threads} -c {output.bam_s_dups_rm} >> {input.post_align_stat_s_bam} """
rule to_remove_chUn_chrM:
input:
s_bam_dups_rm = rules.to_remove_duplicates.output.bam_s_dups_rm
#post_align_stat_s_bam_dups_rm = rules.to_remove_duplicates.input.post_align_stat_s_bam
output:
bam_s_dups_chr_rm = expand("{dir}/{{sample}}_s_dups_chr_rm.bam", dir = BAM_s_dups_chr_rm_dir)
log:
expand("{dir}/dups_chr_rm/{{sample}}.log", dir = LOG_BENCHMARK_dir)
threads: threadsMax
message:
"Removing reads from mitochondrial Unknown chromosome {input.s_bam_dups_rm:q} ..."
shell:
""" samtools view -h --threads {threads} {input.s_bam_dups_rm}\
|awk '{{if ($3 !="chrM" && $3 !~ "chrUn.*") print $0}}'\
|samtools view --threads {threads} -S -b\
|samtools sort --threads {threads} -o {output.bam_s_dups_chr_rm} 2> {log} """
#&&
#samtools view --threads {threads} -c {output.bam_s_dups_chr_rm} >> {input.post_align_stat_s_bam_dups_rm} """
rule to_retain_mapped_reads_with_mapq_filter:
input:
s_bam_dups_chr_rm = rules.to_remove_chUn_chrM.output.bam_s_dups_chr_rm
#post_align_stat_s_bam_dups_chr_rm = rules.to_remove_chUn_chrM.input.post_align_stat_s_bam_dups_rm
output:
bam_s_dups_chr_rm_mapped_qs = expand("{dir}/{{sample}}_s_dups_chr_rm_mapped_qs.bam", dir = BAM_s_dups_chr_rm_mapped_qs_dir),
index = expand("{dir}/{{sample}}_s_dups_chr_rm_mapped_qs.bam.bai", dir = BAM_s_dups_chr_rm_mapped_qs_dir)
log:
expand("{dir}/dups_chr_rm_mapped_qs/{{sample}}.log", dir = LOG_BENCHMARK_dir)
threads: threadsMax
message:
"Retain only mapped reads above a mapq score {input.s_bam_dups_chr_rm:q} ..."
params:
minMAPQ = config["par_postalign"]["minMAPQscore"]
shell:
""" samtools view -F4 -q {params.minMAPQ} --threads {threads} \
-o {output.bam_s_dups_chr_rm_mapped_qs} -b {input.s_bam_dups_chr_rm} &&
samtools index {output.bam_s_dups_chr_rm_mapped_qs} """
#&&
#samtools view --threads {threads} -c {output.bam_s_dups_chr_rm_mapped_qs} >> {input.post_align_stat_s_bam_dups_chr_rm} """
rule stats:
input:
bam_1 = rules.postalign_SAM_TO_BAM.output.sortedBam,
bam_2 = rules.to_remove_duplicates.output.bam_s_dups_rm,
bam_3 = rules.to_remove_chUn_chrM.output.bam_s_dups_chr_rm,
bam_4 = rules.to_retain_mapped_reads_with_mapq_filter.output.bam_s_dups_chr_rm_mapped_qs
output:
post_align_stat = expand("{dir}/{{sample}}_stat.txt", dir = Post_align_stat_dir)
log:
expand("{dir}/post_align_stat_dir/{{sample}}.log", dir = LOG_BENCHMARK_dir)
threads: threadsMax
message:
"stat file under preparation ..."
shell:
""" samtools view --threads {threads} -c {input.bam_1} >> {output.post_align_stat} &&
samtools view --threads {threads} -c {input.bam_2} >> {output.post_align_stat} &&
samtools view --threads {threads} -c {input.bam_3} >> {output.post_align_stat} &&
samtools view --threads {threads} -c {input.bam_4} >> {output.post_align_stat} """