AD-Enhancer-Remodelling / ChIP-seq workflow / snakemake_chIP
snakemake_chIP
Raw
import os, sys
import glob

# DIRECTORIES AND VARIABLES #

ROOT_dir = config["par_general"]["outdir"]
input_dir = config["par_general"]["indir"]

FASTQC_BT_dir   = ROOT_dir + "/1.FastQC_beforeTrimming"
TRIM_dir        = ROOT_dir + "/2.Trimming"
FASTQC_AT_dir   = ROOT_dir + "/3.FastQC_afterTrimming"
ALIGN_dir       = ROOT_dir + "/4.Alignment"
BAM_sorted_dir = ROOT_dir + "/5.BAM_s"
BAM_s_dups_rm_dir = ROOT_dir + "/6.BAM_s_dups_rm"
BAM_s_dups_chr_rm_dir = ROOT_dir + "/7.BAM_s_dups_chr_rm"
BAM_s_dups_chr_rm_mapped_qs_dir = ROOT_dir + "/8.BAM_s_dups_chr_rm_mapped_qs"
Post_align_stat_dir = ROOT_dir + "/9.Post_align_stat"

LOG_BENCHMARK_dir  = ROOT_dir + "/Logs_and_Benchmarks"



########### Executables

fastqc_exec = config["executables"]["fastqc_exec"]
trimmomatic_exec = config["executables"]["Trimmomatic_exec"]
bowtie2_exec = config["executables"]["bowtie2_exec"]
picardtoolsJar   = config["executables"]["picardtoolsJar"]



####### Wildcard patters
fastq_file_pattern = config["wildcard_pattern"]["fastq_input"]
fastqc_bt_pattern = config["wildcard_pattern"]["fastqc_bt"]
fastq_trim_pattern = config["wildcard_pattern"]["fastq_trim"]
fastqc_at_pattern = config["wildcard_pattern"]["fastqc_at"]


############# To get all the samples
sample_regex_full = input_dir + "/" + "{sample}" + fastq_file_pattern
sample_list, = glob_wildcards(sample_regex_full)


######### Number of threads

threadsMax = 16
thereadMin = 1


##################### All rules


rule all:
    input:expand("{dir}/{sample}_stat.txt", dir = Post_align_stat_dir,sample = sample_list)

rule run_fastqc_bt:
    input:
        fastq_file = expand("{dir}/{{sample}}{pattern_in}", dir = input_dir, pattern_in = fastq_file_pattern)

    output:
        expand("{dir}/{{sample}}{pattern_bt}", dir = FASTQC_BT_dir, pattern_bt = fastqc_bt_pattern)

    log:
        expand("{dir}/fastqc_BT/{{sample}}.log", dir = LOG_BENCHMARK_dir)
    message:
        "Perform FASTQC on the samples {input.fastq_file:q} before trimming..."

    threads: threadsMax

    shell:
        """fastqc -o {FASTQC_BT_dir:q} -t {threads} {input.fastq_file:q} 2> {log:q}"""

rule trimmomatic_adapters:
    input:
        fastq_file = expand("{dir}/{{sample}}{pattern_in}", dir = input_dir, pattern_in = fastq_file_pattern)

    output:
        expand("{dir}/{{sample}}{pattern_trim}", dir = TRIM_dir, pattern_trim = fastq_trim_pattern)

    log:
        expand("{dir}/fastq_trim/{{sample}}.log", dir = LOG_BENCHMARK_dir)
    message:
        "Perform trimming on the samples {input.fastq_file:q} ..."

    threads: threadsMax

    params:
        mode = config["par_trimming"]["trimmomatic_mode"],
        ILLUMINACLIP = config["par_trimming"]["trimmomatic_ILLUMINACLIP"],
        trailing     = config["par_trimming"]["trimmomatic_trailing"],
        leading = config["par_trimming"]["trimmomatic_leading"],
        sliding_window = config["par_trimming"]["trimmomatic_sliding_window"],
        minlen       = config["par_trimming"]["trimmomatic_minlen"],
        adapters     = config["par_trimming"]["trimmomatic_adapters"]

    shell:
        """ java -jar {trimmomatic_exec} {params.mode} \
                -threads {threads}  \
                {input.fastq_file}\
                {output} \
                ILLUMINACLIP:{params.adapters}:{params.ILLUMINACLIP}  \
                LEADING:{params.leading}\
                TRAILING:{params.trailing}  \
                SLIDINGWINDOW:{params.sliding_window} \
                MINLEN:{params.minlen}  \
                2>{log:q} """

rule run_fastqc_at:
    input:
        fastq_file = rules.trimmomatic_adapters.output

    output:
        expand("{dir}/{{sample}}{pattern_at}", dir = FASTQC_AT_dir, pattern_at = fastqc_at_pattern)

    log:
        expand("{dir}/fastqc_AT/{{sample}}.log", dir = LOG_BENCHMARK_dir)
    message:
        "Perform FASTQC on the samples {input.fastq_file:q} after trimming..."

    threads: threadsMax

    shell:
        """fastqc -o {FASTQC_AT_dir:q} -t {threads} {input.fastq_file:q} 2> {log:q}"""



rule alignment:
    input:
        file  = rules.trimmomatic_adapters.output,
        report1 = rules.run_fastqc_bt.output,
        report2 = rules.run_fastqc_at.output
    output:
        expand("{dir}/{{sample}}.sam", dir = ALIGN_dir)

    threads: threadsMax

    log: expand("{dir}/alignment/{{sample}}.log", dir = LOG_BENCHMARK_dir)

    message:
        "Do Bowtie2 alignment for files {input:q}. This may take a while..."

    params:
        sensitivity = config["par_align"]["bowtie2_sensitivity"],
        refGenome   = config["par_align"]["bowtie2_refGenome"]

    shell:
        """{bowtie2_exec}  -p {threads} {params.sensitivity}  -t -x {params.refGenome} \
        -U {input.file} -S {output} 2> {log:q}"""

rule postalign_SAM_TO_BAM:
    input:
        sam = rules.alignment.output
    output:
        #unsortedBam = temp(expand('{dir}/{{sample}}.bam', dir = ALIGN_dir)),
        sortedBam   = expand("{dir}/{{sample}}.s.bam", dir = BAM_sorted_dir),
        index       = expand("{dir}/{{sample}}.s.bam.bai", dir = BAM_sorted_dir)
        #post_align_stat = expand("{dir}/{{sample}}_stat.txt", dir = Post_align_stat_dir)

    threads:threadsMax

    message:
        "Conversion to BAM, sort, index for file {input:q} ..."

    shell:
        """samtools view -u -S {input.sam}|samtools sort --threads {threads} -o {output.sortedBam} &&
            samtools index {output.sortedBam} """
            #&&
            #samtools view  --threads {threads} -c {output.sortedBam} >> {output.post_align_stat} """

rule to_remove_duplicates:
    input:
        bam_s = rules.postalign_SAM_TO_BAM.output.sortedBam
        #post_align_stat_s_bam = rules.postalign_SAM_TO_BAM.output.post_align_stat
    output:
        bam_s_dups_rm = expand("{dir}/{{sample}}_s_dups_rm.bam", dir = BAM_s_dups_rm_dir)
    log:
        log_file = expand("{dir}/dups_rm/{{sample}}.log", dir = LOG_BENCHMARK_dir),
        marked_dups_metric_file = expand("{dir}/dups_metric/{{sample}}_dups_metric.txt", dir = LOG_BENCHMARK_dir)

    threads:
        threadsMax
    message:
        "Removing duplicates using PICARD {input.bam_s:q} ..."
    params:
        ValidationStringency = config["par_postalign"]["ValidationStringencyMarkDuplicates"]

    shell:
        """ java -Xmx30G -jar {picardtoolsJar} MarkDuplicates INPUT={input.bam_s}\
        OUTPUT={output.bam_s_dups_rm} \
        METRICS_FILE={log.marked_dups_metric_file} \
        VALIDATION_STRINGENCY={params.ValidationStringency}\
        REMOVE_DUPLICATES=true \
        2> {log.log_file} """
        #&&
        #samtools view  --threads {threads} -c {output.bam_s_dups_rm} >> {input.post_align_stat_s_bam} """

rule to_remove_chUn_chrM:
    input:
        s_bam_dups_rm = rules.to_remove_duplicates.output.bam_s_dups_rm
        #post_align_stat_s_bam_dups_rm = rules.to_remove_duplicates.input.post_align_stat_s_bam
    output:
        bam_s_dups_chr_rm = expand("{dir}/{{sample}}_s_dups_chr_rm.bam", dir = BAM_s_dups_chr_rm_dir)

    log:
        expand("{dir}/dups_chr_rm/{{sample}}.log", dir = LOG_BENCHMARK_dir)

    threads: threadsMax
    message:
        "Removing reads from mitochondrial Unknown chromosome {input.s_bam_dups_rm:q} ..."

    shell:
        """ samtools view -h --threads {threads} {input.s_bam_dups_rm}\
        |awk '{{if ($3 !="chrM" && $3 !~ "chrUn.*") print $0}}'\
        |samtools view --threads {threads} -S -b\
        |samtools sort --threads {threads} -o {output.bam_s_dups_chr_rm} 2> {log} """
        #&&
        #samtools view  --threads {threads} -c {output.bam_s_dups_chr_rm} >> {input.post_align_stat_s_bam_dups_rm} """

rule to_retain_mapped_reads_with_mapq_filter:
    input:
        s_bam_dups_chr_rm = rules.to_remove_chUn_chrM.output.bam_s_dups_chr_rm
        #post_align_stat_s_bam_dups_chr_rm = rules.to_remove_chUn_chrM.input.post_align_stat_s_bam_dups_rm
    output:
        bam_s_dups_chr_rm_mapped_qs = expand("{dir}/{{sample}}_s_dups_chr_rm_mapped_qs.bam", dir = BAM_s_dups_chr_rm_mapped_qs_dir),
        index = expand("{dir}/{{sample}}_s_dups_chr_rm_mapped_qs.bam.bai", dir = BAM_s_dups_chr_rm_mapped_qs_dir)


    log:
        expand("{dir}/dups_chr_rm_mapped_qs/{{sample}}.log", dir = LOG_BENCHMARK_dir)

    threads: threadsMax
    message:
        "Retain only mapped reads above a mapq score {input.s_bam_dups_chr_rm:q} ..."
    params:
        minMAPQ = config["par_postalign"]["minMAPQscore"]

    shell:
        """ samtools view -F4 -q {params.minMAPQ} --threads {threads} \
        -o {output.bam_s_dups_chr_rm_mapped_qs} -b {input.s_bam_dups_chr_rm}  &&
        samtools index {output.bam_s_dups_chr_rm_mapped_qs} """
        #&&
        #samtools view  --threads {threads} -c {output.bam_s_dups_chr_rm_mapped_qs} >> {input.post_align_stat_s_bam_dups_chr_rm} """

rule stats:
    input:
        bam_1 = rules.postalign_SAM_TO_BAM.output.sortedBam,
        bam_2 = rules.to_remove_duplicates.output.bam_s_dups_rm,
        bam_3 = rules.to_remove_chUn_chrM.output.bam_s_dups_chr_rm,
        bam_4 = rules.to_retain_mapped_reads_with_mapq_filter.output.bam_s_dups_chr_rm_mapped_qs
    output:
        post_align_stat = expand("{dir}/{{sample}}_stat.txt", dir = Post_align_stat_dir)

    log:
        expand("{dir}/post_align_stat_dir/{{sample}}.log", dir = LOG_BENCHMARK_dir)

    threads: threadsMax
    message:
        "stat file under preparation ..."
    shell:
        """ samtools view  --threads {threads} -c {input.bam_1} >> {output.post_align_stat} &&
        samtools view  --threads {threads} -c {input.bam_2} >> {output.post_align_stat} &&
        samtools view  --threads {threads} -c {input.bam_3} >> {output.post_align_stat} &&
        samtools view  --threads {threads} -c {input.bam_4} >> {output.post_align_stat} """