Scripts/Human_Mouse_Separation/filter_fastqs_human.sh · Chimeras

#!/bin/bash
# Grid Engine options (lines prefixed with #$)
#$ -N filter_fastqs
#$ -cwd                  
#$ -l h_rt=10:00:00 
#$ -l h_vmem=16G

#Array job :
#$ -t 1-10 #!!!!!!IMPORTANT: change to 1-num_of_samples

# To run in chuncks of 3  uncoment this
##$ -tc 3

# #nitialise the environment modules
. /etc/profile.d/modules.sh

# load modules
module load igmm/apps/samtools/1.6


# define mouse our human
specie="human"
tag="Method1"
## Variables declaration, change them to match your setup:
# File with all the sample IDs, one per line
idfile="/exports/eddie/scratch/$USER/Chimeras/Data/samples.txt"
# Pathe to the runCellRanger.sh script


## Running
# Assigning SAMPLE variable from the built-in array counter
sample=`sed -n ${SGE_TASK_ID}p "$idfile"`

echo "processing $sample"
# some posts to think about how to write the script
# I want to extract the first field from the sam file that matches the barcodes,
# and then use that extraction to filter the fastq files
#https://stackoverflow.com/questions/5814377/grep-f-alternative-sedawk
#https://stackoverflow.com/questions/12204192/using-multiple-delimiters-in-awk
#https://stackoverflow.com/questions/32481877/what-are-nr-and-fnr-and-what-does-nr-fnr-imply
# https://www.tutorialspoint.com/awk/awk_arrays.htm

# Change directory to the main project folder
cd /exports/eddie/scratch/$USER/Chimeras

## Generate SAM file
samtools view -h -o "$TMPDIR/Aligned.sortedByCoord.out.sam" "Data/STARsolo-onlyCB/${sample}/Aligned.sortedByCoord.out.bam"
sam="$TMPDIR/Aligned.sortedByCoord.out.sam"

# variables
barcode_path="outs/filter_70/barcodes/"

### Extract the ids from the reads that have matching barcode
# modify the barcode file so it contains  the prefix to match the sam file
barcode_CB_path="$barcode_path/CB_barcodes"
mkdir $barcode_CB_path
echo "creating barcodes for filtering ($barcode_CB_path)"
sed -e 's/^/CB:Z:/' $barcode_path/${sample}_${specie}_barcodes.txt > $barcode_CB_path/${sample}_CB_${specie}_barcode.txt

# put each line of the first file, with the barcodes, in an array (a)
# then evaluate if the $12 element of the sam file ( now with STARsolo-onlyCB, this is the CB) is in the array (a) 
# finally print the first element of the sam file (the id) if the matching was sucessfull. 
# add an additional step removing all the dupicated reads.
reads="outs/filter_70/reads/${tag}"
mkdir -p $reads
echo "filter IDs from sam fles ($reads)"
awk 'NR==FNR{a[$0];next}$12 in a{print $1}' $barcode_CB_path/${sample}_CB_${specie}_barcode.txt $sam | awk '!seen[$0]++' > $reads/${sample}_${specie}_ids.txt


### Filter the fastq files
split_species="Data/SplitSpecies/${specie}/${tag}/fastqs"
mkdir -p $split_species/$sample

echo "filter fastq files ($split_species)"
for fastq in Data/Raw_Data/${sample}_*.fastq.gz
do
#  fastq="Data/Raw_Data/${sample}_L001_R1_001.fastq.gz"
  fastq_name=$(basename $fastq .fastq.gz)

  Filter_fastqs_Tools/seqtk/seqtk subseq $fastq $reads/${sample}_${specie}_ids.txt | gzip \
  > $split_species/$sample/${specie}_${fastq_name}.fastq.gz

done