Scripts/Human_Mouse_Separation/filter_fastqs_split.sh · Chimeras

#!/bin/bash
# Grid Engine options (lines prefixed with #$)
#$ -N filter_fastqs
#$ -cwd                  
#$ -l h_rt=10:00:00 
#$ -l h_vmem=300M # change this to arround 4 G when I do the mouse
#$ -P scs_speedup
#$ -pe sharedmem 8
#Array job :
#$ -t 1-1 #!!!!!!IMPORTANT: change to 1-num_of_samples

# To run in chuncks of 3  uncoment this
##$ -tc 3

# #nitialise the environment modules
. /etc/profile.d/modules.sh

# load modules
module load igmm/apps/samtools/1.6


# define mouse our human
specie="human"
tag="Method1-split"
## Variables declaration, change them to match your setup:
# File with all the sample IDs, one per line
idfile="/exports/eddie/scratch/$USER/Chimeras/Data/one-sample.txt"
# Pathe to the runCellRanger.sh script


## Running
# Assigning SAMPLE variable from the built-in array counter
sample=`sed -n ${SGE_TASK_ID}p "$idfile"`

echo "processing $sample"
# some posts to think about how to write the script
# I want to extract the first field from the sam file that matches the barcodes,
# and then use that extraction to filter the fastq files
#https://stackoverflow.com/questions/5814377/grep-f-alternative-sedawk
#https://stackoverflow.com/questions/12204192/using-multiple-delimiters-in-awk
#https://stackoverflow.com/questions/32481877/what-are-nr-and-fnr-and-what-does-nr-fnr-imply
# https://www.tutorialspoint.com/awk/awk_arrays.htm

# Change directory to the main project folder
cd /exports/eddie/scratch/$USER/Chimeras

## Generate SAM file
samtools view -h -o "$TMPDIR/Aligned.sortedByCoord.out.sam" "Data/STARsolo-onlyCB/${sample}/Aligned.sortedByCoord.out.bam"
sam="$TMPDIR/Aligned.sortedByCoord.out.sam"

# variables
barcode_path="outs/filter_70/barcodes/"

### Extract the ids from the reads that have matching barcode
# modify the barcode file so it contains  the prefix to match the sam file
barcode_CB_path="$barcode_path/CB_barcodes"
mkdir $barcode_CB_path
echo "creating barcodes for filtering ($barcode_CB_path)"
sed -e 's/^/CB:Z:/' $barcode_path/${sample}_${specie}_barcodes.txt > $barcode_CB_path/${sample}_CB_${specie}_barcode.txt

# put each line of the first file, with the barcodes, in an array (a)
# then evaluate if the $12 element of the sam file ( now with STARsolo-onlyCB, this is the CB) is in the array (a) 
# finally print the first element of the sam file (the id) if the matching was sucessfull. 

reads="outs/filter_70/reads/${tag}"
mkdir -p $reads/$sample

echo "filter IDs from sam fles ($reads)"
awk 'NR==FNR{a[$0];next}$12 in a{print $1}' $barcode_CB_path/${sample}_CB_${specie}_barcode.txt $sam > $reads/${sample}_${specie}_ids.txt

echo "split the read ids file"
split -n 50 $reads/${sample}_${specie}_ids.txt $reads/$sample/${sample}_${specie}_ids.

### Filter the fastq files
split_species="Data/SplitSpecies/${specie}/${tag}/fastqs"
mkdir -p $split_species/$sample

echo "filter fastq files ($split_species)"
for chunk in $reads/$sample/${sample}_${specie}_ids.*
do
echo "     Process chunk: $chunk"
  for fastq in Data/Raw_Data/${sample}_*.fastq.gz
  do
  #  fastq="Data/Raw_Data/${sample}_L001_R1_001.fastq.gz"
    fastq_name=$(basename $fastq .fastq.gz)
    
    # this will append the results from all chunks into the same file.   
    # and process the 8 fastq files in parallel
    Filter_fastqs_Tools/seqtk/seqtk subseq $fastq $chunk | gzip \
    >> $split_species/$sample/${specie}_${fastq_name}.fastq.gz &

  done
# wait for previous chunck to finish, I only have 8 threads
wait
done