Chimeras / Scripts / Compare_Filter_fastqs_Tools / filter_fastqs_bbmap.sh
filter_fastqs_bbmap.sh
Raw
#!/bin/bash
# Grid Engine options (lines prefixed with #$)
#$ -N filter_fastqs_bbmap-2
#$ -cwd                  
#$ -l h_rt=10:00:00 
#$ -l h_vmem=16G
#$ -pe sharedmem 8
#Array job :
#$ -t 1-1 #!!!!!!IMPORTANT: change to 1-num_of_samples

# To run in chuncks of 3  uncoment this
##$ -tc 3

# #nitialise the environment modules
. /etc/profile.d/modules.sh

# load modules
module load igmm/apps/samtools/1.6
module load roslin/bbmap/38.11

# define mouse our human
specie="human"
tag="bbmap"
## Variables declaration, change them to match your setup:
# File with all the sample IDs, one per line
idfile="/exports/eddie/scratch/$USER/Chimeras/Data/one-sample.txt"
# Pathe to the runCellRanger.sh script


## Running
# Assigning SAMPLE variable from the built-in array counter
sample=`sed -n ${SGE_TASK_ID}p "$idfile"`

echo "$(date +%T)  processing $sample"
# some posts to think about how to write the script
# I want to extract the first field from the sam file that matches the barcodes,
# and then use that extraction to filter the fastq files
#https://stackoverflow.com/questions/5814377/grep-f-alternative-sedawk
#https://stackoverflow.com/questions/12204192/using-multiple-delimiters-in-awk
#https://stackoverflow.com/questions/32481877/what-are-nr-and-fnr-and-what-does-nr-fnr-imply
# https://www.tutorialspoint.com/awk/awk_arrays.htm

# Change directory to the main project folder
cd /exports/eddie/scratch/$USER/Chimeras

echo "Generate Sam file"
## Generate SAM file
samtools view -h -o "$TMPDIR/Aligned.sortedByCoord.out.sam" "Data/STARsolo-onlyCB/${sample}/Aligned.sortedByCoord.out.bam"
sam="$TMPDIR/Aligned.sortedByCoord.out.sam"

# variables
barcode_path="outs/filter_70/barcodes/"

### Extract the ids from the reads that have matching barcode
# modify the barcode file so it contains  the prefix to match the sam file
barcode_CB_path="$barcode_path/CB_barcodes"
mkdir $barcode_CB_path
echo "creating barcodes for filtering ($barcode_CB_path)"
sed -e 's/^/CB:Z:/' $barcode_path/${sample}_${specie}_barcodes.txt > $barcode_CB_path/${sample}_CB_${specie}_barcode.txt

# put each line of the first file, with the barcodes, in an array (a)
# then evaluate if the $12 element of the sam file ( now with STARsolo-onlyCB, this is the CB) is in the array (a) 
# finally print the first element of the sam file (the id) if the matching was sucessfull. 

reads="outs/filter_70/reads/${tag}"
mkdir -p $reads
echo "$(date +%T)  filter IDs from sam fles ($reads)"
awk 'NR==FNR{a[$0];next}$12 in a{print $1}' $barcode_CB_path/${sample}_CB_${specie}_barcode.txt $sam > $reads/${sample}_${specie}_ids.txt


### Filter the fastq files
split_species="Data/SplitSpecies/${specie}/${tag}/fastqs"
mkdir -p $split_species/$sample


echo "$(date +%T)  filter fastq files ($split_species)"
for fastq in Data/Raw_Data/${sample}_*.fastq.gz
do
#  fastq="Data/Raw_Data/${sample}_L001_R1_001.fastq.gz"
  fastq_name=$(basename $fastq .fastq.gz)
  
  echo "     $(date +%T)  $fastq_name"
  filterbyname.sh in=$fastq out=$split_species/$sample/${specie}_${fastq_name}.fastq.gz names=$reads/${sample}_${specie}_ids.txt include=t

done