#!/bin/bash # Grid Engine options (lines prefixed with #$) #$ -N filter_fastqs #$ -cwd #$ -l h_rt=10:00:00 #$ -l h_vmem=16G #Array job : #$ -t 1-10 #!!!!!!IMPORTANT: change to 1-num_of_samples # To run in chuncks of 3 uncoment this ##$ -tc 3 # #nitialise the environment modules . /etc/profile.d/modules.sh # load modules module load igmm/apps/samtools/1.6 # define mouse our human specie="human" tag="Method1" ## Variables declaration, change them to match your setup: # File with all the sample IDs, one per line idfile="/exports/eddie/scratch/$USER/Chimeras/Data/samples.txt" # Pathe to the runCellRanger.sh script ## Running # Assigning SAMPLE variable from the built-in array counter sample=`sed -n ${SGE_TASK_ID}p "$idfile"` echo "processing $sample" # some posts to think about how to write the script # I want to extract the first field from the sam file that matches the barcodes, # and then use that extraction to filter the fastq files #https://stackoverflow.com/questions/5814377/grep-f-alternative-sedawk #https://stackoverflow.com/questions/12204192/using-multiple-delimiters-in-awk #https://stackoverflow.com/questions/32481877/what-are-nr-and-fnr-and-what-does-nr-fnr-imply # https://www.tutorialspoint.com/awk/awk_arrays.htm # Change directory to the main project folder cd /exports/eddie/scratch/$USER/Chimeras ## Generate SAM file samtools view -h -o "$TMPDIR/Aligned.sortedByCoord.out.sam" "Data/STARsolo-onlyCB/${sample}/Aligned.sortedByCoord.out.bam" sam="$TMPDIR/Aligned.sortedByCoord.out.sam" # variables barcode_path="outs/filter_70/barcodes/" ### Extract the ids from the reads that have matching barcode # modify the barcode file so it contains the prefix to match the sam file barcode_CB_path="$barcode_path/CB_barcodes" mkdir $barcode_CB_path echo "creating barcodes for filtering ($barcode_CB_path)" sed -e 's/^/CB:Z:/' $barcode_path/${sample}_${specie}_barcodes.txt > $barcode_CB_path/${sample}_CB_${specie}_barcode.txt # put each line of the first file, with the barcodes, in an array (a) # then evaluate if the $12 element of the sam file ( now with STARsolo-onlyCB, this is the CB) is in the array (a) # finally print the first element of the sam file (the id) if the matching was sucessfull. # add an additional step removing all the dupicated reads. reads="outs/filter_70/reads/${tag}" mkdir -p $reads echo "filter IDs from sam fles ($reads)" awk 'NR==FNR{a[$0];next}$12 in a{print $1}' $barcode_CB_path/${sample}_CB_${specie}_barcode.txt $sam | awk '!seen[$0]++' > $reads/${sample}_${specie}_ids.txt ### Filter the fastq files split_species="Data/SplitSpecies/${specie}/${tag}/fastqs" mkdir -p $split_species/$sample echo "filter fastq files ($split_species)" for fastq in Data/Raw_Data/${sample}_*.fastq.gz do # fastq="Data/Raw_Data/${sample}_L001_R1_001.fastq.gz" fastq_name=$(basename $fastq .fastq.gz) Filter_fastqs_Tools/seqtk/seqtk subseq $fastq $reads/${sample}_${specie}_ids.txt | gzip \ > $split_species/$sample/${specie}_${fastq_name}.fastq.gz done