#!/bin/bash # Grid Engine options (lines prefixed with #$) #$ -N filter_fastqs #$ -cwd #$ -l h_rt=10:00:00 #$ -l h_vmem=300M # change this to arround 4 G when I do the mouse #$ -P scs_speedup #$ -pe sharedmem 8 #Array job : #$ -t 1-1 #!!!!!!IMPORTANT: change to 1-num_of_samples # To run in chuncks of 3 uncoment this ##$ -tc 3 # #nitialise the environment modules . /etc/profile.d/modules.sh # load modules module load igmm/apps/samtools/1.6 # define mouse our human specie="human" tag="Method1-split" ## Variables declaration, change them to match your setup: # File with all the sample IDs, one per line idfile="/exports/eddie/scratch/$USER/Chimeras/Data/one-sample.txt" # Pathe to the runCellRanger.sh script ## Running # Assigning SAMPLE variable from the built-in array counter sample=`sed -n ${SGE_TASK_ID}p "$idfile"` echo "processing $sample" # some posts to think about how to write the script # I want to extract the first field from the sam file that matches the barcodes, # and then use that extraction to filter the fastq files #https://stackoverflow.com/questions/5814377/grep-f-alternative-sedawk #https://stackoverflow.com/questions/12204192/using-multiple-delimiters-in-awk #https://stackoverflow.com/questions/32481877/what-are-nr-and-fnr-and-what-does-nr-fnr-imply # https://www.tutorialspoint.com/awk/awk_arrays.htm # Change directory to the main project folder cd /exports/eddie/scratch/$USER/Chimeras ## Generate SAM file samtools view -h -o "$TMPDIR/Aligned.sortedByCoord.out.sam" "Data/STARsolo-onlyCB/${sample}/Aligned.sortedByCoord.out.bam" sam="$TMPDIR/Aligned.sortedByCoord.out.sam" # variables barcode_path="outs/filter_70/barcodes/" ### Extract the ids from the reads that have matching barcode # modify the barcode file so it contains the prefix to match the sam file barcode_CB_path="$barcode_path/CB_barcodes" mkdir $barcode_CB_path echo "creating barcodes for filtering ($barcode_CB_path)" sed -e 's/^/CB:Z:/' $barcode_path/${sample}_${specie}_barcodes.txt > $barcode_CB_path/${sample}_CB_${specie}_barcode.txt # put each line of the first file, with the barcodes, in an array (a) # then evaluate if the $12 element of the sam file ( now with STARsolo-onlyCB, this is the CB) is in the array (a) # finally print the first element of the sam file (the id) if the matching was sucessfull. reads="outs/filter_70/reads/${tag}" mkdir -p $reads/$sample echo "filter IDs from sam fles ($reads)" awk 'NR==FNR{a[$0];next}$12 in a{print $1}' $barcode_CB_path/${sample}_CB_${specie}_barcode.txt $sam > $reads/${sample}_${specie}_ids.txt echo "split the read ids file" split -n 50 $reads/${sample}_${specie}_ids.txt $reads/$sample/${sample}_${specie}_ids. ### Filter the fastq files split_species="Data/SplitSpecies/${specie}/${tag}/fastqs" mkdir -p $split_species/$sample echo "filter fastq files ($split_species)" for chunk in $reads/$sample/${sample}_${specie}_ids.* do echo " Process chunk: $chunk" for fastq in Data/Raw_Data/${sample}_*.fastq.gz do # fastq="Data/Raw_Data/${sample}_L001_R1_001.fastq.gz" fastq_name=$(basename $fastq .fastq.gz) # this will append the results from all chunks into the same file. # and process the 8 fastq files in parallel Filter_fastqs_Tools/seqtk/seqtk subseq $fastq $chunk | gzip \ >> $split_species/$sample/${specie}_${fastq_name}.fastq.gz & done # wait for previous chunck to finish, I only have 8 threads wait done