###Generating genome index. Genome consists sequences of genes analysed in RACE-Seq STAR \ --runMode genomeGenerate \ --runThreadN 2 \ --genomeDir /path/to/STAR_index \ --genomeFastaFiles ref_sequence.fa \ --genomeSAindexNbases 5 \ --sjdbGTFfile annotation.gtf \ --sjdbOverhang 100 ###Adapters trimming mkdir 01_trimmed cutadapt -j 10 -m 20 -a TGGAATTCTCGG -A GATCGTCGGACT -o ./01_trimmed/${file1}_R1_001.trimmed.fastq.gz -p ./01_trimmed/${file1}_R2_001.trimmed.fastq.gz ./raw_data/${file1}_R1_001.fastq.gz ./raw_data/${file1}_R2_001.fastq.gz > ./01_trimmed/${file1}.log ###Decompression of fastq files gunzip -d ./01_trimmed/*fastq.gz ###Removing low quality nucleotides from 3' end of reads mkdir 02_trimmed_quality perl condetri.pl -fastq1=./01_trimmed/${file1}_R1_001.trimmed.fastq -fastq2=./01_trimmed/${file1}_R2_001.trimmed.fastq -prefix=./02_trimmed_quality/${file1} -sc=33 -hq=37 -ml=1 -mh=5 -minlen=40;\ rename 's/_trim1.fastq/_R1.trimmedq.fastq/' ./02_trimmed_quality/*_trim1* rename 's/_trim2.fastq/_R2.trimmedq.fastq/' ./02_trimmed_quality/*_trim2* ###Filter reads R1 and R2 according to primer sequence used to amplifie genes; bbduk.sh program from BBMap package mkdir 03_amplicons #L1NGS4 bbduk.sh in=./02_trimmed_quality/${file1}_R1.trimmedq.fastq in2=./02_trimmed_quality/${file1}_R2.trimmedq.fastq outm=./03_amplicons/${file1}_L1NGS4_R1.fastq outm2=./03_amplicons/${file1}_L1NGS4_R2.fastq literal=TATTGCTTTATTTGTAACCA copyundefined k=20 #GAPDH bbduk.sh in=./02_trimmed_quality/${file1}_R1.trimmedq.fastq in2=./02_trimmed_quality/${file1}_R2.trimmedq.fastq outm=./03_amplicons/${file1}_GAPDH_R1.fastq outm2=./03_amplicons/${file1}_GAPDH_R2.fastq literal=CTAGGGAGCCGCACCTTGT copyundefined k=19 #PABPC4 bbduk.sh in=./02_trimmed_quality/${file1}_R1.trimmedq.fastq in2=./02_trimmed_quality/${file1}_R2.trimmedq.fastq outm=./03_amplicons/${file1}_PABPC4_R1.fastq outm2=./03_amplicons/${file1}_PABPC4_R2.fastq literal=CCCAGAAATTGGTTTTATTT copyundefined k=20 ###Filter R2 reads containing UMI and delimiter sequences and filtering the same R1 reads #Filter R2 reads containing UMI and delimiter sequences mkdir 04_filtered egrep -A 2 -B 1 -i "^.{15}GTCAG" ./03_amplicons/${file1}_R2.fastq | grep -v '^--$' > ./04_filtered/${file1}_R2.filtered.fastq #Read names from filtered R2 grep '@' ./04_filtered/${file1}_R2.filtered.fastq | sed 's/ 2/ 1/g' > ./03_amplicons/${file1}_R1.output_names.txt #R1 reads according to read names from the filtered R2 grep -A3 -f ./03_amplicons/${file1}_R1.output_names.txt ./03_amplicons/${file1}_R1.fastq | grep -v '^--$' > ./04_filtered/${file1}_R1.filtered.fastq #UMI extraction; umi_tools extract assumes that UMI is located in R1, that in the command R2 read is given as R1 mkdir 05_umi_extracted umi_tools extract -p NNNNNNNNNNNNNNNNNNNN -I ./04_filtered/${file1}_R2.filtered.fastq -S ./05_umi_extracted/${file1}_R2.extracted.fastq --read2-in=./04_filtered/${file1}_R1.filtered.fastq --read2-out=./05_umi_extracted/${file1}_R1.extracted.fastq --log=./05_umi_extracted/${file1}.log ###Remove empty reads; reformat.sh program from the BBMap package reformat.sh in=./05_umi_extracted/${file1}_R1.extracted.fastq in2=./05_umi_extracted/${file1}_R2.extracted.fastq out=./05_umi_extracted/${file1}_R1.extracted.reformat.fastq out2=./05_umi_extracted/${file1}_R2.extracted.reformat.fastq minlength=1 #Delete files before reformat rm ./05_umi_extracted/*extracted.fastq ##Mapping mkdir 06_results_STAR STAR --genomeDir /path/to/STAR_index \ --runThreadN 10 \ --genomeLoad LoadAndKeep \ --limitBAMsortRAM 35000000000 \ --readFilesIn ${file1}_R1.extracted.reformat.fastq ${file1}_R2.extracted.reformat.fastq \ --outFileNamePrefix ./06_results_STAR/${file1} \ --outSAMtype BAM SortedByCoordinate \ --outSAMunmapped Within \ --outSAMattributes Standard \ --outFilterMatchNmin 20 \ --outFilterScoreMinOverLread 0.1 \ --outFilterMatchNminOverLread 0.1 #Remove genome from the memory STAR --genomeDir /media/arek/76366F5F366F1EFD1/genomes/hs_38_104_ensembl/LINE_nomarker/STAR_index \ --runThreadN 10 \ --genomeLoad Remove ##Deduplication mkdir 07_deducplicated java -jar umicollapse.jar bam -i ${file1}.bam -o ./07_deducplicated/${file1}.deduplicated.bam ##Index bam files after deduplication ls *bam | xargs -I xxx samtools index xxx xxx.bai ##Change bam to sam samtools view -h ${file1}.bam -o ${file1}.sam ##Tails identification; move to section 2 mkdir 08_tails_result bash Tail_analysis.sh ${file1}.sam ./08_tails_result/