RACE-Seq / 02_tails_identification / Tail_analysis.sh
Tail_analysis.sh
Raw
#!/bin/bash
#Usage Tail_analysis.sh file_core_name results_dir
file=$1
dir_results=$2

grep -v "@" "$file".sam | awk '{if($2==83) print $0}' > ${dir_results}/"$file"_163.sam # R2 +
grep -v "@" "$file".sam | awk '{if($2==147) print $0}' > ${dir_results}/"$file"_99.sam # R2 -
grep -v "@" "$file".sam | awk '{if($2==73) print $0}' > ${dir_results}/"$file"_73.sam # R1 +
grep -v "@" "$file".sam | awk '{if($2==89) print $0}' > ${dir_results}/"$file"_89.sam # R1 -

#Soft-clips with get_softclipped_reads_from_sam.pl script (from https://github.com/smaegol/LINE_1_RACE_seq_analysis)

perl get_softclipped_reads_from_sam.pl \
-input ${dir_results}/"$file"_163.sam  \
-output  ${dir_results}/"$file"_163.softclips

perl get_softclipped_reads_from_sam.pl \
-input ${dir_results}/"$file"_99.sam  \
-output  ${dir_results}/"$file"_99.softclips

perl get_softclipped_reads_from_sam.pl \
-input ${dir_results}/"$file"_73.sam  \
-output  ${dir_results}/"$file"_73.softclips

perl get_softclipped_reads_from_sam.pl \
-input ${dir_results}/"$file"_89.sam  \
-output  ${dir_results}/"$file"_89.softclips

#Parsing results
echo "Parsing results"

grep ">" ${dir_results}/${file}_163.softclips | sed 's/>//g ; s/clip5: //g; s/clip3: //g; s/ref: //g; s/pos: //g' > ${dir_results}/${file}_163.softclips.parsed
grep ">" ${dir_results}/${file}_99.softclips | sed 's/>//g ; s/clip5: //g; s/clip3: //g; s/ref: //g; s/pos: //g' > ${dir_results}/${file}_99.softclips.parsed
grep ">" ${dir_results}/${file}_73.softclips | sed 's/>//g ; s/clip5: //g; s/clip3: //g; s/ref: //g; s/pos: //g' > ${dir_results}/${file}_73.softclips.parsed
grep ">" ${dir_results}/${file}_89.softclips | sed 's/>//g ; s/clip5: //g; s/clip3: //g; s/ref: //g; s/pos: //g' > ${dir_results}/${file}_89.softclips.parsed

#Reformatting soft-clip output
echo "Reformatting soft-clip output"

cut -f 1,3,4,5 ${dir_results}/${file}_163.softclips.parsed > ${dir_results}/${file}_163.softclips.parsed.formatted #clip3
cut -f 1,3,4,5 ${dir_results}/${file}_99.softclips.parsed > ${dir_results}/${file}_99.softclips.parsed.formatted #clip3
cut -f 1,4,5 ${dir_results}/${file}_73.softclips.parsed > ${dir_results}/${file}_73.softclips.parsed.formatted
cut -f 1,4,5 ${dir_results}/${file}_89.softclips.parsed > ${dir_results}/${file}_89.softclips.parsed.formatted

#Adding a column with strand info
echo "Adding a column with strand info"

awk '{print $0, "\t163"}' ${dir_results}/${file}_163.softclips.parsed.formatted > ${dir_results}/${file}_163.softclips.parsed.strand
awk '{print $0, "\t99"}' ${dir_results}/${file}_99.softclips.parsed.formatted > ${dir_results}/${file}_99.softclips.parsed.strand

#Selecting R2 from sam based on mapped R1
echo "Selecting R2 from sam based on mapped R1"

code="73"
cut -f 1 ${dir_results}/${file}_${code}.softclips.parsed.formatted > ${dir_results}/${file}_${code}.names.txt
seqtk subseq "$file"_R2.extracted.reformat.fastq ${dir_results}/${file}_${code}.names.txt \
 >  ${dir_results}/"$file"_R2.extracted.names.fastq.txt
grep "@" ${dir_results}/"$file"_R2.extracted.names.fastq.txt | sed 's/@//g'> ${dir_results}/${file}_${code}.names2.txt
awk '/@/{getline; print}' ${dir_results}/"$file"_R2.extracted.names.fastq.txt | tr ACGTacgt TGCAtgca | rev > ${dir_results}/${file}_${code}.seq.txt
paste ${dir_results}/${file}_${code}.names2.txt ${dir_results}/${file}_${code}.seq.txt > ${dir_results}/${file}_${code}.names.seq.txt

Rscript seq.R -f ${dir_results}/${file}_${code}.softclips.parsed.formatted \
-n ${dir_results}/${file}_${code}.names.seq.txt \
-c ${code} \
-o ${dir_results}/${file}_${code}.softclips.parsed.strand

code="89"

cut -f 1 ${dir_results}/${file}_${code}.softclips.parsed.formatted > ${dir_results}/${file}_${code}.names.txt
seqtk subseq "$file"_R2.extracted.reformat.fastq ${dir_results}/${file}_${code}.names.txt \
 >  ${dir_results}/"$file"_R2.extracted.names.fastq.txt
grep "@" ${dir_results}/"$file"_R2.extracted.names.fastq.txt | sed 's/@//g'> ${dir_results}/${file}_${code}.names2.txt
awk '/@/{getline; print}' ${dir_results}/"$file"_R2.extracted.names.fastq.txt  | tr ACGTacgt TGCAtgca | rev > ${dir_results}/${file}_${code}.seq.txt
paste ${dir_results}/${file}_${code}.names2.txt ${dir_results}/${file}_${code}.seq.txt > ${dir_results}/${file}_${code}.names.seq.txt

Rscript seq.R \
-f ${dir_results}/${file}_${code}.softclips.parsed.formatted \
-n ${dir_results}/${file}_${code}.names.seq.txt \
-c ${code} \
-o ${dir_results}/${file}_${code}.softclips.parsed.strand

#Merging all files
echo "Merging files"
cat ${dir_results}/"$file"*.softclips.parsed.strand > ${dir_results}/${file}.softclips.parsed.all.csv

echo "Deleting intermediate files"
#rm ${dir_results}/${file}*.parsed.strand
#rm ${dir_results}/${file}*.parsed.formatted
#rm ${dir_results}/${file}*.parsed
#rm ${dir_results}/${file}*.txt
#rm ${dir_results}/"$file"_163.sam
#rm ${dir_results}/"$file"_99.sam
#rm ${dir_results}/"$file"_73.sam
#rm ${dir_results}/"$file"_89.sam
#rm *positions*

Rscript poli_clip_all.R \
-f ${dir_results}/${file}.softclips.parsed.all.csv \
-o ${dir_results}/${file}.softclips.results.csv


######################