Rci-Variants / MacroBam.sh
MacroBam.sh
Raw
#! /bin/bash


#REQUEST DIRECTORY INPUT FROM USER
read -p "Enter directory name (exactly as appears in finder): " direc
# read -p "Please enter variant ID: " vnt
# read -p "Please enter the forward sfx nucleotide sequence: " SFX
# SFX_REV=$(seqkit seq -r "$FSX" )

#SIZE FILTER AND MERGE TO SINGLE FILES
cd /home/jankatalinic/Documents/RciVariants_analysis/Analysis/test_nov22/Rci7
cd $direc
let OG_NUM=0
READS=$(ls *.fastq)
mkdir Mapped
cd
cd /home/jankatalinic/Documents/RciVariants_analysis/Analysis/test_nov22/Rci7
cd $direc

for FILE in $READS
    do
        LINE_NUM=$(cat $FILE | wc -l)/4
        OG_NUM=$(($OG_NUM + $LINE_NUM))
        seqkit seq $FILE -m4000 -M5500 >> "TRIM_FILE".fastq
    done

FWD_LINES=$(cat TRIM_FILE.fastq | wc -l)
FWD_COUNT=$(echo "scale=0 ; $FWD_LINES / 4" | bc)

#GENERATE METADATA
echo "The original files contained:" $OG_NUM "reads." >> "Metadata".txt
echo "The new filtered reads file contains:" "$FWD_COUNT" "reads." >> "Metadata".txt

cd 
cd /home/jankatalinic/Documents/RciVariants_analysis/Analysis/test_nov22/Rci7
cd $direc

REFS=$(ls ~/"$direc"/*.fasta)
for FILE in $REFS
    do 
        chmod +x $FILE
done

MAP="map"

#FOR-LOOP MAPPING READS TO EACH REFERENCE SEQ - OUTPUT AS SAM FILE
for FILE in $REFS
    do
        STRIP=$(basename $FILE .fasta)
        minimap2 -ax map-ont --no-long-join -g100 --secondary=no -Y --sam-hit-only $FILE TRIM_FILE.fastq > ~/"$direc"/Mapped/$STRIP-$MAP.bam
done


#FILTER FOR BEST MATCHES AND WRITE TO NEW SAM FILE (FWD)
cd Mapped
MAPPED=$(ls *.bam)
FILT="_filtered"
mkdir Filtered
for FILE in $MAPPED
    do
        STRIP=$(basename $FILE .bam)
        samtools view -h $FILE | grep -E 'AS:i:7[5-9][0-9][0-9]' > ~/"$direc"/Mapped/Filtered/"$STRIP"$FILT.bam
    done 
cd
cd /home/jankatalinic/Documents/RciVariants_analysis/Analysis/test_nov22/Rci7

#WRITE TEXT FILE WITH SEQUENCE ID AS FILENAME AND REF ID AS STRING (FWD)
cd $direc
cd Mapped
cd Filtered
FILTERED=$(ls *_filtered.bam)

DUP="duplicated_"
sort $FILTERED | uniq -w37 -d >> "duplicates".bam
DUPID=$(cat duplicates.bam | cut -f1)
echo $DUPID

for DOC in $FILTERED
    do 
        NUM_LINES=$(cat -n $DOC)
        LINE_NO=$(cat $DOC | wc -l)
        i=1
        while [ "$i" -le "$LINE_NO" ]; do
            echo "$i"
            SEQID=$(awk -v var="$i" 'NR==var' $DOC | cut -f1)
            REFID=$(awk -v var="$i" 'NR==var' $DOC | cut -f3)
            i=$(($i + 1))
            if [[ "$DUPID" == *"$SEQID"* ]]; then
                echo "$REFID"  > "$DUP""$SEQID".txt
            else
                echo "$REFID"  > "$SEQID".txt
            fi
        done
    done
cd
cd /home/jankatalinic/Documents/RciVariants_analysis/Analysis/test_nov22/Rci7

echo "Alignment completed."