RACE-Seq / 02_tails_identification / get_softclipped_reads_from_sam.pl
get_softclipped_reads_from_sam.pl
Raw
#!/usr/bin/perl

#######################################################################################
###																					###
###    Copyright (C) 2017  Pawel Krawczyk (p.krawczyk@ibb.waw.pl)					###
###																					###
###    This program is free software: you can redistribute it and/or modify			###
###    it under the terms of the GNU General Public License as published by			###
###    the Free Software Foundation, either version 3 of the License, or			###
###    (at your option) any later version.											###
###																					###
###    This program is distributed in the hope that it will be useful,				###
###    but WITHOUT ANY WARRANTY; without even the implied warranty of				###
###    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the				###
###    GNU General Public License for more details.									###
###																					###
###    You should have received a copy of the GNU General Public License			###
###    along with this program. If not, see <http://www.gnu.org/licenses/>.			###
###																					###
#######################################################################################

#load required libraries
use strict;

#use warnings;
use Getopt::Long;
use File::Copy;
use File::Basename;
use Bio::SeqIO;
use Bio::Seq::Quality;
use Term::ProgressBar 2.00;

my $current_dir = `pwd`;

print $current_dir;    #print current dir

my $input_SAM   = '';
my $output_file = '';
my $is_R3       = 0;

#Get options from command line
GetOptions(
    'input=s'  => \$input_SAM,
    'output=s' => \$output_file,
    'R3=s'     => \$is_R3
);

if ( $input_SAM eq '' ) {
    die "no input file specified\n";
}
if ( $output_file eq '' ) {
    die "no output file specified\n";
}

#process input files:
get_softclipped_reads_with_ref( $input_SAM, $output_file );

### SUBROUTINES ###

sub get_softclipped_reads_with_ref {

#get mapping positions of mapped reads - find where transcript ends in RACE sequencing
#sam - mapping of reads with tails clipped

    my $input_sam_file    = shift;    #sam file with mappings
    my $output_fastq_file = shift;    #sam file with mappings
    my $input_fastq_file  = shift;

    #print "sam: $input_sam_file\n\n";

    #parameters read from SAM file
    my $query_name;
    my $bit_flag;
    my $ref_name;
    my $pos;
    my $mapq;
    my $cigar;
    my $rnext;
    my $pnext;
    my $tlen;
    my $seq;
    my $qual;

    my %reads_descriptions = ();

    my %positions              = ();
    my $positions_file         = $input_sam_file . "_positions.txt";
    my %positions_tails        = ();
    my $positions_file_tails   = $input_sam_file . "_positions_tails.txt";
    my %positions_notails      = ();
    my $positions_file_notails = $input_sam_file . "_positions_notails.txt";

    if ($input_fastq_file) {

        #specify input file:
        my $sequences_in = Bio::SeqIO->new(
            -file   => "<$input_fastq_file",
            -format => 'fastq',
        );
        while ( my $data = $sequences_in->next_dataset ) {
            my $read =
              Bio::Seq::Quality->new(%$data);   #read sequence with quality info
            my $read_seq  = $read->seq;         #get actual sequence
            my $read_qual = $read->qual;
            my $read_id   = $read->id;
            my $readdesc  = $read->desc;
            $reads_descriptions{$read_id} = $readdesc;
        }
    }

    #create fasta for output files
    my $sequences_out = Bio::SeqIO->new(
        -file   => ">$output_fastq_file",
        -format => 'fasta',
    );

    open( INPUT, "<$input_sam_file" );    #open SAM file for reading
    my $i = 0;

    my $no_sam_lines = `wc -l $input_sam_file`;
    $no_sam_lines =~ /(\d+)\s+/;
    $no_sam_lines = $1;

    my $progress = Term::ProgressBar->new(
        {
            name  => 'Reading SAM file',
            count => $no_sam_lines,
            ETA   => 'linear',
        }
    );
    my $next_update = 0;
    while ( my $line = <INPUT> ) {

        #read all lines of SAM file
        $i++;
        if ( !( $line =~ /^\@/ ) ) {    #ignore header lines
                                        #read all fileds in SAM:
            (
                $query_name, $bit_flag, $ref_name, $pos,
                $mapq, ($cigar), $rnext, $pnext,
                $tlen, $seq, $qual
              )
              = ( $line =~
/^([!-?A-~]{1,255})\s+(\d+)\s+(\*|[!-()+-<>-~][!-~]*)\s+(\d+)\s+(\d+)\s+(\*|[0-9MIDNSHPX]+)\s+(\*|=|.+)\s+(\d+)\s+(-{0,1}\d+)\s+(\*|[A-Za-z=.]+)\s+([!-~]+)\s+/
              );

            if ( $ref_name ne '*' )
            {    #if read was mapped - get position of mapping
                my $seq_length = length($seq);    #get length of mapped read
                if ( $cigar == "*" ) {

                    #if no clipping occured:
                    my $readdesc = $reads_descriptions{$query_name};
                    $readdesc .= "\tclip5: \tclip3: \tpos: -1\tref: -1";
                    my $seq_obj_out = Bio::Seq->new(
                        -seq  => $seq,
                        -id   => $query_name,
                        -desc => $readdesc
                    );
                    $sequences_out->write_seq($seq_obj_out);
                }
                else {
                    #get sequence parts from CIGAR:
                    my @cigar = ( $cigar =~ m/(\d+\D)/g );
                    my $nocigars =
                      scalar @cigar;    # get number of cigar operations
                    my $cum_length = 0;
                    my $clip5;
                    my $clip3;
                    my $matched_read = $seq;
                    my $cig;
                    my $cig_type;
                    my $cig_length;
                    my $match_start = 0;
                    my $match_end   = $seq_length;

                    if ( $nocigars > 1 ) {

               #if there is more than 1 cigar operation - look for soft clipping
                        for ( my $z = 0 ; $z < $nocigars ; $z++ ) {
                            $cig = $cigar[$z];
                            my ( $cig_length, $cig_type ) =
                              ( $cig =~ m/(\d+)(\D)/ );

                            if ( $z == 0 ) {

        #get first CIGAR operation - if S (softclipping) - output as 5'-clipping
                                if ( $cig_type eq "S" ) {
                                    $clip5 = substr( $seq, 0, $cig_length );
                                    $match_start = $cig_length;
                                    $match_end -= $cig_length;
                                }
                            }
                            if ( $z == $nocigars - 1 ) {

         #get last CIGAR operation - if S (softclipping) - output as 3'-clipping
                                if ( $cig_type eq "S" ) {
                                    $clip3 =
                                      substr( $seq, $cum_length, $cig_length );
                                    $match_end -= $cig_length;
                                }
                            }
                            $cum_length += $cig_length;
                        }
                        $matched_read = substr( $seq, $match_start, $match_end )
                          ;    #get matched read (without clipped parts)

                    }
                    my $mapping_pos =
                      $pos + $match_end - 1;    #get mapping position
                    ( $positions{$mapping_pos} )
                      and ( $positions{$mapping_pos}++ )
                      or ( $positions{$mapping_pos} = 1 );
                    if ($clip3) {
                        ( $positions_tails{$mapping_pos} )
                          and ( $positions_tails{$mapping_pos}++ )
                          or ( $positions_tails{$mapping_pos} = 1 );
                    }
                    else {
                        ( $positions_notails{$mapping_pos} )
                          and ( $positions_notails{$mapping_pos}++ )
                          or ( $positions_notails{$mapping_pos} = 1 );
                    }
                    my $readdesc = $reads_descriptions{$query_name}
                      ;    #create read description with clipping info
                    $readdesc .=
"\tclip5: $clip5\tclip3: $clip3\tpos: $mapping_pos\tref: $ref_name";

                    #my $seq_obj_out = Bio::Seq::Quality->new(
                    my $seq_obj_out = Bio::Seq->new(
                        -seq  => $matched_read,
                        -id   => $query_name,
                        -desc => $readdesc
                    );
                    $sequences_out->write_seq($seq_obj_out);
                }
            }    #end if
            else {
                #read was unmapped
                my $readdesc = $reads_descriptions{$query_name};
                $readdesc .= "\tclip5: \tclip3: \tpos: -1\tref: -1";
                my $seq_obj_out = Bio::Seq->new(
                    -seq  => $seq,
                    -id   => $query_name,
                    -desc => $readdesc
                );
                if ( $is_R3 != 0 ) {
                    $seq_obj_out = $seq_obj_out->revcom();
                }
                $sequences_out->write_seq($seq_obj_out);

            }
        }    #end if

        #manage progress bar
        $next_update = $progress->update($i)
          if $i >= $next_update;
    }    #end while

    open( POS, ">$positions_file" );
    foreach my $pos ( sort keys %positions ) {
        print POS "$pos\t$positions{$pos}\n";
    }
    close(POS);

    open( POS, ">$positions_file_tails" );
    foreach my $pos ( sort keys %positions_tails ) {
        print POS "$pos\t$positions_tails{$pos}\n";
    }
    close(POS);

    open( POS, ">$positions_file_notails" );
    foreach my $pos ( sort keys %positions_notails ) {
        print POS "$pos\t$positions_notails{$pos}\n";
    }
    close(POS);

}    # end sub

exit;