#!/usr/bin/perl
#######################################################################################
### ###
### Copyright (C) 2017 Pawel Krawczyk (p.krawczyk@ibb.waw.pl) ###
### ###
### This program is free software: you can redistribute it and/or modify ###
### it under the terms of the GNU General Public License as published by ###
### the Free Software Foundation, either version 3 of the License, or ###
### (at your option) any later version. ###
### ###
### This program is distributed in the hope that it will be useful, ###
### but WITHOUT ANY WARRANTY; without even the implied warranty of ###
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ###
### GNU General Public License for more details. ###
### ###
### You should have received a copy of the GNU General Public License ###
### along with this program. If not, see . ###
### ###
#######################################################################################
#load required libraries
use strict;
#use warnings;
use Getopt::Long;
use File::Copy;
use File::Basename;
use Bio::SeqIO;
use Bio::Seq::Quality;
use Term::ProgressBar 2.00;
my $current_dir = `pwd`;
print $current_dir; #print current dir
my $input_SAM = '';
my $output_file = '';
my $is_R3 = 0;
#Get options from command line
GetOptions(
'input=s' => \$input_SAM,
'output=s' => \$output_file,
'R3=s' => \$is_R3
);
if ( $input_SAM eq '' ) {
die "no input file specified\n";
}
if ( $output_file eq '' ) {
die "no output file specified\n";
}
#process input files:
get_softclipped_reads_with_ref( $input_SAM, $output_file );
### SUBROUTINES ###
sub get_softclipped_reads_with_ref {
#get mapping positions of mapped reads - find where transcript ends in RACE sequencing
#sam - mapping of reads with tails clipped
my $input_sam_file = shift; #sam file with mappings
my $output_fastq_file = shift; #sam file with mappings
my $input_fastq_file = shift;
#print "sam: $input_sam_file\n\n";
#parameters read from SAM file
my $query_name;
my $bit_flag;
my $ref_name;
my $pos;
my $mapq;
my $cigar;
my $rnext;
my $pnext;
my $tlen;
my $seq;
my $qual;
my %reads_descriptions = ();
my %positions = ();
my $positions_file = $input_sam_file . "_positions.txt";
my %positions_tails = ();
my $positions_file_tails = $input_sam_file . "_positions_tails.txt";
my %positions_notails = ();
my $positions_file_notails = $input_sam_file . "_positions_notails.txt";
if ($input_fastq_file) {
#specify input file:
my $sequences_in = Bio::SeqIO->new(
-file => "<$input_fastq_file",
-format => 'fastq',
);
while ( my $data = $sequences_in->next_dataset ) {
my $read =
Bio::Seq::Quality->new(%$data); #read sequence with quality info
my $read_seq = $read->seq; #get actual sequence
my $read_qual = $read->qual;
my $read_id = $read->id;
my $readdesc = $read->desc;
$reads_descriptions{$read_id} = $readdesc;
}
}
#create fasta for output files
my $sequences_out = Bio::SeqIO->new(
-file => ">$output_fastq_file",
-format => 'fasta',
);
open( INPUT, "<$input_sam_file" ); #open SAM file for reading
my $i = 0;
my $no_sam_lines = `wc -l $input_sam_file`;
$no_sam_lines =~ /(\d+)\s+/;
$no_sam_lines = $1;
my $progress = Term::ProgressBar->new(
{
name => 'Reading SAM file',
count => $no_sam_lines,
ETA => 'linear',
}
);
my $next_update = 0;
while ( my $line = ) {
#read all lines of SAM file
$i++;
if ( !( $line =~ /^\@/ ) ) { #ignore header lines
#read all fileds in SAM:
(
$query_name, $bit_flag, $ref_name, $pos,
$mapq, ($cigar), $rnext, $pnext,
$tlen, $seq, $qual
)
= ( $line =~
/^([!-?A-~]{1,255})\s+(\d+)\s+(\*|[!-()+-<>-~][!-~]*)\s+(\d+)\s+(\d+)\s+(\*|[0-9MIDNSHPX]+)\s+(\*|=|.+)\s+(\d+)\s+(-{0,1}\d+)\s+(\*|[A-Za-z=.]+)\s+([!-~]+)\s+/
);
if ( $ref_name ne '*' )
{ #if read was mapped - get position of mapping
my $seq_length = length($seq); #get length of mapped read
if ( $cigar == "*" ) {
#if no clipping occured:
my $readdesc = $reads_descriptions{$query_name};
$readdesc .= "\tclip5: \tclip3: \tpos: -1\tref: -1";
my $seq_obj_out = Bio::Seq->new(
-seq => $seq,
-id => $query_name,
-desc => $readdesc
);
$sequences_out->write_seq($seq_obj_out);
}
else {
#get sequence parts from CIGAR:
my @cigar = ( $cigar =~ m/(\d+\D)/g );
my $nocigars =
scalar @cigar; # get number of cigar operations
my $cum_length = 0;
my $clip5;
my $clip3;
my $matched_read = $seq;
my $cig;
my $cig_type;
my $cig_length;
my $match_start = 0;
my $match_end = $seq_length;
if ( $nocigars > 1 ) {
#if there is more than 1 cigar operation - look for soft clipping
for ( my $z = 0 ; $z < $nocigars ; $z++ ) {
$cig = $cigar[$z];
my ( $cig_length, $cig_type ) =
( $cig =~ m/(\d+)(\D)/ );
if ( $z == 0 ) {
#get first CIGAR operation - if S (softclipping) - output as 5'-clipping
if ( $cig_type eq "S" ) {
$clip5 = substr( $seq, 0, $cig_length );
$match_start = $cig_length;
$match_end -= $cig_length;
}
}
if ( $z == $nocigars - 1 ) {
#get last CIGAR operation - if S (softclipping) - output as 3'-clipping
if ( $cig_type eq "S" ) {
$clip3 =
substr( $seq, $cum_length, $cig_length );
$match_end -= $cig_length;
}
}
$cum_length += $cig_length;
}
$matched_read = substr( $seq, $match_start, $match_end )
; #get matched read (without clipped parts)
}
my $mapping_pos =
$pos + $match_end - 1; #get mapping position
( $positions{$mapping_pos} )
and ( $positions{$mapping_pos}++ )
or ( $positions{$mapping_pos} = 1 );
if ($clip3) {
( $positions_tails{$mapping_pos} )
and ( $positions_tails{$mapping_pos}++ )
or ( $positions_tails{$mapping_pos} = 1 );
}
else {
( $positions_notails{$mapping_pos} )
and ( $positions_notails{$mapping_pos}++ )
or ( $positions_notails{$mapping_pos} = 1 );
}
my $readdesc = $reads_descriptions{$query_name}
; #create read description with clipping info
$readdesc .=
"\tclip5: $clip5\tclip3: $clip3\tpos: $mapping_pos\tref: $ref_name";
#my $seq_obj_out = Bio::Seq::Quality->new(
my $seq_obj_out = Bio::Seq->new(
-seq => $matched_read,
-id => $query_name,
-desc => $readdesc
);
$sequences_out->write_seq($seq_obj_out);
}
} #end if
else {
#read was unmapped
my $readdesc = $reads_descriptions{$query_name};
$readdesc .= "\tclip5: \tclip3: \tpos: -1\tref: -1";
my $seq_obj_out = Bio::Seq->new(
-seq => $seq,
-id => $query_name,
-desc => $readdesc
);
if ( $is_R3 != 0 ) {
$seq_obj_out = $seq_obj_out->revcom();
}
$sequences_out->write_seq($seq_obj_out);
}
} #end if
#manage progress bar
$next_update = $progress->update($i)
if $i >= $next_update;
} #end while
open( POS, ">$positions_file" );
foreach my $pos ( sort keys %positions ) {
print POS "$pos\t$positions{$pos}\n";
}
close(POS);
open( POS, ">$positions_file_tails" );
foreach my $pos ( sort keys %positions_tails ) {
print POS "$pos\t$positions_tails{$pos}\n";
}
close(POS);
open( POS, ">$positions_file_notails" );
foreach my $pos ( sort keys %positions_notails ) {
print POS "$pos\t$positions_notails{$pos}\n";
}
close(POS);
} # end sub
exit;