1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
package main;
our $SEE;
package AlignGraph;
use strict;
use warnings;
use Carp;
use AlignNode;
use base qw (ReadCoverageGraph);
no warnings qw (recursion);
sub new {
my $packagename = shift;
my $self = $packagename->SUPER::new();
## Nodes exist in order of end5 -> end3 at all times. Strand is included in the node name just for safety reasons.
bless ($self, $packagename);
return($self);
}
sub add_alignment {
my $self = shift;
my ($read_acc, $scaffold, $strand, $genome_coords_aref) = @_;
my @align_positions;
foreach my $coordset (sort {$a->[0]<=>$b->[0]} @$genome_coords_aref) {
my ($lend, $rend) = sort {$a<=>$b} @$coordset;
for (my $i = $lend; $i <= $rend; $i++) {
push (@align_positions, $i);
}
}
$self->_add_ordered_positions_to_graph(\@align_positions, $strand, $read_acc);
return;
}
sub _add_ordered_positions_to_graph {
my $self = shift;
my ($ordered_positions_aref, $strand, $read_acc) = @_;
unless (ref $ordered_positions_aref eq 'ARRAY') {
confess "Error, require ordered position list";
}
unless ($strand =~ /^[\+\-]$/) {
confess "strand must be: + or - ";
}
my @align_positions = @$ordered_positions_aref;
if ($strand eq '-') {
@align_positions = reverse @align_positions;
}
my $prev_align_pos = shift @align_positions;
while (@align_positions) {
my $next_align_pos = shift @align_positions;
# print "\tadding $next_align_pos\n";
my $prev_align_node = $self->get_or_create_node("$prev_align_pos,$strand", $read_acc);
my $next_align_node = $self->get_or_create_node("$next_align_pos,$strand", $read_acc);
$self->link_adjacent_nodes($prev_align_node, $next_align_node);
$prev_align_pos = $next_align_pos;
}
return;
}
sub get_all_nodes {
my $self = shift;
return( sort {$a->{_value} cmp $b->{_value}} $self->SUPER::get_all_nodes());
}
1; #EOM
=CIGAR_format
from: http://bioperl.org/pipermail/bioperl-l/2003-March/011591.html
cigar line format (where CIGAR stands for Concise
Idiosyncratic Gapped Alignment Report).
In the cigar line format alignments are sotred as follows:
M: Match
D: Deletino
I: Insertion
An example of an alignment for a hypthetical protein match is shown
below:
Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
PG P G GP R PLGP
Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
protein_align_feature table as the following cigar line:
7M4D12M2I2MD7M
From SAM documentation:
Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
an example. Suppose the clipped alignment is:
REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
READ: gggGTGTAACC-GACTAGgggg
where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
this alignment is: 3S8M1D6M4S.
Spliced alignment. In cDNA-to-genome alignment, we may want to distinguish introns from deletions in exons. We
introduce operation ʻNʼ to represent long skip on the reference sequence. Suppose the spliced alignment is:
REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
READ: GTGTAACCC................................TCAGAATA
where ʻ...ʼ on the read sequence indicates the intron. The CIGAR for this alignment is: 9M32N8M.
=cut
|