File: bam2pairs

package info (click to toggle)
python-pairix 0.3.7-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 179,652 kB
  • sloc: ansic: 5,553; python: 1,197; sh: 613; perl: 464; makefile: 63
file content (135 lines) | stat: -rwxr-xr-x 4,030 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/perl
use Getopt::Long;
my $pos_is_5end=1;
my $chrsizefile;
&GetOptions( 'l|leftmost' => sub { $pos_is_5end=0 },
             'c|chromsize=s' => \$chrsizefile );  ## This defines ordering between mates

if(@ARGV<2){ &print_usage(); exit(); }
my $input = $ARGV[0];
my $prefix = $ARGV[1];
my $outpairs = "$prefix.bsorted.pairs";

my %chr_ord=();
my %chr_size=();
if(defined $chrsizefile){
  &read_chrsizefile($chrsizefile, \%chr_size, \%chr_ord);  ## chr_ord is a hash with chr name as key and order (0,1,2..) as value.
}


open $IN, "samtools view -F12 -h $input |" or die "Can't open input bam file $input\n";

open $OUT, ">$outpairs" or die "Can't write to $outpairs";
&print_headers($OUT, $pos_is_5end, $chrsizefile, @ARGV);

my @SQ=();
my $header_parsed=0;
while(<$IN>){ 

  chomp;
  if(/^\@SQ/){
    push @SQ, $_; 
    next;
  }
  if(!/^\@/ && !$header_parsed){
    &parse_header(\@SQ, \%chr_size, \%chr_ord, $OUT);
    close $OUT; 
    open $OUT, "| sort -k2,2 -k4,4 -k3,3n -k5,5n >>$outpairs";
    $header_parsed=1;
  }
  @x=(split/\t/)[0,2,3,6,7,1]; # get the columns
  @seq=(split/\t/)[9,10]; # sequences
  
  # if the two chromosomes are identical, bam represents the second mate as '='. Replace this with the actual chromosome name.
  if($x[3] eq "="){ $x[3]=$x[1]; }
  
  # extract strand information from the flag column
  if($x[5]&16){ $s1="-"; $x[2]+=$pos_is_5end?length($seq[0])-1:0; } else { $s1="+"; }
  if($x[5]&32){ $s2="-"; $x[4]+=$pos_is_5end?length($seq[1])-1:0; } else { $s2="+"; }
  
  pop @x; #remove the flag column
  
  $"="\t";
  if(%chr_ord){
    next if(!exists $chr_ord{$x[1]} || !exists $chr_ord{$x[3]}); # works as filter as well.
    if($chr_ord{$x[1]} < $chr_ord{$x[3]} || ($chr_ord{$x[1]} == $chr_ord{$x[3]} && $x[2] < $x[4]) ){  ## upper triangle filtering
      print $OUT "@x\t$s1\t$s2\n";  
    }
  } else {
    if($x[1] lt $x[3] || ($x[1] eq $x[3] && $x[2] < $x[4]) ){  ## upper triangle filtering
      print $OUT "@x\t$s1\t$s2\n";  
    }
  }

}
close $IN;
close $OUT;

system("bgzip -f $prefix.bsorted.pairs");
system("pairix -f $prefix.bsorted.pairs.gz");

sub print_usage {
  print "usage: $0 [-l][-c <chromsize_file>] <input_bam> <out_prefix>\n";
  print "  -l : position is left-most position (default 5'end).\n";
  print "  -c|--chromsize : chrom size file is provided to define mate ordering. (default alpha-numeric)\n";
}

sub read_chrsizefile {
  my $chrsizefile = shift @_;
  my $pChr_size = shift @_;
  my $pChr_ord = shift @_;
  open $CHRSIZE, $chrsizefile or die "Can't open chrsize file $chrsizefile";
  my $k=0;  # chr order index
  while(<$CHRSIZE>){
    chomp; 
    my @s = split/\s+/; # any file with chromosome name as first column and size as second column would work. chr name should not have space.
    $pChr_ord->{$s[0]}=$k++;
    $pChr_size->{$s[0]}=$s[1];
  }
  close $CHRSIZE;
}


sub print_headers {
  my $OUT = shift @_;
  my $pos_is_5end = shift @_;
  my $chrsizefile = shift @_;
  my @ARGV = @_;

  print $OUT "## pairs format v1.0\n";
  print $OUT "#sorted: chr1-chr2-pos1-pos2\n";
  print $OUT "#shape: upper triangle\n";
  print $OUT "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n";

  my $command_option = sprintf("%s %s", $pos_is_5end?'':'-l', $chrsizefile?"-c $chrsizefile":'');
  print $OUT "#command: bam2pairs $command_option @ARGV\n";
}

sub print_chr_size_header {
  my $OUT = shift @_;
  my $pChr_size = shift @_;
  my $pChr_ord = shift @_;
  for my $chr (sort {$pChr_ord->{$a}<=>$pChr_ord->{$b}} keys %$pChr_ord){
    print $OUT "#chromsize: $chr\t$pChr_size->{$chr}\n";
  }
}

sub parse_header {
  my $pSQ = $_[0];
  my $pChr_size=$_[1];
  my $pChr_ord=$_[2];
  my $fout=$_[3];

  if(scalar %$pChr_ord==0){
    for my $line (@$pSQ) {
      if($line=~/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){
        my $chrname = $1; my $chrlen = $2;
        $pChr_size->{$chrname}=$chrlen;
      }
    }
    my $k=0;
    %$pChr_ord = map { $_, $k++ } sort {$a cmp $b} keys %$pChr_size;
  }
  &print_chr_size_header($fout, $pChr_size, $pChr_ord);
}