File: estimate_insert_size.pl

package info (click to toggle)
sspace 2.1.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 6,752 kB
  • ctags: 165
  • sloc: perl: 2,382; python: 374; makefile: 19; sh: 18
file content (186 lines) | stat: -rwxr-xr-x 7,344 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env perl
###################################################################################################################
#Marten Boetzer BaseClear B.v. 14-07-2011                                                                         #
#SSPACE perl subscript samToTab_multi.pl                                                                          #
#This script;                                                                                                     #
#  -Estimates median insert size by mapping paired-reads on contigs                                               #
#  It goes through each contig and maps both reads, if a pair is mapped,                                          #
#  the orientation and insert size is estimated.                                                                  #
#  If sufficient pairs (given by the user) are found, the median insert size is                                   #
#  estimated, as well as a file with the distribution is generated which can be                                   #
#  used to visualize the insert size distribution.                                                                #
#                                                                                                                 #
#  To run this script;                                                                                            #
#  perl estimate_insert_size.pl <contigfile> <readfile1> <readfile2> <number_of_pairs> <orientation_of_pairs>     #
                                                                                                                  #
#  Output is the median insert size and a file with distribution of the insert size. Also, number of pairs for    #
#  each found orientation (FR, RF, FF and RR) are given.                                                          #
###################################################################################################################

use File::Path;
use strict;
my $contigfile = $ARGV[0];
my $fileA = $ARGV[1];
my $fileB = $ARGV[2];
my $numpairs = $ARGV[3];
my $orientation = $ARGV[4];

die "ERROR: Can't find contig file: $contigfile -- fatal\n" if(! -e $contigfile);
die "ERROR: Can't find read file: $fileA -- fatal\n" if(! -e $fileA);
die "ERROR: Can't find read file: $fileB -- fatal\n" if(! -e $fileB);
if($numpairs eq ''){
  print "WARNING: No number of pairs are given, using 10000 pairs instead\n";
  $numpairs = 10000;
}
if($orientation eq ''){
  print "WARNING: No orientation of the pairs is given, using orientation FR instead\n";
  $orientation = "FR";
}
die "ERROR: You've inserted $numpairs, which does not seem to be an valid number. Exiting.\n" if(!($numpairs>0) || !($numpairs =~ /^\d+$/));
die "ERROR: Orientation must have length of 2 characters and should contain one of the following; FR, FF, FR or RF. You've inserted orientation of $orientation ...Exiting.\n" if(!(length($orientation) == 2) || !($orientation =~ /[FR][FR]/));

print "\n";
my $paircount = 0;
my ($direction, $insertsize);
mkpath('bowtieoutput');
open (CONT, $contigfile) || die "Can't open contig file $contigfile\n";

my ($seq,$name, $maxctg, $maxseq, $maxname)=("","",0,"","");
my $contignum = 0;
CONTIG:
while (<CONT>) {
  chomp;
  $seq.=$_ if(eof(CONT));
  if (/\>(\S+)/ || eof(CONT)){
    if($seq ne ""){         
       $contignum++;
       if(length($seq) > $maxctg){
         $maxctg = length($seq);
         $maxseq = $seq;
         $maxname = $name;
       }
       if(eof(CONT)){
         $seq = $maxseq;
         $name = $maxname;
       }
       if(eof(CONT)){
         print "now at contig $name = size".length($seq)."\n";
         open (BOWCONT, ">bowtieoutput/bowtie_input.fa");
         print BOWCONT ">$name\n$seq\n";
         close BOWCONT;
         ($paircount) = &mapWithBowtie($contignum,"bowtieoutput/bowtie_input.fa", $fileA, $fileB);
         last CONTIG if($paircount>=$numpairs);
       }

       $name = "";
       $seq = "";
    }
    $name = $1;
  }
  else {
     $seq .= $_;
  }
}

foreach my $d (keys %$direction){
  print "direction $d is found $direction->{$d} times\n";
}
my ($median_ins,$record) = (0,0);
my $median_bin = int($paircount/2);
open (CSV, ">distribution.txt") || die "Can't open distribution.txt for writing -- fatal";
foreach my $is (sort {$a<=>$b} keys %$insertsize){
  for(my $i=0;$i<$insertsize->{$is};$i++){
    $record++;
    $median_ins = $is if($record >= $median_bin && $median_ins == 0);
  }
  print CSV "$is\t$insertsize->{$is}\n";
}

print "\nmedian = $median_ins\n\nSee the distribution in file 'distribution.txt'\n";


sub mapWithBowtie{
  my ($fname,$contig, $fileA, $fileB) = @_;
  my $bowtieout = "contig$fname.bowtieIndex";
  system("bowtie-build $contig bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values

  my $fastq = 0;
  open(TEST, "< $fileA");
  $name = <TEST>;
  close TEST;
  $fastq = 1 if ($name =~ /^[@]/);

  open(FILEA, "< $fileA");
  open(FILEB, "< $fileB");

  my $count=0;
  open (BOWIN, ">bowtieoutput/bowtiein.$fname.fa") || die "Can't write to single file bowtieoutput/bowtiein.$fname.fa-- fatal\n";
  while(<FILEA>) {
    <FILEB>;
    $count++;
    my $seq1 = <FILEA>;
    chomp $seq1;
    my $seq2 = <FILEB>;
    chomp $seq2;
    #FASTQ FORMAT
    <FILEA>,<FILEA>,<FILEB>,<FILEB> if ($fastq);
    
    print BOWIN ">read$count\n$seq1>read$count\n$seq2";
    if($count > $numpairs){
      close BOWIN;
      open(IN, "bowtie -p 1 -v 0 -m 1 bowtieoutput/$bowtieout --suppress 6,7 -f bowtieoutput/bowtiein.$fname.fa --quiet|") || die "Can't open bowtie output -- fatal\n";
      my ($prevread, $prevline);
      while(my $line = <IN>){
        my @t1 = split(/\t/,$line);
        if($prevread eq $t1[0]){
          $paircount++;
          my @t2 = split(/\t/,$prevline);
          my ($start1, $start2, $end1,$end2);

          if($t1[1] eq "+"){
            $end1 = $t1[3] + length($t1[4]);
            $start1 = $t1[3];
          }
          else{
            $start1 = $t1[3] + length($t1[4]);
            $end1 = $t1[3];
          }
          if($t2[1] eq "+"){
            $end2 = $t2[3] + length($t2[4]);
            $start2 = $t2[3];
          }
          else{
            $start2 = $t2[3] + length($t2[4]);
            $end2 = $t2[3];
          }
          my ($dir1, $dir2);
          $dir1 = "F" if($start1 < $end1);
          $dir1 = "R" if($start1 > $end1);
          $dir2 = "F" if($start2 < $end2);
          $dir2 = "R" if($start2 > $end2);
          $direction->{"$dir1$dir2"}++ if($start1 < $start2);
          $direction->{"$dir2$dir1"}++ if($start2 < $start1);
          my $diff = abs($start2-$start1);
          if($orientation eq "$dir1$dir2" || $orientation eq "$dir2$dir1"){
            $insertsize->{$diff}++;
          }
          return $paircount if($paircount >= $numpairs);
        }
        $prevread = $t1[0];
        $prevline = $line;
     }

      close BOWIN;
      open (BOWIN, "bowtieoutput/bowtiein.$fname.fa") || die "Can't write to single file bowtieoutput/bowtiein.$name.fa-- fatal\n";
    }
  }
  print "count = $paircount\n";
  return $paircount;
}

###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
sub CounterPrint{
  my $countingMessager = shift;
  print "\r$countingMessager";
  $|++;
}