File: readLibFiles.pl

package info (click to toggle)
sspace 2.1.1%2Bdfsg-7
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, sid, trixie
  • size: 6,792 kB
  • sloc: perl: 2,382; python: 374; makefile: 27; sh: 17
file content (232 lines) | stat: -rwxr-xr-x 7,706 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env perl
  #############################################################
  #Marten Boetzer 13-06-2011                                  #
  #SSPACE perl subscript readLibFiles.pl                      #
  #This script;                                               #
  #  -reads, converts and filters original input sequences    #
  #############################################################

  use Storable;
  use File::Path;
  use File::Basename;
  use threads;

  my $seplines = ("-" x 60)."\n";
  my $maxlen = 0;

  my $libraryfile = $ARGV[0];
  my $base_name = $ARGV[1];
  my $extending = $ARGV[2];
  my $unpaired_file = $ARGV[3];
  my $min_overlap = $ARGV[4];
  my $thread = $ARGV[5];
  my $log = $base_name . ".logfile.txt";
  my $summaryfile = $base_name.".summaryfile.txt";

  open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
  open (LOG, ">>$log") || die "Can't write to $log -- fatal\n";

  my $filenameOutFilt = "filtered.readpairs.fasta";
  my $filenameOutExt = $base_name . ".singlereads.fasta";

#-------------------------------------------------READ UNPAIRED FILE CONTAINING SINGLE READS
  &readUnpairedFile($unpaired_file) if ($unpaired_file);
#-------------------------------------------------LOOP THROUGH EACH LIBRARY IN LIBRARYFILE AND STORE AND FILTER READS
  open(FILELIB, "< $libraryfile");

  my ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse, $libResHash);
  my ($prevlibrary, $ctlib) = ("",0);
  &printMessage("\n=>".getDate().": Reading, filtering and converting input sequences of library file initiated\n");

  while(<FILELIB>){
    chomp;
    ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse) = split(/\s+/, $_);

    next if($library eq "");
    $ctlib=0 if($library ne $prevlibrary && $prevlibrary ne "");
    $ctlib++;

    my ($fileBaseName1, $dirName1, $fileExtension1) = fileparse($fileA);
    my ($fileBaseName2, $dirName2, $fileExtension2) = fileparse($fileB);

    my $fname = "reads/$base_name.$library.filtered.readpairs.singles.fasta";
    my ($counter2, $Ncount2);
    #Process multiple files at the same time if multithreaded option is set (-T parameter larger than 1)
    if($fileA ne "TAB"  && $thread > 1){
       my $thr = threads->create(\&generateInputFiles, $library, $fileA, $fileB, $extending, $reverse, $fname, $ctlib);
       if(!($ctlib % $thread)){
         foreach my $thr (threads->list()) {
           my @res = $thr->join();
           ($lib,$nreads,$ncount) = split(/,/,$res[0]);
           $libResHash->{$lib}{'reads'}+=$nreads;
           $libResHash->{$lib}{'N'}+=$ncount;
         }
       }
    #otherwise, process only one file at a time
    }elsif($fileA ne "TAB" && $thread <=1){
      my $out = &generateInputFiles($library, $fileA, $fileB, $extending, $reverse, $fname, $ctlib);
      ($lib,$nreads,$ncount) = split(/,/,$out);
      $libResHash->{$lib}{'reads'}+=$nreads;
      $libResHash->{$lib}{'N'}+=$ncount;
    }
    #if user has inserted a TAB file, calculate read statistics
    if($fileA eq "TAB"){
      open FILE, "$fileB" or die $!;
      my ($fileBaseName2, $dirName2, $fileExtension2) = fileparse($fileB);
      print "Reading tabfile: $fileBaseName2...\n";
      $counter2++ while(<FILE>);
      $libResHash->{$lib}{'reads'}+=$counter2;
      $libResHash->{$lib}{'N'} = 0;
      close FILE;
    }
    $prevlibrary = $library;
  }
  #Process remaining reads
  if($fileA ne "TAB"){
    foreach my $thr (threads->list()) {
      my @res = $thr->join();
      ($lib,$nreads,$ncount) = split(/,/,$res[0]);
      $libResHash->{$lib}{'reads'}+=$nreads;
      $libResHash->{$lib}{'N'}+=$ncount;
    }
  }
  #Print read statistics to the summary file
  &printMessage("\n$seplines");
  foreach my $libs (keys %$libResHash){
    my $totcounter = $libResHash->{$libs}{'reads'};
    my $totNcount = $libResHash->{$libs}{'N'};
    my $filt = $totcounter-$totNcount;
    print SUMFILE "READING READS $libs:\n";
    print SUMFILE "$seplines\tTotal inserted pairs = $totcounter \n";
    print SUMFILE "\tNumber of pairs containing N's = $totNcount \n\tRemaining pairs = $filt\n$seplines\n";
  }
  close FILELIB;
  close SUMFILE;
  close LOG;

  mkpath('process_OK'); #make directory, indicating that process has run OK

#--------------------------------------------------

###CONVERT INPUT SEQUENCES BY REMOVING PAIRED READS HAVING AN 'N'
sub generateInputFiles{
  my ($lib, $fileA, $fileB, $extension, $reverse, $fname, $libct) = @_;
  my ($name,$seq1,$seq2, $res1,$res2);
  my ($counterext, $Ncount, $countsinglet, $fastq, $step) = (0,0,0,0,1000000);
  open (OUTSINGLEFILE, ">reads/$base_name.$lib.file$libct.fa") || die "Can't write to single file file$fname-- fatal\n";

  #check if file is fastQ or fastA
  open(TEST, "< $fileA");
  $name = <TEST>;
  close TEST;
  $fastq = 1 if ($name =~ /^[@]/);

  open(FILEA, "< $fileA");
  open(FILEB, "< $fileB");
  CounterPrint("Reading read-pairs $lib.$libct @ $countsinglet       ");
  while(<FILEA>) {
    <FILEB>;
    $seq1 = uc(<FILEA>), $seq1 =~ s/^\r\n/\n/;
    $seq2 = uc(<FILEB>), $seq2 =~ s/^\r\n/\n/;
    #FASTQ FORMAT
    <FILEA>,<FILEA>,<FILEB>,<FILEB> if ($fastq);

    $res1 = index($seq1,"N");
    $res2 = index($seq2,"N");
    #if both reads contain N's, do not use them for contig extension and for scaffolding
    if($res1 == -1 && $res2 == -1){
       print OUTSINGLEFILE ">read$countsinglet/1\n$seq1>read$countsinglet/2\n$seq2";
    }else{
      $Ncount++;
    }
    if(++$countsinglet == $step){   
      CounterPrint("Reading read-pairs $lib.$libct @ $countsinglet         ");
      $step = $step + 1000000;
    }

  }
  CounterPrint("\n") if($thread <= 1);
  CounterPrint((" " x 40));
  close OUTSINGLEFILE;
  close FILEB;
  close FILEA;
  return "$lib,$countsinglet,$Ncount";
}

#------------------READ UNPAIRED SINGLE READS FILE WHEN -u IS SET

sub readUnpairedFile{
  my ($file) = @_;
  open(INUNPAIRED, "< $file") || die "Can't open $file -- fatal\n";
  open OUTFILEExt, "> reads/$filenameOutExt";

  &printMessage("\n=>".getDate().": Reading, filtering and converting unpaired input sequences initiated\n");

  my ($seq1, $name);
  my ($counterext, $counter, $step, $fastq) = (0,0, 100000,0);

  open(TEST, "< $file");
  $name = <TEST>;
  close TEST;
  $fastq = 1 if ($name =~ /^[@]/);
  while(<INUNPAIRED>) {
    $seq1 = uc(<INUNPAIRED>); $seq1 =~ s/\r\n/\n/; chomp $seq1;

    #FASTQ FORMAT
    if ($fastq){
      <INUNPAIRED>; <INUNPAIRED>;
    }
    # ELSE FASTA FORMAT
    if(index($seq1, "N") == -1){
       print OUTFILEExt ">$counterext\n$seq1\n";
       $counterext++;
    }
    if(++$counter == $step){
      CounterPrint($counter);
      $step = $step + 100000;
    }
  }
  CounterPrint("                ");

  print SUMFILE "READING UNPAIRED READS:\n";
  print SUMFILE "$seplines\tTotal inserted reads = $counter \n";
  print SUMFILE "\tNumber of reads containing N's = ".($counter-$counterext)."\n\tRemaining reads = $counterext\n";
  close OUTFILEext;
  close INUNPAIRED;
}

###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE
sub reverseComplement{
   $_ = shift;
   tr/ATGC/TACG/;
   return (reverse());
}

###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
sub CounterPrint{
  my $countingMessager = shift;
  print "\r$countingMessager";
  $|++;
}

###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE
sub printMessage{
  my $message = shift;
  print $message;
  print LOG $message;
}

###FUNCTION TO GET THE CURRENT DATE
sub getDate{
  my $date = scalar(localtime);
  return $date;
}

###FLUSHES THE SUMMARY AND LOG FILE
sub FlushFiles{
  select((select(SUMFILE), $| = 1)[0]);
  select((select(LOG), $| = 1)[0]);
  $|++;
}

#########END readLibFiles.pl