File: fastQ_rand_subset.reservoir_sampling_reqiures_high_mem.pl

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (112 lines) | stat: -rwxr-xr-x 3,398 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env perl

## util/fastQ_rand_subset.pl
## Purpose: Extracts out a specific number of random reads from an
##   input file, making sure that left and right ends of the selected
##   reads are paired
## Usage: $0 left.fq right.fq num_entries
##
## This code implements reservoir sampling, which produces an unbiased
## selection regardless of the number of entries (assuming an
## appropriately uniform random distribution function).

## See:
## * https://en.wikipedia.org/wiki/Reservoir_sampling
##
## The current implementation requires the selected entries to be
## stored in memory, but passes over the input file(s) only once. If
## the array loading / reading is too slow or memory intensive, this
## could be implemented in a two-pass fashion by first getting
## indexes, and then getting the reads

use strict;
use warnings;

use lib ("/usr/lib/trinityrnaseq/PerlLib");
use Fastq_reader;
use File::Basename;

my $usage = "usage: $0 left.fq right.fq num_entries\n\n";

my $left_fq = $ARGV[0] or die $usage;
my $right_fq = $ARGV[1] or die $usage;
my $num_entries = $ARGV[2] or die $usage;

my @selected_left_entries = ();
my @selected_right_entries = ();

main: {

  print STDERR "Selecting $num_entries entries...";

  my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
  my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;

  my $num_M_entries = $num_entries/1e6;
  $num_M_entries .= "M";
  my $base_left_fq = basename($left_fq);
  my $base_right_fq = basename($right_fq);

  open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
  open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;

  srand();

  my $num_skipped = 0;
  my $num_output_entries = 0;
  my $num_entries_read = 0;

  my $left_entry = 0;
  my $right_entry = 0;

  while ($left_entry = $left_fq_reader->next()) {
    $right_entry = $right_fq_reader->next();

    unless ($left_entry && $right_entry) {
      die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
    }
    unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
      die "Error, core read names don't match: "
        . "Left: " . $left_entry->get_core_read_name() . "\n"
          . "Right: " . $right_entry->get_core_read_name() . "\n";
    }

    $num_entries_read++;

    if($num_entries_read <= $num_entries){
      # Populate reservoir with entries up to num_entries
      push(@selected_left_entries, $left_entry);
      push(@selected_right_entries, $right_entry);
    } else {
      # Randomly replace elements in the reservoir
      # with a decreasing probability
      my $swapPos = int(rand($num_entries_read));

      if($swapPos < $num_entries){
        $selected_left_entries[$swapPos] = $left_entry;
        $selected_right_entries[$swapPos] = $right_entry;
      }
    }
  }

  if ($num_entries > $num_entries_read) {
    die "Error, num_entries $num_entries > total records available: $num_entries_read ";
  }

  # print selected entries to their respective files
  foreach my $entry (@selected_left_entries){
    print $left_ofh $entry->get_fastq_record();
  }
  foreach my $entry (@selected_right_entries){
    print $right_ofh $entry->get_fastq_record();
  }

  print STDERR " done.\n";

  close $left_ofh;
  close $right_ofh;

  exit(0);
}