1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use lib ("$FindBin::RealBin/../../PerlLib");
use Fastq_reader;
use File::Basename;
my $usage = "usage: $0 left.fq right.fq num_entries\n\n";
my $left_fq = $ARGV[0] or die $usage;
my $right_fq = $ARGV[1] or die $usage;
my $num_to_sample = $ARGV[2] or die $usage;
srand();
main: {
## do reservoir sampling on indices instead of the fastq records themselves, then just extract the selected ones.
my $num_fq_entries = &get_num_fq_entries($left_fq);
my @selected_entries = (1..$num_to_sample);
for (my $i = $num_to_sample + 1; $i <= $num_fq_entries; $i++) {
my $rand_pos = rand($i);
if ($rand_pos < $num_to_sample) {
$selected_entries[$rand_pos] = $i;
}
}
my %indices_want = map { + $_ => 1 } @selected_entries;
@selected_entries = (); # free
my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
my $num_M_entries = $num_to_sample/1e6;
$num_M_entries .= "M";
my $base_left_fq = basename($left_fq);
my $base_right_fq = basename($right_fq);
open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
my $fq_index = 0;
while (my $left_entry = $left_fq_reader->next()) {
my $right_entry = $right_fq_reader->next();
unless ($left_entry && $right_entry) {
die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
}
unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
die "Error, core read names don't match: "
. "Left: " . $left_entry->get_core_read_name() . "\n"
. "Right: " . $right_entry->get_core_read_name() . "\n";
}
$fq_index++;
if ($indices_want{$fq_index}) {
print $left_ofh $left_entry->get_fastq_record();
print $right_ofh $right_entry->get_fastq_record();
}
}
print STDERR " done.\n";
close $left_ofh;
close $right_ofh;
exit(0);
}
####
sub get_num_fq_entries {
my ($fastq_file) = @_;
unless (-s $fastq_file) {
die "Error, cannot locate file $fastq_file";
}
my $linecount;
if ($fastq_file =~ /\.gz$/) {
$linecount = `gunzip -c $fastq_file | wc -l`;
}
else {
$linecount = `cat $fastq_file | wc -l`;
}
chomp $linecount;
my $num_records = $linecount / 4;
return($num_records);
}
|