1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
#!/usr/bin/env perl
## util/fastQ_rand_subset.pl
## Purpose: Extracts out a specific number of random reads from an
## input file, making sure that left and right ends of the selected
## reads are paired
## Usage: $0 left.fq right.fq num_entries
##
use strict;
use warnings;
use lib ("/usr/lib/trinityrnaseq/PerlLib");
use Fastq_reader;
use File::Basename;
use List::Util qw(shuffle);
use Data::Dumper;
my $usage = "usage: $0 left.fq right.fq num_entries num_total_records\n\n";
my $left_fq = $ARGV[0] or die $usage;
my $right_fq = $ARGV[1] or die $usage;
my $num_entries = $ARGV[2] or die $usage;
my $num_total_records = $ARGV[3] or die $usage;
main: {
srand();
my @selected_indices = &get_random_indices($num_entries, $num_total_records);
if (scalar(@selected_indices) != $num_entries) {
die "Error, get_random_indices returned " . scalar(@selected_indices) . " instead of $num_total_records";
}
else {
print STDERR "-selected $num_entries indices. Now outputting selected records\n";
}
my %selected = map { + $_ => 1 } @selected_indices;
@selected_indices = ();
print STDERR "Selecting $num_entries entries...";
my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
my $num_M_entries = $num_entries/1e6;
$num_M_entries .= "M";
my $base_left_fq = basename($left_fq);
my $base_right_fq = basename($right_fq);
open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
my $counter = 0;
while (my $left_entry = $left_fq_reader->next()) {
my $right_entry = $right_fq_reader->next();
unless ($left_entry && $right_entry) {
die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
}
unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
die "Error, core read names don't match: "
. "Left: " . $left_entry->get_core_read_name() . "\n"
. "Right: " . $right_entry->get_core_read_name() . "\n";
}
if ($selected{$counter}) {
print $left_ofh $left_entry->get_fastq_record();
print $right_ofh $right_entry->get_fastq_record();
delete($selected{$counter});
}
$counter++;
if ($counter % 100000 == 0) {
print STDERR "\r[$counter] ";
}
}
print STDERR "\n\ndone.\n";
close $left_ofh;
close $right_ofh;
if (%selected) {
die "Error, missing indices: " . Dumper(\%selected);
}
else {
print STDERR "-all records located and output.\n";
}
exit(0);
}
####
sub get_random_indices {
my ($num_entries, $num_total_records) = @_;
return( (shuffle 0..$num_total_records)[0..($num_entries-1)]);
}
|