File: randomFQ

package info (click to toggle)
ea-utils 1.1.2%2Bdfsg-9
links: PTS, VCS
area: main
in suites: bookworm, sid, trixie
size: 1,212 kB
sloc: ansic: 11,459; cpp: 3,752; perl: 426; makefile: 74; sh: 71
file content (245 lines) | stat: -rwxr-xr-x 5,982 bytes
parent folder | download | duplicates (5)
#!/usr/bin/perl
use strict;
use Getopt::Long;

# Author: Erik Aronesty (earonesty@xpressionanalysis.com) 
# Outputs a random sampled fastq or fasta file, from an input fastq
# Copyright (c) 2011 Expression Analysis


#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


my $cnt;
my $fasta;
my $out;
my $seed=1;
my $window;
my $always;
my $append;
my $pct;
GetOptions("count=i"=>\$cnt, "pct=f"=>\$pct, "fasta"=>\$fasta, "out=s"=>\$out, "seed=i"=>\$seed, "window=i"=>\$window, "always|a"=>\$always, "append|A"=>\$append) || die usage();
if ($window < $cnt) {
	$window = $cnt * 5; 
	$window = 100000 if $window < 100000;
}

if ($seed) {
	srand($seed);
}

die usage() unless $cnt>0||$pct>0;

my $in = shift;
my $mate = shift;
my $mate2 = shift;

die "Can't see $in\n" unless -e $in;
die "Can't see $mate\n" unless !$mate || -e $mate;
die "Can't see $mate2\n" unless !$mate2 || -e $mate2;

my $s = -s $in;
my $sm = -s $mate;
my $sm2 = -s $mate2;

die "Need -out <prefix> for paired-end" if $mate && !$out;

my $suff;
if ($out =~ s/\%(.*)//) {
	$suff = $1;
}
$out =~ s/_$//;

my $gzmeth = $suff =~ /\.gz/ ? "|gzip -c " : "";

open(IN, $in=~/\.gz$/?"gunzip -c $in|":$in) || die;

$append = $append ? ">>" : ">";

if ($mate) {
	open(MI, $mate=~/\.gz$/?"gunzip -c $mate|":$mate) || die;
	open(MI2, $mate2=~/\.gz$/?"gunzip -c $mate2|":$mate2) if $mate2;
	open(O1, "$gzmeth$append${out}_1$suff") || die;
	open(O2, "$gzmeth$append${out}_2$suff") || die;
	if ($mate2) {
		open(O3, "$gzmeth$append${out}_3$suff") || die;
	}
} else {
	my $gzmeth = $out =~ /\.gz/ ? "|gzip -c " : "";
	if ($out) {
		open(O1, "$gzmeth$append$out");
	} else {
		open(O1, ">&STDOUT");
	}
}

my $lc = 0+`ea-alc -o $in`;
my $stats = $in; $stats =~ s/\.gz$//;  $stats .= ".stats";
if (-e $stats) {
	my $rlc = `grep ^reads $stats | cut -f 2`+0;
	$lc = $rlc if $rlc;
} else {
    if ($in =~ /gz$/) {
        $lc *= .90;         # lower guess
    }
}

if (!$always && ($cnt > $lc/2)) {
	$cnt *= 4;
	warn "Source is too small relative to count requested, just returning tail -$cnt\n";
	if ($mate) {
		if ($in=~/gz$/) {
			system("gunzip -c $in | head -$cnt $gzmeth $append ${out}_1$suff");
		} else {
			system("tail -$cnt $in $gzmeth $append ${out}_1$suff");
		}
		if ($mate=~/gz$/) {
			system("gunzip -c $mate | head -$cnt $gzmeth $append ${out}_2$suff");
		} else {
			system("tail -$cnt $mate $gzmeth $append ${out}_2$suff");
		}
		if ($mate2=~/gz$/) {
			system("gunzip -c $mate2 | head -$cnt $gzmeth $append ${out}_3$suff");
		} else {
			system("tail -$cnt $mate2 $gzmeth $append ${out}_3$suff") if ($mate2);
		}
		exit 0;
	} else {
		if ($out) {
			$gzmeth = $out =~ /\.gz/ ? "|gzip -c " : "";
			$out = "$gzmeth $append $out";
		}
		if ($in=~/gz$/) {
			exec("gunzip -c $in | head -$cnt $out");
			die("Exec failed : $!\n");
		} else {
			exec("tail -$cnt $in $out");
			die("Exec failed : $!\n");
		}
	}
}

# reads to sample from....whole file or a smaller part?
$window = $lc if $window > $lc;

my $fudge = 1.5;                # top weighted
$fudge = 1.2 if $always;        # less top-weighted

# larger = more chance to keep read
my $prob;

if ($pct) {
    $prob = $pct/100;
    $cnt = 10000000000;
} else {
    $prob = ($cnt*$fudge)/$window;
}

# we used to seek ... but this broke too much ... if you want to fix... fix randomFQ-broken ... 
if (!$always && !$pct) {
    # skip some reads at the beginning
    my $skip = ($lc-$window) / 10;
    $skip = 200000 if $skip > 200000;
    for (my $i=0;$i<$skip;++$i)	{
        scalar <IN>; scalar <IN>; scalar <IN>; scalar <IN>;
        scalar <MI>; scalar <MI>; scalar <MI>; scalar <MI>;
        scalar <MI2>; scalar <MI2>; scalar <MI2>; scalar <MI2>;
    }
}

while ($cnt > 0) {
	if (rand() > $prob) {
		# discard
		<IN>; <IN>; <IN>; <IN>;
		<MI>; <MI>; <MI>; <MI>;
		<MI2>; <MI2>; <MI2>; <MI2>;
		next;
	}

	# id
	my $i = <IN>;
    if (!$i) {
        $cnt = 0;
        last;
    }
	my $i2 = <MI> if $mate;
	my $i3 = <MI2> if $mate2;

	# read
	my $r = <IN>;
	my $r2 = <MI> if $mate;
	my $r3 = <MI2> if $mate2;

	if ($fasta) {
		# only need id and read
		$i=~ s/^\@/>/;
		$i2=~ s/^\@/>/;
		$i3=~ s/^\@/>/;
		print O1 "$i$r";
		print O2 "$i2$r2" if $mate;
		print O3 "$i3$r3" if $mate2;
		<IN>;<IN>;
		<MI>;<MI>;
	} else {
		# print id and read
		print O1 "$i$r";
		print O2 "$i2$r2";
		print O3 "$i3$r3";
		# copy comment and quality
		print O1 scalar <IN>;
		print O2 scalar <MI> if $mate;
		print O3 scalar <MI2> if $mate2;
		print O1 scalar <IN>;
		print O2 scalar <MI> if $mate;
		print O3 scalar <MI2> if $mate2;
	}
	--$cnt;
}

close IN;

sub usage() {
	return <<EOF
usage: $0 (-c <count> | -p <pct>) [-fasta] [-out <prefix>] [-seed <int>] <input-fastq> [<input-2> [<index-3>] ]

Returns <count> number of random entries from the input fastq.

Output is fastq, unless you specify -fasta

If the -out parameter ends in .gz, the result is gzipped in-place.

-p returns a % of total reads, -c returns a fixed count.

SINGLE END:

Outputs to standard output, unless -out <file> is specified.   

PAIRED END:

Pass 2 (or 3) files as input, -out is required.

If the paired-end output contains a "%" sign, it is replaced with the 1 & 2 for paired-end.

IE: -o output_%.fastq.gz

Otherwise it's jsut output_1 and output_2

*** If one file is an indexed read, it has to be the 3rd file (for now).
EOF
}

sub max {
	return $_[0] > $_[1] ? $_[0] : $_[1];
}