1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
|
#!/usr/bin/env bash
if [ $# -lt 3 ];
then
echo "usage: $0 [regions file] [ncpus] [freebayes arguments]"
echo
echo "Run freebayes in parallel over regions listed in regions file, using ncpus processors."
echo "Will merge and sort output, producing a uniform VCF stream on stdout. Flags to freebayes"
echo "which would write to e.g. a particular file will obviously cause problms, so caution is"
echo "encouraged when using this script."
echo
echo "examples:"
echo
echo "Run freebayes in parallel on 100000bp chunks of the ref (fasta_generate_regions.py is also"
echo "located in the scripts/ directory in the freebayes distribution). Use 36 threads."
echo
echo " freebayes-parallel <(fasta_generate_regions.py ref.fa.fai 100000) 36 -f ref.fa aln.bam >out.vcf"
echo
echo "Generate regions that are equal in terms of data content, and thus have lower variance"
echo "in runtime. This will yield better resource utilization."
echo
echo " bamtools coverage -in aln.bam | coverage_to_regions.py ref.fa.fai 500 >ref.fa.500.regions"
echo " freebayes-parallel ref.fa.500.regions 36 -f ref.fa aln.bam >out.vcf"
echo
exit
fi
regionsfile=$1
shift
ncpus=$1
shift
command=("freebayes" "$@")
(
#$command | head -100 | grep "^#" # generate header
# iterate over regions using gnu parallel to dispatch jobs
cat "$regionsfile" | parallel -k -j "$ncpus" "${command[@]}" --region {}
) | vcffirstheader \
| vcfstreamsort -w 1000 | vcfuniq # remove duplicates at region edges
|