#!/usr/bin/env perl # Read and merge the sequence for the chosen level use strict ; use warnings ; use threads ; use threads::shared ; use FindBin qw($Bin); use File::Basename; use File::Find; use Getopt::Long; use Cwd; use Cwd 'cwd' ; use Cwd 'abs_path' ; my $CWD = dirname( abs_path( $0 ) ) ; my $PWD = abs_path( "./" ) ; my $usage = "USAGE: perl ".basename($0)." path_to_download_files path_to_taxnonomy [-map header_to_taxid_map -o compressed -noCompress -t 1 -maxG 50000000 -noDustmasker]\n" ; my $level = "species" ; my $output = "compressed" ; my $bssPath = $CWD ; # take path of binary as script directory my $numOfThreads = 1 ; my $noCompress = 0 ; my $noDustmasker = 0 ; my $verbose = 0; my $maxGenomeSizeForCompression = 50000000 ; my $mapFile = "" ; GetOptions ("level|l=s" => \$level, "output|o=s" => \$output, "bss=s" => \$bssPath, "threads|t=i" => \$numOfThreads, "maxG=i" => \$maxGenomeSizeForCompression, "map=s" => \$mapFile, "verbose|v" => \$verbose, "noCompress" => \$noCompress, "noDustmasker" => \$noDustmasker) or die("Error in command line arguments. \n\n$usage"); die $usage unless @ARGV == 2; my $path = $ARGV[0] ; my $taxPath = $ARGV[1] ; my $i ; my %gidToFile ; my %gidUsed ; my %tidToGid ; # each tid can corresponds several gid my %gidToTid ; my %speciesList ; # hold the tid in the species my %species ; # hold the species tid my %genus ; # hold the genus tid my %speciesToGenus ; my %fileUsed : shared ; my $fileUsedLock : shared ; my %taxTree ; my @speciesListKey ; my $speciesUsed : shared ; my $debug: shared ; my %speciesIdToName : shared ; my %idToTaxId : shared ; my %newIdToTaxId : shared ; my %idToGenomeSize : shared ; my $idMapLock : shared ; my $step = 1 ; # check the depended softwares if ( `which jellyfish` eq "" ) { die "Could not find Jellyfish.\n" ; } else { my $version = `jellyfish --version` ; chomp $version ; if ( ( split /\s+/, $version )[1] lt 2 ) { die "Jellyfish on your system is ", $version, " , centrifuge-compress.pl requires at least 2.0.0\n" ; } } if ( `which nucmer` eq "" ) { die "Could not find Nucmer.\n" } if ( `which dustmasker` eq "" ) { print STDERR "Could not find dustmasker. And will turn on -noDustmasker option.\n" ; $noDustmasker = 1 ; } #Extract the gid we met in the file if ( $mapFile ne "" ) { print STDERR "Step $step: Reading in the provided mapping list of ids to taxionomy ids.\n"; ++$step ; open FP1, $mapFile ; while ( ) { chomp ; my @cols = split ; $idToTaxId{ $cols[0] } = $cols[1] ; if ( $noCompress == 1 ) { $newIdToTaxId{ $cols[0] } = $cols[1] ; } } } print STDERR "Step $step: Collecting all .fna files in $path and getting gids\n"; ++$step ; if ( $noCompress == 1 ) { `rm tmp_output.fa` ; } find ( { wanted=>sub { return unless -f ; # Must be a file return unless -s; # Must be non-zero if ( !( /\.f[nf]?a$/ || /\.ffn$/ || /\.fasta$/ ) ) { return ; } my $fullfile = $File::Find::name; ## file with full path, but the CWD is actually the file's path my $file = $_; ## file name open FP2, $file or die "Error opening $file: $!"; my $head = ; close FP2 ; chomp $head ; my $headId = substr( ( split /\s+/, $head )[0], 1 ) ; if ( $noCompress == 1 ) { # it seems the find will change the working directory system_call( "cat $PWD/$fullfile >> $PWD/tmp_output.fa" ) ; if ( defined $idToTaxId{ $headId } ) { $newIdToTaxId{ $headId } = $idToTaxId{ $headId } ; } else { $newIdToTaxId{ $headId } = -1 ; my @cols = split /\|/, $headId ; my $subHeader = $cols[0]."\|".$cols[1] ; if ( $headId =~ /gi\|([0-9]+)?\|/ ) { $newIdToTaxId{ $subHeader } = -1 ; #print STDERR "$headId $subHeader\n" if $verbose ; } elsif ( $headId =~ /taxid\|([0-9]+)?[\|\s]/ ) { $newIdToTaxId{ $headId } = $1 ; $newIdToTaxId{ $subHeader } = $1 ; } elsif ( scalar( @cols ) > 2 ) { $newIdToTaxId{ $subHeader } = -1 ; } } } if ( defined $idToTaxId{ $headId } ) { my $tid = $idToTaxId{ $headId } ; my $dummyGid = "centrifuge_gid_".$fullfile."_$tid" ; $gidUsed{ $dummyGid } = 1 ; $gidToFile{ $dummyGid } = $fullfile ; $fileUsed{ $fullfile } = 0 ; push @{ $tidToGid{ $tid } }, $dummyGid ; print STDERR "tid=$tid $file\n" if $verbose; } elsif ( $head =~ /^>gi\|([0-9]+)?\|/ ) { my $gid = $1 ; print STDERR "gid=$gid $file\n" if $verbose; if ( defined $gidUsed{ $gid } ) { print "Repeated gid $gid\n" if $verbose ; $fileUsed{ $fullfile } = 1 ; } else { $fileUsed{ $fullfile } = 0 ; $gidToFile{ $gid } = $fullfile ; } $gidUsed{ $gid } = 1 ; } elsif ( $head =~ /taxid\|([0-9]+)?[\|\s]/ ) { my $tid = $1 ; my $dummyGid = "centrifuge_gid_".$fullfile."_$1" ; $gidUsed{ $dummyGid } = 1 ; $gidToFile{ $dummyGid } = $fullfile ; $fileUsed{ $fullfile } = 0 ; push @{ $tidToGid{ $tid } }, $dummyGid ; print STDERR "tid=$tid $file\n" if $verbose; } else { print STDERR "Excluding $fullfile: Wrong header.\n"; } }, follow=>1 }, $path ); if ( $noCompress == 1 ) { # Remove the Ns from the file if ( $noDustmasker == 1 ) { system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ; } else { system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa > tmp_output_fmt.fa") ; system_call( "dustmasker -infmt fasta -in tmp_output_fmt.fa -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]//g' > tmp_output_dustmasker.fa" ) ; system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output_dustmasker.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ; } } # Extract the tid that are associated with the gids print STDERR "Step $step: Extract the taxonomy ids that are associated with the gids\n"; ++$step ; open FP1, "$taxPath/gi_taxid_nucl.dmp" ; while ( ) { chomp ; my @cols = split ; #print $cols[0], "\n" ; next if ( @ARGV < 2 ) ; if ( defined( $gidUsed{ $cols[0] } ) ) { push @{ $tidToGid{ $cols[1] } }, $cols[0] ; $gidToTid{ $cols[0] } = $cols[1] ; print STDERR "gid=", $cols[0], " tid=", $cols[1], " ", $gidToFile{ $cols[0] }, "\n" if $verbose; } } close FP1 ; if ( $noCompress == 1 ) { open FP1, ">tmp_$output.map" ; foreach my $key (keys %newIdToTaxId ) { if ( $newIdToTaxId{$key} != -1 ) { print FP1 "$key\t", $newIdToTaxId{$key}, "\n" ; } elsif ( $key =~ /gi\|([0-9]+)?/ ) { #if ( defined $gidToTid{ $1 } ) #{ # $newIdToTaxId{ $key } = $gidToTid{ $1 } ; #} my $taxId = 1 ; if (defined $gidToTid{ $1 } ) { $taxId = $gidToTid{ $1 } ; } print FP1 "$key\t", $taxId, "\n" ; } else { print FP1 "$key\t1\n" ; } } close FP1 ; system_call( "sort tmp_$output.map | uniq > $output.map" ) ; exit ; } # Organize the tree print STDERR "Step $step: Organizing the taxonomical tree\n"; ++$step ; open FP1, "$taxPath/nodes.dmp" or die "Couldn't open $taxPath/nodes.dmp: $!"; while ( ) { chomp ; my @cols = split ; #next if ( !defined $tidToGid{ $cols[0] } ) ; my $tid = $cols[0] ; my $parentTid = $cols[2] ; my $rank = $cols[4] ; #print "subspecies: $tid $parentTid\n" ; #push @{ $species{ $parentTid } }, $tid ; #$tidToSpecies{ $tid } = $parentTid ; $taxTree{ $cols[0] } = $cols[2] ; #print $cols[0], "=>", $cols[2], "\n" ; if ( $rank eq "species" ) { $species{ $cols[0] } = 1 ; } elsif ( $rank eq "genus" ) { $genus{ $cols[0] } = 1 ; } } close FP1 ; # Put the sub-species taxonomy id into the corresponding species. print STDERR "Step $step: Putting the sub-species taxonomy id into the corresponding species\n"; ++$step ; for $i ( keys %tidToGid ) { my $p = $i ; my $found = 0 ; while ( 1 ) { last if ( $p <= 1 ) ; if ( defined $species{ $p } ) { $found = 1 ; last ; } if ( defined $taxTree{ $p } ) { $p = $taxTree{ $p } ; } else { last ; } } if ( $found == 1 ) { push @{ $speciesList{ $p } }, $i ; } } print STDERR "Step $step: Reading the name of the species\n"; ++$step ; open FP1, "$taxPath/names.dmp" or die "Could not open $taxPath/names.dmp: $!"; while ( ) { next if (!/scientific name/ ) ; my @cols = split /\t/ ; if ( defined $species{ $cols[0] } ) { $speciesIdToName{ $cols[0] } = $cols[2] ; } } close FP1 ; #exit ; sub GetGenomeSize { open FPfa, $_[0] ; my $size = 0 ; while ( ) { next if ( /^>/ ) ; $size += length( $_ ) - 1 ; } close FPfa ; return $size ; } # Compress one species. sub solve { # Extracts the files #print "find $pwd -name *.fna > tmp.list\n" ; my $tid = threads->tid() - 1 ; #system_call("find $pwd -name *.fna > tmp_$tid.list") ; #system_call("find $pwd -name *.fa >> tmp.list") ; # Just in case # Build the header my $genusId ; my $speciesId = $_[0] ; my $speciesName ; my $i ; my $file ; my @subspeciesList = @{ $speciesList{ $speciesId } } ; $genusId = $taxTree{ $speciesId } ; while ( 1 ) { if ( !defined $genusId || $genusId <= 1 ) { $genusId = 0 ; last ; } if ( defined $genus{ $genusId } ) { last ; } else { $genusId = $taxTree{ $genusId } ; } } my $FP1 ; open FP1, ">tmp_$tid.list" ; my $genomeSize = 0 ; my $avgGenomeSize = 0 ; foreach $i ( @subspeciesList ) { foreach my $j ( @{$tidToGid{ $i } } ) { #{ # lock( $debug ) ; # print "Merge ", $gidToFile{ $j }, "\n" ; #} $file = $gidToFile{ $j } ; { lock( $fileUsedLock ) ; $fileUsed{ $file } = 1 ; } print FP1 $file, "\n" ; my $tmp = GetGenomeSize( $file ) ; if ( $tmp > $genomeSize ) { $genomeSize = $tmp ; } $avgGenomeSize += $tmp ; } } close FP1 ; #$genomeSize = int( $genomeSize / scalar( @subspeciesList ) ) ; $avgGenomeSize = int( $avgGenomeSize / scalar( @subspeciesList ) ) ; #print $file, "\n" ; #if ( $file =~ /\/(\w*?)uid/ ) #{ # $speciesName = ( split /_/, $1 )[0]."_". ( split /_/, $1 )[1] ; #} if ( defined $speciesIdToName{ $speciesId } ) { $speciesName = $speciesIdToName{ $speciesId } ; $speciesName =~ s/ /_/g ; } else { $speciesName = "Unknown_species_name" ; } my $id = $speciesId ;#( $speciesId << 32 ) | $genusId ; my $header = ">cid|$id $speciesName $avgGenomeSize ".scalar( @subspeciesList ) ; print STDERR "$header\n" ; { lock( $idMapLock ) ; $newIdToTaxId{ "cid|$id" } = $speciesId ; $idToGenomeSize{ "cid|$id" } = $avgGenomeSize ; } #return ; # Build the sequence my $seq = "" ; if ( $noCompress == 0 && ( $maxGenomeSizeForCompression < 0 || $genomeSize <= $maxGenomeSizeForCompression ) ) #$genomeSize < 50000000 ) { system_call("perl $bssPath/centrifuge-BuildSharedSequence.pl tmp_$tid.list -prefix tmp_${tid}_$id" ) ; # Merge all the fragmented sequence into one big chunk. system_call("cat tmp_${tid}_${id}_*.fa > tmp_${tid}_$id.fa"); open FP1, "tmp_${tid}_$id.fa" ; while ( ) { chomp ; next if ( /^>/ ) ; $seq .= $_ ; } close FP1 ; } else { foreach $i ( @subspeciesList ) { foreach my $j ( @{$tidToGid{ $i } } ) { $file = $gidToFile{ $j } ; open FP1, $file ; while ( ) { #chomp ; next if ( /^>/ ) ; $seq .= $_ ; } close FP1 ; } } } open fpOut, ">>${output}_${tid}" ; print fpOut "$header\n$seq\n" ; close fpOut ; unlink glob("tmp_${tid}_*"); } sub system_call { print STDERR "SYSTEM CALL: ".join(" ",@_); system(@_) == 0 or die "system @_ failed: $?"; print STDERR " finished\n"; } sub threadWrapper { my $tid = threads->tid() - 1 ; unlink("${output}_${tid}"); while ( 1 ) { my $u ; { lock $speciesUsed ; $u = $speciesUsed ; ++$speciesUsed ; } last if ( $u >= scalar( @speciesListKey ) ) ; solve( $speciesListKey[ $u ] ) ; } } print STDERR "Step $step: Merging sub-species\n"; ++$step ; my @threads ; @speciesListKey = keys %speciesList ; $speciesUsed = 0 ; for ( $i = 0 ; $i < $numOfThreads ; ++$i ) { push @threads, $i ; } foreach (@threads) { $_ = threads->create( \&threadWrapper ) ; } foreach (@threads) { $_->join() ; } # merge the files generated from each threads system_call("cat ${output}_* > tmp_output.fa"); unlink glob("${output}_*"); #print unused files foreach $i ( keys %fileUsed ) { if ( $fileUsed{ $i } == 0 ) { #print $i, "\n" ; #`cat $i >> tmp_output.fa` ; print "Unused file: $i\n" ; } } # Remove the Ns from the file if ( $noDustmasker == 1 ) { system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ; } else { system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa > tmp_output_fmt.fa") ; system_call( "dustmasker -infmt fasta -in tmp_output_fmt.fa -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]//g' > tmp_output_dustmasker.fa" ) ; system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output_dustmasker.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ; } # Output the mapping of the ids to species open FP1, ">$output.map" ; foreach my $key (keys %newIdToTaxId ) { print FP1 "$key\t", $newIdToTaxId{ $key }, "\n" ; } close FP1 ; # Output the genome sizem map open FP1, ">$output.size" ; foreach my $key ( keys %newIdToTaxId ) { print FP1 $newIdToTaxId{ $key }, "\t", $idToGenomeSize{ $key }, "\n" ; } close FP1 ; unlink glob("tmp_*") ;