#!/usr/bin/perl


########################################################################
# Perl scritp for download the placnet RefDB database of genomes and   #
# plasmids from NCBI databases. Script download all complete genomes   #
# from RefSeq bacteria and all isolate Plamids (whitout associated     #
# chromosome). Additionally script create a headersRefDB.txt file to   #
# import description information in Placnet networks                   #
#                                                                      #
#                                                                      #
# Just run: ./makeRefDB                                                #
#                                                                      #
# outputs: RefDB.XX.nXX (blast nucleotide database)                    #
#          headersRefDB.txt (TAB file with genome description)         #
########################################################################

print("\n\nDownloading index of RefSeq Bacteria Database\n");
system("wget -nv --show-progress ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt");

open(SUM,"assembly_summary.txt");
open(OUT,">down.list");
@summary = <SUM>;

@complete = grep(/Complete Genome/,@summary);

foreach $l (@complete)
{
	chomp $l;
	@c = split(/\t/,$l);
	@c2 = split(/\//,$c[19]);
	print OUT "$c[-1]/$c2[-1]_genomic.fna.gz\n";	
}
close SUM;
close OUT;

print ("\nDownloading complete genomes...\n");
system("wget -nv --show-progress -i down.list");

print ("\nDownloading complete plasmids...\n");
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plasmid/*genomic.fna*");

print("Decompressing files...\n");
system("gzip -d *.gz");
system("cat plasmid*.fna > all_plasmid_tmp.fna");
system("grep '>' GC*fna | cut -f2 --delimiter='>' | cut -f1 --delimiter=' ' > acc.txt");

####removing duplicates between plasmids.*.fna and GCA_*.fna
open(A,"acc.txt");
@acc = <A>;
close A;

foreach $l (@acc)
{
	chomp $l;
	$hash{$l} =1;
}


$prt =1;
open(F,"all_plasmid_tmp.fna");
open(O,">all_plasmid_nr.fna");
while ($l = <F>)
{
	if($l =~ />/)
	{
		@c = split(/\|/,$l);
		if (exists($hash{$c[3]}))
		{
			$prt = 0;
		}else{
			$prt = 1;
		}
	}
	
	if($prt ==1)
	{
			print O $l;
	}
	
}


system("cat GC* all_plasmid_nr.fna > all.fasta");

print ("Making Blast Datadase...\n");
system("sed -i 's/>/>refDB|/' all.fasta");
system("makeblastdb -in all.fasta -out RefDB -dbtype nucl");
system("grep '>' all.fasta | sed 's/ /\t/' | sed 's/>//' > headersRefDB.txt");

#system("rm all_plasmid_tmp.fna acc.txt plasmid*.fna");


print("\n\nFINISHED\n");
