File: makeRefDB.pl

package info (click to toggle)
placnet 1.04-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 140 kB
  • sloc: perl: 691; sh: 21; makefile: 8
file content (94 lines) | stat: -rw-r--r-- 2,601 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/perl


########################################################################
# Perl scritp for download the placnet RefDB database of genomes and   #
# plasmids from NCBI databases. Script download all complete genomes   #
# from RefSeq bacteria and all isolate Plamids (whitout associated     #
# chromosome). Additionally script create a headersRefDB.txt file to   #
# import description information in Placnet networks                   #
#                                                                      #
#                                                                      #
# Just run: ./makeRefDB                                                #
#                                                                      #
# outputs: RefDB.XX.nXX (blast nucleotide database)                    #
#          headersRefDB.txt (TAB file with genome description)         #
########################################################################

print("\n\nDownloading index of RefSeq Bacteria Database\n");
system("wget -nv --show-progress ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt");

open(SUM,"assembly_summary.txt");
open(OUT,">down.list");
@summary = <SUM>;

@complete = grep(/Complete Genome/,@summary);

foreach $l (@complete)
{
	chomp $l;
	@c = split(/\t/,$l);
	@c2 = split(/\//,$c[19]);
	print OUT "$c[-1]/$c2[-1]_genomic.fna.gz\n";	
}
close SUM;
close OUT;

print ("\nDownloading complete genomes...\n");
system("wget -nv --show-progress -i down.list");

print ("\nDownloading complete plasmids...\n");
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plasmid/*genomic.fna*");

print("Decompressing files...\n");
system("gzip -d *.gz");
system("cat plasmid*.fna > all_plasmid_tmp.fna");
system("grep '>' GC*fna | cut -f2 --delimiter='>' | cut -f1 --delimiter=' ' > acc.txt");

####removing duplicates between plasmids.*.fna and GCA_*.fna
open(A,"acc.txt");
@acc = <A>;
close A;

foreach $l (@acc)
{
	chomp $l;
	$hash{$l} =1;
}


$prt =1;
open(F,"all_plasmid_tmp.fna");
open(O,">all_plasmid_nr.fna");
while ($l = <F>)
{
	if($l =~ />/)
	{
		@c = split(/\|/,$l);
		if (exists($hash{$c[3]}))
		{
			$prt = 0;
		}else{
			$prt = 1;
		}
	}
	
	if($prt ==1)
	{
			print O $l;
	}
	
}


system("cat GC* all_plasmid_nr.fna > all.fasta");

print ("Making Blast Datadase...\n");
system("sed -i 's/>/>refDB|/' all.fasta");
system("makeblastdb -in all.fasta -out RefDB -dbtype nucl");
system("grep '>' all.fasta | sed 's/ /\t/' | sed 's/>//' > headersRefDB.txt");

#system("rm all_plasmid_tmp.fna acc.txt plasmid*.fna");


print("\n\nFINISHED\n");