1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
|
package prok;
import java.util.ArrayList;
import java.util.HashMap;
import fileIO.FileFormat;
import fileIO.TextStreamWriter;
import server.ServerTools;
import shared.Parse;
import shared.Tools;
import template.ThreadWaiter;
/** Crawls ncbi's ftp site to download genomes and annotations */
public class FetchProks {
public static void main(String[] args){
//ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/
String baseAddress=args[0];
String out=args.length>1 ? args[1] : "stdout";
if(args.length>2){
maxSpeciesPerGenus=Integer.parseInt(args[2]);
System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus);
}
if(args.length>3){
findBest=Parse.parseBoolean(args[3]);
System.err.println("Set findBest="+findBest);
}
TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT);
tsw.start();
// iterateOuter(baseAddress, tsw);
ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
int threads=7;
ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
for(int i=0; i<threads; i++){
alpt.add(new ProcessThread(contents, tsw, i, threads));
}
for(ProcessThread pt : alpt){pt.start();}
boolean success=ThreadWaiter.waitForThreadsToFinish(alpt);
for(ProcessThread pt : alpt){
totalSpecies+=pt.totalSpeciesT;
totalGenus+=pt.totalGenusT;
totalGenomes+=pt.totalGenomesT;
}
System.err.println("Total Genomes: "+totalGenomes);
System.err.println("Total Species: "+totalSpecies);
System.err.println("Total Genuses: "+totalGenus);
tsw.poisonAndWait();
assert(success);
}
static class ProcessThread extends Thread {
ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){
speciesList=speciesList_;
tsw=tsw_;
tid=tid_;
threads=threads_;
}
@Override
public void run(){
for(String s : speciesList){
// if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
// processSpecies(s);
// }
//This way one thread handles an entire genus
if(s!=null){
String genus=getGenus(s);
if(genus!=null){
if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) {
processSpecies(s);
}
}else{
if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
processSpecies(s);
}
}
}
}
}
void processSpecies(String species){
String genus=getGenus(species);
if(genus!=null){
final int count=seen(genus, seen);
if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){
int found=examineSpecies(species, tsw);
if(found>=1){
totalSpeciesT++;
totalGenomesT+=found;
if(count==0){totalGenusT++;}
put(genus, found, seen);
}
}else{
if(verbose){System.err.println("same genus: "+species+"\n"+genus);}
}
}else{
if(verbose){System.err.println("bad species: "+species+"\n"+genus);}
}
}
final ArrayList<String> speciesList;
final int tid;
final int threads;
//This is OK now that threads work on a per-genus basis
HashMap<String, Integer> seen=new HashMap<String, Integer>();
final TextStreamWriter tsw;
int totalSpeciesT=0;
int totalGenusT=0;
int totalGenomesT=0;
}
static String getGenus(String path){
//Candidatus_Hamiltonella
String name=path.substring(path.lastIndexOf('/')+1);
if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
int under=name.indexOf('_');
if(under>0){
return name.substring(0, under);
}else{
return null;
}
}
static String getSpecies(String path){
//Candidatus_Hamiltonella
String name=path.substring(path.lastIndexOf('/')+1);
if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
return name;
}
static int examineSpecies(String baseAddress, TextStreamWriter tsw){
if(verbose){System.err.println("examineSpecies: "+baseAddress);}
String speciesName=getSpecies(baseAddress);
ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
// System.err.println("B: "+contents);
int found=0;
for(String s : contents){
// System.err.println(s);
if(s.contains("reference")){
// System.err.println("Looking at '"+s+"'");
found+=examineAssemblies(s, tsw, speciesName);
}
}
if(found>0){return found;}
for(String s : contents){
// System.err.println(s);
if(s.contains("latest_assembly_versions")){
// System.err.println("Looking at '"+s+"'");
found+=examineAssemblies(s, tsw, speciesName);
}
}
if(found>0){return found;}
for(String s : contents){
// System.err.println(s);
if(s.contains("all_assembly_versions")){
// System.err.println("Looking at '"+s+"'");
found+=examineAssemblies(s, tsw, speciesName);
}
}
return found;
}
static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){
if(verbose){System.err.println("examineAssemblies: "+baseAddress);}
Stats stats=null;
if(findBest){
stats=findBestAssembly(baseAddress);
if(stats!=null){
stats.name=speciesName;
int x=examineAssembly(stats, tsw, speciesName);
if(x>0){return x;}
}
}
ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
// System.err.println("C: "+contents);
int found=0;
for(String s : contents){
stats=calcStats(s);
if(stats!=null){
stats.name=speciesName;
found+=examineAssembly(stats, tsw, speciesName);
if(found>0){break;}
}
}
return found;
}
/** Tries to find the assembly with the longest contig */
static Stats findBestAssembly(String baseAddress){
if(verbose){System.err.println("findBestAssembly: "+baseAddress);}
ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
// System.err.println("C: "+contents);
Stats best=null;
for(String s : contents){
// System.err.println(s);
Stats stats=calcStats(s);
if(stats!=null){
if(best==null || stats.compareTo(best)>0){
best=stats;
}
}
}
return best;
}
static Stats calcStats(String baseAddress){
if(verbose){System.err.println("calcStats: "+baseAddress);}
ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
String report=null;
for(String s : contents){
if(s.endsWith("_assembly_report.txt")){
report=s;
break;
}
}
if(report==null){
if(verbose){System.err.println("Could not find report for "+baseAddress);}
return null;
}
if(verbose){System.err.println("Report: "+report);}
ArrayList<String> data=null;
for(int i=0; i<=retries && data==null; i++){
try {
data = ServerTools.readFTPFile(report);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
try {
Thread.sleep(Tools.mid(10000, i*1000, 1000));
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
}
if(data==null){return null;}
int contigs=0;
long size=0;
long max=0;
int taxid=-1;
for(String s : data){
if(s!=null && s.length()>0){
if(s.charAt(0)=='#'){
if(s.startsWith("# Taxid:")){
String[] split=Tools.whitespacePlus.split(s);
try {
taxid=Integer.parseInt(split[split.length-1]);
} catch (NumberFormatException e) {
e.printStackTrace();
}
assert(taxid>-1) : "Bad TaxID: '"+s+"'";
}
}else{
String[] split=s.split("\t");
contigs++;
long len;
try {
len=Long.parseLong(split[8]);
} catch (NumberFormatException e) {
len=1;
}
size+=len;
max=Tools.max(max, len);
}
}
}
return new Stats(baseAddress, max, size, contigs, taxid);
}
static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){
if(verbose){System.err.println("examineAssembly: "+stats.path);}
ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries);
// System.err.println("D: "+contents);
String gff=null;
String fna=null;
for(String s : contents){
// System.err.println(s);
if(!s.contains("_from_genomic")){
if(s.endsWith("genomic.fna.gz")){fna=s;}
else if(s.endsWith("genomic.gff.gz")){gff=s;}
}
}
if(fna!=null && gff!=null){
System.err.println("Printing: "+fna);
String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : "");
synchronized(tsw){
if(renameSequences){
tsw.println("wget -q -O - "+fna+" | "
+ "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz");
tsw.println("wget -q -O - "+gff+" | "
+ "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz");
}else if(renameFiles){
tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz");
tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz");
}else{
tsw.println("wget -q "+fna);
tsw.println("wget -q "+gff);
}
tsw.println();
}
return 1;
}
return 0;
}
static String makeSubAddress(String baseAddress, String extension){
if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";}
String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1);
return subAddress;
}
static int seen(String s, HashMap<String, Integer> map){
// synchronized(map){
Integer x=map.get(s);
return x==null ? 0 : x.intValue();
// }
}
static void put(String s, int found, HashMap<String, Integer> map){
// synchronized(map){
int present=seen(s, map);
map.put(s, present+found);
// }
}
static class Stats implements Comparable<Stats>{
public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){
path=path_;
maxContig=maxContig_;
size=size_;
contigs=contigs_;
taxID=taxID_;
}
@Override
public int compareTo(Stats b) {//true if b is better
if(b==null){return 1;}
if(taxID>0 && b.taxID<1){return 1;}
if(b.taxID>0 && taxID<1){return -1;}
if(size>2*b.size){return 1;}
if(size<2*b.size){return -1;}
if(maxContig>b.maxContig){return 1;}
if(maxContig<b.maxContig){return -1;}
return b.contigs-contigs;
}
String path;
String name;
long maxContig;
long size;
int contigs;
int taxID;
}
static boolean verbose=true;
// static boolean allowSameGenus=false;
static int maxSpeciesPerGenus=1;
static boolean renameFiles=true;
static boolean renameSequences=true;
static int retries=40;
static boolean findBest=false;
static boolean tidInFilename=true;
// private static HashMap<String, Integer> seen=new HashMap<String, Integer>();
static int totalSpecies=0;
static int totalGenus=0;
static int totalGenomes=0;
private static final Integer one=1;
}
|