//****************************************************************************** // // File: FindProteinSeq.java // Package: edu.rit.compbio.seq // Unit: Class edu.rit.compbio.seq.FindProteinSeq // // This Java source file is copyright (C) 2008 by Alan Kaminsky. All rights // reserved. For further information, contact the author, Alan Kaminsky, at // ark@cs.rit.edu. // // This Java source file is part of the Parallel Java Library ("PJ"). PJ is free // software; you can redistribute it and/or modify it under the terms of the GNU // General Public License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // PJ is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU General Public License for more details. // // Linking this library statically or dynamically with other modules is making a // combined work based on this library. Thus, the terms and conditions of the // GNU General Public License cover the whole combination. // // As a special exception, the copyright holders of this library give you // permission to link this library with independent modules to produce an // executable, regardless of the license terms of these independent modules, and // to copy and distribute the resulting executable under terms of your choice, // provided that you also meet, for each linked independent module, the terms // and conditions of the license of that module. An independent module is a // module which is not derived from or based on this library. If you modify this // library, you may extend this exception to your version of the library, but // you are not obligated to do so. If you do not wish to do so, delete this // exception statement from your version. // // A copy of the GNU General Public License is provided in the file gpl.txt. You // may also obtain a copy of the GNU General Public License on the World Wide // Web at http://www.gnu.org/licenses/gpl.html. // //****************************************************************************** package edu.rit.compbio.seq; import edu.rit.pj.Comm; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * Class FindProteinSeq is a sequential program that finds matches for a protein * in a protein sequence database. The query sequence is stored in a file in * FASTA format; for further information, see class {@linkplain * ProteinSequence}. The protein sequence database is stored in two files, a * database file in FASTA format and an index file; for further information, see * class {@linkplain ProteinDatabase}. The program uses the Smith-Waterman * algorithm to compute a local alignment between the query sequence and each * subject sequence in the database. The program uses the BLOSUM-62 protein * substitution matrix. The program uses affine gap penalties with a gap * existence penalty of −11 and a gap extension penalty of −1. The * program prints on the standard output the resulting alignments from highest * to lowest score. The program only prints alignments with an E-value * below the given threshold; if not specified, the default is 10. *

* The program does all the alignments sequentially, in a single thread. The * program uses class {@linkplain ProteinLocalAlignmentSeq} to do the * alignments. *

* Usage: java edu.rit.compbio.seq.FindProteinSeq queryfile * databasefile indexfile [ expect ] *
queryfile = Query sequence file *
databasefile = Protein sequence database file *
indexfile = Protein sequence index file *
expect = E-value threshold (default: 10) * * @author Alan Kaminsky * @version 01-Jul-2008 */ public class FindProteinSeq { // Prevent construction. private FindProteinSeq() { } // Global variables. // Command line arguments. static File queryfile; static File databasefile; static File indexfile; static double expect; // Main program. /** * Main program. */ public static void main (String[] args) throws Exception { // Start timing. long t1 = System.currentTimeMillis(); Comm.init (args); // Parse command line arguments. if (3 > args.length || args.length > 4) usage(); queryfile = new File (args[0]); databasefile = new File (args[1]); indexfile = new File (args[2]); expect = 10.0; if (args.length == 4) expect = Double.parseDouble (args[3]); // Set up query sequence. ProteinSequence query = new ProteinSequence (queryfile); // Set up protein sequence database. ProteinDatabase database = new ProteinDatabase (databasefile, indexfile); // Set up object to compute alignment statistics. AlignmentStats stats = new DefaultAlignmentStats (database.getDatabaseLength()); // Set up list to hold alignments. List alignments = new ArrayList(); // Set up object to perform alignments. ProteinLocalAlignment aligner = new ProteinLocalAlignmentSeq(); aligner.setQuerySequence (query, 0); long t2 = System.currentTimeMillis(); // Align query sequence against every subject sequence. for (long id = 0; id < database.getProteinCount(); ++ id) { ProteinSequence subject = database.getProteinSequence (id); aligner.setSubjectSequence (subject, id); Alignment a = aligner.align(); if (stats.eValue (a) <= expect) { alignments.add (a); } } long t3 = System.currentTimeMillis(); // Sort alignments into descending order of score. Collections.sort (alignments); // Set up alignment printer. AlignmentPrinter printer = new AlignmentPrinter (System.out, stats); // Print query sequence. System.out.println ("Query Description:"); System.out.println (query.description()); System.out.println ("Length = "+query.length()); System.out.println(); // Print summary of each alignment. System.out.println (" Bit E-"); System.out.println ("Subject Description Score Value"); for (Alignment a : alignments) { printer.printSummary (a, database.getProteinSequence (a.getSubjectId())); } System.out.println(); // Print details of each alignment. for (Alignment a : alignments) { printer.printDetails (a, query, database.getProteinSequence (a.getSubjectId())); } // Print various information about the alignment procedure. System.out.println ("Query file: "+queryfile); System.out.println ("Database file: "+databasefile); System.out.println ("Database index file: "+indexfile); System.out.println ("Number of sequences: "+database.getProteinCount()); System.out.println ("Number of matches: "+alignments.size()); System.out.println ("Query length: "+query.length()); System.out.println ("Database length: "+database.getDatabaseLength()); stats.print (System.out); System.out.println(); // All done. database.close(); long t4 = System.currentTimeMillis(); System.out.println ((t2-t1)+" msec pre"); System.out.println ((t3-t2)+" msec calc"); System.out.println ((t4-t3)+" msec post"); System.out.println ((t4-t1)+" msec total"); } // Hidden operations. /** * Print a usage message and exit. */ private static void usage() { System.err.println ("Usage: java edu.rit.compbio.seq.FindProteinSeq []"); System.err.println (" = Query sequence file"); System.err.println (" = Protein sequence database file"); System.err.println (" = Protein sequence index file"); System.err.println (" = E-value threshold (default: 10)"); System.exit (1); } }