//***************************************************************************** // // File: DnaSequenceList.java // Package: edu.rit.compbio.phyl // Unit: Class edu.rit.compbio.phyl.DnaSequenceList // // This Java source file is copyright (C) 2008 by Alan Kaminsky. All rights // reserved. For further information, contact the author, Alan Kaminsky, at // ark@cs.rit.edu. // // This Java source file is part of the Parallel Java Library ("PJ"). PJ is free // software; you can redistribute it and/or modify it under the terms of the GNU // General Public License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // PJ is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU General Public License for more details. // // Linking this library statically or dynamically with other modules is making a // combined work based on this library. Thus, the terms and conditions of the // GNU General Public License cover the whole combination. // // As a special exception, the copyright holders of this library give you // permission to link this library with independent modules to produce an // executable, regardless of the license terms of these independent modules, and // to copy and distribute the resulting executable under terms of your choice, // provided that you also meet, for each linked independent module, the terms // and conditions of the license of that module. An independent module is a // module which is not derived from or based on this library. If you modify this // library, you may extend this exception to your version of the library, but // you are not obligated to do so. If you do not wish to do so, delete this // exception statement from your version. // // A copy of the GNU General Public License is provided in the file gpl.txt. You // may also obtain a copy of the GNU General Public License on the World Wide // Web at http://www.gnu.org/licenses/gpl.html. // //****************************************************************************** package edu.rit.compbio.phyl; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.Arrays; import java.util.Iterator; import java.util.Scanner; /** * Class DnaSequenceList provides a list of {@linkplain DnaSequence}s. Methods * for reading and writing textual files of DNA sequences are provided. *

* Each DNA sequence consists of a sequence of sites. Each site has a * state, which is a set of bases. The four bases are adenine, * cytosine, guanine, and thymine. For textual I/O, each state is represented by * a single character as follows: *

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Char. Meaning Set
A Adenine (A)
C Cytosine (C)
G Guanine (G)
T Thymine (T)
Y pYrimidine (C or T)
R puRine (A or G)
W "Weak" (A or T)
S "Strong" (C or G)
K "Keto" (G or T)
M "aMino" (A or C)
B not A (C or G or T)
D not C (A or G or T)
H not G (A or C or T)
V not T (A or C or G)
X unknown (A or C or G or T)
- deletion ()
*

* The DNA sequence file format is that used by Joseph Felsenstein's Phylogeny * Inference Package (PHYLIP). While the file is a plain text file, it often has * the extension ".phy" to indicate that it is in PHYLIP format. For * further information, see: *

* PHYLIP -- http://evolution.genetics.washington.edu/phylip/phylip.html *
* Input file format -- http://evolution.genetics.washington.edu/phylip/doc/sequence.html *

* Here is an example of an input file: *

* * * * *
*
5 42 * Turkey AAGCTNGGGC ATTTCAGGGT * Salmo gair AAGCCTTGGC AGTGCAGGGT * H. Sapiens ACCGGTTGGC CGTTCAGGGT * Chimp AAACCCTTGC CGTTACGCTT * Gorilla AAACCCTTGC CGGTACGCTT * * GAGCCCGGGC AATACAGGGT AT * GAGCCGTGGC CGGGCACGGT AT * ACAGGTTGGC CGTTCAGGGT AA * AAACCGAGGC CGGGACACTC AT * AAACCATTGC CGGTACGCTT AA
*
*

* The first line contains the number of species S and the number of * sites N in each sequence. S must be >= 2. N must be * >= 1. *

* The next S lines contain the initial data for each species. The first * ten characters contain the sequence name. This must be exactly ten * characters, padded with blanks if necessary. Then comes one character for * each site in the sequence. Uppercase and lowercase are considered the same. * Characters other than those for the states listed above are ignored. Often, a * blank is inserted every ten characters for readability, but this is not * necessary. After these S lines come zero or more blank lines for * readability, which are ignored. If there is more sequence data, the next * S lines give the states for the next sites in the sequences. This * continues for the rest of the file. *

* This is known as the "interleaved" file format. There is also a "sequential" * file format, but the sequential file format is not supported. *

* Thus, the complete sequence for each species in the example is: *

* * * * * * * * * * * * * *
Species Sequence
Turkey AAGCTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT
Salmo gair AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT
H. Sapiens ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA
Chimp AAACCCTTGCCGTTACGCTTAAACCGAGGCCGGGACACTCAT
Gorilla AAACCCTTGCCGGTACGCTTAAACCATTGCCGGTACGCTTAA
*

* In the input file, the following alternate characters can be used: X, N, and * ? all mean "unknown." O (capital letter O) and - (hyphen) both mean * "deletion." The character . (period) means "the same as the corresponding * site in the first species." Here is another input file with the same * sequences as the one above: *

* * * * *
*
5 42 * Turkey AAGCTNGGGC ATTTCAGGGT * Salmo gair ..G.CTT... AG.G...... * H. Sapiens .CCGGTT... .G........ * Chimp ..A.CCTT.. .G..AC.CT. * Gorilla ..A.CCTT.. .GG.AC.CT. * * GAGCCCGGGC AATACAGGGT AT * .....GT... CGGG..C... .. * ACAGGTT... CG.T...... .A * A.A..GA... CGGGACACTC .. * A.A..ATT.. CGGTAC.CT. .A
*
*

* Here are some more example DNA sequence files: *

* * @author Alan Kaminsky * @version 20-Jul-2008 */ public class DnaSequenceList implements Iterable { // Hidden data members. // DNA sequences. DnaSequence[] mySequence; // Mapping from site (index) to whether site is informative (true/false). If // null, must be recomputed. private boolean[] isInformative; // Number of informative sites. private int nInformative; // Number of state changes in uninformative sites. private int nChanges; // Hidden constructors. /** * Construct a new DNA sequence list. */ DnaSequenceList() { } /** * Construct a new DNA sequence list that is a copy of the given DNA * sequence list. *

* Note: The DNA sequences in the new list are copies of (not * references to) the DNA sequences in the given list. * * @param list DNA sequence list to copy. * * @exception NullPointerException * (unchecked exception) Thrown if list is null. */ public DnaSequenceList (DnaSequenceList list) { int N = list.mySequence.length; this.mySequence = new DnaSequence [N]; for (int i = 0; i < N; ++ i) { this.mySequence[i] = new DnaSequence (list.mySequence[i]); } if (list.isInformative != null) { this.isInformative = (boolean[]) list.isInformative.clone(); } this.nInformative = list.nInformative; this.nChanges = list.nChanges; } // Exported operations. /** * Obtain this DNA sequence list's length. * * @return Length N (number of DNA sequences). */ public int length() { return mySequence.length; } /** * Get the DNA sequence at the given index in this DNA sequence list. * * @param i Index, 0 ≤ i ≤ N−1. * * @return DNA sequence. * * @exception ArrayIndexOutOfBoundsException * (unchecked exception) Thrown if i is out of bounds. */ public DnaSequence seq (int i) { return mySequence[i]; } /** * Read a DNA sequence list from the given input file. The input file must * be in interleaved PHYLIP format. *

* The DNA sequences' sites and names are read from the input file. The DNA * sequences' scores are set to 0. * * @param file File. * * @return DNA sequence list. * * @exception NullPointerException * (unchecked exception) Thrown if file is null. * @exception IOException * Thrown if an I/O error occurred. Thrown if the input file's contents * were invalid. */ public static DnaSequenceList read (File file) throws IOException { Scanner filescanner = new Scanner (file); Scanner linescanner; int S, N; DnaSequenceList list; int[] sitecount; String line; try { // Read number of species and number of sites from first line. if (! filescanner.hasNextLine()) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Empty file"); } linescanner = new Scanner (filescanner.nextLine()); if (! linescanner.hasNextInt()) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Number of species invalid or missing"); } S = linescanner.nextInt(); if (S < 2) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Number of species must be >= 2"); } if (! linescanner.hasNextInt()) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Number of sites invalid or missing"); } N = linescanner.nextInt(); if (N < 1) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Number of sites must be >= 1"); } // Set up DNA sequence list and site count array. list = new DnaSequenceList(); list.mySequence = new DnaSequence [S]; sitecount = new int [S]; // Read sequence data from groups of S lines until EOF. fileloop: for (;;) { speciesloop: for (int s = 0; s < S; ++ s) { // Get a line of sequence data for species s. if (filescanner.hasNextLine()) { } else if (s != 0 || sitecount[s] == 0) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Missing a line of sequence data for species " + (s+1)); } else { break fileloop; } line = filescanner.nextLine(); // Ignore blank lines. if (line.trim().equals ("")) { -- s; continue; } // The first time, extract sequence name and create // DnaSequence object. if (sitecount[s] == 0) { if (line.length() < 10) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Name must be 10 characters for species " + (s+1)); } list.mySequence[s] = new DnaSequence (N, 0, line.substring (0, 10) .trim()); line = line.substring (10); } // Parse characters in sequence data. int len = line.length(); byte[] seq = list.mySequence[s].mySites; byte[] seq0 = list.mySequence[0].mySites; int count = sitecount[s]; for (int i = 0; i < len; ++ i) { switch (line.charAt(i)) { case 'O': case 'o': case '-': verifyCount (count, N, file, s); seq[count] = (byte) 0; // ---- ++ count; break; case 'A': case 'a': verifyCount (count, N, file, s); seq[count] = (byte) 1; // ---A ++ count; break; case 'C': case 'c': verifyCount (count, N, file, s); seq[count] = (byte) 2; // --C- ++ count; break; case 'M': case 'm': verifyCount (count, N, file, s); seq[count] = (byte) 3; // --CA ++ count; break; case 'G': case 'g': verifyCount (count, N, file, s); seq[count] = (byte) 4; // -G-- ++ count; break; case 'R': case 'r': verifyCount (count, N, file, s); seq[count] = (byte) 5; // -G-A ++ count; break; case 'S': case 's': verifyCount (count, N, file, s); seq[count] = (byte) 6; // -GC- ++ count; break; case 'V': case 'v': verifyCount (count, N, file, s); seq[count] = (byte) 7; // -GCA ++ count; break; case 'T': case 't': verifyCount (count, N, file, s); seq[count] = (byte) 8; // T--- ++ count; break; case 'W': case 'w': verifyCount (count, N, file, s); seq[count] = (byte) 9; // T--A ++ count; break; case 'Y': case 'y': verifyCount (count, N, file, s); seq[count] = (byte) 10; // T-C- ++ count; break; case 'H': case 'h': verifyCount (count, N, file, s); seq[count] = (byte) 11; // T-CA ++ count; break; case 'K': case 'k': verifyCount (count, N, file, s); seq[count] = (byte) 12; // TG-- ++ count; break; case 'D': case 'd': verifyCount (count, N, file, s); seq[count] = (byte) 13; // TG-A ++ count; break; case 'B': case 'b': verifyCount (count, N, file, s); seq[count] = (byte) 14; // TGC- ++ count; break; case 'X': case 'x': case 'N': case 'n': case '?': verifyCount (count, N, file, s); seq[count] = (byte) 15; // TGCA ++ count; break; case '.': verifyCount (count, N, file, s); if (s == 0) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "'.' not allowed in species 1"); } if (count >= sitecount[0]) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "'.' in species " + (s+1) + " has no corresponding site in species 1"); } seq[count] = seq0[count]; ++ count; break; } } sitecount[s] = count; } } // Verify correct site count for all species. for (int s = 0; s < S; ++ s) { if (sitecount[s] < N) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Too few sites for species " + (s+1)); } else if (sitecount[s] > N) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Too many sites for species " + (s+1)); } } // Return DNA sequence list. return list; } finally { filescanner.close(); } } private static void verifyCount (int count, int N, File file, int s) throws IOException { if (count >= N) { throw new IOException ("DnaSequenceList.read(\"" + file + "\"): " + "Too many sites for species " + (s+1)); } } /** * Write this DNA sequence list to the given output file. The output file is * in interleaved PHYLIP format. There are 70 sites on each output line. * Periods are not used. Informative sites are not marked in bold. * * @param file File. * * @exception NullPointerException * (unchecked exception) Thrown if file is null. * @exception IOException * Thrown if an I/O error occurred. */ public void write (File file) throws IOException { write (file, 70, false, false); } /** * Write this DNA sequence list to the given output file. The output file is * in interleaved PHYLIP format. * * @param file File. * @param sites Number of sites per output line. * @param periods True to use periods, false not to use periods. * @param bold True to mark informative sites in bold, false not to. * * @exception NullPointerException * (unchecked exception) Thrown if file is null. * @exception IllegalArgumentException * (unchecked exception) Thrown if sites <= 10. * @exception IOException * Thrown if an I/O error occurred. */ public void write (File file, int sites, boolean periods, boolean bold) throws IOException { PrintStream ps = new PrintStream (new BufferedOutputStream (new FileOutputStream (file))); try { write (ps, sites, periods, bold); } finally { ps.close(); } } /** * Write this DNA sequence list to the given print stream in interleaved * PHYLIP format. * * @param ps Print stream. * @param sites Number of sites per output line. * @param periods True to use periods, false not to. * @param bold True to mark informative sites in bold, false not to. * * @exception NullPointerException * (unchecked exception) Thrown if ps is null. * @exception IllegalArgumentException * (unchecked exception) Thrown if sites <= 10. * @exception IOException * Thrown if an I/O error occurred. */ public void write (PrintStream ps, int sites, boolean periods, boolean bold) throws IOException { if (sites <= 10) { throw new IllegalArgumentException ("DnaSequenceList.write(): sites = " + sites + " illegal"); } // Determine informative sites if necessary. if (bold) computeInformativeSites(); // Print number of species and number of sites. int S = mySequence.length; int N = mySequence[0].myLength; ps.print (S); ps.print (' '); ps.print (N); ps.println(); // Print groups of sites for each species. On the first line, print // sequence name, padded or truncated to 10 characters. int lb = 0; int ub = Math.min (sites-10, N); byte[] seq0 = mySequence[0].mySites; while (lb < N) { for (int s = 0; s < S; ++ s) { byte[] seq = mySequence[s].mySites; if (lb == 0) ps.print (padName (mySequence[s].myName)); for (int i = lb; i < ub; ++ i) { if ((lb == 0 || i > lb) && i % 10 == 0) { ps.print (' '); } if (periods && s > 0 && seq[i] == seq0[i]) { printSite (ps, i, '.', bold); } else { printSite (ps, i, DnaSequence.state2char[seq[i]], bold); } } ps.println(); } ps.println(); lb = ub; ub = Math.min (ub+sites, N); } // Check for I/O errors. if (ps.checkError()) { throw new IOException ("DnaSequenceList.write(): I/O error"); } } private static String padName (String name) { if (name == null) return " "; int len = name.length(); if (len == 10) { return name; } else if (len > 10) { return name.substring (0, 10); } else { return name + padding[len]; } } private static String[] padding = new String[] {/*0*/ " ", /*1*/ " ", /*2*/ " ", /*3*/ " ", /*4*/ " ", /*5*/ " ", /*6*/ " ", /*7*/ " ", /*8*/ " ", /*9*/ " "}; private void printSite (PrintStream ps, int i, char c, boolean bold) { if (bold && isInformative[i]) { ps.print (""); ps.print (c); ps.print (""); } else { ps.print (c); } } /** * Truncate this DNA sequence list to the given length. If this list is * already shorter than len, the truncate() method does * nothing. * * @param len Length. * * @exception NegativeArraySizeException * (unchecked exception) Thrown if len < 0. */ public void truncate (int len) { if (len < mySequence.length) { DnaSequence[] newSequence = new DnaSequence [len]; System.arraycopy (mySequence, 0, newSequence, 0, len); mySequence = newSequence; } } /** * Excise uninformative sites from the DNA sequences in this DNA sequence * list. *

* Each site in the DNA sequences is either "uninformative" or * "informative," defined as follows: *

* If the site has the same state (A, C, G, or T) in all sequences, the * site is uninformative. This site will contribute no state changes to the * parsimony score in every possible phylogenetic tree. *
* If the site has the same state in all sequences, except for one or more * sequences that have a unique state at that site (i.e., a state that * appears in no other sequences at that site), the site is uninformative. * The site will contribute the same number of state changes to the * parsimony score in every possible phylogenetic tree, namely the number of * different states that appear at that site, minus 1. *
* Otherwise, the site is informative. There are at least two different * states at that site, and each state appears in at least two different * sequences. The site will contribute a different number of state changes * to the parsimony score, depending on where the sequences appear in the * phylogenetic tree. *

* Since the uninformative sites do not affect the outcome of a maximum * parsimony phylogenetic tree search, the uninformative sites can be * omitted from the tree scoring process to save time. The informative sites * do affect the outcome and must be included in the tree scoring process. *

* The exciseUninformativeSites() removes the uninformative sites * from the DNA sequences in this list. The DNA sequences' scores and names * are unchanged. * * @return Number of state changes the (excised) uninformative sites * contribute to the parsimony score. */ public int exciseUninformativeSites() { int S = mySequence.length; int N = mySequence[0].length(); // Determine which sites are informative. computeInformativeSites(); // Excise uninformative sites from sequences. for (int s = 0; s < S; ++ s) { byte[] oldSites = mySequence[s].mySites; mySequence[s] = new DnaSequence (nInformative, mySequence[s].myScore, mySequence[s].myName); byte[] excSites = mySequence[s].mySites; int j = 0; for (int i = 0; i < N; ++ i) { if (isInformative[i]) { excSites[j++] = oldSites[i]; } } } // Mark all sites as informative. isInformative = new boolean [nInformative]; Arrays.fill (isInformative, true); // Return number of state changes. return nChanges; } /** * Returns the number of informative sites in this DNA sequence list. * * @return Number of informative sites. */ public int informativeSiteCount() { computeInformativeSites(); return nInformative; } /** * Determine the number of absent states after adding each sequence in this * DNA sequence list to a tree. The return value A is an * N-element array, where N is the length of this DNA sequence * list. As sequences from this list are added to a tree in order from * i = 0 to N−1, A[i] is the number of * character states that do not yet appear in the tree. Thus, the number of * state changes in the tree must increase by at least A[i] * when the sequences after sequence i are added to the tree. This * can be used to prune a branch-and-bound search. * * @return Array A. */ public int[] countAbsentStates() { int N = mySequence.length; int L = mySequence[0].length(); int[] A = new int [N]; // Compute the union of all the DNA sequences. byte[] sites = new byte [L]; for (int i = 0; i < N; ++ i) { byte[] mysites_i = mySequence[i].mySites; for (int j = 0; j < L; ++ j) { sites[j] |= mysites_i[j]; } } // Subtract each sequence from the union, count and record states. for (int i = 0; i < N; ++ i) { byte[] mysites_i = mySequence[i].mySites; int count = 0; for (int j = 0; j < L; ++ j) { sites[j] &= ~ mysites_i[j]; count += DnaSequence.state2bitCount [sites[j]]; } A[i] = count; } return A; } /** * Create a DNA sequence tree from this DNA sequence list and the given tree * signature. The tree signature is an array of indexes of length N, * where N is the length of this list. To construct the tree, for all * i from 0 to N−1, the DNA sequence at index i * in this list is added to the tree at index signature[i] using * the DnaSequenceTree.add() method. For all i, * signature[i] must be in the range 0 .. * 2(i − 1), except signature[0] is 0. *

* Note: The returned tree has references to (not copies of) the DNA * sequences in this list. * * @param signature Tree signature (array of tree indexes). * * @return Tree. */ public DnaSequenceTree toTree (int[] signature) { int N = mySequence.length; DnaSequenceTree tree = new DnaSequenceTree (2*N - 1); for (int i = 0; i < N; ++ i) { tree.add (signature[i], mySequence[i]); } return tree; } /** * Returns an iterator for the DNA sequences in this list. * * @return Iterator. */ public Iterator iterator() { return new Iterator() { int i = 0; public boolean hasNext() { return i < mySequence.length; } public DnaSequence next() { return mySequence[i++]; } public void remove() { throw new UnsupportedOperationException(); } }; } // Hidden operations. /** * Compute information about informative sites. */ private void computeInformativeSites() { if (isInformative != null) return; int S = mySequence.length; int N = mySequence[0].length(); // Allocate storage to remember each site's category: true = // informative, false = uninformative. Also count number of informative // sites and number of state changes in uninformative sites. isInformative = new boolean [N]; nInformative = 0; nChanges = 0; // Allocate storage to count states at each site. int[] stateCount = new int [16]; // Examine all sites. for (int i = 0; i < N; ++ i) { Arrays.fill (stateCount, 0); // Examine current site in all sequences. for (int s = 0; s < S; ++ s) { ++ stateCount[mySequence[s].mySites[i]]; } // Count how many values in stateCount are 2 or greater. int x = 0; for (int j = 0; j < 16; ++ j) { if (stateCount[j] >= 2) ++ x; } // Categorize current site. if (x >= 2) { // Informative site. isInformative[i] = true; ++ nInformative; } else { // Uninformative site. Increase number of state changes by // (number of different states - 1). isInformative[i] = false; for (int j = 0; j < 16; ++ j) { if (stateCount[j] > 0) ++ nChanges; } -- nChanges; } } } }

Char.	Meaning	Set
A	Adenine	(A)
C	Cytosine	(C)
G	Guanine	(G)
T	Thymine	(T)
Y	pYrimidine	(C or T)
R	puRine	(A or G)
W	"Weak"	(A or T)
S	"Strong"	(C or G)
K	"Keto"	(G or T)
M	"aMino"	(A or C)
B	not A	(C or G or T)
D	not C	(A or G or T)
H	not G	(A or C or T)
V	not T	(A or C or G)
X	unknown	(A or C or G or T)
-	deletion	()

Species		Sequence
Turkey		`AAGCTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT`
Salmo gair		`AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT`
H. Sapiens		`ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA`
Chimp		`AAACCCTTGCCGTTACGCTTAAACCGAGGCCGGGACACTCAT`
Gorilla		`AAACCCTTGCCGGTACGCTTAAACCATTGCCGGTACGCTTAA`