//****************************************************************************** // // File: DnaSequence.java // Package: edu.rit.compbio.phyl // Unit: Class edu.rit.compbio.phyl.DnaSequence // // This Java source file is copyright (C) 2008 by Alan Kaminsky. All rights // reserved. For further information, contact the author, Alan Kaminsky, at // ark@cs.rit.edu. // // This Java source file is part of the Parallel Java Library ("PJ"). PJ is free // software; you can redistribute it and/or modify it under the terms of the GNU // General Public License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // PJ is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU General Public License for more details. // // Linking this library statically or dynamically with other modules is making a // combined work based on this library. Thus, the terms and conditions of the // GNU General Public License cover the whole combination. // // As a special exception, the copyright holders of this library give you // permission to link this library with independent modules to produce an // executable, regardless of the license terms of these independent modules, and // to copy and distribute the resulting executable under terms of your choice, // provided that you also meet, for each linked independent module, the terms // and conditions of the license of that module. An independent module is a // module which is not derived from or based on this library. If you modify this // library, you may extend this exception to your version of the library, but // you are not obligated to do so. If you do not wish to do so, delete this // exception statement from your version. // // A copy of the GNU General Public License is provided in the file gpl.txt. You // may also obtain a copy of the GNU General Public License on the World Wide // Web at http://www.gnu.org/licenses/gpl.html. // //****************************************************************************** package edu.rit.compbio.phyl; /** * Class DnaSequence encapsulates a DNA sequence. The DNA sequence consists of a * sequence of sites. Each site has a state, which is a set of * bases. The four bases are adenine, cytosine, guanine, and thymine. For * textual I/O, each state is represented by a single character as follows: *

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Char. Meaning Set
A Adenine (A)
C Cytosine (C)
G Guanine (G)
T Thymine (T)
Y pYrimidine (C or T)
R puRine (A or G)
W "Weak" (A or T)
S "Strong" (C or G)
K "Keto" (G or T)
M "aMino" (A or C)
B not A (C or G or T)
D not C (A or G or T)
H not G (A or C or T)
V not T (A or C or G)
X unknown (A or C or G or T)
- deletion ()
*

* The DNA sequence has an associated score, an integer. The score can be * set to anything and later retrieved. *

* The DNA sequence has a name, a string. The name can be set to anything * and later retrieved. * * @author Alan Kaminsky * @version 23-Jul-2008 */ public class DnaSequence { // Hidden constants. // Amount of extra padding in byte array. private static final int PAD = 128; // Mapping from the state of a site to the corresponding output character. // A=1, C=2, G=4, T=8. static final char[] state2char = new char[] {/*----*/ '-', /*---A*/ 'A', /*--C-*/ 'C', /*--CA*/ 'M', /*-G--*/ 'G', /*-G-A*/ 'R', /*-GC-*/ 'S', /*-GCA*/ 'V', /*T---*/ 'T', /*T--A*/ 'W', /*T-C-*/ 'Y', /*T-CA*/ 'H', /*TG--*/ 'K', /*TG-A*/ 'D', /*TGC-*/ 'B', /*TGCA*/ 'X'}; // Mapping from the state of a site to the number of bits turned on at that // site. A=1, C=2, G=4, T=8. static final int[] state2bitCount = new int[] {/*----*/ 0, /*---A*/ 1, /*--C-*/ 1, /*--CA*/ 2, /*-G--*/ 1, /*-G-A*/ 2, /*-GC-*/ 2, /*-GCA*/ 3, /*T---*/ 1, /*T--A*/ 2, /*T-C-*/ 2, /*T-CA*/ 3, /*TG--*/ 2, /*TG-A*/ 3, /*TGC-*/ 3, /*TGCA*/ 4}; // Hidden data members. // Sequence data. Each site's set of bases is stored as a bitmap in one // byte. A=1, C=2, G=4, T=8. PAD bytes of padding are added to avert cache // interference. byte[] mySites; // Length. int myLength; // Score. int myScore; // Name. String myName; // 128 bytes of extra padding to avert cache interference. private transient long p0, p1, p2, p3, p4, p5, p6, p7; private transient long p8, p9, pa, pb, pc, pd, pe, pf; // Exported constructors. /** * Construct a new zero-length DNA sequence. The score is initially 0. The * name is initially null. */ public DnaSequence() { this (0, 0, null); } /** * Construct a new DNA sequence with the given length. The score is * initially 0. The name is initially null. * * @param N Length (number of sites). * * @exception IllegalArgumentException * (unchecked exception) Thrown if N < 0. */ public DnaSequence (int N) { this (N, 0, null); } /** * Construct a new DNA sequence with the given length and score. The name is * initially null. * * @param N Length (number of sites). * @param score Score. * * @exception IllegalArgumentException * (unchecked exception) Thrown if N < 0. */ public DnaSequence (int N, int score) { this (N, score, null); } /** * Construct a new DNA sequence with the given length, score, and name. * * @param N Length (number of sites). * @param score Score. * @param name Name. May be null. * * @exception IllegalArgumentException * (unchecked exception) Thrown if N < 0. */ public DnaSequence (int N, int score, String name) { if (N < 0) { throw new IllegalArgumentException ("DnaSequence(): N (= "+N+") < 0, illegal)"); } this.mySites = new byte [N+PAD]; this.myLength = N; this.myScore = score; this.myName = name; } /** * Construct a new DNA sequence that is a copy of the given DNA sequence. * * @param seq DNA sequence to copy. * * @exception NullPointerException * (unchecked exception) Thrown if seq is null. */ public DnaSequence (DnaSequence seq) { this.mySites = (byte[]) seq.mySites.clone(); this.myLength = seq.myLength; this.myScore = seq.myScore; this.myName = seq.myName; } // Exported operations. /** * Get this DNA sequence's length. * * @return Length (number of sites). */ public int length() { return myLength; } /** * Get this DNA sequence's score. * * @return Score. */ public int score() { return myScore; } /** * Set this DNA sequence's score. * * @param score Score. */ public void score (int score) { myScore = score; } /** * Get this DNA sequence's name. * * @return Name. May be null. */ public String name() { return myName; } /** * Set this DNA sequence's name. * * @param name Name. May be null. */ public void name (String name) { myName = name; } /** * Make this DNA sequence's sites be the same as the given DNA sequence. It * is assumed that this DNA sequence and the given DNA sequence are the same * length. This DNA sequence's score and name are unchanged. * * @param seq DNA sequence to copy. * * @exception NullPointerException * (unchecked exception) Thrown if seq is null. */ public void copySites (DnaSequence seq) { System.arraycopy (seq.mySites, 0, this.mySites, 0, myLength); } /** * Compute the distance between this DNA sequence and the given DNA * sequence. It is assumed that this DNA sequence and the given DNA sequence * are the same length. The distance is the number of differing sites * between the two sequences (the Hamming distance). * * @param seq DNA sequence. * * @return Distance. */ public double distance (DnaSequence seq) { byte[] site1 = this.mySites; byte[] site2 = seq.mySites; int diff = 0; int N = myLength; for (int i = 0; i < N; ++ i) { if (site1[i] != site2[i]) ++ diff; } return diff; } /** * Make this DNA sequence be the ancestor of the two given DNA sequences in * the Fitch parsimony score algorithm. This DNA sequence's sites are set * based on seq1's and seq2's sites. It is assumed that * this DNA sequence and the given DNA sequences are the same length. This * DNA sequence's score is set to the sum of seq1's score, * seq2's score, and the number of state changes at the ancestor. * This DNA sequence's name is unchanged. * * @param seq1 First child DNA sequence. * @param seq2 Second child DNA sequence. */ public void setFitchAncestor (DnaSequence seq1, DnaSequence seq2) { // Get references to sites. byte[] ancestor = this.mySites; byte[] descendent1 = seq1.mySites; byte[] descendent2 = seq2.mySites; int N = myLength; // Process all sites. Count state changes. int nChanges = 0; for (int i = 0; i < N; ++ i) { // Compute intersection of states. int state1 = descendent1[i]; int state2 = descendent2[i]; int state3 = state1 & state2; // If intersection is not empty, record intersection, otherwise // record union and note one state change. if (state3 == 0) { state3 = state1 | state2; ++ nChanges; } // Update site. ancestor[i] = (byte) state3; } // Record number of state changes. this.myScore = seq1.myScore + seq2.myScore + nChanges; } /** * Returns a string version of this DNA sequence. The string consists of * just the sequence of states (the score and name are not included). */ public String toString() { StringBuilder buf = new StringBuilder(); byte[] site = mySites; int N = myLength; for (int i = 0; i < N; ++ i) { buf.append (state2char [site[i]]); } return buf.toString(); } }

Char.	Meaning	Set
A	Adenine	(A)
C	Cytosine	(C)
G	Guanine	(G)
T	Thymine	(T)
Y	pYrimidine	(C or T)
R	puRine	(A or G)
W	"Weak"	(A or T)
S	"Strong"	(C or G)
K	"Keto"	(G or T)
M	"aMino"	(A or C)
B	not A	(C or G or T)
D	not C	(A or G or T)
H	not G	(A or C or T)
V	not T	(A or C or G)
X	unknown	(A or C or G or T)
-	deletion	()