//****************************************************************************** // // File: ProteinSequence.java // Package: edu.rit.compbio.seq // Unit: Class edu.rit.compbio.seq.ProteinSequence // // This Java source file is copyright (C) 2008 by Alan Kaminsky. All rights // reserved. For further information, contact the author, Alan Kaminsky, at // ark@cs.rit.edu. // // This Java source file is part of the Parallel Java Library ("PJ"). PJ is free // software; you can redistribute it and/or modify it under the terms of the GNU // General Public License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // PJ is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU General Public License for more details. // // Linking this library statically or dynamically with other modules is making a // combined work based on this library. Thus, the terms and conditions of the // GNU General Public License cover the whole combination. // // As a special exception, the copyright holders of this library give you // permission to link this library with independent modules to produce an // executable, regardless of the license terms of these independent modules, and // to copy and distribute the resulting executable under terms of your choice, // provided that you also meet, for each linked independent module, the terms // and conditions of the license of that module. An independent module is a // module which is not derived from or based on this library. If you modify this // library, you may extend this exception to your version of the library, but // you are not obligated to do so. If you do not wish to do so, delete this // exception statement from your version. // // A copy of the GNU General Public License is provided in the file gpl.txt. You // may also obtain a copy of the GNU General Public License on the World Wide // Web at http://www.gnu.org/licenses/gpl.html. // //****************************************************************************** package edu.rit.compbio.seq; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; /** * Class ProteinSequence encapsulates a protein sequence. A protein sequence * object may be constructed from a string or read from a file. Protein sequence * objects may also be read from a protein sequence database using class * {@linkplain ProteinDatabase}. *

* In a file, a protein sequence is stored in FASTA format. A FASTA format * protein sequence consists of one description line and one or more sequence * lines. The description line consists of an initial '>' character * followed by zero or more characters (the protein's description). A sequence * line consists of one or more characters 'A' through 'Z', * 'a' through 'z', '*', or '-'. *

* In a program, a protein sequence is represented as a byte array (type * byte[]). For a protein sequence of length L, the byte array * contains L+1 bytes. The byte at index 0 is unused and contains a value * of -1. The bytes at indexes 1 through L contain the amino acids. Amino * acids 'A' through 'Z' (case insensitive) are represented by * the values 0 through 25; '*' is represented as 26; '-' is * represented as 27. *

* The amino acid letters and values are: * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Letter Value Amino Acid
A 0 Alanine
B 1 Aspartate or asparagine
C 2 Cysteine
D 3 Aspartate
E 4 Glutamate
F 5 Phenylalanine
G 6 Glycine
H 7 Histidine
I 8 Isoleucine
J 9 unused
K 10 Lysine
L 11 Leucine
M 12 Methionine
N 13 Asparagine
O 14 unused
P 15 Proline
Q 16 Glutamine
R 17 Arginine
S 18 Serine
T 19 Threonine
U 20 Selenocysteine
V 21 Valine
W 22 Tryptophan
X 23 Any, unknown
Y 24 Tyrosine
Z 25 Glutamate or glutamine
* 26 Translation stop
- 27 Gap of indeterminate length
* * @author Alan Kaminsky * @version 01-Jul-2008 */ public class ProteinSequence extends Sequence { // Exported constructors. /** * Construct a new protein sequence from the given string. * * @param description * Description string. Must start with a '>' character. * @param sequence * Sequence string. Must consist of the characters 'A' through * 'Z', 'a' through 'z', '*', and * '-'. */ public ProteinSequence (String description, String sequence) { // Make sure description starts with '>'. if (description.charAt(0) != '>') { throw new IllegalArgumentException ("ProteinSequence(): Invalid description"); } myDescription = description; // Read characters of the protein sequence. myLength = sequence.length(); mySequence = new byte [myLength + 1]; mySequence[0] = (byte)(-1); for (int i = 0; i < myLength; ++ i) { char b = sequence.charAt(i); if ('A' <= b && b <= 'Z') { mySequence[i+1] = (byte)(b - 'A'); } else if ('a' <= b && b <= 'z') { mySequence[i+1] = (byte)(b - 'a'); } else if (b == '*') { mySequence[i+1] = (byte)(26); } else if (b == '-') { mySequence[i+1] = (byte)(27); } else { throw new IllegalArgumentException ("ProteinSequence(): Invalid amino acid '"+b+"'"); } } } /** * Construct a new protein sequence read from the given file. * * @param file File. * * @exception IOException * Thrown if an I/O error occurred. */ public ProteinSequence (File file) throws IOException { InputStream in = null; try { in = new BufferedInputStream (new FileInputStream (file)); read (in); } finally { if (in != null) { try { in.close(); } catch (IOException exc) {} } } } /** * Construct a new protein sequence read from the given input stream. * * @param in Input stream. * * @exception IOException * Thrown if an I/O error occurred. */ ProteinSequence (InputStream in) throws IOException { read (in); } /** * Read this protein sequence from the given input stream. * * @param in Input stream. * * @exception IOException * Thrown if an I/O error occurred. */ private void read (InputStream in) throws IOException { // Make sure description starts with '>'. int b = in.read(); if (b != '>') { throw new IOException ("ProteinSequence(): Invalid description line"); } // Read bytes of the description up until end of line or end of file. ByteArrayOutputStream baos = new ByteArrayOutputStream(); baos.write (b); while ((b = in.read()) != -1 && b != '\r' && b != '\n') baos.write (b); myDescription = new String (baos.toByteArray()); // Read bytes of the protein sequence. baos.reset(); baos.write (-1); for (;;) { b = in.read(); if (b == -1 || b == '>') { break; } else if ('A' <= b && b <= 'Z') { baos.write (b - 'A'); } else if ('a' <= b && b <= 'z') { baos.write (b - 'a'); } else if (b == '*') { baos.write (26); } else if (b == '-') { baos.write (27); } else if (b == '\r' || b == '\n') { } else { throw new IOException ("ProteinSequence(): Invalid amino acid '"+((char) b)+"'"); } } mySequence = baos.toByteArray(); myLength = mySequence.length - 1; } // Exported operations. /** * Returns a character version of this protein sequence's element at the * given index. * * @param i Index in the range 1 .. L. * * @return Character corresponding to element i. */ public char charAt (int i) { if (1 > i || i > myLength) { throw new IndexOutOfBoundsException ("ProteinSequence.charAt(): Index "+i+" out of bounds"); } int aa = mySequence[i]; if (0 <= aa && aa <= 25) return (char)('A' + aa); else if (aa == 26) return '*'; else return '-'; } /** * Returns a string version of this protein sequence. The string is * "ProteinSequence(description)". * * @return String version. */ public String toString() { return "ProteinSequence(" + myDescription + ")"; } }

Letter	Value	Amino Acid
A	0	Alanine
B	1	Aspartate or asparagine
C	2	Cysteine
D	3	Aspartate
E	4	Glutamate
F	5	Phenylalanine
G	6	Glycine
H	7	Histidine
I	8	Isoleucine
J	9	unused
K	10	Lysine
L	11	Leucine
M	12	Methionine
N	13	Asparagine
O	14	unused
P	15	Proline
Q	16	Glutamine
R	17	Arginine
S	18	Serine
T	19	Threonine
U	20	Selenocysteine
V	21	Valine
W	22	Tryptophan
X	23	Any, unknown
Y	24	Tyrosine
Z	25	Glutamate or glutamine
*	26	Translation stop
-	27	Gap of indeterminate length