File: FastaReader.java

package info (click to toggle)
libjaba-client-java 2.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 2,052 kB
  • sloc: java: 17,308; makefile: 12
file content (189 lines) | stat: -rw-r--r-- 6,514 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* Copyright (c) 2011 Peter Troshin
 *  
 *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0
 * 
 *  This library is free software; you can redistribute it and/or modify it under the terms of the
 *  Apache License version 2 as published by the Apache Software Foundation
 * 
 *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache 
 *  License for more details.
 * 
 *  A copy of the license is in apache_license.txt. It is also available here:
 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
 * 
 * Any republication or derived work distributed in source code form
 * must include this copyright and license notice.
 */
package compbio.data.sequence;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Scanner;
import java.util.regex.MatchResult;

import compbio.util.Util;

/**
 * Reads files with FASTA formatted sequences. All the information in the FASTA
 * header is preserved including trailing white spaces. All the white spaces are
 * removed from the sequence.
 * 
 * Examples of the correct input:
 * 
 * <pre>
 * 
 * >zedpshvyzg
 * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
 * 
 * >xovkactesa
 * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
 * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
 * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
 * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
 * 
 * >ntazzewyvv
 * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
 * EASINM	AQQWRSLPPSRIMKLNG	HGCDCMHSHMEAD	DTKQSGIKGTFWNG	HDAQWLCRWG	
 * EFITEA	WWGRWGAITFFHAH	ENKNEIQECSDQNLKE	SRTTCEIID   TCHLFTRHLDGW 
 *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    
 *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   
 *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
 * 
 *    > 12 d t y wi 		k	jbke  	
 *   KLSHHDCD
 *    N
 *     H
 *     HSKCTEPHCGNSHQMLHRDP
 *     CCDQCQSWEAENWCASMRKAILF
 * 
 * </pre>
 * 
 * @author Peter Troshin
 * @version 1.0 April 2011
 * 
 */
public class FastaReader implements Iterator<FastaSequence> {

	private final Scanner input;
	/**
	 * Delimiter for the scanner
	 */
	private final String DELIM=">";
	/**
	 * Header data can contain non-ASCII symbols and read in UTF8
	 * 
	 * @param inputFile
	 *            the file containing the list of FASTA formatted sequences to
	 *            read from
	 * @throws FileNotFoundException
	 *             if the input file is not found
	 * @throws IllegalStateException
	 *             if the close method was called on this instance
	 * 
	 */
	public FastaReader(final String inputFile) throws FileNotFoundException {
		input = new Scanner(new File(inputFile), "UTF8");
		input.useDelimiter(DELIM);
		Runtime.getRuntime().addShutdownHook(new Thread() {

			@Override
			public void run() {
				if (input != null) {
					input.close();
				}
			}
		});
	}

	/**
	 * This class will not close the incoming stream! So the client should do
	 * so.
	 * 
	 * @param inputStream
	 * @throws FileNotFoundException
	 */
	public FastaReader(final InputStream inputStream)
			throws FileNotFoundException {
		input = new Scanner(inputStream);
		input.useDelimiter(DELIM);
	}
	/**
	 * {@inheritDoc}
	 * 
	 * @throws IllegalStateException
	 *             if the close method was called on this instance
	 */
	@Override
	public boolean hasNext() {
		return input.hasNext();
	}

	/**
	 * Reads the next FastaSequence from the input
	 * 
	 * @throws AssertionError
	 *             if the header or the sequence is missing
	 * @throws IllegalStateException
	 *             if the close method was called on this instance
	 *             @throws MismatchException - if there were no more FastaSequence's.
	 */
	@Override
	public FastaSequence next() {
		String fastaHeader=input.next();
		while (fastaHeader.indexOf("\n")<0 && input.hasNext())
		{
			fastaHeader = fastaHeader.concat(">");
			fastaHeader = fastaHeader.concat(input.next());
		}
		return FastaReader.toFastaSequence(fastaHeader);
	}

	/**
	 * Not implemented
	 */
	@Override
	public void remove() {
		throw new UnsupportedOperationException();
	}

	/**
	 * Call this method to close the connection to the input file if you want to
	 * free up the resources. The connection will be closed on the JVM shutdown
	 * if this method was not called explicitly. No further reading on this
	 * instance of the FastaReader will be possible after calling this method.
	 */
	public void close() {
		input.close();
	}

	private static FastaSequence toFastaSequence(final String singleFastaEntry) {

		assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";

		int nlineidx = singleFastaEntry.indexOf("\n");
		if (nlineidx < 0) {
			throw new AssertionError(
					"The FASTA sequence must contain the header information"
							+ " separated by the new line from the sequence. Given sequence does not appear to "
							+ "contain the header! Given data:\n "
							+ singleFastaEntry);
		}
		String header = singleFastaEntry.substring(0, nlineidx);

		// Get rid of the new line chars (should cover common cases)
		header = header.replaceAll("\r", "");

		String sequence = singleFastaEntry.substring(nlineidx);

		if (Util.isEmpty(sequence)) {
			throw new AssertionError(
					"Empty sequences are not allowed! Please make sure the "
							+ " data is in the FASTA format! Given data:\n "
							+ singleFastaEntry);
		}
		return new FastaSequence(header, sequence);
	}
}