File: RNAStructReader.java

package info (click to toggle)
libjaba-client-java 2.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 2,052 kB
  • sloc: java: 17,308; makefile: 12
file content (278 lines) | stat: -rw-r--r-- 10,172 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
package compbio.data.sequence;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Scanner;
import java.util.TreeSet;
import java.util.regex.Pattern;

// Utility class for reading alifold output

public class RNAStructReader {

	// Whitespace patterns
	static String s = "[+\\s=]+";
	static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";
	static String notData = "[\\s=+]+";

	// RNAOut data type patterns 
	static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)
	static String structP = "[\\.)({}\\[\\],]{2,}";
	static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";
	static String energyP = "-?[0-9]*\\.?[0-9]{2}";
	static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";
	
	// alifold out line patterns
	static String ps = "\\s*";
	static String alignmentP = "^"+seqP+ps+"$";
	static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";
	static String justStructP = "^"+structP+ps+"$";
	static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";
	static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";
	static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";
	static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";
	static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";
	static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";
	
	
	public static RNAStructScoreManager readRNAStructStream(InputStream stdout)
			throws IOException {
		
		String error = "Error in parsing alifold stdout file: ";
		// The Lists required to construct a ScoreManager Using the new constructor
		List<String> structs = new ArrayList<String>();
		List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();

		// Allocate necessry data structures for creating Score objects
		ArrayList<Float> scores = new ArrayList<Float>();

		BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));
		// The first 2 lines of the alifold stdout file are always the same format
		String fline = reader.readLine();
		assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :
			error + "Sequence Alignment Expected";
		structs.add(fline.trim());
		data.add(newEmptyScore(AlifoldResult.consensusAlignment));
		
		fline = reader.readLine();
		assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) :
			error + "Consensus Structure and Energy Expected";
		Scanner sc = new Scanner(fline);
		structs.add(sc.next());
		for (int i = 0; i < 3; i++) {
			scores.add(Float.parseFloat(sc.findInLine(floatP)));
		}
		data.add(newSetScore(AlifoldResult.mfeStructure, scores));
		
		// Now the alifold stdout file formats diverge based on arguments
		fline = reader.readLine();
		String sline;
		Scanner nsc = null;
		while ( fline != null) {
			scores.clear();
			AlifoldLine ftype = identifyLine(fline);
			sline = reader.readLine(); // Look ahead
			sc = new Scanner(fline);
			if (sline != null) nsc = new Scanner(sline);

			if (ftype.equals(AlifoldLine.PStruct)) {
				// The -p or --MEA option is specified
				// The next line should always be frequency of mfe structure
				assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
					error + "Expected frequency of mfe structure";
				structs.add(sc.next());
				scores.add(Float.parseFloat(sc.findInLine(floatP)));
				scores.add(Float.parseFloat(nsc.findInLine(floatP)));
				data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores));
				// Jump line
				sline = reader.readLine();
			}
			else if (ftype.equals(AlifoldLine.centStruct)) {
				structs.add(sc.next());
				for (int i = 0; i < 3; i++) {
					scores.add(Float.parseFloat(sc.findInLine(floatP)));
				}
				data.add(newSetScore(AlifoldResult.centroidStructure, scores));
			}
			else if (ftype.equals(AlifoldLine.MEAStruct)) {
				structs.add(sc.next());
				for (int i = 0; i < 2; i++) {
					scores.add(Float.parseFloat(sc.findInLine(floatP)));
				}
				data.add(newSetScore(AlifoldResult.MEAStucture, scores));
			}
			else if (ftype.equals(AlifoldLine.justStruct)) {
				structs.add(sc.next());
				data.add(newEmptyScore(AlifoldResult.stochBTStructure));
			}
			else if (ftype.equals(AlifoldLine.stochBTStruct)) {
				structs.add(sc.next());
				scores.add(sc.nextFloat());
				scores.add(sc.nextFloat());
				data.add(newSetScore(AlifoldResult.stochBTStructure, scores));
			}
			else if (ftype.equals(AlifoldLine.freeEnergy)) {
				assert (sline != null 
						&& Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
						error + "Found 'freeEnergy' line on its own";
				structs.add("Free energy of ensemble (kcal/mol) followed by frequency of mfe structure in ensemble");
				scores.add(Float.parseFloat(sc.findInLine(floatP)));
				scores.add(Float.parseFloat(nsc.findInLine(floatP)));
				data.add(newSetScore(AlifoldResult.ensembleValues, scores));
				// jump line
				sline = reader.readLine();
			}

			assert(!ftype.equals(AlifoldLine.ensembleFreq)) :
				error + "Wasn't expecting 'frequency of mfe structure'!";
			assert(!ftype.equals(AlifoldLine.mfeStruct)) :
				error + "'Standard output' line at a place other than line 2!";
			assert(!ftype.equals(AlifoldLine.alignment)) :
				error + "Wasn't expecting an alignment sequence!";
			assert(!ftype.equals(AlifoldLine.OTHER)) :
				error + "Wasn't expecting this whatever it is: " + fline;

			fline = sline;
		}
				
		sc.close();
		if (nsc != null) nsc.close();
		
		return new RNAStructScoreManager(structs, data);
	}
	
	// Just for the purpose of creating new TreeSet<Score> objects of length one
	// for adding to a 'data' list to make a ScoreManager
	private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {
		// first convert List<Float> to float[]
		float[] scoresf = new float[scores.size()];
		Float f;
		for (int i = 0; i < scoresf.length; i++) {
			f = scores.get(i);
			scoresf[i] = ( f != null ? f : Float.NaN);
		}
		return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));
	}

	// A method just for the purpose of neatly creating Almost Empty score objects
	// that can't be null
	public static TreeSet<Score> newEmptyScore(Enum<?> res) {
		return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));
	}

	public static RNAStructScoreManager readRNAStructStream(InputStream stdout, 
			InputStream alifold) throws IOException {
		
		// The Lists required to construct a ScoreManager Using the new constructor
		List<String> structs;
		List<TreeSet<Score>> data; 
		
		// Get a ScoreManager that takes the std output but ignores alifold.out (-p)
		RNAStructScoreManager stdSM = readRNAStructStream(stdout);
		
		// Unpack this into the structs and data lists
		structs = stdSM.getStructs();
		data = stdSM.getData();
		
		// Now parse alifold.out
		Scanner sc = new Scanner(alifold);
		sc.useDelimiter("[\\s%]+");
		
		// jump two lines to the data 
		sc.nextLine(); sc.nextLine();
		
		// Read the first, second and fourth columns. Ignoring everything else.
		// Allocate necessry data structures for creating Score objects
		ArrayList<Float> scores = new ArrayList<Float>();
		List<Range> rangeHolder = new ArrayList<Range>();
		String s = "null";
		while (true) {
			s = sc.next();
			if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;
			if (!sc.hasNextLine()) break;
			int t = sc.nextInt();
			rangeHolder.add(new Range(Integer.parseInt(s), t));
			sc.next();
			scores.add(sc.nextFloat());
			sc.nextLine();
		}
		sc.close();
		
		// Update the first ScoreHolder TreeSet<Score> element
		assert (rangeHolder.size() == scores.size());
		TreeSet<Score> sHolder = new TreeSet<Score>();
		for (int i = 0; i < rangeHolder.size(); i++) {
			ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));
			TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));
			sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR));
		}
		
		data.set(0, sHolder);
		
		return new RNAStructScoreManager(structs, data);
	}

	private static RNAOut identify(String token) {
		if (Pattern.matches(seqP, token)) {
			return RNAOut.SEQ;
		} else if (Pattern.matches(structP, token)) {
			return RNAOut.STRUCT;
		} else if (Pattern.matches(energyP, token)) {
			return RNAOut.ENERGY;
		} else if (Pattern.matches(freqP, token)) {
			return RNAOut.FREQ;
		}

		return RNAOut.OTHER;
	}
	
	private static AlifoldLine identifyLine(String line) {
		
		for (AlifoldLine il : AlifoldLine.values()) {
			if (Pattern.matches(il.regex, line)) return il;
		}
		return AlifoldLine.OTHER;
	}
	
	static enum AlifoldLine {
		mfeStruct (mfeStructP),
		justStruct (justStructP),
		stochBTStruct (stochBTStructP),
		PStruct (PStructP),
		centStruct (centStructP),
		MEAStruct (MEAStructP),
		freeEnergy (freeEnergyP),
		ensembleFreq (ensembleFreqP),
		alignment (alignmentP), 
		OTHER (".*");
		
		String regex;
		AlifoldLine(String regex) { this.regex = regex; }

	}
	
	//The types of data in an RNAalifold stdout file
	static enum RNAOut {
		SEQ, STRUCT, ENERGY, FREQ, OTHER
	}

	//Something to put in the Score objects of the alifold result which gives information
	//about what kind of sequence it is holding in its String Id.

	public static enum AlifoldResult {
		mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities
	}

	// Print the full regex Strings for testing 
	public static void main(String[] args) {
		for (AlifoldLine l : AlifoldLine.values()) {
			System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));
		}
	}

}