1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
|
package compbio.data.sequence;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Scanner;
import java.util.TreeSet;
import java.util.regex.Pattern;
// Utility class for reading alifold output
public class RNAStructReader {
// Whitespace patterns
static String s = "[+\\s=]+";
static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";
static String notData = "[\\s=+]+";
// RNAOut data type patterns
static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)
static String structP = "[\\.)({}\\[\\],]{2,}";
static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";
static String energyP = "-?[0-9]*\\.?[0-9]{2}";
static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";
// alifold out line patterns
static String ps = "\\s*";
static String alignmentP = "^"+seqP+ps+"$";
static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";
static String justStructP = "^"+structP+ps+"$";
static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";
static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";
static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";
static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";
static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";
static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";
public static RNAStructScoreManager readRNAStructStream(InputStream stdout)
throws IOException {
String error = "Error in parsing alifold stdout file: ";
// The Lists required to construct a ScoreManager Using the new constructor
List<String> structs = new ArrayList<String>();
List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();
// Allocate necessry data structures for creating Score objects
ArrayList<Float> scores = new ArrayList<Float>();
BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));
// The first 2 lines of the alifold stdout file are always the same format
String fline = reader.readLine();
assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :
error + "Sequence Alignment Expected";
structs.add(fline.trim());
data.add(newEmptyScore(AlifoldResult.consensusAlignment));
fline = reader.readLine();
assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) :
error + "Consensus Structure and Energy Expected";
Scanner sc = new Scanner(fline);
structs.add(sc.next());
for (int i = 0; i < 3; i++) {
scores.add(Float.parseFloat(sc.findInLine(floatP)));
}
data.add(newSetScore(AlifoldResult.mfeStructure, scores));
// Now the alifold stdout file formats diverge based on arguments
fline = reader.readLine();
String sline;
Scanner nsc = null;
while ( fline != null) {
scores.clear();
AlifoldLine ftype = identifyLine(fline);
sline = reader.readLine(); // Look ahead
sc = new Scanner(fline);
if (sline != null) nsc = new Scanner(sline);
if (ftype.equals(AlifoldLine.PStruct)) {
// The -p or --MEA option is specified
// The next line should always be frequency of mfe structure
assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
error + "Expected frequency of mfe structure";
structs.add(sc.next());
scores.add(Float.parseFloat(sc.findInLine(floatP)));
scores.add(Float.parseFloat(nsc.findInLine(floatP)));
data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores));
// Jump line
sline = reader.readLine();
}
else if (ftype.equals(AlifoldLine.centStruct)) {
structs.add(sc.next());
for (int i = 0; i < 3; i++) {
scores.add(Float.parseFloat(sc.findInLine(floatP)));
}
data.add(newSetScore(AlifoldResult.centroidStructure, scores));
}
else if (ftype.equals(AlifoldLine.MEAStruct)) {
structs.add(sc.next());
for (int i = 0; i < 2; i++) {
scores.add(Float.parseFloat(sc.findInLine(floatP)));
}
data.add(newSetScore(AlifoldResult.MEAStucture, scores));
}
else if (ftype.equals(AlifoldLine.justStruct)) {
structs.add(sc.next());
data.add(newEmptyScore(AlifoldResult.stochBTStructure));
}
else if (ftype.equals(AlifoldLine.stochBTStruct)) {
structs.add(sc.next());
scores.add(sc.nextFloat());
scores.add(sc.nextFloat());
data.add(newSetScore(AlifoldResult.stochBTStructure, scores));
}
else if (ftype.equals(AlifoldLine.freeEnergy)) {
assert (sline != null
&& Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
error + "Found 'freeEnergy' line on its own";
structs.add("Free energy of ensemble (kcal/mol) followed by frequency of mfe structure in ensemble");
scores.add(Float.parseFloat(sc.findInLine(floatP)));
scores.add(Float.parseFloat(nsc.findInLine(floatP)));
data.add(newSetScore(AlifoldResult.ensembleValues, scores));
// jump line
sline = reader.readLine();
}
assert(!ftype.equals(AlifoldLine.ensembleFreq)) :
error + "Wasn't expecting 'frequency of mfe structure'!";
assert(!ftype.equals(AlifoldLine.mfeStruct)) :
error + "'Standard output' line at a place other than line 2!";
assert(!ftype.equals(AlifoldLine.alignment)) :
error + "Wasn't expecting an alignment sequence!";
assert(!ftype.equals(AlifoldLine.OTHER)) :
error + "Wasn't expecting this whatever it is: " + fline;
fline = sline;
}
sc.close();
if (nsc != null) nsc.close();
return new RNAStructScoreManager(structs, data);
}
// Just for the purpose of creating new TreeSet<Score> objects of length one
// for adding to a 'data' list to make a ScoreManager
private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {
// first convert List<Float> to float[]
float[] scoresf = new float[scores.size()];
Float f;
for (int i = 0; i < scoresf.length; i++) {
f = scores.get(i);
scoresf[i] = ( f != null ? f : Float.NaN);
}
return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));
}
// A method just for the purpose of neatly creating Almost Empty score objects
// that can't be null
public static TreeSet<Score> newEmptyScore(Enum<?> res) {
return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));
}
public static RNAStructScoreManager readRNAStructStream(InputStream stdout,
InputStream alifold) throws IOException {
// The Lists required to construct a ScoreManager Using the new constructor
List<String> structs;
List<TreeSet<Score>> data;
// Get a ScoreManager that takes the std output but ignores alifold.out (-p)
RNAStructScoreManager stdSM = readRNAStructStream(stdout);
// Unpack this into the structs and data lists
structs = stdSM.getStructs();
data = stdSM.getData();
// Now parse alifold.out
Scanner sc = new Scanner(alifold);
sc.useDelimiter("[\\s%]+");
// jump two lines to the data
sc.nextLine(); sc.nextLine();
// Read the first, second and fourth columns. Ignoring everything else.
// Allocate necessry data structures for creating Score objects
ArrayList<Float> scores = new ArrayList<Float>();
List<Range> rangeHolder = new ArrayList<Range>();
String s = "null";
while (true) {
s = sc.next();
if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;
if (!sc.hasNextLine()) break;
int t = sc.nextInt();
rangeHolder.add(new Range(Integer.parseInt(s), t));
sc.next();
scores.add(sc.nextFloat());
sc.nextLine();
}
sc.close();
// Update the first ScoreHolder TreeSet<Score> element
assert (rangeHolder.size() == scores.size());
TreeSet<Score> sHolder = new TreeSet<Score>();
for (int i = 0; i < rangeHolder.size(); i++) {
ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));
TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));
sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR));
}
data.set(0, sHolder);
return new RNAStructScoreManager(structs, data);
}
private static RNAOut identify(String token) {
if (Pattern.matches(seqP, token)) {
return RNAOut.SEQ;
} else if (Pattern.matches(structP, token)) {
return RNAOut.STRUCT;
} else if (Pattern.matches(energyP, token)) {
return RNAOut.ENERGY;
} else if (Pattern.matches(freqP, token)) {
return RNAOut.FREQ;
}
return RNAOut.OTHER;
}
private static AlifoldLine identifyLine(String line) {
for (AlifoldLine il : AlifoldLine.values()) {
if (Pattern.matches(il.regex, line)) return il;
}
return AlifoldLine.OTHER;
}
static enum AlifoldLine {
mfeStruct (mfeStructP),
justStruct (justStructP),
stochBTStruct (stochBTStructP),
PStruct (PStructP),
centStruct (centStructP),
MEAStruct (MEAStructP),
freeEnergy (freeEnergyP),
ensembleFreq (ensembleFreqP),
alignment (alignmentP),
OTHER (".*");
String regex;
AlifoldLine(String regex) { this.regex = regex; }
}
//The types of data in an RNAalifold stdout file
static enum RNAOut {
SEQ, STRUCT, ENERGY, FREQ, OTHER
}
//Something to put in the Score objects of the alifold result which gives information
//about what kind of sequence it is holding in its String Id.
public static enum AlifoldResult {
mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities
}
// Print the full regex Strings for testing
public static void main(String[] args) {
for (AlifoldLine l : AlifoldLine.values()) {
System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));
}
}
}
|