1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
|
package prok;
import java.util.ArrayList;
import java.util.HashMap;
import dna.AminoAcid;
import gff.GffLine;
import shared.Shared;
import shared.Tools;
import structures.ByteBuilder;
/**
* ORF means Open Reading Frame.
* It starts at the first base of a start codon and ends at the last base of a stop codon.
* The length is divisible by 3.
* @author Brian Bushnell
* @date Sep 20, 2018
*
*/
public class Orf extends PFeature {
/*--------------------------------------------------------------*/
/*---------------- Initialization ----------------*/
/*--------------------------------------------------------------*/
/**
* Bases and coordinates are assumed to be the correct strand.
* Minus-strand ORFs can be flipped at the end of the constructor.
* @param scafName_
* @param start_
* @param stop_
* @param strand_
* @param frame_
* @param bases
* @param flip
*/
public Orf(String scafName_, int start_, int stop_, int strand_, int frame_, byte[] bases, boolean flip, int type_) {
super(scafName_, start_, stop_, strand_, bases.length);
frame=frame_;
startCodon=getCodon(start, bases);
stopCodon=getCodon(stop-2, bases);
type=type_;
if(flip && strand==Shared.MINUS){flip();}
}
/*--------------------------------------------------------------*/
/*---------------- Init Helpers ----------------*/
/*--------------------------------------------------------------*/
/**
* Grab the codon starting at from.
* Assumes bases are in the correct strand
* @param from
* @param bases
* @return
*/
private static int getCodon(int from, byte[] bases){
int codon=0;
for(int i=0; i<3; i++){
// assert(i+from<bases.length) : i+", "+from+", "+bases.length;
byte b=bases[i+from];
int x=AminoAcid.baseToNumber[b];
codon=(codon<<2)|x;
}
return codon;
}
public float calcOrfScore(){
return calcOrfScore(0);
}
/**
* The score of an ORF alone is a factor of the length, start score, stop score, and kmer score.
* The score of an ORF in the context of an overlapping gene also includes a penalty for the overlap length.
* @param overlap
* @return Calculated score
*/
public float calcOrfScore(int overlap){
double a=Math.sqrt(Tools.max(f1, e1+startScore));
// double b=Math.sqrt(f2/*Tools.max(f2, e2+stopScore)*/);//This is better, ignoring stopscore completely
double b=Math.sqrt(Tools.max(f2, e2+0.35f*stopScore));
double c=Tools.max(f3, e3+averageKmerScore());
assert(a!=Double.NaN);
assert(b!=Double.NaN);
assert(c!=Double.NaN);
c=4*Math.pow(c, 2.2);
double d=(0.1*a*b*c*(Math.pow(length()-overlap, 2.5)-(overlap<1 ? 0 : Math.pow(overlap+50, 2))));//TODO: Adjust these constants
if(d>0){d=Math.sqrt(d);}
assert(d!=Double.NaN);
return (float)d;
}
public float averageKmerScore(){
return kmerScore/(length()-GeneModel.kInnerCDS-2); //This slightly affects score if kInnerCDS is changed
}
/*--------------------------------------------------------------*/
/*---------------- Public Methods ----------------*/
/*--------------------------------------------------------------*/
public boolean isValidPrev(Orf prev, int maxOverlap){
if(prev.stop>=stop || prev.stop>=start+maxOverlap || prev.start>=start){return false;}
if(prev.frame==frame && prev.strand==strand && prev.stop>=start){return false;}
return true;
}
public float pathScore() {return Tools.max(pathScorePlus, pathScoreMinus);}
public float pathScore(int prevStrand) {return prevStrand==0 ? pathScorePlus : pathScoreMinus;}
public Orf prev(){return pathScorePlus>=pathScoreMinus ? prevPlus : prevMinus;}
public Orf prev(int prevStrand){return prevStrand==0 ? prevPlus : prevMinus;}
public int pathLength(int prevStrand){return prevStrand==0 ? pathLengthPlus : pathLengthMinus;}
public int pathLength(){return pathScorePlus>=pathScoreMinus ? pathLengthPlus : pathLengthMinus;}
/*--------------------------------------------------------------*/
/*---------------- ToString ----------------*/
/*--------------------------------------------------------------*/
/**
* @param orfs A list of called features
* @param types Types of features to retain, e.g. "CDS,rRNA,tRNA"
* @return GffLines subdivided by type
*/
public static ArrayList<GffLine>[] toGffLinesByType(ArrayList<Orf> orfs, String types){
String[] typeArray=types.split(",");
@SuppressWarnings("unchecked")
ArrayList<GffLine>[] lists=new ArrayList[typeArray.length];
HashMap<String, ArrayList<GffLine>> map=new HashMap<String, ArrayList<GffLine>>(3*typeArray.length);
for(int i=0; i<typeArray.length; i++){
String type=typeArray[i];
lists[i]=new ArrayList<GffLine>();
map.put(type, lists[i]);
}
for(Orf orf : orfs){
String type=ProkObject.typeStrings2[orf.type];
ArrayList<GffLine> glist=map.get(type);
if(glist!=null) {
glist.add(new GffLine(orf));
}
}
return lists;
}
public static ArrayList<GffLine> toGffLines(ArrayList<Orf> orfs){
if(orfs==null) {return null;}
ArrayList<GffLine> list=new ArrayList<GffLine>(orfs.size());
for(Orf orf : orfs){list.add(new GffLine(orf));}
return list;
}
public String toStringFlipped(){
if(strand==flipped()){
return toString();
}
flip();
String s=toString();
flip();
return s;
}
@Override
public String toString(){
return toGff();
}
public String toGff(){
ByteBuilder bb=new ByteBuilder();
appendGff(bb);
return bb.toString();
}
public ByteBuilder appendGff(ByteBuilder bb){
if(scafName==null){
bb.append('.').tab();
}else{
for(int i=0, max=scafName.length(); i<max; i++){
char c=scafName.charAt(i);
if(c==' ' || c=='\t'){break;}
bb.append(c);
}
bb.tab();
}
bb.append("BBTools").append('\t');
bb.append(typeStrings2[type]).append('\t');
bb.append(start+1).append('\t');
bb.append(stop+1).append('\t');
if(orfScore<0){bb.append('.').append('\t');}
else{bb.append(orfScore, 2).append('\t');}
bb.append(strand<0 ? '.' : Shared.strandCodes2[strand]).append('\t');
bb.append('0').append('\t');
//bb.append('.');
bb.append(typeStrings[type]).append(',');
if(type==0){
bb.append("fr").append(frame).append(',');
}
// bb.append(startCodon).append(',');
// bb.append(stopCodon).append(',');
bb.append("startScr:").append(startScore, 3).append(',');
bb.append("stopScr:").append(stopScore, 3).append(',');
bb.append("innerScr:").append(averageKmerScore(), 3).append(',');
bb.append("len:").append(length());
if(type==0){
bb.append(',');
bb.append("start:").append(AminoAcid.codonToString(startCodon)).append(',');
bb.append("stop:").append(AminoAcid.codonToString(stopCodon));
}
return bb;
}
public boolean isSSU(){return type==r16S || type==r18S;}
public boolean is16S(){return type==r16S;}
public boolean is18S(){return type==r18S;}
public boolean isCDS(){return type==CDS;}
public boolean isRRNA(){return type==r18S || type==r16S || type==r5S || type==r23S;}
public boolean isTRNA(){return type==tRNA;}
/*--------------------------------------------------------------*/
/*---------------- Overrides ----------------*/
/*--------------------------------------------------------------*/
@Override
public float score() {
return orfScore;
}
/*--------------------------------------------------------------*/
/*---------------- Fields ----------------*/
/*--------------------------------------------------------------*/
public final int frame;
//These are not needed but nice for printing
public final int startCodon;
public final int stopCodon;
public float startScore;
public float stopScore;
public float kmerScore;
public float orfScore;
//Path scores are for pathfinding phase
public float pathScorePlus;
public int pathLengthPlus=1;
public Orf prevPlus;
public float pathScoreMinus;
public int pathLengthMinus=1;
public Orf prevMinus;
public final int type;
/*--------------------------------------------------------------*/
/*---------------- Static Fields ----------------*/
/*--------------------------------------------------------------*/
/* for kinnercds=6 */
// static float e1=0.1f;
// static float e2=-0.04f;
// static float e3=0.01f;//Decreasing this decreases TP but increases SNR
//
// static float f1=0.08f;
// static float f2=0.06f;
// static float f3=0.09f;
/* for kinnercds=7 */
static float e1=0.35f;
static float e2=-0.1f;
static float e3=-0.01f;//Decreasing this decreases TP but increases SNR
static float f1=0.08f;
static float f2=0.02f;
static float f3=0.09f;
}
|