File: RepeatSet.java

package info (click to toggle)
bbmap 39.20%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,024 kB
  • sloc: java: 312,743; sh: 18,099; python: 5,247; ansic: 2,074; perl: 96; makefile: 39; xml: 38
file content (199 lines) | stat: -rwxr-xr-x 5,859 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
package repeat;

import java.util.ArrayList;
import java.util.Collections;

import repeat.Repeat.PosComparator2;
import shared.Tools;
import stream.Read;
import structures.CRange;
import tracker.EntropyTracker;

public class RepeatSet {
	
	RepeatSet(int k_, int minDepth_, int maxDepth_, int minRepeat_, int maxGap_, boolean weakSubsumes_, boolean amino_, int ek_, int ew_){
		k=k_;
		minDepth=minDepth_;
		maxDepth=maxDepth_;
		minRepeat=minRepeat_;
		maxGap=maxGap_;
		weakSubsumes=weakSubsumes_;
		amino=amino_;
		ek=ek_;
		ew=ew_;
		eta=new EntropyTracker[ew+1];
	}
	
	EntropyTracker getET(int len) {
		int window=Tools.mid(ek, ew, len);
		if(eta[window]==null){eta[window]=new EntropyTracker(ek, window, amino, 0, true);}
		return eta[window];
	}
	
	//Replaced by collectResidual
//	void flushOpen() {
//		for(Repeat r : openRepeats) {
//			if(r.contigNum>=0 && r.depth>1 && r.length()>=minRepeat){
//				assert(false);
//				oldRepeats.add(r.clone());
//			}
//		}
//	}
	
	void retireClosed() {
		oldRepeats.addAll(closedRepeats);
		closedRepeats.clear();
	}
	
	void addRepeat(Repeat r){
		if(lastRepeat!=null && lastRepeat.subsumes(r, weakSubsumes)) {
			//for more efficiency I could prevent r from being created in the first place
			//but that would require adding complexity to Repeat
			return;
		}
		//Actually this can happen in rare cases due to lazy collection.
//		assert(lastRepeat==null || !r.subsumes(lastRepeat)) : "\n"+lastRepeat+"\n"+r;
		
		r.calcStats(getET(r.length()));
		closedRepeats.add(r);
		recent.add(r);
		lastRepeat=r;
	}
	
	void collectResidual(int maxDepthSeen) {
		maxDepthSeen=Tools.min(maxDepthSeen, openRepeats.size()-1); //In case maxDepth is set.
		assert(maxDepthSeen<openRepeats.size()) : maxDepthSeen+", "+openRepeats.size();
		for(int i=maxDepthSeen; i>=minDepth; i--) {
			Repeat r=openRepeats.get(i);
			if(r.contigNum>=0 && r.depth>1 && r.length()>=minRepeat){
				addRepeat(r.clone());
				r.clear();
			}
		}
	}
	
	/** TODO: This can go from O(N^2) to O(N) if it follows contours instead of incrementing all.
	 * Also, make sure min depth is implemented. */
	void increment(Read contig, int pos, final int depth) {
		assert(pos>=k-1) : pos;
		int depth2=Tools.min(depth, maxDepth);
		while(depth2>=openRepeats.size()){
			openRepeats.add(new Repeat(null, -2, openRepeats.size(), k, maxGap, minRepeat, 'R'));
		}
		for(int i=depth2; i>=minDepth; i--){//Potentially quadratic
			Repeat r=openRepeats.get(i);
			Repeat old=r.increment(contig, pos, depth);
			if(old!=null){addRepeat(old);}
		}
	}
	
	int subsumeClosed(boolean weak) {return subsume(closedRepeats, weak);}
	
	ArrayList<CRange> closedToRanges(boolean merge){
		return toRanges(closedRepeats, merge, rangeBuffer);
	}
	
	ArrayList<CRange> recentToRanges(boolean merge){
		return toRanges(recent, merge, rangeBuffer);
	}
	
	ArrayList<Read> fetchRepeatSequence(){return fetchRepeatSequence(closedRepeats, maxGap, k);}
	
	static ArrayList<Read> fetchRepeatSequence(ArrayList<Repeat> repeats0, int maxGap, int k){
		ArrayList<Read> reads=new ArrayList<Read>();
		if(repeats0.isEmpty()){return reads;}
		ArrayList<Repeat> repeats=(ArrayList<Repeat>) repeats0.clone();
		removeFullyContained(repeats, maxGap, k);
		for(Repeat pete : repeats) {
			Read r=pete.toRead();
			reads.add(r);
		}
		return reads;
	}
	
	public static int subsume(ArrayList<Repeat> list, boolean weak) {
		if(list.isEmpty()) {return 0;}
		list.sort(Repeat.PosComparator.comparator);
		
		int removed=0;
		Repeat current=list.get(0);
		for(int i=1; i<list.size(); i++) {
			Repeat r=list.get(i);
			assert(current!=r);
			if(current.subsumes(r, weak)) {
//				if(printSubsumes) {
//					System.err.println(current+"\n"+r);
//				}
				list.set(i, null);
				removed++;
			}else{
				current=r;
			}
		}
		if(removed>0){Tools.condenseStrict(list);}
		return removed;
	}
	
	public static int removeFullyContained(final ArrayList<Repeat> repeats, int maxGap, int k) {//maxGap and k are just for an assertion
		if(repeats.size()<2) {return 0;}
		repeats.sort(PosComparator2.comparator);
		Repeat current=repeats.get(0);
		int removed=0;
		for(int i=1; i<repeats.size(); i++) {
			final Repeat r=repeats.get(i);
			if(current.spans(r)){
				repeats.set(i, null);
				removed++;
			}else{
				assert(!current.overlaps(r) || maxGap<=k) : "\n"+current+"\n"+r;
				current=r;
			}
		}
		if(removed>0){
			Tools.condenseStrict(repeats);
		}
		return removed;
	}
	
	public static ArrayList<CRange> toRanges(ArrayList<Repeat> repeats, boolean merge, ArrayList<CRange> ranges){
		if(ranges==null){ranges=new ArrayList<CRange>();}
		ranges.clear();
		if(repeats.size()==1) {ranges.add(repeats.get(0).toRange());}
		if(repeats.size()<2){return ranges;}
		{
			Repeat current=repeats.get(repeats.size()-1);
			for(int i=repeats.size()-2; i>=0; i--) {
				Repeat r=repeats.get(i);
				if(current.spans(r)) {
					//do nothing
				}else {
					ranges.add(current.toRange());
					current=r;
				}
			}
			ranges.add(current.toRange());
			Collections.sort(ranges);
		}
		if(merge){CRange.mergeList(ranges, false);}
		return ranges;
	}
	
	final int k;
	final int minDepth;
	final int maxDepth;
	final int minRepeat;
	final int maxGap;
	final boolean weakSubsumes;
	boolean amino;
	final int ek, ew;
	
	final ArrayList<Repeat> openRepeats=new ArrayList<Repeat>();
	final ArrayList<Repeat> closedRepeats=new ArrayList<Repeat>();
	final ArrayList<Repeat> recent=new ArrayList<Repeat>();
	final ArrayList<Repeat> oldRepeats=new ArrayList<Repeat>();
	private final ArrayList<CRange> rangeBuffer=new ArrayList<CRange>();
	Repeat lastRepeat=null;
	final EntropyTracker[] eta;
//	final EntropyTracker et;
	
}