File: BasicPhaseStates.java

package info (click to toggle)
beagle 250227-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 9,364 kB
  • sloc: java: 17,684; sh: 55; makefile: 11
file content (351 lines) | stat: -rw-r--r-- 13,823 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/*
 * Copyright (C) 2014-2021 Brian L. Browning
 *
 * This file is part of Beagle
 *
 * Beagle is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Beagle is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package phase;

import vcf.Steps;
import blbutil.BitArray;
import blbutil.Utilities;
import ints.IntIntMap;
import java.util.Arrays;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Random;
import java.util.stream.IntStream;
import beagleutil.CompHapSegment;
import vcf.Markers;
import vcf.XRefGT;

/**
 * <p>Class {@code BasicPhaseStates} has methods for constructing a Li and
 * Stephens HMM for a target haplotype or target sample.
 * </p>
 * <p>Instances of {@code BasicPhaseStates} are not thread-safe.
 * </p>
 *
 * @author Brian L. Browning {@code <browning@uw.edu>}
 */
public final class BasicPhaseStates {

    private static final int NIL = -103;
    private final PbwtPhaseIbs ibsHaps;
    private final PhaseData phaseData;
    private final Steps steps;
    private final XRefGT allHaps;
    private final Markers markers;
    private final int nMarkers;
    private final int maxStates;
    private final int minSteps;

    private final IntIntMap hapToLastIbsStep;
    private final PriorityQueue<CompHapSegment> q;

    private final BitArray[] compHaps;

    /**
     * Constructs a new {@code BasicPhaseStates} object from the specified data.
     * @param ibsHaps the IBS haplotype segments
     * @param maxStates the maximum number of composite reference
     * haplotypes that will be constructed
     * @throws IllegalArgumentException if {@code maxStates < 1}
     * @throws NullPointerException if {@code ibsHaps == null}
     */
    public BasicPhaseStates(PbwtPhaseIbs ibsHaps, int maxStates) {
        if (maxStates < 1) {
            throw new IllegalArgumentException(String.valueOf(maxStates));
        }
        this.ibsHaps = ibsHaps;
        this.phaseData = ibsHaps.phaseData();
        this.steps = phaseData.fpd().stage1Steps();
        this.allHaps = ibsHaps.allHaps();
        this.markers = allHaps.markers();
        this.nMarkers = allHaps.nMarkers();
        this.maxStates = maxStates;
        float phaseStep = phaseData.fpd().ibsStep();
        this.minSteps = Math.max(200, (int) Math.ceil(1.0f/phaseStep)); // 200 steps and 1 cM
        this.hapToLastIbsStep = new IntIntMap(maxStates);
        this.q = new PriorityQueue<>(maxStates);

        int nBits = markers.sumHapBits();
        this.compHaps = IntStream.range(0, maxStates)
                .mapToObj(j -> new BitArray(nBits))
                .toArray(BitArray[]::new);
    }

    /**
     * Returns the number of target samples.
     * @return the number of target samples
     */
    public int nTargSamples() {
        return phaseData.fpd().targGT().nSamples();
    }

    /**
     * Returns the number of markers.
     * @return the number of markers
     */
    public int nMarkers() {
        return phaseData.fpd().targGT().nMarkers();
    }

    /**
     * Returns the maximum number of HMM states at a marker.
     * @return the maximum number of HMM states at a marker
     */
    public int maxStates() {
        return maxStates;
    }

    /**
     * Stores the Li and Stephens HMM for the specified target sample in
     * the specified arrays.The {@code nMismatches} parameter is an array of
     * three two-dimensional arrays: {@code nMismatches[0]} stores
     * stores the allele mismatch data between the reference haplotypes and
     * the haplotype composed of homozygous target genotypes,
     * {@code nMismatches[1]} stores the allele mismatch data between
     * the reference haplotypes and the first target haplotype, and
     * {@code nMismatches[2]} stores the allele mismatch data between
     * the reference haplotypes and the first target haplotype.  Each
     * two-dimensional array must have at least {@code mc.nClusters()}
     * rows, and a column for each HMM state. An element of the
     * two-dimensional array is 0 if the target and reference allele match
     * and is 1 otherwise.
     *
     * @param mc the marker clusters
     * @param refAtMissingGT a list of arrays in which HMM state alleles
     * at markers for which one or both target haplotypes have a missing allele
     * @param nMismatches arrays for storing present or absence of mismatches
     * between target and reference alleles
     * @return the number of state alleles at each marker
     *
     * @throws IndexOutOfBoundsException if
     * {@code sample < 0 || sample >= this.nTargSamples()}
     * @throws IndexOutOfBoundsException if {@code nMismatches.length < 3}
     * @throws IndexOutOfBoundsException if there exists {@code j} such that
     * {@code 0 < j && j < 3 && nMismatches[j].length < mc.nClusters()}
     * @throws IndexOutOfBoundsException if there exists {@code j, m} such that
     * {@code (0 < j && j < 3 && 0 < m && m < mc.nClusters())}, and
     * {@code nMismatches[j][m].length} is less than the number of model
     * states at marker {@code m}
     * @throws IndexOutOfBoundsException if {@code missAlleles.get(j)}
     * is less than the number of model states for any {@code j}
     * that indexes the missing genotypes
     * @throws NullPointerException if
     * {@code (samplePhase == null || mc == null || refAtMissingGT == null)}
     * or if any array is {@code null}
     */
    public int ibsStates(MarkerCluster mc, List<int[]> refAtMissingGT,
            byte[][][] nMismatches) {
        int nCompHaps = setCompRefHaps(mc.samplePhase().sample());
        copyData(mc, nCompHaps, refAtMissingGT, nMismatches);
        return nCompHaps;
    }

    /**
     * Stores the Li and Stephens HMM for the specified target sample in
     * the specified arrays.  The number of allele mismatches (0 or 1)
     * between {@code hap1[m]} and {@code hap2[m]} for the {@code j}-th state
     * are stored in {@code nMismatchs[0][m][j]} and
     * {@code nMismatches[1][m][j]} respectively.
     *
     * @param sample the target sample index
     * @param nMismatches an array of two two-dimensional arrays in which the
     * number of allele mismatches with reference haplotypes for the first
     * haplotype and the second haplotype will be stored
     * @return the number of state alleles at each marker
     *
     * @throws IndexOutOfBoundsException if
     * {@code sample < 0 || sample >= this.nTargSamples()}
     * @throws IndexOutOfBoundsException if {@code nMismatches.length < 2}
     * @throws IndexOutOfBoundsException if there exists {@code j} such that
     * {@code (0 < j && j < 2 && nMismatches[j].length < this.nMarkers())}
     * @throws IndexOutOfBoundsException if there exists {@code j, m} such that
     * {@code (0 < j && j < 2 && 0 < m && m < this.nMarkers())}, and
     * {@code nMismatches[j][m].length} is less than the total number of model
     * states
     * @throws NullPointerException if any array is {@code null}
     */
    public int ibsStates(int sample, byte[][][] nMismatches) {
        int nCompHaps = setCompRefHaps(sample);
        copyData(sample, nCompHaps, nMismatches);
        return nCompHaps;
    }

    private int setCompRefHaps(int sample) {
        int h1 = sample << 1;
        int h2 = (h1 | 0b1);
        q.clear();
        hapToLastIbsStep.clear();
        for (int step=0, n=steps.size(); step<n; ++step) {
            int ibsHap1 = ibsHaps.ibsHap(h1, step);
            if (ibsHap1>=0) {
                addIbsHap(ibsHap1, step);
            }
            int ibsHap2 = ibsHaps.ibsHap(h2, step);
            if (ibsHap2>=0) {
                addIbsHap(ibsHap2, step);
            }
        }
        if (q.isEmpty()) {
            fillQWithRandomHaps(sample);
        }
        int nCompHaps = copyFinalRefSegs();
        return nCompHaps;
    }

    private void addIbsHap(int ibsHap, int step) {
        if (hapToLastIbsStep.get(ibsHap, NIL)==NIL) { // hap not currently in q
            updateHeadOfQ();
            if (q.size()==maxStates
                    || (q.isEmpty()==false && step - q.peek().lastIbsStep() >= minSteps)) {
                CompHapSegment head = q.poll();
                int index = head.compHapIndex();
                int prevHap = head.hap();
                int prevStart = head.startMarker();
                int nextStart = steps.start((head.lastIbsStep() + step) >>> 1);
                hapToLastIbsStep.remove(head.hap());
                allHaps.copyTo(prevHap, prevStart, nextStart, compHaps[index]);

                head.updateSegment(ibsHap, nextStart, step);
                q.offer(head);
            }
            else {
                int index = q.size();
                int start = 0;
                q.offer(new CompHapSegment(ibsHap, start, step, index));
            }
        }
        hapToLastIbsStep.put(ibsHap, step);
    }

    private void updateHeadOfQ() {
        CompHapSegment head = q.peek();
        if (head!=null) {
            int lastIbsStep = hapToLastIbsStep.get(head.hap(), NIL);
            while (head.lastIbsStep()!=lastIbsStep) {
                head = q.poll();
                head.setLastIbsStep(lastIbsStep);
                q.offer(head);
                head = q.peek();
                lastIbsStep = hapToLastIbsStep.get(head.hap(), NIL);
            }
        }
    }

    private int copyFinalRefSegs() {
        int nCompHaps = q.size();
        CompHapSegment head = q.poll();
        while (head!=null) {
            int index = head.compHapIndex();
            int hap = head.hap();
            int startMarker = head.startMarker();
            allHaps.copyTo(hap, startMarker, nMarkers, compHaps[index]);
            head = q.poll();
        }
        return nCompHaps;
    }

    private void copyData(MarkerCluster mc, int nCompHaps,
            List<int[]> refAtMissingGT, byte[][][] nMismatches) {
        SamplePhase phase = mc.samplePhase();
        BitArray hap1 = phase.hap1();
        BitArray hap2 = phase.hap2();
        int missIndex = 0;
        int nClusters = mc.nClusters();
        for (int c=0; c<nClusters; ++c) {
            Arrays.fill(nMismatches[0][c], 0, nCompHaps, (byte) 0);
            Arrays.fill(nMismatches[1][c], 0, nCompHaps, (byte) 0);
            Arrays.fill(nMismatches[2][c], 0, nCompHaps, (byte) 0);
            int mStart = mc.clusterStart(c);
            int mEnd = mc.clusterEnd(c);
            if (mc.isMissingGtOrMaskedHet(c)) {
                assert mEnd-mStart==1;
                int[] refAlleles = refAtMissingGT.get(missIndex++);
                for (int j=0; j<nCompHaps; ++j) {
                    refAlleles[j] = markers.allele(compHaps[j], mStart);
                }
            }
            else {
                int bStart = markers.sumHapBits(mStart);
                int bEnd = markers.sumHapBits(mEnd);
                if (hap1.equal(hap2, bStart, bEnd)) {
                    for (int j=0; j<nCompHaps; ++j) {
                        if (hap1.equal(compHaps[j], bStart, bEnd)==false) {
                            nMismatches[0][c][j] = 1;
                            nMismatches[1][c][j] = 1;
                            nMismatches[2][c][j] = 1;
                        }
                    }
                }
                else {
                    // cluster contains a heterozygote genotype
                    for (int j=0; j<nCompHaps; ++j) {
                        if (hap1.equal(compHaps[j], bStart, bEnd)==false) {
                            nMismatches[1][c][j] = 1;
                        }
                        if (hap2.equal(compHaps[j], bStart, bEnd)==false) {
                            nMismatches[2][c][j] = 1;
                        }
                    }
                }
            }
        }
    }

    private int copyData(int sample, int nCompHaps, byte[][][] nMismatches) {
        byte[][] nMismatches1 = nMismatches[0];
        byte[][] nMismatches2 = nMismatches[1];
        int h1 = sample << 1;
        int h2 = h1 | 0b1;
        for (int m=0; m<nMarkers; ++m) {
            int a1 = allHaps.allele(m, h1);
            int a2 = allHaps.allele(m, h2);
            for (int j=0; j<nCompHaps; ++j) {
                int refAllele = markers.allele(compHaps[j], m);
                nMismatches1[m][j] = (refAllele==a1 ? (byte) 0 : (byte) 1);
                nMismatches2[m][j] = (refAllele==a2 ? (byte) 0 : (byte) 1);
            }
        }
        return nCompHaps;
    }

    private void fillQWithRandomHaps(int sample) {
        assert q.isEmpty();
        int nHaps = allHaps.nHaps();
        int nStates = Math.min(nHaps-2, maxStates);
        if (nStates<=0) {
            Utilities.exit("ERROR: there is only one sample");
        }
        else {
            Random rand = new Random(phaseData.seed() + sample);
            int ibsStep = 0;
            int startMarker = 0;
            int compHapIndex = 0;
            for (int j=0; j<nStates; ++j) {
                int h = rand.nextInt(nHaps);
                while ((h>>1)==sample) {
                    h = rand.nextInt(nHaps);
                }
                if (hapToLastIbsStep.get(h, NIL)==NIL) {
                    q.add(new CompHapSegment(h, startMarker, ibsStep, compHapIndex++));
                    hapToLastIbsStep.put(h, startMarker);
                }
            }
        }
    }
}