File: CompressBref3Writer.java

package info (click to toggle)
beagle 220722-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 9,644 kB
  • sloc: java: 17,045; sh: 55; makefile: 11
file content (130 lines) | stat: -rw-r--r-- 4,397 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/*
 * Copyright (C) 2014-2021 Brian L. Browning
 *
 * This file is part of Beagle
 *
 * Beagle is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Beagle is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package bref;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import vcf.RefGTRec;
import vcf.Samples;

/**
 * <p>Class {@code CompressBref3Writer} writes phased, non-missing genotypes
 * to a binary reference format v3 (bref) file.
 * The {@code close()} method must be called after the last invocation of
 * the {@code write()} method in order to ensure that all buffered
 * data is written to the output binary reference file.
 * </p>
 * <p>Instances of class {@code CompressBrerf3Writer} are not thread-safe.</p>
 *
 * @author Brian L. Browning {@code <browning@uw.edu>}
 */
public class CompressBref3Writer implements BrefWriter {

    private final int maxNAlleles;
    private final List<RefGTRec> buffer;
    private final SeqCoder3 seqCoder;
    private final AsIsBref3Writer brefWriter;
    private final int nonMajThreshold;

    /**
     * Constructs a new {@code CompressBref3fWriter} for the specified data.
     * The Java virtual machine will exit with an error message if an I/O
     * error occurs.
     *
     * @param program the name of the program which is creating the
     * binary reference file.
     * @param samples the list of samples whose genotype data will
     * be written in binary reference format
     * @param maxNSeq the maximum number of distinct allele sequences
     * in a compressed block
     * @param brefFile name of the output binary reference file or
     * {@code null} if the output should be directed to standard output
     * @throws IllegalArgumentException
     * {@code maxNSeq < 0 || maxNSeq >= Chracter.MAX_VALUE}
     * @throws NullPointerException if {@code program == null || samples == null}
     */
    public CompressBref3Writer(String program, Samples samples, int maxNSeq,
            File brefFile) {
        this.maxNAlleles = SeqCoder3.MAX_NALLELES;
        this.buffer = new ArrayList<>(500);
        this.seqCoder = new SeqCoder3(samples, maxNSeq);
        this.brefWriter = new AsIsBref3Writer(program, samples, brefFile);
        this.nonMajThreshold = (maxNSeq/4) + 1;
    }

    @Override
    public Samples samples() {
        return brefWriter.samples();
    }

    @Override
    public void write(RefGTRec rec) {
        if (rec.isAlleleCoded()==false) {
            rec = RefGTRec.alleleCodedInstance(rec);
        }
        if (buffer.size()==Integer.MAX_VALUE) {
            flushBuffer();
        }
        if (convertToSeqCoding(rec)) {
            boolean success = seqCoder.add(rec);
            if (success == false) {
                flushBuffer();
                success = seqCoder.add(rec);
                assert success;
            }
            buffer.add(null);
        }
        else {
            buffer.add(rec);
        }
    }

    private boolean convertToSeqCoding(RefGTRec rec) {
        assert rec.isAlleleCoded();
        if (rec.marker().nAlleles() > maxNAlleles) {
            return false;
        }
        int majAllele = rec.majorAllele();
        int nonMajCnt = 0;
        for (int a=0, n=rec.marker().nAlleles(); a<n; ++a) {
            if (a!=majAllele) {
                nonMajCnt += rec.alleleCount(a);
            }
        }
        return nonMajCnt >= nonMajThreshold;
    }

    private void flushBuffer() {
        List<RefGTRec> list = seqCoder.getCompressedList();
        int index = 0;
        for (int j=0, n=buffer.size(); j<n; ++j) {
            RefGTRec rec = buffer.get(j);
            brefWriter.write( rec==null ? list.get(index++) : rec );
        }
        assert index==list.size();
        buffer.clear();
    }

    @Override
    public void close() {
        flushBuffer();
        brefWriter.close();
    }
}