/** * The MIT License (MIT) * Copyright (c) 2016-2017 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in * the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of * the Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFHeader; import org.genomicsdb.exception.GenomicsDBException; import org.genomicsdb.importer.GenomicsDBImporter; import org.genomicsdb.importer.model.SampleInfo; import org.json.simple.parser.ContainerFactory; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import java.io.FileReader; import java.io.IOException; import java.util.*; /** * Wrapper class to maintain stream state for the test driver program * The class can maintains * (a) VCFHeader * (b) CloseableTribbleIterator * (c) mNextVC the next VariantContext object to be sent to GenomicsDBImporter iff the buffer * interface of GenomicsDBImporter is used (addBufferStream()) and not the * Iterator interface (addSortedVariantContextIterator()) */ public final class TestBufferStreamGenomicsDBImporter { private static class VCFFileStreamInfo { public int mStreamIdx = -1; public VCFHeader mVCFHeader = null; public Iterator mIterator = null; public VariantContext mNextVC = null; /** * Constructor * @param fileName path to VCF file */ public VCFFileStreamInfo(final String fileName, final String loaderJSONFile, final int rank, final boolean useMultiChromosomeIterator) throws IOException, ParseException { AbstractFeatureReader reader = AbstractFeatureReader.getFeatureReader(fileName, new VCFCodec(), false); mVCFHeader = (VCFHeader)(reader.getHeader()); if(useMultiChromosomeIterator) mIterator = GenomicsDBImporter.columnPartitionIterator(reader, loaderJSONFile, rank); else mIterator = reader.iterator(); } } /** * Factory object to maintain order of keys in simple JSON parsing - use LinkedHashMap */ private static class LinkedHashFactory implements ContainerFactory { @Override public List creatArrayContainer() { return new ArrayList(); } @Override public Map createObjectContainer() { return new LinkedHashMap(); } } /** * Static function that reads sample names from the vcfHeader and adds entries to the map. * The function assumes that the samples will be assigned row indexes beginning at rowIdx * and that the sample names specified in the header * are globally unique (across all streams/files) * * @param sampleIndexToInfo map: key=sampleIndex in vcfHeader: value=SampleInfo * @param vcfHeader VCF header * @param rowIdx Starting row index from which to assign * @return rowIdx+#samples in the header */ static long initializeSampleInfoMapFromHeader(Map sampleIndexToInfo, final VCFHeader vcfHeader, final long rowIdx) { final List headerSampleNames = vcfHeader.getGenotypeSamples(); final int numSamplesInHeader = headerSampleNames.size(); for (int i = 0; i < numSamplesInHeader; ++i) sampleIndexToInfo.put(i, new SampleInfo(headerSampleNames.get(i), rowIdx + i)); return rowIdx + numSamplesInHeader; } /** * Sample driver code for testing Java VariantContext write API for GenomicsDB * The code shows two ways of using the API * (a) Iterator * (b) Directly adding VariantContext objects * If "-iterators" is passed as the second argument, method (a) is used. */ public static void main(final String[] args) throws IOException, GenomicsDBException, ParseException { if(args.length < 2) { System.err.println("For loading: [-iterators] " +" [bufferCapacity rank lbRowIdx ubRowIdx useMultiChromosomeIterator]"); System.exit(-1); } int argsLoaderFileIdx = 0; if(args[0].equals("-iterators")) argsLoaderFileIdx = 1; //Buffer capacity long bufferCapacity = (args.length >= argsLoaderFileIdx+3) ? Integer.parseInt(args[argsLoaderFileIdx+2]) : 1024; //Specify rank (or partition idx) of this process int rank = (args.length >= argsLoaderFileIdx+4) ? Integer.parseInt(args[argsLoaderFileIdx+3]) : 0; //Specify smallest row idx from which to start loading. // This is useful for incremental loading into existing array long lbRowIdx = (args.length >= argsLoaderFileIdx+5) ? Long.parseLong(args[argsLoaderFileIdx+4]) : 0; //Specify largest row idx up to which loading should be performed - for completeness long ubRowIdx = (args.length >= argsLoaderFileIdx+6) ? Long.parseLong(args[argsLoaderFileIdx+5]) : Long.MAX_VALUE-1; //Boolean to use MultipleChromosomeIterator boolean useMultiChromosomeIterator = (args.length >= argsLoaderFileIdx + 7) && Boolean.parseBoolean(args[argsLoaderFileIdx + 6]); // first arg String loaderJSONFile = args[argsLoaderFileIdx]; GenomicsDBImporter loader = new GenomicsDBImporter(loaderJSONFile, rank); // - useful for the driver only //JSON file that contains "stream_name": "vcf_file_path" entries FileReader mappingReader = new FileReader(args[argsLoaderFileIdx+1]); JSONParser parser = new JSONParser(); LinkedHashMap streamNameToFileName = (LinkedHashMap)parser.parse(mappingReader, new LinkedHashFactory()); ArrayList streamInfoVec = new ArrayList(); long rowIdx = 0; for(Object currObj : streamNameToFileName.entrySet()) { Map.Entry entry = (Map.Entry)currObj; VCFFileStreamInfo currInfo = new VCFFileStreamInfo(entry.getValue(), loaderJSONFile, rank, useMultiChromosomeIterator); /** The following 2 lines are not mandatory - use initializeSampleInfoMapFromHeader() * iff you know for sure that sample names in the VCF header are globally unique * across all streams/files. If not, you have 2 options: * (a) specify your own mapping from sample index in the header to SampleInfo object * (unique_name, rowIdx) OR * (b) specify the mapping in the callset_mapping_file (JSON) and pass null to * addSortedVariantContextIterator() */ LinkedHashMap sampleIndexToInfo = new LinkedHashMap(); rowIdx = initializeSampleInfoMapFromHeader(sampleIndexToInfo, currInfo.mVCFHeader, rowIdx); int streamIdx = -1; if(args[0].equals("-iterators")) streamIdx = loader.addSortedVariantContextIterator(entry.getKey(), currInfo.mVCFHeader, currInfo.mIterator, bufferCapacity, VariantContextWriterBuilder.OutputType.BCF_STREAM, sampleIndexToInfo); //pass sorted VC iterators else //use buffers - VCs will be provided by caller streamIdx = loader.addBufferStream(entry.getKey(), currInfo.mVCFHeader, bufferCapacity, VariantContextWriterBuilder.OutputType.BCF_STREAM, null, sampleIndexToInfo); currInfo.mStreamIdx = streamIdx; streamInfoVec.add(currInfo); } if(args[0].equals("-iterators")) { //Much simpler interface if using Iterator loader.doSingleImport(); assert loader.isDone(); } else { //Must be called after all iterators/streams added - no more iterators/streams // can be added once this function is called loader.setupGenomicsDBImporter(); //Counts and tracks buffer streams for which new data must be supplied //Initialized to all the buffer streams int numExhaustedBufferStreams = streamInfoVec.size(); int[] exhaustedBufferStreamIdxs = new int[numExhaustedBufferStreams]; for(int i=0;i