1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
|
syntax = "proto2";
package org.campagnelab.dl.varanalysis.protobuf;
option java_package = "org.campagnelab.dl.varanalysis.protobuf";
option optimize_for = SPEED;
/**
One such message per position in the genome that we observe. These records will be written to collections and
packed in a binary file.
*/
message BaseInformationCollection {
repeated BaseInformation records = 1;
}
message BaseInformation {
/** Index of the reference sequence where this base is observed in the genome. */
required uint32 reference_index = 1;
/** Identifier for the reference sequence (typically chromosome name). */
optional string reference_id = 10;
/** The position in the reference sequence of the genome. */
required uint32 position = 2;
/** Whether this base was mutated in simulation. */
optional bool mutated = 3;
/** The base we have introduced in the simulation. Always length 1. */
optional string mutatedBase = 4;
/** The index of the counts array where the mutated element was introduced. */
optional uint32 indexOfMutatedBase = 7;
/** The frequency with which this base was mutated by the mutator. */
optional float frequencyOfMutation = 6;
/**
The base present at the position in the genome/reference sequence.
*/
optional string referenceBase = 5;
/** A set of samples (ie germline,somatic). */
repeated SampleInfo samples = 8;
/** The genotype at this base position. In the format A/B where A,B are alleles (single bases or indels).
Homozygotes are written for instance, A/A while A/T-- is a heterozygote genotype with one allele A another T--*[ */
optional string trueGenotype = 9;
/** This should generally correspond with the reference base, but in the case of some indels will be different.
Eg: A (snp or no variant) , A--- (insertion) or ACAG. (deletion)[ */
optional string trueFrom = 11;
/** Genomic context centered on the base: bases around the base. If 11 bases are stored, 5 bases of context are
stored on either side. */
optional string genomicSequenceContext = 15;
/** gobyGenotypeIndex corresponding to each called allele of the true genotype **/
repeated uint32 calledCountIndices = 16;
}
/**
Usually two SampleInfo messages, one germline and one somatic. Contain data about the samples the count info for each genotype.
*/
message SampleInfo {
/** A number of bases were observed at the position. */
repeated CountInfo counts = 1;
/** Flag to specify whether the relevant sample is from a tumor (ie somatic sample) */
optional bool isTumor = 2;
/** String encoding nicely formatted version of counts */
optional string formattedCounts = 3;
/** Indicates whether this sample has a non-reference allele. Only filled in
when true labels have been added (using add-true-genotypes.sh from variationanalysis).
*/
optional bool isVariant = 4;
/** The genotype at this base position, in this sample. In the format A/B where A,B are alleles (single bases or indels).
Homozygotes are written for instance, A/A while A/T-- is a heterozygote genotype with one allele A another T--*[
*/
optional string trueGenotype = 5;
/** The genotype before segment post-processing. Used for debugging post-processing of segment construction. **/
optional string prePostProcessingGenotype=100;
/** gobyGenotypeIndex corresponding to each called allele of the true genotype **/
repeated uint32 calledCountIndices = 9;
}
message CountInfo {
/**
True iff genotype sequence observed in the sample matches the reference sequence. */
required bool matchesReference = 1;
/**
Sequence of the genotype observed in the reference genome. Typically a single base, or an indel sequence. */
required string fromSequence = 2;
/**
Sequence of the genotype observed in the sample. Typically a single base, or an indel sequence. */
required string toSequence = 3;
/** The number of times the genotype is observed in the sample in a read matching the forward strand. */
required uint32 genotypeCountForwardStrand = 4;
/** The number of times the genotype is observed in the sample in a read matching the reverse strand. */
required uint32 genotypeCountReverseStrand = 5;
/**
Indicates whether this genotype is an indel. */
optional bool isIndel = 15;
/** The quality scores of all bases matching this genotype on the forward strand. Phred scale. */
repeated NumberWithFrequency qualityScoresForwardStrand = 16;
/** The quality scores of all bases matching this genotype on the reverse strand. Phred scale. */
repeated NumberWithFrequency qualityScoresReverseStrand = 17;
/** The index in the read of all bases matching this genotype on the forward strand */
repeated NumberWithFrequency readIndicesForwardStrand = 18;
/** The index in the read of all bases matching this genotype on the reverse strand */
repeated NumberWithFrequency readIndicesReverseStrand = 19;
/** Mapping qualities for read on the forward strand with this genotype. */
repeated NumberWithFrequency readMappingQualityForwardStrand = 21;
/** Mapping qualities for read on the reverse strand with this genotype. */
repeated NumberWithFrequency readMappingQualityReverseStrand = 22;
/**
Number of variations found in the reads that support this genotype. */
repeated NumberWithFrequency numVariationsInReads = 23;
/**
Insert sizes found in reads that support this genotype. */
repeated NumberWithFrequency insertSizes = 24;
/**
Target aligned lengths for aligned reads that support this genotype. */
repeated NumberWithFrequency targetAlignedLengths = 25;
/**
Query aligned lengths for aligned reads that support this genotype. **/
repeated NumberWithFrequency queryAlignedLengths = 26;
/**
Query positions for aligned reads that support this genotype. **/
repeated NumberWithFrequency queryPositions = 30;
/**
Pair flags for aligned reads that support this genotype. **/
repeated NumberWithFrequency pairFlags = 27;
/** The index in the read of all bases matching this genotype on the forward strand */
repeated NumberWithFrequency distancesToReadVariationsForwardStrand = 28;
/** The index in the read of all bases matching this genotype on the reverse strand */
repeated NumberWithFrequency distancesToReadVariationsReverseStrand = 29;
/** Number of bases between the base where the variation is observed and the start of the read. */
repeated NumberWithFrequency distanceToStartOfRead = 31;
/** Number of bases between the base where the variation is observed and the end of the read. */
repeated NumberWithFrequency distanceToEndOfRead = 32;
/** True iff the genotype is called in this sample. In other words, this field is true when the genotype caller believes
that this genotype is present in the sample. */
optional bool isCalled = 20;
/*
The index of the genotype in the sample count info array. Used when counts are sorted and we need to restore
the original genotype order.
*/
optional uint32 gobyGenotypeIndex = 40;
/** The offset used when creating the record copy during post-processing. Used for debugging post-processing of segment construction. */
optional uint32 offset=101;
}
message NumberWithFrequency {
required int32 number = 1;
required uint32 frequency = 2;
}
message SomaticOutput {
required bool isSomatic = 200;
/** Index of the reference sequence where this base is observed in the genome. */
required uint32 reference_index = 1;
/** Identifier for the reference sequence (typically chromosome name). */
optional string reference_id = 10;
/** The position in the reference sequence of the genome. */
required uint32 position = 2;
/** The base we have introduced in the simulation. Always length 1. */
optional string mutatedBase = 4;
/** The index of the counts array where the mutated element was introduced. */
optional uint32 indexOfMutatedBase = 7;
/** The frequency with which this base was mutated by the mutator. */
optional float frequencyOfMutation = 6;
/**
The base present at the position in the genome/reference sequence.
*/
optional string referenceBase = 5;
}
|