File: BaseInformationRecords.proto

package info (click to toggle)
libgoby-java 3.3.1%2Bdfsg2-11
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 58,108 kB
  • sloc: java: 78,105; cpp: 5,011; xml: 3,170; python: 2,108; sh: 1,575; ansic: 277; makefile: 114
file content (206 lines) | stat: -rwxr-xr-x 8,448 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
syntax = "proto2";

package org.campagnelab.dl.varanalysis.protobuf;

option java_package = "org.campagnelab.dl.varanalysis.protobuf";

option optimize_for = SPEED;

/**
One such message per position in the genome that we observe. These records will be written to collections and
packed in a binary file.
*/


message BaseInformationCollection {
    repeated BaseInformation records = 1;
}

message BaseInformation {
    /** Index of the reference sequence where this base is observed in the genome. */
    required uint32 reference_index = 1;
    /** Identifier for the reference sequence (typically chromosome name). */
    optional string reference_id = 10;
    /**  The position in the reference sequence of the genome. */
    required uint32 position = 2;

    /** Whether this base was mutated in simulation. */
    optional bool mutated = 3;

    /** The base we have introduced in the simulation. Always length 1. */
    optional string mutatedBase = 4;
    /** The index of the counts array where the mutated element was introduced. */
    optional uint32 indexOfMutatedBase = 7;

    /** The frequency with which this base was mutated by the mutator. */
    optional float frequencyOfMutation = 6;
    /**
    The base present at the position in the genome/reference sequence.
    */
    optional string referenceBase = 5;

    /** A set of samples (ie germline,somatic). */
    repeated SampleInfo samples = 8;

    /** The genotype at this base position. In the format A/B where A,B are alleles (single bases or indels).
    Homozygotes are written for instance, A/A while A/T-- is a heterozygote genotype with one allele A another T--*[ */
    optional string trueGenotype = 9;

    /** This should generally correspond with the reference base, but in the case of some indels will be different.
    Eg: A (snp or no variant) , A--- (insertion) or ACAG. (deletion)[ */
    optional string trueFrom = 11;

    /** Genomic context centered on the base: bases around the base. If 11 bases are stored, 5 bases of context are
    stored on either side. */
    optional string genomicSequenceContext = 15;

    /** gobyGenotypeIndex corresponding to each called allele of the true genotype **/
    repeated uint32 calledCountIndices = 16;
}

/**
Usually two SampleInfo messages, one germline and one somatic. Contain data about the samples the count info for each genotype.
*/
message SampleInfo {
    /** A number of bases were observed at the position. */
    repeated CountInfo counts = 1;

    /** Flag to specify whether the relevant sample is from a tumor (ie somatic sample) */
    optional bool isTumor = 2;

    /** String encoding nicely formatted version of counts */
    optional string formattedCounts = 3;

    /** Indicates whether this sample has a non-reference allele. Only filled in
     when true labels have been added (using add-true-genotypes.sh from variationanalysis).
     */
    optional bool isVariant = 4;
    /** The genotype at this base position, in this sample. In the format A/B where A,B are alleles (single bases or indels).
        Homozygotes are written for instance, A/A while A/T-- is a heterozygote genotype with one allele A another T--*[
        */
    optional string trueGenotype = 5;

    /** The genotype before segment post-processing. Used for debugging post-processing of segment construction. **/
    optional string prePostProcessingGenotype=100;

    /** gobyGenotypeIndex corresponding to each called allele of the true genotype **/
    repeated uint32 calledCountIndices = 9;


}

message CountInfo {
    /**
      True iff genotype sequence observed in the sample matches the reference sequence. */
    required bool matchesReference = 1;
    /**
      Sequence of the genotype observed in the reference genome. Typically a single base, or an indel sequence. */
    required string fromSequence = 2;

    /**
      Sequence of the genotype observed in the sample. Typically a single base, or an indel sequence. */
    required string toSequence = 3;

    /** The number of times the genotype is observed in the sample in a read matching the forward strand. */
    required uint32 genotypeCountForwardStrand = 4;

    /** The number of times the genotype is observed in the sample in a read matching the reverse strand. */
    required uint32 genotypeCountReverseStrand = 5;

    /**
    Indicates whether this genotype is an indel. */
    optional bool isIndel = 15;

    /** The quality scores of all bases matching this genotype on the forward strand. Phred scale. */
    repeated NumberWithFrequency qualityScoresForwardStrand = 16;

    /** The quality scores of all bases matching this genotype on the reverse strand. Phred scale.  */
    repeated NumberWithFrequency qualityScoresReverseStrand = 17;

    /** The index in the read of  all bases matching this genotype on the forward strand */
    repeated NumberWithFrequency readIndicesForwardStrand = 18;

    /** The index in the read of  all bases matching this genotype on the reverse strand */
    repeated NumberWithFrequency readIndicesReverseStrand = 19;

    /** Mapping qualities for read on the forward strand with this genotype. */
    repeated NumberWithFrequency readMappingQualityForwardStrand = 21;

    /** Mapping qualities for read on the reverse strand with this genotype. */
    repeated NumberWithFrequency readMappingQualityReverseStrand = 22;
    /**
    Number of variations found in the reads that support this genotype. */
    repeated NumberWithFrequency numVariationsInReads = 23;
    /**
    Insert sizes found in reads that support this genotype. */
    repeated NumberWithFrequency insertSizes = 24;

    /**
    Target aligned lengths for aligned reads that support this genotype. */
    repeated NumberWithFrequency targetAlignedLengths = 25;

    /**
    Query aligned lengths for aligned reads that support this genotype. **/
    repeated NumberWithFrequency queryAlignedLengths = 26;

    /**
    Query positions for aligned reads that support this genotype. **/
    repeated NumberWithFrequency queryPositions = 30;

    /**
    Pair flags for aligned reads that support this genotype. **/
    repeated NumberWithFrequency pairFlags = 27;

    /** The index in the read of  all bases matching this genotype on the forward strand */
    repeated NumberWithFrequency distancesToReadVariationsForwardStrand = 28;

    /** The index in the read of  all bases matching this genotype on the reverse strand */
    repeated NumberWithFrequency distancesToReadVariationsReverseStrand = 29;

    /** Number of bases between the base where the variation is observed and the start of the read. */
    repeated NumberWithFrequency distanceToStartOfRead = 31;
    /** Number of bases between the base where the variation is observed and the end of the read. */
    repeated NumberWithFrequency distanceToEndOfRead = 32;

    /** True iff the genotype is called in this sample. In other words, this field is true when the genotype caller believes
     that this genotype is present in the sample. */
    optional bool isCalled = 20;
    /*
    The index of the genotype in the sample count info array. Used when counts are sorted and we need to restore
    the original genotype order.
    */
    optional uint32 gobyGenotypeIndex = 40;

    /** The offset used when creating the record copy during post-processing. Used for debugging post-processing of segment construction.  */
    optional uint32 offset=101;
}

message NumberWithFrequency {
    required int32 number = 1;
    required uint32 frequency = 2;
}

message SomaticOutput {

    required bool isSomatic = 200;
    /** Index of the reference sequence where this base is observed in the genome. */
    required uint32 reference_index = 1;
    /** Identifier for the reference sequence (typically chromosome name). */
    optional string reference_id = 10;
    /**  The position in the reference sequence of the genome. */
    required uint32 position = 2;

    /** The base we have introduced in the simulation. Always length 1. */
    optional string mutatedBase = 4;

    /** The index of the counts array where the mutated element was introduced. */
    optional uint32 indexOfMutatedBase = 7;

    /** The frequency with which this base was mutated by the mutator. */
    optional float frequencyOfMutation = 6;
    /**
    The base present at the position in the genome/reference sequence.
    */
    optional string referenceBase = 5;

}