File: Genome.h

package info (click to toggle)
snap-aligner 1.0.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,988 kB
  • sloc: cpp: 36,500; ansic: 5,239; python: 227; makefile: 85; sh: 28
file content (333 lines) | stat: -rw-r--r-- 10,939 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
/*++

Module Name:

    Geonome.h

Abstract:

    Genome class for the SNAP sequencer

Authors:

    Bill Bolosky, August, 2011

Environment:

    User mode service.

Revision History:

    Adapted from Matei Zaharia's Scala implementation.

--*/

#pragma once
#include "Compat.h"
#include "GenericFile.h"
#include "GenericFile_map.h"

//
// We have two different classes to represent a place in a genome and a distance between places in a genome.
// In reality, they're both just 64 bit ints, but the classes are set up to encourage the user to keep
// in mind the difference.  So, a genome location might be something
// like "chromosome 12, base 12345" which would be represented in (0-based) genome coordinates as some 
// 64 bit int that's the base of cheomosome 12 + 12344 (one less because we're 0-based and the nomenclature
// uses 1-based).
// In the non-debug build, GenomeLocation is just defined as an _int64, so that no matter how dumb the compiler
// can't possibly screw it up.  However, in the debug build you get the happy type checking.
//

typedef _int64 GenomeDistance;

#ifdef _DEBUG

class GenomeLocation {
public:
    GenomeLocation(_int64 i_location) : location(i_location) {}
    GenomeLocation(const GenomeLocation &peer) : location(peer.location) {}
    GenomeLocation() {location = -1;}

    inline GenomeLocation operator=(const GenomeLocation &peer) {
        location = peer.location;
        return *this;
    }

    inline GenomeLocation operator=(const _int64 value) {
       location = value;
       return *this;
    }

    inline GenomeLocation operator++() {
        location++;
        return *this;
    }

    inline GenomeLocation operator--() {
        location--;
        return *this;
    }

    // The postfix versions
    inline GenomeLocation operator++(int foo) {
        location++;
        return *this - 1;
    }

    inline GenomeLocation operator--(int foo) {
        location--;
        return *this + 1;
    }

    inline bool operator==(const GenomeLocation &peer) const {
        return location == peer.location;
    }

    inline bool operator>=(const GenomeLocation &peer) const {
        return location >= peer.location;
    }

    inline bool operator>(const GenomeLocation &peer) const {
        return location > peer.location;
    }

    inline bool operator<=(const GenomeLocation &peer) const {
        return location <= peer.location;
    }

    inline bool operator<(const GenomeLocation &peer) const {
        return location < peer.location;
    }

    inline bool operator!=(const GenomeLocation &peer) const {
        return location != peer.location;
    }

    inline GenomeLocation operator+(const GenomeDistance distance) const {
        GenomeLocation retVal(location + distance);
        return retVal;
    }

    inline GenomeDistance operator-(const GenomeLocation &otherLoc) const {
        return location - otherLoc.location;
    }

    inline GenomeLocation operator-(const GenomeDistance distance) const {
        return location - distance;
    }

    inline GenomeLocation operator+=(const GenomeDistance distance)  {
        location += distance;
        return *this;
    }

    inline GenomeLocation operator-=(const GenomeDistance distance) {
        location -= distance;
        return *this;
    }

    _int64         location;
};

inline _int64 GenomeLocationAsInt64(GenomeLocation genomeLocation) {
    return genomeLocation.location;
}

inline unsigned GenomeLocationAsInt32(GenomeLocation genomeLocation) {
    _ASSERT(genomeLocation.location <= 0xffffffff && genomeLocation.location >= 0);
    return (unsigned)genomeLocation.location;
}
#else   // _DEBUG
typedef _int64 GenomeLocation;

inline _int64 GenomeLocationAsInt64(GenomeLocation genomeLocation)
{
    return genomeLocation;
}

inline unsigned GenomeLocationAsInt32(GenomeLocation genomeLocation) {
    _ASSERT(genomeLocation <= 0xffffffff && genomeLocation >= 0);    // One might wonder about the value of an _ASSERT in code that's only non-_DEBUG.  Think of it as an uppity comment.  :-)
    return (unsigned)genomeLocation;
}

#endif // _DEBUG

typedef _int64 GenomeDistance;

extern GenomeLocation InvalidGenomeLocation;

class Genome {
public:
        //
        // Methods for building a genome.
        //

        //
        // Create a new genome.  It's got a limit on the number of bases, but there's no need to
        // store that many, it's just an upper bound.  It will, of course, use memory proportional
        // to the bound.
        //
        Genome(
            GenomeDistance          i_maxBases,
            GenomeDistance          nBasesStored,
            unsigned                i_chromosomePadding,
            unsigned                maxContigs,
            bool                    i_areALTContigsMarked);

		void startContig(
			const char          *contigName);

		void markContigALT(
			const char			*contigName);

        void addData(
            const char          *data);

        void addData(const char *data, GenomeDistance len);

        const unsigned getChromosomePadding() const {return chromosomePadding;}

        ~Genome();

        //
        // Methods for loading a genome from a file, and saving one to a file.  When you save and
        // then load a genome the space used is reduced from the max that was reserved when it was
        // first created to the amount actually used (rounded up to a page size).  However, saved
        // and loaded genomes can't be added to, they're read only.
        //
        // minOffset and length are used to read in only a part of a whole genome.
        //
        static const Genome *loadFromFile(
								const char *fileName, 
								unsigned chromosomePadding, 
								GenomeLocation i_minLocation = 0, 
								GenomeDistance length = 0, 
								bool map = false);
                                                                  // This loads from a genome save
                                                                  // file, not a FASTA file.  Use
                                                                  // FASTA.h for FASTA loads.

        static bool getSizeFromFile(const char *fileName, GenomeDistance *nBases, unsigned *nContigs, bool *hasAltContigsMarked);

        bool saveToFile(const char *fileName) const;

        //
        // Methods to read the genome.
        //
		inline const char *getSubstring(GenomeLocation location, GenomeDistance lengthNeeded) const {
			if (location > nBases || location + lengthNeeded > nBases + N_PADDING) {
				// The first part of the test is for the unsigned version of a negative offset.
				return NULL;
			}

			// If we're in the padding, then the base will be an n, and we can't short circuit.  Recall that we use lower case n in the reference so it won't match with N in the read.
			if (lengthNeeded <= chromosomePadding && bases[GenomeLocationAsInt64(location)] != 'n') {
				return bases + (location - minLocation);
			}

			_ASSERT(location >= minLocation && location + lengthNeeded <= maxLocation + N_PADDING); // If the caller asks for a genome slice, it's only legal to look within it.

			if (lengthNeeded == 0) {
				return bases + (location - minLocation);
			}

			const Contig *contig = getContigAtLocation(location);
			if (NULL == contig) {
				return NULL;
			}

			_ASSERT(contig->beginningLocation <= location && contig->beginningLocation + contig->length >= location);
			if (contig->beginningLocation + contig->length <= location + lengthNeeded) {
				return NULL;
			}

			return bases + (location - minLocation);
		}

        inline GenomeDistance getCountOfBases() const {return nBases;}

        bool getLocationOfContig(const char *contigName, GenomeLocation *location, int* index = NULL) const;

        inline void prefetchData(GenomeLocation genomeLocation) const {
            _mm_prefetch(bases + GenomeLocationAsInt64(genomeLocation), _MM_HINT_T2);
            _mm_prefetch(bases + GenomeLocationAsInt64(genomeLocation) + 64, _MM_HINT_T2);
        }

        struct Contig {
            Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL), isALT(false) {}
            GenomeLocation     beginningLocation;
            GenomeDistance     length;
            unsigned           nameLength;
            char              *name;
			bool			   isALT;
        };

        inline const Contig *getContigs() const { return contigs; }

        inline int getNumContigs() const { return nContigs; }

		int getNumALTContigs() const;

        const Contig *getContigAtLocation(GenomeLocation location) const;
        const Contig *getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const;
        const Contig *getNextContigAfterLocation(GenomeLocation location) const;
        int getContigNumAtLocation(GenomeLocation location) const;    // Returns the contig number, which runs from 0 .. getNumContigs() - 1.
		inline bool isGenomeContigAware() 
		{
			return isContigAware;
		}

        //
        // These are only public so creators of new genomes (i.e., FASTA) can use them.
        //
        void    fillInContigLengths();
        void    sortContigsByName();

        inline bool isGenomeLocationALT(GenomeLocation location) const {
            return location >= genomeLocationOfFirstALTContig;
        }

        char* genomeLocationInStringForm(GenomeLocation location, char *buffer, size_t bufferSize) const;

private:

        static const int N_PADDING = 100; // Padding to add on either end of the genome to allow substring reads past it

        //
        // The actual genome.
        char                *bases;       // Will point to offset N_PADDING in an array of nBases + 2 * N_PADDING
        GenomeDistance       nBases;
        GenomeLocation       maxBases;

        GenomeLocation       minLocation;
        GenomeLocation       maxLocation;

        //
        // A genome is made up of a bunch of contigs, typically chromosomes.  Contigs have names,
        // which are stored here.
        //
        int          nContigs;
        int          maxContigs;

		bool		areALTContigsMarked;

        Contig      *contigs;    // This is always in order (it's not possible to express it otherwise in FASTA).

        Contig      *contigsByName;
 
        static bool openFileAndGetSizes(const char *filename, GenericFile **file, GenomeDistance *nBases, unsigned *nContigs, bool map, bool *hasALTContigsMarked);

        const unsigned chromosomePadding;

		GenericFile_map *mappedFile;

		bool isContigAware;

        GenomeLocation  genomeLocationOfFirstALTContig;
};

GenomeDistance DistanceBetweenGenomeLocations(GenomeLocation locationA, GenomeLocation locationB);
inline bool genomeLocationIsWithin(GenomeLocation locationA, GenomeLocation locationB, GenomeDistance distance)
{
    return DistanceBetweenGenomeLocations(locationA, locationB) <= distance;
}