File: IndexBase.h

package info (click to toggle)
libstatgen 1.0.15-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,588 kB
  • sloc: cpp: 49,624; ansic: 1,408; makefile: 320; sh: 60
file content (165 lines) | stat: -rw-r--r-- 4,967 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
 *  Copyright (C) 2011-2012  Regents of the University of Michigan
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef __INDEX_BASE_H__
#define __INDEX_BASE_H__

#include <stdint.h>
#include <vector>
#include <map>
#include <stdlib.h>

#include "InputFile.h"
#include "StatGenStatus.h"


class Chunk
{
public:
    uint64_t chunk_beg; // offset of the start of the chunk
    uint64_t chunk_end; // offset of the end of the chunk
    
    static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL;

    bool operator< (const Chunk& otherChunk) const
    {
        return(this->chunk_beg < otherChunk.chunk_beg);
    }
};


// This class contains chunks that are sorted by the beginning position.
// This class hides how the chunks are actually stored (map, list ,etc),
// so they can be interchanged.
class SortedChunkList
{
public:
    // Returns the first chunk in the list and  removes it.
    Chunk pop();
    bool insert(const Chunk& chunkToInsert);
    void clear();
    bool empty();
    bool mergeOverlapping();

private:
    std::map<uint64_t, Chunk> chunkList;
};

class IndexBase
{
public:

    IndexBase();
    virtual ~IndexBase();

    /// Reset the member data for a new index file.
    virtual void resetIndex();

    // Read & parse the specified index file.
    /// \param filename the bam index file to be read.
    /// \return the status of the read.
    virtual StatGenStatus::Status readIndex(const char* filename) = 0;

    /// Get the number of references in this index.
    /// \return number of references
    int32_t getNumRefs() const;

    // Returns the minimum offset of records that cross the 16K block that
    // contains the specified position for the given reference id.
    bool getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
                                     uint64_t& minOffset) const;

protected:
    const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins

    // Maximum allowed position (inclusive 512MB - 1)
    // NOTE: CSI index may not have this same max position.
    const static uint32_t MAX_POSITION = 536870911;

    // Number of bits in 1 linear index - how much to shift a position by
    // to determine which offset into the linear index to look for it.
    const static uint32_t LINEAR_INDEX_SHIFT = 14;

    class Bin
    {
    public:
        Bin(){chunks = NULL; reset();}
        ~Bin() {reset();}
        void reset()
        {
            if(chunks != NULL)
            {
                free(chunks);
                chunks = NULL;
            }
            n_chunk = 0; 
            bin = NOT_USED_BIN;
        }
        uint32_t bin; // The bin id.
        int32_t n_chunk; // The number of chunks.
        Chunk* chunks; // The chunks for this bin.
        static const uint32_t NOT_USED_BIN = 0xFFFFFFFF;
    };

    class Reference
    {
        // Add one to the max since there may now be an extra bin containing
        // the mapped/unmapped counts.
    public:
        static const int32_t UNKNOWN_MAP_INFO = -1;
        Reference(){ioffsets = NULL; reset();}
        ~Reference(){reset();}
        void reset()
        { 
            bins.clear(); 
            if(ioffsets != NULL)
            {
                free(ioffsets);
                ioffsets = NULL;
            }
            n_bin = 0; 
            n_intv = 0;
            minChunkOffset = UNSET_MIN_CHUNK_OFFSET;
            maxChunkOffset = 0;
            n_mapped = UNKNOWN_MAP_INFO;
            n_unmapped = UNKNOWN_MAP_INFO;
        }
        int32_t n_bin; // The number of bins.
        int32_t n_intv; // Number of intervals.
        std::vector<Bin> bins;  // The bins for this reference.
        uint64_t* ioffsets; // Offsets of intervals first alignments
        uint64_t minChunkOffset;
        uint64_t maxChunkOffset;
        int32_t n_mapped; // Number of mapped reads.
        int32_t n_unmapped; // Number of unmapped reads.

        static const uint64_t UNSET_MIN_CHUNK_OFFSET = 0xFFFFFFFFFFFFFFFFULL;
    };

    // Set bins in the region to 1 and all other bins to 0.
    // start is incluive, end is exclusive.
    static void getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1]);

    // Number of reference sequences.
    int32_t n_ref;

    // The references.
    std::vector<Reference> myRefs;
};


#endif