File: IndexBase.cpp

package info (click to toggle)
libstatgen 1.0.15-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,588 kB
  • sloc: cpp: 49,624; ansic: 1,408; makefile: 320; sh: 60
file content (241 lines) | stat: -rw-r--r-- 7,407 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/*
 *  Copyright (C) 2010-2012  Regents of the University of Michigan
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "IndexBase.h"
#include <iomanip>

Chunk SortedChunkList::pop()
{
    Chunk newChunk = chunkList.begin()->second;
    chunkList.erase(chunkList.begin());
    return(newChunk);
}


bool SortedChunkList::insert(const Chunk& chunkToInsert)
{
    std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
    // Insert the passed in chunk.
    insertRes = 
        chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
                                                    chunkToInsert));

    if(!insertRes.second)
    {
        // Failed to insert the chunk.
        std::cerr << "Failed to insert into the SortedChunkList.\n";
        std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
                  << insertRes.first->second.chunk_beg 
                  << "\tend = "
                  << insertRes.first->second.chunk_end
                  << "\nnew chunk:\tbeg = " << std::hex
                  << chunkToInsert.chunk_beg 
                  << "\tend = "
                  << chunkToInsert.chunk_end
                  << std::endl;
    }
    // return the result that comes from insertRes.
    return(insertRes.second);
}

void SortedChunkList::clear()
{
    chunkList.clear();
}

bool SortedChunkList::empty()
{
    return(chunkList.empty());
}


// Merge overlapping chunks found in this list.
bool SortedChunkList::mergeOverlapping()
{
    // Start at the beginning of the list and iterate through.
    std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
    std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
    if(nextPos != chunkList.end())
    {
        ++nextPos;
    }
    
    // Loop until the end is reached.
    while(nextPos != chunkList.end())
    {
        // If the next chunk is completely contained within the current 
        // chunk (its end is less than the current chunk's end), 
        // delete it since its position is already covered.
        if(nextPos->second.chunk_end < currentPos->second.chunk_end)
        {
            chunkList.erase(nextPos);
            nextPos = currentPos;
            ++nextPos;
            continue;
        }
        
        // If the next chunk's start position's BGZF block is less than or
        // equal to the BGZF block of the current chunk's end position,
        // combine the two chunks into the current chunk.
        if((nextPos->second.chunk_beg >> 16) <= 
           (currentPos->second.chunk_end >> 16))
        {
            currentPos->second.chunk_end = nextPos->second.chunk_end;
            // nextPos has now been included in the current pos, so
            // remove it.
            chunkList.erase(nextPos);
            nextPos = currentPos;
            ++nextPos;
            continue;
        }
        else
        {
            // Nothing to combine.  So try combining at the next 
            currentPos = nextPos;
            ++nextPos;
        }
    }
    return(true);
}


IndexBase::IndexBase()
    : n_ref(0)
{
    myRefs.clear();
}



IndexBase::~IndexBase()
{
}


// Reset the member data for a new index file.
void IndexBase::resetIndex()
{
    n_ref = 0;
    // Clear the references.
    myRefs.clear();
}

    
// Get the number of references in this index.
int32_t IndexBase::getNumRefs() const
{
    // Return the number of references.
    return(myRefs.size());
}


// The basic logic is from samtools reg2bins and the samtools format specification pdf.
// Set bins in the region to 1 and all other bins to 0.
void IndexBase::getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1])
{
    for(uint32_t index = 0; index < MAX_NUM_BINS+1; index++)
    {
        binMap[index] = false;
    }

    uint32_t binNum = 0;
    --end;
    
    // Check if beg/end go too high, set to max position.
    if(start > MAX_POSITION)
    {
        start = MAX_POSITION;
    }
    if(end > MAX_POSITION)
    {
        end = MAX_POSITION;
    }
    
    // Turn on bins.
    binMap[binNum] = true;
    for (binNum =    1 + (start>>26); binNum <=    1 + (end>>26); ++binNum) 
        binMap[binNum] = true;
    for (binNum =    9 + (start>>23); binNum <=    9 + (end>>23); ++binNum) 
        binMap[binNum] = true;
    for (binNum =   73 + (start>>20); binNum <=   73 + (end>>20); ++binNum)
        binMap[binNum] = true;
    for (binNum =  585 + (start>>17); binNum <=  585 + (end>>17); ++binNum)
        binMap[binNum] = true;
    for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
        binMap[binNum] = true;
}


// Returns the minimum offset of records that cross the 16K block that
// contains the specified position for the given reference id.
bool IndexBase::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
                                            uint64_t& minOffset) const
{
    int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;

    minOffset = 0;

    if(refID > n_ref)
    {
        // out of range of the references, return false.
        return(false);
    }
    // Check to see if the position is out of range of the linear index.
    int32_t linearOffsetSize = myRefs[refID].n_intv;

    // If there are no entries in the linear index, return false.
    // Or if the linear index is not large enough to include
    // the start block, then there can be no records that cross
    // our region, so return false.
    if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))

    {
        return(false);
    }

    // The linear index is specified for this block, so return that value.
    minOffset = myRefs[refID].ioffsets[linearIndex];
    
    // If the offset is 0, go to the previous block that has an offset.
    // This is due to a couple of bugs in older sam tools indexes.
    // 1) they add one to the index location (so when reading those, you
    // may be starting earlier than necessary)
    // 2) (the bigger issue) They did not include bins 4681-37449 in
    // the linear index.
    while((minOffset == 0) && (--linearIndex >= 0))
    {
        minOffset = myRefs[refID].ioffsets[linearIndex]; 
    }


    // If the minOffset is still 0 when moving forward,
    // check later indices to find a non-zero since we don't want to return
    // an offset of 0 since the record can't start at 0 we want to at least
    // return the first record position for this reference.
    linearIndex = 0;
    while((minOffset == 0) && (linearIndex < linearOffsetSize))
    {
         minOffset = myRefs[refID].ioffsets[linearIndex]; 
         linearIndex++;
    }
    if(minOffset == 0)
    {
        // Could not find a valid start position for this reference.
        return(false);
    }
    return(true);
}