1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
|
/*
Author: Shane Neph & Alex Reynolds
Date: Mon Jan 23 06:29:10 PST 2012
*/
//
// BEDOPS
// Copyright (C) 2011-2022 Shane Neph, Scott Kuehn and Alex Reynolds
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
//
#ifndef BED_RANGE_FINDER_ALGORITHM_H
#define BED_RANGE_FINDER_ALGORITHM_H
#include <cstdio>
#include <iterator>
#include <map>
#include <utility>
#include "data/bed/BedCompare.hpp"
#include "data/bed/BedDistances.hpp"
#include "data/bed/BedTypes.hpp"
#include "suite/BEDOPS.Constants.hpp"
#include "utility/PooledMemory.hpp"
namespace Bed {
typedef Bed::SignedCoordType ByteOffset;
template <class BedType, std::size_t SZ>
class bed_check_iterator;
namespace extract_details {
template <typename BedType>
struct CompBed {
bool operator()(const BedType& b1, const BedType& b2) const {
static Bed::GenomicCompare<BedType, BedType> gc;
return gc(&b1,&b2);
}
};
template <typename BT1, typename BT2>
bool check_overlap(BT1 const* b1, BT2 const* b2) {
static Bed::Overlapping overlap;
return overlap(b1, b2) == 0;
}
typedef Bed::B3Rest QueryBedType; // file 1
typedef QueryBedType TargetBedType; // file 2 -> must be same type as QueryBedType
typedef std::map<QueryBedType, ByteOffset, CompBed<QueryBedType>> MType;
template <typename BedType, std::size_t Sz>
inline
void remove(Bed::bed_check_iterator<BedType*, Sz>& b, BedType* p)
{ b.get_pool().release(p); }
template <typename Iter, typename BedType>
inline
void remove(Iter, BedType* b)
{ delete b; }
} // namespace extract_details
//==================
// find_bed_range() : think bedops -e -1 with file1 = qfile and titer pointing at file2
//==================
template <typename TargetIter, typename Op>
std::pair<bool, ByteOffset> find_bed_range(FILE* qfile, TargetIter titer, TargetIter teof, Op op) {
TargetIter orig = titer;
auto reference = new extract_details::TargetBedType;
auto last = new extract_details::QueryBedType;
auto current = new extract_details::QueryBedType;
Bed::Overlapping overlap, lessthan, greaterthan; // any overlap
ByteOffset prev_pos = 0, cur_pos = 0, start_pos = 0, end_pos = 0;
extract_details::MType bounds;
cur_pos = std::ftell(qfile); // may not be start of file if, for ex., qfile has headers we skipped
std::fseek(qfile, 0, SEEK_END); // apparently dangerous on some platforms in binary mode -> padded nulls;
const ByteOffset at_end = std::ftell(qfile); // I'll assume msft is the problem until I know better
std::fseek(qfile, cur_pos, SEEK_SET);
prev_pos = cur_pos;
std::iterator_traits<FILE*>::difference_type count, step;
bool didWork = false, first = true, donequery = false;
while ( titer != teof ) {
extract_details::TargetBedType* const refelement = *titer++;
if ( donequery ) { // read through and delete items in [titer,teof) for posterity
extract_details::remove(orig, refelement);
continue;
} else if ( !first && (greaterthan(last, refelement) > 0) ) { // last lower_bound is still applicable
extract_details::remove(orig, refelement);
continue;
}
// only use reference where you need to compare to starting base. Otherwise, use refelement.
*reference = *refelement;
reference->end(reference->start()+1);
start_pos = std::ftell(qfile);
while ( true ) {
extract_details::MType::iterator miter = bounds.upper_bound(*refelement); // define end_pos
if ( miter == bounds.end() ) {
end_pos = at_end;
bounds.clear();
break;
} else {
// it's possible that miter->first and refelement overlap but are not picked up in
// bounds.upper_bound() search b/c bounds is ordered by the analog of "<", where
// start coord takes precedence. Consider when refelement is less than miter->first
// but refelement->end() > miter->first.start(). In such a case miter->first would
// not be a proper upper bound. Remove it and try again.
bounds.erase(bounds.begin(), miter);
end_pos = miter->second;
if ( extract_details::check_overlap(refelement, &miter->first) )
bounds.erase(miter);
else
break;
}
} // while
count = end_pos - start_pos; // how many bytes between positions
didWork = false;
while ( count > 0 ) {
std::fseek(qfile, start_pos, SEEK_SET);
step = count/2;
std::fseek(qfile, step, SEEK_CUR);
// find beginning of current line
while ( std::ftell(qfile) != start_pos ) {
if ( static_cast<char>(std::fgetc(qfile)) != '\n' ) {
std::fseek(qfile, -2, SEEK_CUR);
} else {
break;
}
} // while
// read in the line; incrementing to start of next QueryBedType element.
prev_pos = std::ftell(qfile);
current->readline(qfile);
cur_pos = std::ftell(qfile);
bounds.insert(std::make_pair(*current, prev_pos));
// compare 'current' to starting base
if ( lessthan(current, reference) < 0 ) {
count = (end_pos - cur_pos);
start_pos = cur_pos;
if ( 0 == count ) {
if ( end_pos != at_end ) {
prev_pos = cur_pos;
current->readline(qfile);
cur_pos = std::ftell(qfile);
bounds.insert(std::make_pair(*current, prev_pos));
} else {
prev_pos = at_end;
}
}
} else {
count = (prev_pos - start_pos);
end_pos = prev_pos;
}
didWork = true;
} // while
// spit elements in range
if ( didWork ) {
while ( prev_pos != at_end && extract_details::check_overlap(current, refelement) ) {
op(current);
prev_pos = std::ftell(qfile);
if ( prev_pos != at_end )
current->readline(qfile);
} // while
}
if ( refelement )
extract_details::remove(orig, refelement);
std::fseek(qfile, prev_pos, SEEK_SET); // because start_pos = std::ftell(qfile); on next go-around
if ( prev_pos == at_end )
donequery = true;
*last = *current;
first = false;
} // while
if ( reference )
delete reference;
if ( last )
delete last;
if ( current )
delete current;
return std::make_pair(!first, prev_pos);
}
} // namespace Bed
#endif // BED_RANGE_FINDER_ALGORITHM_H
|