File: SelectiveAlignmentUtils.hpp

package info (click to toggle)
rapmap 0.15.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,228 kB
  • sloc: cpp: 48,810; ansic: 4,686; sh: 215; python: 82; makefile: 15
file content (378 lines) | stat: -rw-r--r-- 13,820 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#ifndef __SELECTIVE_ALIGNMENT_UTILS__
#define __SELECTIVE_ALIGNMENT_UTILS__


#include "ksw2pp/KSW2Aligner.hpp"
#include "metro/metrohash64.h"
#include "tsl/hopscotch_map.h"
#include "edlib.h"

namespace selective_alignment {
  namespace utils {

    enum class AlignmentPolicy : uint8_t { DEFAULT, BT2, BT2_STRICT };

    /// Get alignment score
    struct CacheEntry {
      uint64_t hashKey;
      int32_t alnScore;
      CacheEntry(uint64_t hashKeyIn, int32_t alnScoreIn) :
        hashKey(hashKeyIn), alnScore(alnScoreIn) {}
    };

    // Use a passthrough hash for the alignment cache, because
    // the key *is* the hash.
    struct PassthroughHash {
      std::size_t operator()(uint64_t const& u) const { return u; }
    };

    using AlnCacheMap = tsl::hopscotch_map<uint64_t, int32_t, PassthroughHash>;


#ifndef RAPMAP_SALMON_SUPPORT
    template <typename IndexT>
#endif
    inline bool recoverOrphans(std::string& leftRead,
                               std::string& rightRead,
                               std::string& rc1,
                               std::string& rc2,
#ifdef RAPMAP_SALMON_SUPPORT   // for salmon, we just have a vector of transcript objects
                               const std::vector<Transcript>& transcripts,
#else                          // otherwise, we have to pass in this information from the index
                               const std::string& seq,
                               const std::vector<std::string>& txpNames,
                               const std::vector<IndexT>& txpOffsets,
                               const std::vector<IndexT>& txpLens,
#endif
                               const std::vector<rapmap::utils::QuasiAlignment>& leftHits,
                               const std::vector<rapmap::utils::QuasiAlignment>& rightHits,
                               std::vector<rapmap::utils::QuasiAlignment>& jointHits) {
      using QuasiAlignment = rapmap::utils::QuasiAlignment;

      auto* r1 = leftRead.data();
      auto* r2 = rightRead.data();
      auto l1 = static_cast<int32_t>(leftRead.length());
      auto l2 = static_cast<int32_t>(rightRead.length());
      // We compute the reverse complements below only if we
      // need them and don't have them.
      char* r1rc = nullptr;
      char* r2rc = nullptr;

      const char* windowSeq = nullptr;
      int32_t windowLength = -1;

      int32_t maxDistRight = l2 / 4;
      int32_t maxDistLeft = l1 / 4;
      constexpr const int32_t signedZero{0};
      int32_t lreadLen = l1;
      //int32_t rreadLen = l2;

      auto recoverSingleOrphan = [&] (const QuasiAlignment& anchorHit, bool anchorIsLeft) -> bool {
        bool recovered = false;
        auto txpID = anchorHit.tid;
#ifdef RAPMAP_SALMON_SUPPORT
        auto& txp = transcripts[txpID];
        const char* tseq = txp.Sequence();
#else
        const char* tseq = seq.data() + txpOffsets[txpID];
#endif

        int32_t anchorPos{anchorHit.allPositions.front()};
        bool anchorFwd{anchorHit.fwd};
        bool lfwd = false, rfwd = false;
        int32_t startPos = -1, maxDist = -1, lpos = -1, rpos = -1, anchorLen = -1, otherLen = -1, rlen = -1;
        const char* rptr{nullptr};
        std::string* otherReadPtr{nullptr};
        const char* otherRead{nullptr};
        char* otherReadRC{nullptr};
        std::string* otherRCSpace{nullptr};
        rapmap::utils::ChainStatus leftChainStatus = rapmap::utils::ChainStatus::REGULAR;
        rapmap::utils::ChainStatus rightChainStatus = rapmap::utils::ChainStatus::REGULAR;

        if (anchorIsLeft) {
          anchorLen = l1;
          otherLen = l2;
          maxDist = maxDistRight;
          lpos = anchorPos;
          rpos = -1;
          lfwd = anchorFwd;
          rfwd = !lfwd;
          otherReadPtr = &rightRead;
          otherRCSpace = &rc2;
          otherRead = r2;
          otherReadRC = r2rc;
          leftChainStatus = anchorHit.chainStatus.getLeft();
        } else {
          anchorLen = l2;
          otherLen = l1;
          maxDist = maxDistLeft;
          lpos = -1;
          rpos = anchorPos;
          rfwd = anchorFwd;
          lfwd = !rfwd;
          otherReadPtr = &leftRead;
          otherRCSpace = &rc1;
          otherRead = r1;
          otherReadRC = r1rc;
          rightChainStatus = anchorHit.chainStatus.getRight();
        }

        // if this hit is forward, look downstream, else upstream
#ifdef RAPMAP_SALMON_SUPPORT
        auto refLength = txp.RefLength;
#else
        auto refLength = txpLens[txpID];
#endif
        if (anchorFwd) {
          if (!otherReadRC){
            rapmap::utils::reverseRead(*otherReadPtr, *otherRCSpace);
            otherReadRC = const_cast<char*>(otherRCSpace->data());
          }
          rptr = otherReadRC;
          rlen = otherLen;
          startPos = std::max(signedZero, static_cast<int32_t>(anchorPos));

          windowLength = std::min(1000, static_cast<int32_t>(refLength - startPos));
        } else {
          rptr = otherRead;
          rlen = otherLen;
          int32_t endPos = std::min(static_cast<int32_t>(refLength), static_cast<int32_t>(anchorPos) + anchorLen);
          startPos = std::max(signedZero,  endPos - 1000);
          windowLength = std::min(1000, endPos);
        }
        windowSeq = tseq + startPos;

        EdlibAlignResult result = edlibAlign(rptr, rlen, windowSeq, windowLength,
                                             edlibNewAlignConfig(maxDist, EDLIB_MODE_HW, EDLIB_TASK_DISTANCE));
        if (result.editDistance > -1) {
          if (anchorIsLeft) {
            rpos = startPos + result.endLocations[0] - otherLen;
          } else {
            lpos = startPos + result.endLocations[0] - otherLen;
          }
          // If we consider only a single position per transcript
          int32_t startRead1 = std::max(lpos, signedZero);
          int32_t startRead2 = std::max(rpos, signedZero);
          bool read1First{(startRead1 < startRead2)};
          int32_t fragStartPos = read1First ? startRead1 : startRead2;
          int32_t fragEndPos = read1First ?
            (startRead2 + l2) : (startRead1 + l1);
          uint32_t fragLen = fragEndPos - fragStartPos;
          jointHits.emplace_back(txpID,
                                 lpos,
                                 lfwd,
                                 lreadLen,
                                 fragLen, true);
          // Fill in the mate info
          auto& qaln = jointHits.back();
          qaln.mateLen = rlen;
          qaln.matePos = rpos;
          qaln.mateIsFwd = rfwd;
          jointHits.back().mateStatus = rapmap::utils::MateStatus::PAIRED_END_PAIRED;
          jointHits.back().chainStatus = rapmap::utils::FragmentChainStatus(leftChainStatus, rightChainStatus);
          recovered = true;
        }
        edlibFreeAlignResult(result);
        return recovered;
      };

      {
        auto leftIt = leftHits.begin();
        auto leftEnd = leftHits.end();
        auto leftLen = std::distance(leftIt, leftEnd);
        auto rightIt = rightHits.begin();
        auto rightEnd = rightHits.end();
        auto rightLen = std::distance(rightIt, rightEnd);
        jointHits.reserve(std::min(leftLen, rightLen));

        bool didRecover{false};

        while ((leftIt != leftEnd) and (rightIt != rightEnd)) {
          uint32_t leftTxp, rightTxp;
          leftTxp = leftIt->tid;
          rightTxp = rightIt->tid;
          if (leftTxp < rightTxp) {
            didRecover = recoverSingleOrphan(*leftIt, true);
            ++leftIt;
          } else if (rightTxp < leftTxp) {
            didRecover = recoverSingleOrphan(*rightIt, false);
            ++rightIt;
          } else if (rightTxp == leftTxp) {
            /*
            didRecover = recoverSingleOrphan(*leftIt, true);
            ++leftIt;
            if(!didRecover) {
              didRecover = recoverSingleOrphan(*rightIt, false);
            }
            ++rightIt;
            */
            //++leftIt; ++rightIt;
            // Should not happen!
            auto log = spdlog::get("jointLog");

            log->error("Found a transcript in common between leftHits and rightHits while "
                       "trying to recover orphans.  This should not happen. "
                       "Please report this via GitHub. ");
            std::stringstream ss;
            ss << "left hits : [";
            for (auto& lh : leftHits) {
              ss << "(" <<
#ifdef RAPMAP_SALMON_SUPPORT
                transcripts[lh.tid].RefName
#else
                txpNames[lh.tid]
#endif
                 << ", " << lh.pos << ", " << lh.fwd << ") ; ";
            }
            ss << "]\n";
            ss << "right hits : [";
            for (auto& lh : rightHits ) {
              ss << "(" <<
#ifdef RAPMAP_SALMON_SUPPORT
                transcripts[lh.tid].RefName
#else
                txpNames[lh.tid]
#endif
                 << ", " << lh.pos << ", " << lh.fwd << ") ; ";
            }
            log->error(ss.str());
            log->flush();
            spdlog::drop_all();
            std::exit(1);
          }
        }

        while (leftIt != leftEnd) {
          didRecover = recoverSingleOrphan(*leftIt, true);
          ++leftIt;
        }

        while(rightIt != rightEnd) {
          didRecover = recoverSingleOrphan(*rightIt, false);
          ++rightIt;
        }
        (void)didRecover;
      } // union / merge left and right orphans
      return true;
    }


inline int32_t getAlnScore(
                           ksw2pp::KSW2Aligner& aligner,
                           ksw_extz_t& ez,
                           int32_t pos, const char* rptr, int32_t rlen,
                           char* tseq, int32_t tlen,
                           int8_t mscore,
                           int8_t mmcost,
                           int32_t maxScore,
                           rapmap::utils::ChainStatus chainStat,
                           bool multiMapping, // was there > 1 hit for this read
                           AlignmentPolicy ap,
                           uint32_t buf,
                           AlnCacheMap& alnCache) {
  // If this was a perfect match, don't bother to align or compute the score
  if (chainStat == rapmap::utils::ChainStatus::PERFECT) {
    return maxScore;
  }

  auto ungappedAln = [mscore, mmcost](char* ref, const char* query, int32_t len) -> int32_t {
    int32_t ungappedScore{0};
    for (int32_t i = 0; i < len; ++i) {
      char c1 = *(ref + i);
      char c2 = *(query + i);
      c1 = (c1 == 'N' or c2 == 'N') ? c2 : c1;
      ungappedScore += (c1 == c2) ? mscore : mmcost;
    }
    return ungappedScore;
  };

  int32_t s{std::numeric_limits<int32_t>::lowest()};
  // TODO : Determine what is the most "appropriate" penalty for
  // an overhang (based on the scoring function).
  bool invalidStart = (pos < 0);
  bool invalidEnd = (pos + rlen > tlen);
  if (invalidStart) { rptr += -pos; rlen += pos; pos = 0; }

  // if we are trying to mimic Bowtie2 with RSEM params
  if (invalidStart or invalidEnd) {
    switch (ap) {
    case AlignmentPolicy::BT2:
    case AlignmentPolicy::BT2_STRICT:
      return s;
    case AlignmentPolicy::DEFAULT:
    default:
      break;
    }
  }

  if (pos < tlen) {
    bool doUngapped{(!invalidStart) and (chainStat == rapmap::utils::ChainStatus::UNGAPPED)};
    buf = (doUngapped) ? 0 : buf;
    auto lnobuf = static_cast<uint32_t>(tlen - pos);
    auto lbuf = static_cast<uint32_t>(rlen+buf);
    auto useBuf = (lbuf < lnobuf);
    uint32_t tlen1 = std::min(lbuf, lnobuf);
    char* tseq1 = tseq + pos;
    ez.max_q = ez.max_t = ez.mqe_t = ez.mte_q = -1;
    ez.max = 0, ez.mqe = ez.mte = KSW_NEG_INF;
    ez.n_cigar = 0;

    uint64_t hashKey{0};
    bool didHash{false};
    if (!alnCache.empty()) {
      // hash the reference string
      uint32_t keyLen = useBuf ? tlen1 - buf : tlen1;
      MetroHash64::Hash(reinterpret_cast<uint8_t*>(tseq1), keyLen, reinterpret_cast<uint8_t*>(&hashKey), 0);
      didHash = true;
      // see if we have this hash
      auto hit = alnCache.find(hashKey);
      // if so, we know the alignment score
      if (hit != alnCache.end()) {
        s = hit->second;
      }
    }
    // If we got here with s == -1, we don't have the score cached
    if (s == std::numeric_limits<int32_t>::lowest()) {
      if (doUngapped) {
        // signed version of tlen1
        int32_t tlen1s = tlen1;
        int32_t alnLen = rlen < tlen1s ? rlen : tlen1s;
        s = ungappedAln(tseq1, rptr, alnLen);
      } else {
        /*
        auto startBuf = std::min(pos, static_cast<int32_t>(buf));
        int32_t bpos = pos - startBuf;
        char* tseqTemp = tseq + bpos;
        uint32_t tlenTemp = tlen1 + startBuf;
        EdlibAlignResult result = edlibAlign(rptr, rlen, tseqTemp, tlenTemp,
                                             edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_PATH));
        auto spos = result.startLocations[0];
        tseq1 = tseq + bpos + spos;
        auto* aln = result.alignment;
        if (!aln or (aln[0] == 1 or aln[0] == 2)) {
          edlibFreeAlignResult(result);
          return s;
        }
        edlibFreeAlignResult(result);
        */
        aligner(rptr, rlen, tseq1, tlen1, &ez, ksw2pp::EnumToType<ksw2pp::KSW2AlignmentType::EXTENSION>());
        s = std::max(ez.mqe, ez.mte);
      }

      if (multiMapping) { // don't bother to fill up a cache unless this is a multi-mapping read
        if (!didHash) {
          uint32_t keyLen = useBuf ? tlen1 - buf : tlen1;
          MetroHash64::Hash(reinterpret_cast<uint8_t*>(tseq1), keyLen, reinterpret_cast<uint8_t*>(&hashKey), 0);
        }
        alnCache[hashKey] = s;
      } // was a multi-mapper

    }
  }
  return s;
}

  } // namespace utils
} // namespace selective_alignment

#endif //__SELECTIVE_ALIGNMENT_UTILS__