1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
|
// Copyright ©2012 The bíogo Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sam
import (
"bytes"
"fmt"
)
// Cigar is a set of CIGAR operations.
type Cigar []CigarOp
// IsValid returns whether the CIGAR string is valid for a record of the given
// sequence length. Validity is defined by the sum of query consuming operations
// matching the given length, clipping operations only being present at the ends
// of alignments, and that CigarBack operations only result in query-consuming
// positions at or right of the start of the alignment.
func (c Cigar) IsValid(length int) bool {
var pos int
for i, co := range c {
ct := co.Type()
if ct == CigarHardClipped && i != 0 && i != len(c)-1 {
return false
}
if ct == CigarSoftClipped && i != 0 && i != len(c)-1 {
if c[i-1].Type() != CigarHardClipped && c[i+1].Type() != CigarHardClipped {
return false
}
}
con := ct.Consumes()
if pos < 0 && con.Query != 0 {
return false
}
length -= co.Len() * con.Query
pos += co.Len() * con.Reference
}
return length == 0
}
// String returns the CIGAR string for c.
func (c Cigar) String() string {
if len(c) == 0 {
return "*"
}
var b bytes.Buffer
for _, co := range c {
fmt.Fprint(&b, co)
}
return b.String()
}
// Lengths returns the number of reference and read bases described by the Cigar.
func (c Cigar) Lengths() (ref, read int) {
var con Consume
for _, co := range c {
con = co.Type().Consumes()
if co.Type() != CigarBack {
ref += co.Len() * con.Reference
}
read += co.Len() * con.Query
}
return ref, read
}
// CigarOp is a single CIGAR operation including the operation type and the
// length of the operation.
type CigarOp uint32
// NewCigarOp returns a CIGAR operation of the specified type with length n.
// Due to a limitation of the BAM format, CIGAR operation lengths are limited
// to 2^28-1, and NewCigarOp will panic if n is above this or negative.
func NewCigarOp(t CigarOpType, n int) CigarOp {
if uint64(n) > 1<<28-1 {
panic("sam: illegal CIGAR op length")
}
return CigarOp(t) | (CigarOp(n) << 4)
}
// Type returns the type of the CIGAR operation for the CigarOp.
func (co CigarOp) Type() CigarOpType { return CigarOpType(co & 0xf) }
// Len returns the number of positions affected by the CigarOp CIGAR operation.
func (co CigarOp) Len() int { return int(co >> 4) }
// String returns the string representation of the CigarOp
func (co CigarOp) String() string { return fmt.Sprintf("%d%s", co.Len(), co.Type().String()) }
// A CigarOpType represents the type of operation described by a CigarOp.
type CigarOpType byte
const (
CigarMatch CigarOpType = iota // Alignment match (can be a sequence match or mismatch).
CigarInsertion // Insertion to the reference.
CigarDeletion // Deletion from the reference.
CigarSkipped // Skipped region from the reference.
CigarSoftClipped // Soft clipping (clipped sequences present in SEQ).
CigarHardClipped // Hard clipping (clipped sequences NOT present in SEQ).
CigarPadded // Padding (silent deletion from padded reference).
CigarEqual // Sequence match.
CigarMismatch // Sequence mismatch.
CigarBack // Skip backwards.
lastCigar
)
var cigarOps = []string{"M", "I", "D", "N", "S", "H", "P", "=", "X", "B", "?"}
// Consumes returns the CIGAR operation alignment consumption characteristics for the CigarOpType.
//
// The Consume values for each of the CigarOpTypes is as follows:
//
// Query Reference
// CigarMatch 1 1
// CigarInsertion 1 0
// CigarDeletion 0 1
// CigarSkipped 0 1
// CigarSoftClipped 1 0
// CigarHardClipped 0 0
// CigarPadded 0 0
// CigarEqual 1 1
// CigarMismatch 1 1
// CigarBack 0 -1
//
func (ct CigarOpType) Consumes() Consume { return consume[ct] }
// String returns the string representation of a CigarOpType.
func (ct CigarOpType) String() string {
if ct < 0 || ct > lastCigar {
ct = lastCigar
}
return cigarOps[ct]
}
// Consume describes how CIGAR operations consume alignment bases.
type Consume struct {
Query, Reference int
}
// A few years ago, Complete Genomics (CG) proposed to add a new CIGAR
// operator 'B' for an operation of moving backward along the reference
// genome. It is the opposite of the 'N', the reference skip. In a later
// discussion on a separate issue, Fred expressed his preference to a
// negative reference skip which is equivalent to a positive 'B' operation.
// Now the SRA group from NCBI intends to archive the CG alignment in
// the SAM format and raises this request again. I think it may be the
// time to add this backward operation.
//
// The backward operation is designed to describe such an alignment:
//
// REF:: GCATACGATCGACTAGTCACGT
// READ: --ATACGATCGA----------
// READ: ---------CGACTAGTCAC--
//
// i.e. there is an overlap between two segments of a read, which is quite
// frequent in CG data. We are unable to fully describe such an alignment
// with the original CIGAR. In the current spec, we suggest using a CIGAR
// 18M and storing the overlap information in optional tags. This is a
// little clumsy and is not compressed well for the purpose of archiving.
// With 'B', the new CIGAR is "10M3B11M" with no other optional tags.
//
// Using "B" in this case is cleaner, but the major concern is that it breaks
// the compatibility and is also likely to complicate SNP calling and many
// other applications. As I think now, the solution is to implement a
// "remove_B()" routine in samtools. This routine collapses overlapping
// sequences, recalculates base quality in the overlap and gives a CIGAR
// without 'B'. For the example above, remove_B() gives CIGAR 18M. For SNP
// calling, we may call remove_B() immediately after the alignment loaded
// into memory. The downstream pileup engine does not need any changes. Other
// SNP callers can do the same. A new option will be added to "samtools view"
// as a way to remove 'B' operations on the command-line.
//
// The implementation of remove_B() may be quite complicated in the generic
// case - we may be dealing with a multiple-sequence alignment, but it should
// be straightforward in the simple cases such as the example above. Users may
// not need to care too much about how remove_B() is implemented.
//
// http://sourceforge.net/p/samtools/mailman/message/28463294/
var consume = []Consume{
CigarMatch: {Query: 1, Reference: 1},
CigarInsertion: {Query: 1, Reference: 0},
CigarDeletion: {Query: 0, Reference: 1},
CigarSkipped: {Query: 0, Reference: 1},
CigarSoftClipped: {Query: 1, Reference: 0},
CigarHardClipped: {Query: 0, Reference: 0},
CigarPadded: {Query: 0, Reference: 0},
CigarEqual: {Query: 1, Reference: 1},
CigarMismatch: {Query: 1, Reference: 1},
CigarBack: {Query: 0, Reference: -1}, // See notes above.
lastCigar: {},
}
var cigarOpTypeLookup [256]CigarOpType
func init() {
for i := range cigarOpTypeLookup {
cigarOpTypeLookup[i] = lastCigar
}
for op, c := range []byte{'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B'} {
cigarOpTypeLookup[c] = CigarOpType(op)
}
}
var powers = []int64{1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12}
// atoi returns the integer interpretation of b which must be an ASCII decimal number representation.
func atoi(b []byte) (int, error) {
if len(b) > len(powers) {
return 0, fmt.Errorf("sam: integer overflow: %q", b)
}
var n int64
k := len(b) - 1
for i, v := range b {
n += int64(v-'0') * powers[k-i]
if int64(int(n)) != n {
return 0, fmt.Errorf("sam: integer overflow: %q at %d", b, i)
}
}
return int(n), nil
}
// ParseCigar returns a Cigar parsed from the provided byte slice.
// ParseCigar will break CIGAR operations longer than 2^28-1 into
// multiple operations summing to the same length.
func ParseCigar(b []byte) (Cigar, error) {
if len(b) == 1 && b[0] == '*' {
return nil, nil
}
var (
c Cigar
op CigarOpType
n int
err error
)
for i := 0; i < len(b); i++ {
for j := i; j < len(b); j++ {
if b[j] < '0' || '9' < b[j] {
n, err = atoi(b[i:j])
if err != nil {
return nil, err
}
op = cigarOpTypeLookup[b[j]]
i = j
break
}
}
if op == lastCigar {
return nil, fmt.Errorf("sam: failed to parse cigar string %q: unknown operation %q", b, op)
}
for {
c = append(c, NewCigarOp(op, minInt(n, 1<<28-1)))
n -= 1<<28 - 1
if n <= 0 {
break
}
}
}
return c, nil
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
|