1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
|
### test_readGAlignmentPairsFromBam.R
# Flag bits summary
# -----------------
# 0x1: template having multiple segments in sequencing
# 0x2: each segment properly aligned according to the aligner
# 0x4: segment unmapped
# 0x8: next segment in the template unmapped
# 0x10: SEQ being reverse complemented
# 0x20: SEQ of the next segment in the template being reversed
# 0x40: the first segment in the template
# 0x80: the last segment in the template
# 0x100: secondary alignment
# 0x200: not passing quality controls
# 0x400: PCR or optical duplicate
make_samline <- function(QNAME, mapped=0, RNAME="*", POS=0, strand="+",
primary=1, flagbits=0,
RNEXT="*", PNEXT=0, proper=0)
{
FLAG <- 0x4 * (!mapped) + 0x10 * (strand == "-") +
0x100 * (!primary) + 0x2 * proper + flagbits
if (mapped) {
CIGAR <- "14M"
} else {
CIGAR <- "*"
}
paste(QNAME, FLAG, RNAME, POS, "255", CIGAR,
RNEXT, PNEXT, "0", "*", "*", sep="\t")
}
make_mapped_pair <- function(QNAME,
RNAME1, POS1, CIGAR1, strand1,
RNAME2, POS2, CIGAR2, strand2,
primary, proper,
pair_id=NA)
{
flag0 <- 0x1 + 0x2 * proper + 0x100 * (!primary)
FLAG1 <- flag0 + 0x40 + 0x10 * (strand1 == "-") +
0x20 * (strand2 == "-")
FLAG2 <- flag0 + 0x80 + 0x10 * (strand2 == "-") +
0x20 * (strand1 == "-")
line1 <- paste(QNAME, FLAG1, RNAME1, POS1, "255", CIGAR1,
RNAME2, POS2, "0", "*", "*", sep="\t")
line2 <- paste(QNAME, FLAG2, RNAME2, POS2, "255", CIGAR2,
RNAME1, POS1, "0", "*", "*", sep="\t")
if (!identical(pair_id, NA)) {
pair_id_tag <- paste0("pi:Z:", pair_id)
line1 <- paste(line1, pair_id_tag, sep="\t")
line2 <- paste(line2, pair_id_tag, sep="\t")
}
c(line1, line2)
}
make_mapped_pairs <- function(mapped_pair_table)
{
if (!is.data.frame(mapped_pair_table)) {
if (is.character(mapped_pair_table))
mapped_pair_table <- textConnection(mapped_pair_table)
mapped_pair_table <- read.table(mapped_pair_table, header=TRUE,
stringsAsFactors=FALSE)
}
row_groups <- unname(split(seq_len(nrow(mapped_pair_table)),
mapped_pair_table$QNAME))
unlist(lapply(row_groups,
function(row_group) {
lines <- unlist(lapply(row_group, function(i)
do.call("make_mapped_pair", mapped_pair_table[i, ])))
if (length(row_group) == 3L) {
lines[c(2L, 4L, 6L)] <- lines[c(6L, 2L, 4L)]
} else {
lines[c(FALSE, TRUE)] <- rev(lines[c(FALSE, TRUE)])
}
lines
}))
}
make_toy_bamfile <- function(mapped_pair_table, filename)
{
lines0 <- c("@HD\tVN:1.3",
"@SQ\tSN:chr1\tLN:2450",
"@SQ\tSN:chr2\tLN:1882",
"@SQ\tSN:chrX\tLN:999")
## Single end reads
lines1 <- c(
## s001: 1 primary alignment
make_samline("s001", mapped=1, RNAME="chr1", POS=10, strand="+",
primary=1),
## s002: 1 primary alignment + 3 secondary alignments
make_samline("s002", mapped=1, RNAME="chr1", POS=20, strand="+",
primary=1),
make_samline("s002", mapped=1, RNAME="chr1", POS=21, strand="+",
primary=0),
make_samline("s002", mapped=1, RNAME="chr1", POS=22, strand="+",
primary=0),
make_samline("s002", mapped=1, RNAME="chr1", POS=20, strand="+",
primary=0),
## s003: unmapped
make_samline("s003")
)
## Paired end reads
lines2 <- c(
## Mapped pairs
make_mapped_pairs(mapped_pair_table),
## p991: 1 pair with a missing mate (can happen if file was subsetted
## with e.g. filterBam)
make_mapped_pair("p991",
"chr2", 150, "18M", "+", "chr2", 199, "18M", "-",
primary=1, proper=1)[2L],
## p992: 1 pair with 1st mate unmapped and 2nd mate mapped
make_samline("p992", flagbits=0x1 + 0x40,
RNEXT="chr2", PNEXT=150),
make_samline("p992", mapped=1, RNAME="chr2", POS=150, strand="+",
primary=1, flagbits=0x1 + 0x8 + 0x80),
## p993: 1 pair with both mates unmapped
make_samline("p993", flagbits=0x1 + 0x8 + 0x40),
make_samline("p993", flagbits=0x1 + 0x8 + 0x80)
)
## Reads with multiple segments
lines3 <- c(
## m001: 3 segments in the template (index of each segment in template
## is known)
make_samline("m001", mapped=1,, RNAME="chrX", POS=10, strand="+",
flagbits=0x1 + 0x40,
RNEXT="chrX", PNEXT=20, proper=1),
make_samline("m001", mapped=1, RNAME="chrX", POS=20, strand="+",
flagbits=0x1 + 0x40 + 0x80,
RNEXT="chrX", PNEXT=30, proper=1),
make_samline("m001", mapped=1, RNAME="chrX", POS=30, strand="+",
flagbits=0x1 + 0x80,
RNEXT="chrX", PNEXT=10, proper=1),
## m002: 3 segments in the template (index of each segment in template
## was lost)
make_samline("m002", mapped=1, RNAME="chrX", POS=10, strand="+",
flagbits=0x1,
RNEXT="chrX", PNEXT=20, proper=1),
make_samline("m002", mapped=1, RNAME="chrX", POS=20, strand="+",
flagbits=0x1,
RNEXT="chrX", PNEXT=30, proper=1),
make_samline("m002", mapped=1, RNAME="chrX", POS=30, strand="+",
flagbits=0x1,
RNEXT="chrX", PNEXT=10, proper=1)
)
samfile <- paste0(filename, ".sam")
cat(c(lines0, lines1, lines2, lines3), file=samfile, sep="\n")
bamfile <- asBam(samfile, filename, overwrite=TRUE)
## Should never happen.
if (bamfile != paste0(filename, ".bam"))
stop("asBam() returned an unexpected path")
bamfile
}
### 1 line per mapped pair. Each line will generate 2 lines/records in the
### SAM file. The pair_id field will be stored in the SAM/BAM file as a user
### defined tag ("pi" tag).
mapped_pair_table <- "
QNAME RNAME1 POS1 CIGAR1 strand1 RNAME2 POS2 CIGAR2 strand2 primary proper pair_id
# p001: 1 primary proper pair
p001 chr2 10 18M + chr2 110 18M - 1 1 p001
# p002: 1 primary non proper pair
p002 chr2 20 18M + chr2 120 18M - 1 0 p002
# p003: 2 proper pairs: 1 primary + 1 secondary
p003 chr2 30 18M + chr2 130 18M - 1 1 p003a
p003 chr2 31 18M + chr2 131 18M - 0 1 p003b
# p004: 2 non proper pairs: 1 primary + 1 secondary
p004 chr2 40 18M + chr2 140 18M - 1 0 p004a
p004 chr2 41 18M + chr2 141 18M - 0 0 p004b
# p005: 2 primary pairs (some aligners seem to produce that, even though they
# probably shouldn't)
p005 chr2 50 18M + chr2 150 18M - 1 1 p005a
p005 chr2 51 18M + chr2 151 18M - 1 1 p005b
# p006: 3 pairs: 1 primary proper + 1 secondary proper + 1 secondary non
# proper
p006 chr2 60 18M + chr2 160 18M - 1 1 p006a
p006 chr2 61 18M + chr2 161 18M - 0 1 p006b
p006 chr2 62 18M + chr2 60 18M - 0 0 p006c
# p007: 2 pairs mapped to the same position: 1 primary proper + 1 secondary
# proper
p007 chr2 70 9M1D9M + chr2 170 18M - 1 1 p007a
p007 chr2 70 18M + chr2 170 7M2I9M - 0 1 p007b
# p008: 3 pairs mapped to the same position: 1 primary proper + 1 secondary
# proper + 1 secondary non proper
p008 chr2 80 18M + chr2 180 18M - 1 1 p008a
p008 chr2 80 9M2D9M + chr2 180 7M2I9M - 0 1 p008b
p008 chr2 80 6M3I9M + chr2 180 9M3D9M - 0 0 p008c
# p009: 3 pairs mapped to the same position: 1 primary proper + 2 secondary
# proper. The secondary pairs can NOT be disambiguated.
p009 chr2 90 18M + chr2 190 18M - 1 1 p009a
p009 chr2 90 9M2D9M + chr2 190 7M2I9M - 0 1 p009b
p009 chr2 90 6M3I9M + chr2 190 9M3D9M - 0 1 p009c
"
toy_bamfile <- make_toy_bamfile(mapped_pair_table, tempfile())
test_readGAlignmentPairsFromBam <- function()
{
param <- ScanBamParam(tag="pi")
galp <- suppressWarnings(
readGAlignmentPairsFromBam(toy_bamfile, use.names=TRUE, param=param)
)
## Check the dumped alignments
dumped_gal <- getDumpedAlignments()
checkTrue(validObject(dumped_gal, complete=TRUE))
pi_target <- rep(c("p009b", "p009c"), each=2)
checkIdentical(pi_target, sort(mcols(dumped_gal)$pi))
## Check 'galp'
checkTrue(validObject(galp, complete=TRUE))
pi_target <- c("p001", "p002", "p003a", "p003b", "p004a", "p004b",
"p005a", "p005b", "p006a", "p006b", "p006c",
"p007a", "p007b", "p008a", "p008b", "p008c", "p009a")
checkIdentical(pi_target, mcols(first(galp))$pi)
checkIdentical(pi_target, mcols(last(galp))$pi)
}
### Starting with BioC 2.14, readGAlignmentPairsFromBam() behavior changed when
### using the 'which' argument. Old behavior: the same pair was returned once
### per each range in 'which' that had an overlap with the *two* segments in
### the pair. New behavior: the same pair is returned once per each range in
### 'which' that has an overlap with *any* of the 2 segments in the pair.
### The new behavior is a consequence of using
### scanBam(BamFile(asMates=TRUE), ...)
### behind the scene instead of
### findMateAlignment()
### for the pairing.
### The new behavior breaks the test below so I'm turning it off for now.
if (FALSE) {
test_readGAlignmentPairsFromBam_which <- function()
{
## 4 non-overlapping regions of interest: first two regions only overlap
## with first p001 mate and last two regions only with last p001 mate.
my_ROI <- GRanges("chr2", IRanges(c(10, 15, 110, 115), width=1))
my_ROI_labels <- c("chr2:10-10", "chr2:15-15",
"chr2:110-110", "chr2:115-115")
param <- ScanBamParam(tag="pi", which=my_ROI[c(1, 4)])
target1 <- readGAlignmentPairsFromBam(toy_bamfile, use.names=TRUE,
param=param, with.which_label=TRUE)
checkTrue(validObject(target1, complete=TRUE))
checkIdentical(1L, length(target1))
checkIdentical(Rle(factor(my_ROI_labels[1], levels=my_ROI_labels[c(1, 4)])),
mcols(first(target1))$which_label)
checkIdentical(Rle(factor(my_ROI_labels[4], levels=my_ROI_labels[c(1, 4)])),
mcols(last(target1))$which_label)
mcols(target1@first)$which_label <- mcols(target1@last)$which_label <- NULL
## Checking all possible combinations of ranges in 'which'.
check_my_ROI_subsets <- function(subset)
{
check_my_ROI_subset <- function(i)
{
#print(i)
param <- ScanBamParam(tag="pi", which=my_ROI[i])
current <- suppressWarnings(
readGAlignmentPairsFromBam(toy_bamfile, use.names=TRUE,
param=param)
)
if (sum(i <= 2L) == 1L && sum(i >= 3L) == 1L) {
checkIdentical(target1, current)
} else {
checkIdentical(0L, length(current))
}
}
check_my_ROI_subset(subset)
if (length(subset) >= 2L) {
check_my_ROI_subset(rev(subset))
if (length(subset) >= 4L)
check_my_ROI_subset(c(4L, 1:3))
}
TRUE
}
for (m in 1:length(my_ROI))
combn(length(my_ROI), m, FUN=check_my_ROI_subsets)
}
}
|