File: methods-SolexaExportQA.R

package info (click to toggle)
r-bioc-shortread 1.32.0-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 8,384 kB
  • ctags: 293
  • sloc: ansic: 2,718; cpp: 202; sh: 3; makefile: 2
file content (285 lines) | stat: -rw-r--r-- 11,109 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
.SolexaExportQA <- function(x, ...) 
{
    new("SolexaExportQA", .srlist=x, ...)
}

## qa

.qa_lst_as_data_frame <- function(lst) {
    if (length(lst)==0) return(data.frame())
    nms <- names(lst[[1]])
    sublst <- sapply(nms, function(nm) {
        subListExtract(lst, nm, simplify=TRUE)
    })
    names(sublst) <- nms
    do.call(data.frame, sublst)
}

.qa_Solexa_tileStats <- function(dirPath, pattern, ...)
{
    .qa_Solexa_tileStats_tile <- function(dirPath, pattern, ...)
    {
        lane <- as.numeric(sub("s_([0-9]+)_.*", "\\1", pattern))
        tile <- as.numeric(sub("s_[0-9]+_([0-9]+)_.*", "\\1",
                               pattern))
        dna <- readXStringColumns(dirPath, pattern,
                                  colClasses=list(NULL, NULL, NULL,
                                    NULL, "DNAString"))[[1]]
        list(lane=lane, tile=tile,
             slane=(lane-1)*3+trunc((tile-1)/100)+1,
             stile=1+pmin((tile-1)%%200, (200-tile)%%200),
             nReads=length(dna),
             nClean=sum(alphabetFrequency(dna, baseOnly=TRUE)[,"other"]==0))
    }
    if (length(pattern)==0) pattern=".*_seq.txt$"
    lst <- bplapply(list.files(dirPath, pattern, full.names=TRUE),
                    .qa_Solexa_tileStats_tile, dirPath=dirPath, ...)
    .qa_lst_as_data_frame(lst)
}

.qa_SolexaExport_lane <-
    function(dirPath, pattern, ..., type="SolexaExport",
             verbose=FALSE)
{
    if (verbose)
        message("qa 'SolexaExport' pattern:", pattern)
    readLbls <- c("read", "filtered", "aligned")
    rpt <- readAligned(dirPath, pattern, type, ...)
    doc <- .qa_depthOfCoverage(rpt, pattern)
    ac <- .qa_adapterContamination(rpt, pattern, ...)
    df <- pData(alignData(rpt))
    filterIdx <- df$filtering=="Y"
    mapIdx <- !is.na(position(rpt))

    nbins <- max(df$tile)
    tiles <- seq_len(nbins)
    nReadByTile <- tabulate(df$tile, nbins)
    nFilterByTile <- tabulate(df$tile[filterIdx], nbins)
    nMapByTile <- tabulate(df$tile[mapIdx], nbins)

    qualityScore <- alphabetScore(quality(rpt)) / width(quality(rpt))
    qualityDf <- function(qscore)
    {
        d <- density(qscore)
        data.frame(quality=d$x,
                   density=d$y,
                   lane=pattern)
    }
    qualityScoreRead <- qualityDf(qualityScore)
    qualityScoreFiltered <- qualityDf(qualityScore[filterIdx])
    qualityScoreAligned <- qualityDf(qualityScore[mapIdx])

    abc <- alphabetByCycle(rpt)
    baseQuality <- apply(abc, 2, sum)

    alignQuality <- table(quality(alignQuality(rpt))[mapIdx])

    tablesRead <- tables(sread(rpt))
    tablesFiltered <- tables(sread(rpt)[filterIdx])
    tablesAligned <- tables(sread(rpt)[mapIdx])
    frequentSequences <-
        data.frame(sequence=c(
                     names(tablesRead$top),
                     names(tablesFiltered$top),
                     names(tablesAligned$top)),
                   count=c(
                     as.integer(tablesRead$top),
                     as.integer(tablesFiltered$top),
                     as.integer(tablesAligned$top)),
                   type=rep(
                     readLbls,
                     c(length(tablesRead$top),
                       length(tablesFiltered$top),
                       length(tablesAligned$top))),
                   lane=pattern)
    sequenceDistribution <-
        cbind(rbind(tablesRead$distribution,
                    tablesFiltered$distribution,
                    tablesAligned$distribution),
              type=rep(
                readLbls,
                c(nrow(tablesRead$distribution),
                  nrow(tablesFiltered$distribution),
                  nrow(tablesAligned$distribution))),
              lane=pattern)
    perCycleBaseCall <- .qa_perCycleBaseCall(abc, pattern)
    perCycleQuality <- .qa_perCycleQuality(abc, quality(rpt), pattern)
    list(readCounts=data.frame(
           read=sum(nReadByTile),
           filtered=sum(nFilterByTile),
           aligned=sum(nMapByTile),
           row.names=pattern),
         baseCalls=local({
             n <- apply(abc, 1, sum)
             data.frame(A=n["A"], C=n["C"], G=n["G"], T=n["T"],
                        N=n["N"], row.names=pattern)
         }),
         readQualityScore=cbind(
           rbind(qualityScoreRead,
                 qualityScoreFiltered,
                 qualityScoreAligned),
           type=rep(
             readLbls,
             c(nrow(qualityScoreRead),
               nrow(qualityScoreFiltered),
               nrow(qualityScoreAligned)))),
         baseQuality=data.frame(
           score=as.vector(names(baseQuality)),
           count=as.vector(baseQuality),
           lane=pattern, row.names=NULL),
         alignQuality=data.frame(
           score=as.numeric(names(alignQuality)),
           count=as.vector(alignQuality),
           lane=pattern, row.names=NULL),
         frequentSequences=frequentSequences,
         sequenceDistribution=sequenceDistribution,

         perCycle=list(
           baseCall=perCycleBaseCall,
           quality=perCycleQuality),

         perTile=list(
           readCounts=data.frame(
             count=c(nReadByTile, nFilterByTile, nMapByTile),
             type=rep(
               readLbls,
               c(length(nReadByTile), length(nFilterByTile),
                 length(nMapByTile))),
             tile=rep(tiles, 3),
             lane=pattern, row.names=NULL),
           medianReadQualityScore=local({
               tidx <- as.character(tiles)
               data.frame(score=c(
                            tapply(qualityScore,
                                   df$tile, median)[tidx],
                            tapply(qualityScore[filterIdx],
                                   df$tile[filterIdx], median)[tidx],
                            tapply(qualityScore[mapIdx],
                                   df$tile[mapIdx], median)[tidx]),
                          type=rep(readLbls, each=length(tidx)),
                          tile=as.integer(tidx),
                          lane=pattern, row.names=NULL)
           })),
         depthOfCoverage=doc,
         adapterContamination=ac)
}

.qa_SolexaExport <-
    function(dirPath, pattern, type="SolexaExport", ...,
             verbose=FALSE)
{
    fls <- .file_names(dirPath, pattern)
    lst <- bplapply(basename(fls), .qa_SolexaExport_lane,
                   dirPath=dirPath, type=type, ..., verbose=verbose)

    ## collapse into data frames
    lst <-
        list(readCounts=.bind(lst, "readCounts"),
             baseCalls=.bind(lst, "baseCalls"),
             readQualityScore=.bind(lst, "readQualityScore"),
             baseQuality=.bind(lst, "baseQuality"),
             alignQuality=.bind(lst, "alignQuality"),
             frequentSequences=.bind(lst, "frequentSequences"),
             sequenceDistribution=.bind(lst, "sequenceDistribution"),
             perCycle=local({
                 lst <- subListExtract(lst, "perCycle")
                 list(baseCall=.bind(lst, "baseCall"),
                      quality=.bind(lst, "quality"))
             }),
             perTile=local({
                 lst <- subListExtract(lst, "perTile")
                 list(readCounts=.bind(lst, "readCounts"),
                      medianReadQualityScore=.bind(
                        lst, "medianReadQualityScore"))
             }),
             depthOfCoverage=.bind(lst, "depthOfCoverage"),
             adapterContamination=.bind(lst, "adapterContamination"))
    .SolexaExportQA(lst)
}

## report

setMethod(.report_pdf, "SolexaExportQA",
          function(x, dest, type, ...)
{
    qa <- x                             # mnemonic alias
    to <- tempfile()
    save(qa, file=to)
    res <- callGeneric(to, dest, type, ...)
    unlink(to)
    res
})

setMethod(report_html, "SolexaExportQA",
          function (x, dest, type, ...)
{
    qa <- .qa_sampleKey(x)
    dir.create(dest, recursive=TRUE)
    fls <- c("0000-Header.html", "1000-Overview.html",
             "2000-RunSummary.html", "3000-ReadDistribution.html",
             "4000-CycleSpecific.html", "5000-PerTile.html",
             "6000-Alignment.html", "8000-DepthOfCoverage.html",
             "9000-AdapterContamination.html", "9999-Footer.html")
    sections <- system.file("template", fls, package="ShortRead")
    perCycle <- qa[["perCycle"]]
    perTile <- qa[["perTile"]]
    readCnt <- perTile[["readCounts"]]
    values <-
        list(SAMPLE_KEY=hwrite(qa[["keyValue"]], border=0),
             PPN_COUNT=.html_img(
               dest, "readCount", .plotReadCount(qa)),
             PPN_COUNT_TBL=hwrite(
               .ppnCount(qa[["readCounts"]]),
               border=0),
             BASE_CALL_COUNT=.html_img(
               dest, "baseCalls", .plotNucleotideCount(qa)),
             READ_QUALITY_FIGURE=.htmlReadQuality(
               dest, "readQuality", qa),
             READ_OCCURRENCES_FIGURE=.htmlReadOccur(
               dest, "readOccurences", qa),
             FREQUENT_SEQUENCES_READ=hwrite(
               .freqSequences(qa, "read"),
               border=0),
             FREQUENT_SEQUENCES_FILTERED=hwrite(
               .freqSequences(qa, "filtered"),
               border=0),
             FREQUENT_SEQUENCES_ALIGNED=hwrite(
               .freqSequences(qa, "aligned"),
               border=0),
             CYCLE_BASE_CALL_FIGURE=.html_img(
               dest, "perCycleBaseCall",
               .plotCycleBaseCall(perCycle$baseCall)),
             CYCLE_QUALITY_FIGURE=.html_img(
               dest, "perCycleQuality",
               .plotCycleQuality(perCycle$quality)),
             PER_TILE_HISTOGRAM=local({
                 cnts <- readCnt[readCnt$type=="read", "count"]
                 hist <- histogram(cnts, breaks=40,
                                   xlab="Reads per tile",
                                   panel=function(x, ...) {
                                       panel.abline(v=quantile(x, .1),
                                                    col="red", lty=2)
                                       panel.histogram(x, ...)
                                   }, col="white")
                 .html_img(dest, "perTileHistogram", hist)
             }),
             PER_TILE_COUNT_FIGURE=.html_img(
               dest, "perTileCount",
               .plotTileCounts(readCnt[readCnt$type=="read",])),
             PER_TILE_QUALITY_FIGURE=local({
                 qscore <- perTile[["medianReadQualityScore"]]
                 score <- qscore[qscore$type=="read",]
                 .html_img(dest, "perTileQuality",
                           .plotTileQualityScore(score))
             }),
             ALIGN_QUALITY_FIGURE=.html_img(
               dest, "alignmentQuality",
               .plotAlignQuality(qa[["alignQuality"]])),
             DEPTH_OF_COVERAGE_FIGURE=.html_img(
               dest, "depthOfCoverage",
               .plotDepthOfCoverage(qa[["depthOfCoverage"]])),
             ADAPTER_CONTAMINATION=hwrite(
               .df2a(qa[["adapterContamination"]]),
               border=0))
    .report_html_do(dest, sections, values, ...)
})