File: bamHandler.py

package info (click to toggle)
python-deeptools 3.5.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 34,456 kB
  • sloc: python: 14,503; xml: 4,212; sh: 33; makefile: 5
file content (103 lines) | stat: -rw-r--r-- 3,345 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys
import pysam
from deeptools.mapReduce import mapReduce


def countReadsInInterval(args):
    chrom, start, end, fname, toEOF = args

    bam = openBam(fname)
    mapped = 0
    unmapped = 0
    for b in bam.fetch(chrom, start, end):
        if chrom == "*":
            unmapped += 1
            continue
        if b.pos < start:
            continue
        if not b.is_unmapped:
            mapped += 1
        else:
            unmapped += 1
    return mapped, unmapped, chrom


def getMappingStats(bam, nThreads):
    """
    This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless

    This requires pysam > 0.13.0
    """
    header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
    res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)

    mapped = sum([x[0] for x in res])
    unmapped = sum([x[1] for x in res])
    stats = {x[0]: [0, 0] for x in header}
    for r in res:
        stats[r[2]][0] += r[0]
        stats[r[2]][1] += r[1]

    # We need to count the number of unmapped reads as well
    unmapped += bam.count("*")

    return mapped, unmapped, stats


def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True):
    """
    A wrapper for opening BAM/CRAM files.

    bamFile: str
        A BAM/CRAM file name

    returnStats: bool
        Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict).
        These additional values are needed by some downstream functions, since one
        can't use file_handle.mapped on CRAM files (or idxstats())

    nThreads: int
        If returnStats is True, number of threads to use for computing statistics

    minimalDecoding: Bool
        For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions).

    Returns either the file handle or a tuple as described in returnStats
    """
    format_options = ["required_fields=0x1FF"]
    if sys.version_info.major >= 3:
        format_options = [b"required_fields=0x1FF"]
    if not minimalDecoding:
        format_options = None
    try:
        bam = pysam.Samfile(bamFile, 'rb', format_options=format_options)
    except IOError:
        sys.exit("The file '{}' does not exist".format(bamFile))
    except:
        sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile))

    try:
        assert bam.check_index() is not False
    except:
        sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile))

    if bam.is_cram and returnStats:
        mapped, unmapped, stats = getMappingStats(bam, nThreads)
    elif bam.is_bam:
        mapped = bam.mapped
        unmapped = bam.unmapped

        # Make the dictionary to hold the stats
        if returnStats:
            stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()}

    if bam.is_bam or (bam.is_cram and returnStats):
        if mapped == 0:
            sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please "
                             "check that the file is properly indexed and "
                             "that it contains mapped reads.\n".format(bamFile))

    if returnStats:
        return bam, mapped, unmapped, stats
    else:
        return bam