File: reads_io_processor.py

package info (click to toggle)
htseq 2.0.9%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 103,476 kB
  • sloc: python: 6,280; sh: 211; cpp: 147; makefile: 80
file content (187 lines) | stat: -rw-r--r-- 6,364 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import itertools
import pysam
import HTSeq
import sys


class ReadsIO(object):
    """docstring for ReadsIO."""

    def __init__(
        self,
        sam_filename,
        samout_filename,
        samout_format,
        supplementary_alignment_mode,
        secondary_alignment_mode,
        order,
        max_buffer_size,
    ):

        # Set by _prepare_bam_sam_file_parser function below.
        self.pe_mode = None
        self.read_seq = None
        self.read_seq_file = None
        self.template = None
        self.samoutfile = None
        self.samout_format = samout_format

        self._set_BAM_reader(sam_filename)
        self._set_output_template(samout_filename, samout_format)
        self._set_read_seq(
            supplementary_alignment_mode,
            secondary_alignment_mode,
            order,
            max_buffer_size,
        )

    def write_to_samout(self, read_sequence, assignment):
        if self.samoutfile is None:
            return
        if not self.pe_mode:
            # TODO not sure if this is good in all honesty..
            read_sequence = (read_sequence,)
        for read in read_sequence:
            if read is not None:
                read.optional_fields.append(("XF", assignment))
                if self.template is not None:
                    self.samoutfile.write(read.to_pysam_AlignedSegment(self.template))
                elif self.samout_format in ("SAM", "sam"):
                    self.samoutfile.write(read.get_sam_line() + "\n")
                else:
                    raise ValueError(
                        "BAM/SAM output: no template and not a test SAM file",
                    )

    def close_samoutfile(self):
        if self.samoutfile is not None:
            self.samoutfile.close()

    def _set_BAM_reader(self, sam_filename):
        """
        Convert the input SAM/BAM files into a parser.

        Parameters
        ----------
        sam_filename : str
            The name of SAM/BAM file to write out all SAM alignment records into.

        """
        if sam_filename == "-":
            self.read_seq_file = HTSeq.BAM_Reader(sys.stdin)
        else:
            self.read_seq_file = HTSeq.BAM_Reader(sam_filename)

    def get_chromosome_names_header(self):
        """ Reads BAM header and returns a list of contigs, or None if no SQ in header """
        contigs = None
        sq = self.read_seq_file.get_header_dict().get("SQ")
        if sq is not None:
            contigs = []
            for sq_record in sq:
                sn = sq_record.get("SN")
                if sn:
                    contigs.append(sn)
        return contigs

    def _set_read_seq(
        self,
        supplementary_alignment_mode,
        secondary_alignment_mode,
        order,
        max_buffer_size,
    ):

        """
        Prepare the BAM/SAM file iterator.
        Note, only run this after _set_BAM_reader as you need self.read_seq_file to be set.
        This will create a parser and prepare an iterator for it.
        Depending on whether we have paired-end reads or not, different iterator
        will be returned.

        Parameters
        ----------
        supplementary_alignment_mode : str
            Whether to score supplementary alignments (0x800 flag).
            Choices: score or ignore.
        secondary_alignment_mode : str
            Whether to score secondary alignments (0x100 flag).
            Choices: score or ignore.
        order : str
            Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
        max_buffer_size : int
            When <alignment_file> is paired end sorted by position, allow only so many reads to stay in memory
            until the mates are found (raising this number will use more memory).
            Has no effect for single end or paired end sorted by name.

        """

        read_seq_iter = iter(self.read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            self.pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            self.pe_mode = False
        if first_read is not None:
            self.read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            self.read_seq = []

        if self.pe_mode:
            if (supplementary_alignment_mode == "ignore") and (
                secondary_alignment_mode == "ignore"
            ):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                self.read_seq = HTSeq.pair_SAM_alignments(
                    self.read_seq, primary_only=primary_only
                )
            elif order == "pos":
                self.read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    self.read_seq,
                    max_buffer_size=max_buffer_size,
                    primary_only=primary_only,
                )
            else:
                raise ValueError("Illegal order specified.")

    def _set_output_template(self, samout_filename, samout_format):
        """
        Set up the SAM/BAM output files (and corresponding template) if possible.

        Parameters
        ----------
        samout_filename : str
            The name of SAM/BAM file to write out all SAM alignment records into.
        samout_format : str
            Format of the output files denoted by samouts.
            Choices: SAM, BAM, sam, bam.

        """
        if samout_filename is None:
            self.template = None
            self.samoutfile = None
        elif samout_format in ("bam", "BAM"):
            self.template = self.read_seq_file.get_template()
            self.samoutfile = pysam.AlignmentFile(
                samout_filename,
                "wb",
                template=self.template,
            )
        elif (samout_format in ("sam", "SAM")) and hasattr(
            self.read_seq_file, "get_template"
        ):
            self.template = self.read_seq_file.get_template()
            self.samoutfile = pysam.AlignmentFile(
                samout_filename,
                "w",
                template=self.template,
            )
        else:
            self.template = None
            self.samoutfile = open(samout_filename, "w")