File: count_features_per_file.py

package info (click to toggle)
htseq 2.0.9%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 103,476 kB
  • sloc: python: 6,280; sh: 211; cpp: 147; makefile: 80
file content (465 lines) | stat: -rw-r--r-- 15,773 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
import itertools
import random
import sys

from HTSeq.scripts.utils import invert_strand, UnknownChrom
from HTSeq.scripts.count_features.reads_io_processor import ReadsIO
from HTSeq.scripts.count_features.reads_stats import ReadsStatistics


def count_reads_single_file(
    isam,
    sam_filename,
    features,
    feature_attr,
    order,
    max_buffer_size,
    stranded,
    overlap_mode,
    multimapped_mode,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    feature_type,
    id_attribute,
    additional_attributes,
    quiet,
    minaqual,
    samout_format,
    samout_filename,
):
    """
    The function that does the counting for each input BAM/SAM file.
    Fixme: there are some redundant parameters here.. feature_type, id_attribute, additional_attributes

    Parameters
    ----------
    isam : int
        input files' indexing for the purpose of parallel processing.
        This basically tell you which input file is being processed by this
        instance of function.
    sam_filename : str
        Path to the SAM/BAM file containing the mapped reads.
    features : array
        TODO check the type of this parameter.
        Supplied by HTSeq.make_feature_genomicarrayofsets
    feature_attr : array
        TODO check the type of this parameter.
        Supplied by HTSeq.make_feature_genomicarrayofsets
    order : str
        Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
    max_buffer_size : int
        The number of reads allowed to stay in memory until mates are found.
        Used when <alignment_file> is paired end sorted by position.
    stranded : str
        Whether the data to be aligned is from a strand-specific assay.
        Option is yes, no, reverse.
        reverse means yes with reversed strand interpretation.
    overlap_mode : str
        Mode to handle reads overlapping more than one feature.
        Choices: union, intersection-strict, intersection-nonempty.
    multimapped_mode : str
        Whether and how to score reads that are not uniquely aligned or
        ambiguously assigned to features.
        Choices: none, all, fraction, random.
    secondary_alignment_mode : str
        Whether to score secondary alignments (0x100 flag).
        Choices: score or ignore.
    supplementary_alignment_mode : str
        Whether to score supplementary alignments (0x800 flag).
        Choices: score or ignore.
    feature_type : str
        Feature type (3rd column in GTF file) to be used, all features of other
        type are ignored (default, suitable for Ensembl, GTF files: exon).
    id_attribute : str
        GTF attribute to be used as feature ID.
        Normally gene_id, suitable for Ensembl GTF files.
    additional_attributes : array
        Additional feature attributes.
        Commonly, gene_name is suitable for Ensembl GTF files.
    quiet : boolean
        Whether to suppress progress report.
    minaqual : int
        Value denoting the MAPQ alignment quality of reads to skip.
    samout_format : str
        Format of the output files denoted by samouts.
        Choices: SAM, BAM, sam, bam.
    samout_filename : str
        The name of SAM/BAM file to write out all SAM alignment records into.
    Returns
    -------
    Dictionary
        TODO update me when done refactoring

    """
    try:
        read_io_obj = ReadsIO(
            sam_filename=sam_filename,
            samout_filename=samout_filename,
            samout_format=samout_format,
            supplementary_alignment_mode=supplementary_alignment_mode,
            secondary_alignment_mode=secondary_alignment_mode,
            order=order,
            max_buffer_size=max_buffer_size,
        )

        # If the BAM header is available, check that at least one of the
        # chromosomes is also found in the GTF/GFF file, otherwise the user
        # is probably doing something wrong (e.g. "chr1" vs "1").
        bam_chroms = read_io_obj.get_chromosome_names_header()
        if bam_chroms is not None:
            bam_chroms = set(bam_chroms)
            feature_chroms = set(features.chrom_vectors.keys())
            if not (bam_chroms & feature_chroms):
                sys.stderr.write(
                    f"The alignment file has no chromosomes in common with the GFF/GTF "
                    "file. This will result in zero feature counts. Please check if the "
                    "references match, e.g. if you are using 'chr1' or '1' as "
                    "chromosome names.\n")

    except:
        sys.stderr.write("Error occurred when reading beginning of SAM/BAM file.\n")
        raise

    try:
        read_stats = ReadsStatistics(
            feature_attr=feature_attr, read_io_object=read_io_obj
        )
    except:
        sys.stderr.write(
            "Error occurred when preparing object to store the reads' assignments\n"
        )
        raise

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ("M", "=", "X")

    try:
        for r in read_io_obj.read_seq:
            read_stats.print_progress()
            read_stats.add_num_reads_processed()

            #  get the interval/read sequence.
            if not read_io_obj.pe_mode:
                skip_read = _assess_non_pe_read(
                    read_sequence=r,
                    read_stats=read_stats,
                    secondary_alignment_mode=secondary_alignment_mode,
                    supplementary_alignment_mode=supplementary_alignment_mode,
                    multimapped_mode=multimapped_mode,
                    minaqual=minaqual,
                )

                if skip_read:
                    continue
                iv_seq = _get_iv_seq_non_pe_read(com, r, stranded)
            else:

                # NOTE: the logic here is a little arbitrary and might benefit
                # from an optional arg. If the reads are paired-end but one of
                # the two is missing, ATM we rely on the other one for info,
                # however the data is technically inconsistent and we might
                # want to let the user choose.
                skip_read = _assess_pe_read(
                    minaqual,
                    multimapped_mode,
                    r,
                    read_stats,
                    secondary_alignment_mode,
                    supplementary_alignment_mode,
                )
                if skip_read:
                    continue

                iv_seq = _get_iv_seq_pe_read(com, r, stranded)

            # this bit updates the counts obtained from aligning reads to feature sets.
            try:
                fs = _align_reads_to_feature_set(features, iv_seq, overlap_mode)

                _update_feature_set_counts(fs, multimapped_mode, r, read_stats)

            except UnknownChrom:
                read_stats.add_empty_read(read_sequence=r)

    except:
        sys.stderr.write(
            "Error occured when processing input (%s):\n"
            % (read_io_obj.read_seq_file.get_line_number_string())
        )
        raise

    if not quiet:
        read_stats.print_progress(force_print=True)

    read_io_obj.close_samoutfile()

    res = read_stats.get_output(isam)
    return res


def _update_feature_set_counts(fs, multimapped_mode, read_sequence, read_stats):
    """
    Distribute the counts among the aligned feature set.

    Parameters
    ----------
    fs : array
        A list of feature set previously aligned to the read
    multimapped_mode : str
        How to handle read mapped to multiple features
    read_sequence : array
        Read sequence
    read_stats : ReadsStatistics object
        For updating bad reads

    """
    if fs is None or len(fs) == 0:
        read_stats.add_empty_read(read_sequence=read_sequence)
    elif len(fs) > 1:
        read_stats.add_ambiguous_read(
            read_sequence=read_sequence,
            assignment="__ambiguous[" + "+".join(sorted(fs)) + "]",
        )
    else:
        read_stats.add_good_read_assignment(
            read_sequence=read_sequence, assignment=list(fs)[0]
        )
    if fs is not None and len(fs) > 0:
        fs = list(fs)
        if multimapped_mode == "none":
            if len(fs) == 1:
                read_stats.add_to_count(feature=fs[0])
        elif multimapped_mode == "all":
            for fsi in fs:
                read_stats.add_to_count(feature=fsi)
        elif multimapped_mode == "fraction":
            val = 1.0 / len(fs)
            for fsi in fs:
                read_stats.add_to_count(feature=fsi, value=val)
        elif multimapped_mode == "random":
            fsi = random.choice(fs)
            read_stats.add_to_count(feature=fsi)
        else:
            sys.exit("Illegal multimap mode.")


def _align_reads_to_feature_set(features, iv_seq, overlap_mode):
    """
    Align reads to feature set.

    Parameters
    ----------
    features : array
        A set of features to align the reads to
        TODO not sure the type yet.
    iv_seq : array
        TODO not sure the type yet.
        Read (or interval?) sequence
    overlap_mode : str
        How to select the features for read that not 100% aligned to a feature.

    Returns
    -------
    fs : array
        A set of features to align the reads to
        TODO not sure the type yet.

    """
    if overlap_mode == "union":
        fs = set()
        for iv in iv_seq:
            if iv.chrom not in features.chrom_vectors:
                raise UnknownChrom
            for iv2, fs2 in features[iv].steps():
                fs = fs.union(fs2)
    elif overlap_mode in ("intersection-strict", "intersection-nonempty"):
        fs = None
        for iv in iv_seq:
            if iv.chrom not in features.chrom_vectors:
                raise UnknownChrom
            for iv2, fs2 in features[iv].steps():
                if (len(fs2) > 0) or (overlap_mode == "intersection-strict"):
                    if fs is None:
                        fs = fs2.copy()
                    else:
                        fs = fs.intersection(fs2)
    else:
        sys.exit("Illegal overlap mode.")
    return fs


def _get_iv_seq_pe_read(com, r, stranded):
    """
    Function to break down the read sequence into intervals which will
    subsequently be processed.

    Parameters
    ----------
    com : array
        CIGAR match characters (including alignment match, sequence match, and
        sequence mismatch
    r :
        todo update type
        Read sequence
    stranded : str
        Whether the data to be aligned is from a strand-specific assay.
        Option is yes, no, reverse.
        reverse means yes with reversed strand interpretation.

    Returns
    -------
    iv_seq :
        todo update type

    """
    if r[0] is not None and r[0].aligned:
        iv_seq = _get_iv_seq_pe_read_first(com, r[0], stranded)
    else:
        iv_seq = tuple()
    if r[1] is not None and r[1].aligned:
        iv_seq = _get_iv_seq_pe_read_second(com, iv_seq, r[1], stranded)
    return iv_seq


def _assess_pe_read(
    minaqual,
    multimapped_mode,
    read_sequence,
    read_stats,
    secondary_alignment_mode,
    supplementary_alignment_mode,
):
    """
    Function to check the read for paired end.

    Parameters
    ----------
    minaqual : int
        Value denoting the MAPQ alignment quality of reads to skip.
    multimapped_mode : str
        Whether and how to score reads that are not uniquely aligned or
        ambiguously assigned to features.
        Choices: none, all, fraction, random.
    read_sequence :
        todo update type
    read_stats : ReadsStatistics object
        Object which stores a bunch of statistics about the read sequences.
    secondary_alignment_mode : str
        Whether to score secondary alignments (0x100 flag).
        Choices: score or ignore.
    supplementary_alignment_mode : str
        Whether to score supplementary alignments (0x800 flag).
        Choices: score or ignore.

    Returns
    -------

    """
    # NOTE: Sometimes read1 is None or not aligned but read2 is fine, in that
    # case we should not exclude the entire pair but rather use the interval
    # of the second read
    read1_miss = (read_sequence[0] is None) or (not read_sequence[0].aligned)
    read2_miss = (read_sequence[1] is None) or (not read_sequence[1].aligned)
    if read1_miss and read2_miss:
        read_stats.add_not_aligned_read(read_sequence=read_sequence)
        return True

    if secondary_alignment_mode == "ignore":
        if (read_sequence[0] is not None) and read_sequence[0].not_primary_alignment:
            return True
        elif (read_sequence[1] is not None) and read_sequence[1].not_primary_alignment:
            return True
    if supplementary_alignment_mode == "ignore":
        if (read_sequence[0] is not None) and read_sequence[0].supplementary:
            return True
        elif (read_sequence[1] is not None) and read_sequence[1].supplementary:
            return True
    try:
        if (
            read_sequence[0] is not None and read_sequence[0].optional_field("NH") > 1
        ) or (
            read_sequence[1] is not None and read_sequence[1].optional_field("NH") > 1
        ):
            read_stats.add_not_unique_read(read_sequence=read_sequence)
            if multimapped_mode == "none":
                return True
    except KeyError:
        pass
    if (read_sequence[0] and read_sequence[0].aQual < minaqual) or (
        read_sequence[1] and read_sequence[1].aQual < minaqual
    ):
        read_stats.add_low_quality_read(read_sequence=read_sequence)
        return True
    return False


# Get GenomicInterval for each read, whether single-end or paired-end
def _get_iv_seq_non_pe_read(com, r, stranded):
    if stranded != "reverse":
        iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0)
    else:
        iv_seq = (
            invert_strand(co.ref_iv)
            for co in r.cigar
            if (co.type in com and co.size > 0)
        )
    return iv_seq


def _get_iv_seq_pe_read_first(com, read, stranded):
    if stranded != "reverse":
        iv_seq = (co.ref_iv for co in read.cigar if co.type in com and co.size > 0)
    else:
        iv_seq = (
            invert_strand(co.ref_iv)
            for co in read.cigar
            if co.type in com and co.size > 0
        )
    return iv_seq


def _get_iv_seq_pe_read_second(com, iv_seq, read, stranded):
    if stranded != "reverse":
        iv_seq = itertools.chain(
            iv_seq,
            (
                invert_strand(co.ref_iv)
                for co in read.cigar
                if co.type in com and co.size > 0
            ),
        )
    else:
        iv_seq = itertools.chain(
            iv_seq, (co.ref_iv for co in read.cigar if co.type in com and co.size > 0)
        )
    return iv_seq


def _assess_non_pe_read(
    read_sequence,
    read_stats,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    multimapped_mode,
    minaqual,
):
    if not read_sequence.aligned:
        read_stats.add_not_aligned_read(read_sequence=read_sequence)
        return True
    if (secondary_alignment_mode == "ignore") and read_sequence.not_primary_alignment:
        return True
    if (supplementary_alignment_mode == "ignore") and read_sequence.supplementary:
        return True
    try:
        if read_sequence.optional_field("NH") > 1:
            read_stats.add_not_unique_read(read_sequence=read_sequence)
            if multimapped_mode == "none":
                return True
    except KeyError:
        pass
    if read_sequence.aQual < minaqual:
        read_stats.add_low_quality_read(read_sequence=read_sequence)
        return True

    return False