File: inspection.py

package info (click to toggle)
python-gffutils 0.9-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 9,812 kB
  • sloc: python: 4,295; makefile: 5; sh: 5
file content (105 lines) | stat: -rw-r--r-- 2,924 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from gffutils import iterators
from gffutils import interface
from collections import Counter
import sys


def inspect(data, look_for=['featuretype', 'chrom', 'attribute_keys',
                            'feature_count'], limit=None, verbose=True):
    """
    Inspect a GFF or GTF data source.

    This function is useful for figuring out the different featuretypes found
    in a file (for potential removal before creating a FeatureDB).

    Returns a dictionary with a key for each item in `look_for` and
    a corresponding value that is a dictionary of how many of each unique item
    were found.

    There will always be a `feature_count` key, indicating how many features
    were looked at (if `limit` is provided, then `feature_count` will be the
    same as `limit`).

    For example, if `look_for` is ['chrom', 'featuretype'], then the result
    will be a dictionary like::

        {
            'chrom': {
                'chr1': 500,
                'chr2': 435,
                'chr3': 200,
                ...
                ...
            }.

            'featuretype': {
                'gene': 150,
                'exon': 324,
                ...
            },

            'feature_count': 5000

        }


    Parameters
    ----------
    data : str, FeatureDB instance, or iterator of Features
        If `data` is a string, assume it's a GFF or GTF filename.  If it's
        a FeatureDB instance, then its `all_features()` method will be
        automatically called. Otherwise, assume it's an iterable of Feature
        objects.

    look_for : list
        List of things to keep track of. Options are:

            - any attribute of a Feature object, such as chrom, source, start,
              stop, strand.

            - "attribute_keys", which will look at all the individual
              attribute keys of each feature

    limit : int
        Number of features to look at.  Default is no limit.

    verbose : bool
        Report how many features have been processed.

    Returns
    -------
    dict
    """

    results = {}
    obj_attrs = []
    for i in look_for:
        if i not in ['attribute_keys', 'feature_count']:
            obj_attrs.append(i)
        results[i] = Counter()

    attr_keys = 'attribute_keys' in look_for

    d = iterators.DataIterator(data)
    feature_count = 0
    for f in d:
        if verbose:
            sys.stderr.write('\r%s features inspected' % feature_count)
            sys.stderr.flush()

        for obj_attr in obj_attrs:
            results[obj_attr].update([getattr(f, obj_attr)])

        if attr_keys:
            results['attribute_keys'].update(f.attributes.keys())

        feature_count += 1
        if limit and feature_count == limit:
            break

    new_results = {}
    for k, v in results.items():
        new_results[k] = dict(v)

    new_results['feature_count'] = feature_count
    return new_results