File: segmentation.py

package info (click to toggle)
ont-fast5-api 4.1.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,556 kB
  • sloc: python: 3,799; makefile: 152; sh: 13
file content (193 lines) | stat: -rw-r--r-- 9,018 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
""" Helper class for working with segmentation type analyses.
"""
import numpy as np

from ont_fast5_api.analysis_tools.base_tool import BaseTool
from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools


class SegmentationTools(BaseTool):
    """ Provides helper methods specific to segmentation analyses.
    """
    group_id = 'Segmentation'
    analysis_id = 'segmentation'

    def get_results(self):
        """ Returns the segmentation summary data.
        
        This data is normalized, to eliminate differences in what is stored
        for different types of segmentation analyses.
        
        The following fields are output:

        * has_template - True if the segmentation found template data.
        * has_complement - True if the segmentation found complement data.
        * first_sample_template - The first sample of the template data in
            the raw data. Only present if has_template is True.
        * duration_template - The duration (in samples) of the template
            data. Only present if has_template is True.
        * first_sample_complement - The first sample of the complement data
            in the raw data. Only present if has_complement is True.
        * duration_complement - The duration (in samples) of the complement
            data. Only present if has_complement is True.
            
        """
        summary = self._get_summary_data()
        if summary is None:
            results = {'has_template': False,
                       'has_complement': False}
        else:
            results = {}
            if 'has_template' in summary:
                results['has_template'] = bool(summary['has_template'])
            else:
                results['has_template'] = True if summary['num_temp'] > 0 else False
            if 'has_complement' in summary:
                results['has_complement'] = bool(summary['has_complement'])
            else:
                results['has_complement'] = True if summary['num_comp'] > 0 else False
            need_raw_info = False
            if results['has_template']:
                if 'start_index_temp' in summary:
                    summary['start_event_template'] = summary['start_index_temp']
                    summary['end_event_template'] = summary['end_index_temp']
                if 'first_sample_template' not in summary:
                    need_raw_info = True
            if results['has_complement']:
                if 'start_index_comp' in summary:
                    summary['start_event_complement'] = summary['start_index_comp']
                    summary['end_event_complement'] = summary['end_index_comp']
                if 'first_sample_complement' not in summary:
                    need_raw_info = True
            if need_raw_info:
                self._get_raw_info(summary)
            if results['has_template']:
                results['first_sample_template'] = summary['first_sample_template']
                results['duration_template'] = summary['duration_template']
                if 'start_event_template' in summary:
                    results['start_event_template'] = summary['start_event_template']
                    results['end_event_template'] = summary['end_event_template']
            if results['has_complement']:
                results['first_sample_complement'] = summary['first_sample_complement']
                results['duration_complement'] = summary['duration_complement']
                if 'start_event_complement' in summary:
                    results['start_event_complement'] = summary['start_event_complement']
                    results['end_event_complement'] = summary['end_event_complement']
        return results

    def get_event_data(self, section, time_in_seconds=False):
        """ Get the template or complement event data.
        
        :param section: Either template, complement, or both.
        :param time_in_seconds: Return the start and length fields
            in seconds, rather than samples.
        :return: The event dataset for the section. If section=both
            then it returns a tuple with both sections. Returns None
            if the section does not exist.
        """
        if section not in ['template', 'complement', 'both']:
            raise Exception('Unrecognized section: {} Expected: "template", "complement" or "both"'.format(section))
        results = self.get_results()
        if results is None:
            return None, None if section == 'both' else None
        if section == 'both':
            sections = ['template', 'complement']
        else:
            sections = [section]
        evdet_group, _ = self._find_event_data()
        with EventDetectionTools(self.handle, group_name=evdet_group) as evdet:
            event_data, _ = evdet.get_event_data(time_in_seconds=time_in_seconds)
        datasets = [None, None]
        for n, this_section in enumerate(sections):
            if not results['has_{}'.format(this_section)]:
                continue
            ev1 = results['start_event_{}'.format(this_section)]
            ev2 = results['end_event_{}'.format(this_section)]
            datasets[n] = event_data[ev1:ev2]
        if section == 'both':
            return tuple(datasets)
        return datasets[0]

    def get_raw_data(self, section, scale=False):
        """ Get the template or complement raw data.
        
        :param section: Either template, complement, or both.
        :param scale: Scale the raw data to pA.
        :return:  The raw data for the section. If section=both
            then it returns a tuple with both sections. Returns None
            if the section does not exist.
        """
        results = self.get_results()
        datasets = [None, None]
        if section == 'both':
            sections = ['template', 'complement']
        else:
            sections = [section]
        for n, this_section in enumerate(sections):
            if not results['has_{}'.format(this_section)]:
                continue
            start = results['first_sample_{}'.format(this_section)]
            dur = results['duration_{}'.format(this_section)]
            datasets[n] = self.handle.get_raw_data(start=start, end=start+dur, scale=scale)
        if section == 'both':
            return tuple(datasets)
        return datasets[0]


    ##########################
    #
    #  Private methods below
    #
    ##########################
    
    def _get_summary_data(self):
        summary = self.handle.get_summary_data(self.group_name)
        if summary is None:
            return None
        if 'segmentation' in summary:
            results = summary['segmentation']
        elif 'split_hairpin' in summary:
            results = summary['split_hairpin']
        else:
            results = None
        return results

    def _find_event_data(self):
        attrs = self.handle.get_analysis_attributes(self.group_name)
        evdet_group = attrs.get('event_detection')
        if evdet_group is None:
            evdet_group = self.handle.get_latest_analysis('EventDetection')
        else:
            evdet_group = evdet_group[9:]
        if evdet_group is None:
            return None
        # We directly use the Fast5Read interface here, rather than the
        # EventDetectionTools one, because we don't want to load the entire
        # event table into memory.
        read_info = self.handle.status.read_info[0] # We assume only one read.
        read_number = read_info.read_number
        event_table_group = '{}/Reads/Read_{}'.format(evdet_group, read_number)
        dataset = self.handle.get_analysis_dataset(event_table_group, 'Events', skip_decoding=True)
        return evdet_group, dataset

    def _get_raw_info(self, summary):
        _, dataset = self._find_event_data()
        read_info = self.handle.status.read_info[0] # We assume only one read.
        if dataset is None:
            summary['first_sample_template'] = None
            summary['duration_template'] = None
            summary['first_sample_complement'] = None
            summary['duration_complement'] = None
            return
        if summary.get('start_event_template', -1) >= 0:
            ev1 = summary['start_event_template']
            ev2 = summary['end_event_template']
            summary['first_sample_template'] = dataset[ev1]['start'] - read_info.start_time
            end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time
            summary['duration_template'] = end - summary['first_sample_template']
        if summary.get('start_event_complement', -1) >= 0:
            ev1 = summary['start_event_complement']
            ev2 = summary['end_event_complement']
            summary['first_sample_complement'] = dataset[ev1]['start'] - read_info.start_time
            end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time
            summary['duration_complement'] = end - summary['first_sample_complement']