File: fast5_info.py

package info (click to toggle)
ont-fast5-api 4.1.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,548 kB
  • sloc: python: 3,799; makefile: 153; sh: 13
file content (174 lines) | stat: -rwxr-xr-x 8,401 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
""" Helper class for getting information about a fast5 file.
"""
import os
import h5py

from packaging import version as packaging_version

# This unused import is included for backwards compatibilty and can be removed in future.
from ont_fast5_api.data_sanitisation import _clean

class ReadInfo(object):
    """ This object provides basic details about a read.
    """
    
    def __init__(self, read_number, read_id, start_time, duration,
                 mux=0, median_before=-1.0):
        """ Constructs an object describing a read.
        
        :param read_number: A read number, unique for the channel.
        :param read_id: A globally unique read id.
        :param start_time: The start time of the read (in samples).
        :param duration: The duration of the read (in samples).
        :param mux: The mux of the channel when the read occurred.
        :param median_before: The median current before the read.
        """
        self.read_number = read_number
        self.read_id = read_id
        self.has_raw_data = False
        self.duration = duration
        self.has_event_data = False
        self.event_data_count = 0
        self.start_time = start_time
        self.start_mux = mux
        self.median_before = median_before


class Fast5Info(object):
    """ This object provides some basic details about a read fast5 file.
    
    **Fields**
      * **valid:** Indicates whether the fast5 file is valid or not.
      * **version:** Indicates the version of the read fast5 file
        specification the file conforms to (if any).
      * **read_info:** A list of ReadInfo objects. One entry for each read.
      * **read_number_map:** A dictionary giving the index into the read_info
        list for each read number.
      * **read_id_map:** A dictionary giving the index into the read_info
        list for each read-id.
    """

    def __init__(self, fname):
        """ Constructs a status object from a file.

        :param fname: Filename of fast5 file to read status from.
        """
        self.valid = True
        self.channel = None
        self.read_info = []
        self.read_number_map = {}
        self.read_id_map = {}
        try:
            with h5py.File(fname, 'r') as handle:
                if 'file_version' in handle.attrs:
                    self.version = _clean(handle.attrs['file_version'])
                    minimum_valid_version = packaging_version.Version('0.6')
                    if packaging_version.parse(str(self.version)) \
                       < minimum_valid_version:
                        self.valid = False
                else:
                    self.valid = False
                    self.version = 0.0

                # Check for required groups.
                top_groups = handle.keys()
                if 'UniqueGlobalKey' in top_groups:
                    global_keys = handle['UniqueGlobalKey'].keys()
                if 'tracking_id' not in global_keys and not self._legacy_version():
                    self.valid = False
                if 'channel_id' not in global_keys:
                    self.valid = False

                self.channel = handle['UniqueGlobalKey/channel_id'].attrs.get('channel_number')
                if self.channel is None and self._legacy_version():
                    self.valid = False

                # Get the read information.
                if 'Raw' in top_groups:
                    reads = handle['Raw/Reads'].keys()
                    for read in reads:
                        read_group_name = 'Raw/Reads/{}'.format(read)
                        read_group = handle[read_group_name]
                        read_attrs = read_group.attrs
                        read_number = _clean(read_attrs['read_number'])
                        if 'read_id' in read_attrs:
                            read_id = _clean(read_attrs['read_id'])
                        else:
                            if not self._legacy_version():
                                self.valid = False
                            else:
                                read_id = os.path.basename(fname)
                        start_time = _clean(read_attrs['start_time'])
                        duration = _clean(read_attrs['duration'])
                        mux = _clean(read_attrs.get('start_mux',0))
                        median_before = _clean(read_attrs.get('median_before',-1.0))
                        read_info = ReadInfo(read_number, read_id, start_time, duration, mux, median_before)
                        if 'Signal' in read_group:
                            read_info.has_raw_data = True
                        elif self._legacy_version():
                            if 'Data' in read_group:
                                read_info.has_raw_data = True
                            else:
                                self.valid = False
                        self.read_info.append(read_info)
                        n = len(self.read_info) - 1
                        self.read_number_map[read_number] = n
                        self.read_id_map[read_id] = n
                else:
                    if not self._legacy_version():
                        self.valid = False
                analyses = sorted(handle['Analyses'].keys()) if 'Analyses' in handle else []
                for ana in analyses[::-1]:
                    if ana.startswith('EventDetection'):
                        reads_group_name = 'Analyses/{}/Reads'.format(ana)
                        if reads_group_name not in handle:
                            continue
                        reads = handle[reads_group_name].keys()
                        for read in reads:
                            read_group_name = '{}/{}'.format(reads_group_name, read)
                            read_group = handle[read_group_name]
                            read_attrs = read_group.attrs
                            read_number = _clean(read_attrs['read_number'])
                            if 'read_id' in read_attrs:
                                read_id = _clean(read_attrs['read_id'])
                            else:
                                if not self._legacy_version():
                                    self.valid = False
                                    continue
                                else:
                                    read_id = os.path.basename(fname)
                            start_time = _clean(read_attrs['start_time'])
                            duration = _clean(read_attrs['duration'])
                            mux = _clean(read_attrs.get('start_mux', 0))
                            median_before = _clean(read_attrs.get('median_before', -1.0))
                            read_info = ReadInfo(read_number, read_id, start_time, duration, mux, median_before)
                            if 'Events' in read_group:
                                read_info.has_event_data = True
                                read_info.event_data_count = len(read_group['Events'])
                            else:
                                read_info.has_event_data = False
                                read_info.event_data_count = 0
                            if read_number in self.read_number_map:
                                read_index = self.read_number_map[read_number]
                                self.read_info[read_index].has_event_data = read_info.has_event_data
                                self.read_info[read_index].event_data_count = read_info.event_data_count
                            else:
                                if not self._legacy_version():
                                    self.valid = False
                                self.read_info.append(read_info)
                                n = len(self.read_info) - 1
                                self.read_number_map[read_number] = n
                                self.read_id_map[read_id] = n
                        break
        except:
            self.valid = False
            raise

        if self._legacy_version():
            # There must be either raw data or event data (or both).
            if len(self.read_info) == 0:
                self.valid = False

    def _legacy_version(self):
        legacy_cutoff = packaging_version.Version("1.1")
        return packaging_version.parse(str(self.version)) < legacy_cutoff