File: fast5_file.py

package info (click to toggle)
ont-fast5-api 4.1.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,548 kB
  • sloc: python: 3,799; makefile: 153; sh: 13
file content (288 lines) | stat: -rw-r--r-- 12,499 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
""" Basic API for reading/writing single-read fast5 files.
"""
import os
import h5py
import numpy as np

from ont_fast5_api import CURRENT_FAST5_VERSION
from ont_fast5_api.compression_settings import VBZ, raise_missing_vbz_error_read
from ont_fast5_api.fast5_read import Fast5Read
from ont_fast5_api.fast5_info import Fast5Info, ReadInfo
from ont_fast5_api.static_data import supported_modes, mode_docstring

# This unused import is included for backwards compatibilty and can be removed in future.
from ont_fast5_api.data_sanitisation import _sanitize_data_for_reading, _sanitize_data_for_writing


class Fast5FileTypeError(ValueError):
    pass


class Fast5File(Fast5Read):
    """ This object encapsulates a read fast5 file. It can be used
    instead of directly using the h5py API in order to help maintain
    consistency in fast5 file format, and simplify common tasks.

    The object will contain a field called **status** that is a
    Fast5Status object with details about the file.
    """

    def __init__(self, fname, mode='r'):
        """ Constructor. Opens the specified file.

        :param fname: Filename to open.
        :param mode: File open mode (r, r+, w, w-, x, a).
        """
        self.global_key = "UniqueGlobalKey/"
        if mode not in supported_modes:
            raise IOError("Unsupported file handle mode : '{}' {}".format(mode, mode_docstring))
        self.filename = fname
        self.handle = None
        self.mode = mode
        self._initialise_file()

    def get_reads(self):
        # We yield here for consistency with MultiFast5File
        yield self.get_read(self.get_read_id())

    def get_read_ids(self):
        return [self.get_read_id()]

    def get_read(self, read_id):
        if read_id != self.get_read_id():
            raise KeyError("read_id given: {} does not match read_id in file: {}"
                           "".format(read_id, self.get_read_id()))
        return self

    @property
    def read_id(self):
        return self.status.read_info[0].read_id

    def get_read_id(self):
        DeprecationWarning("'read.get_read_id()' will be deprecated. Use the property 'read.read_id' instead")
        return self.read_id

    @property
    def raw_dataset_group_name(self):
        read_number = self._get_only_read_number()
        return 'Raw/Reads/Read_{}'.format(read_number)

    @property
    def has_context_tags(self):
        return 'context_tags' in self.handle[self.global_key[:-1]]

    def add_channel_info(self, data, clear=False):
        """ Add channel info data to the channel_id group.

        :param data: A dictionary of key/value pairs. Keys must be strings.
            Values can be strings or numeric values.
        :param clear: If set, any existing channel info data will be removed. 
        """
        self.assert_writeable()
        self._add_attributes(self.global_key + 'channel_id', data, clear)

    def get_raw_data(self, read_number=None, start=None, end=None, scale=False):
        """ Pull raw data from the file.
        
        :param read_number: The read number you want raw data from. Pass None
            if there is only one read, and will grab that read's raw data.
        :param start: The first sample to pull. Default is to pull from the
            beginning.
        :param end: One past the last sample to pull. Default is to pull until
            the end.
        :param scale: If set, the returned data will be scaled floating point
            values, in pA. Otherwise raw DAQ values are returned as 16 bit
            integers.
        :returns: Raw data as either 32 bit floats, or 16 bit integers.
        """
        self.assert_open()
        if self.raw_dataset_name not in self.handle:
            msg = 'Fast5 file has no raw data for read {} in {}'.format(read_number, self.filename)
            raise KeyError(msg)
        return self._load_raw(self.raw_dataset_name, start, end, scale)

    def add_raw_data(self, data, attrs=None, compression=VBZ):
        """ Add raw data for a read.
        
        :param read_number: The number of the read the raw data is for.
        :param data: The raw data DAQ values (16 bit integers).
        
        The read must already exist in the file. It must not already
        have raw data.
        """
        self.assert_writeable()

        if self.raw_dataset_name in self.handle:
            msg = 'Fast5 file already has raw data for read {} in {}'
            raise KeyError(msg.format(self.raw_dataset_name, self.filename))

        group_name = self.raw_dataset_group_name
        if group_name not in self.handle:
            self.handle.create_group(group_name)

        try:
            self.handle[group_name].create_dataset('Signal', data=data, dtype='i2', **vars(compression))
        except ValueError as e:
            raise_missing_vbz_error_read(e)

        read_index = self.status.read_number_map[self._get_only_read_number()]
        self.status.read_info[read_index].has_raw_data = True
        if attrs is not None:
            self._add_attributes(group_name, attrs, clear=True)

    def add_read(self, read_number, read_id, start_time, duration, mux, median_before):
        """ Add a new read to the file.

        :param read_number: The read number to assign to the read.
        :param read_id: The unique read-id for the read.
        :param start_time: The start time (in samples) of the read.
        :param duration: The duration (in samples) of the read.
        :param mux: The mux set at the time of the read.
        :param median_before: The median level of the data before the read.

        Note that most tools assume a file contains only one read.
        Putting multiple reads into a file severely limits the
        ability to operate on those reads with standard tools.
        """
        self.assert_writeable()
        read_info = ReadInfo(read_number, read_id, start_time, duration, mux=mux, median_before=median_before)
        self.status.read_info.append(read_info)
        n = len(self.status.read_info) - 1
        self.status.read_number_map[read_number] = n
        self.status.read_id_map[read_id] = n
        group_name = self.raw_dataset_group_name
        attrs = {'read_number': read_number,
                 'read_id': read_id,
                 'start_time': start_time,
                 'duration': duration,
                 'start_mux': mux,
                 'median_before': median_before}
        self._add_group(group_name, attrs)

    #########################
    #
    #  Static methods below
    #
    #########################

    @staticmethod
    def update_legacy_file(fname):
        """ Update a fast5 file from an older version to the new standard.
        
        :param fname: The filename of the fast5 file.
        """
        status = Fast5Info(fname)
        if not status.valid:
            raise IOError('Cannot update invalid file: {}'.format(fname))
        with h5py.File(fname, 'r+') as handle:
            # Add Raw/Read/Read_## groups for reads if they are missing.
            for read_info in status.read_info:
                read_group_name = 'Raw/Reads/Read_{}'.format(read_info.read_number)
                if read_group_name in handle:
                    rgh = handle[read_group_name]
                else:
                    rgh = handle.create_group(read_group_name)
                rgh.attrs['read_number'] = read_info.read_number
                rgh.attrs['read_id'] = read_info.read_id
                rgh.attrs['duration'] = read_info.duration
                rgh.attrs['start_time'] = read_info.start_time
                rgh.attrs['start_mux'] = read_info.start_mux

            # Add the Analyses and tracking_id groups, if they are missing.
            if not 'Analyses' in handle:
                handle.create_group('Analyses')
            if not 'tracking_id' in handle['UniqueGlobalKey']:
                handle.create_group('UniqueGlobalKey/tracking_id')

            # Update the EventDetection_000 created by old versions of MinKNOW, if needed.
            if status.version < 1.1:
                if 'Analyses/EventDetection_000' in handle:
                    reads_group = handle['Analyses/EventDetection_000/Reads']
                    data_group_names = reads_group.keys()
                    for data_group_name in data_group_names:
                        read_group = reads_group[data_group_name]
                        read_number = read_group.attrs['read_number']
                        read_info = status.read_info[status.read_number_map[read_number]]
                        read_group.attrs['read_id'] = read_info.read_id
                        if 'Events' in read_group:
                            dataset = read_group['Events']
                            if 'variance' in dataset.dtype.names:
                                old_data = read_group['Events'][()]
                                new_data = np.empty(old_data.size, dtype=[('mean', float), ('stdv', float),
                                                                          ('start', int), ('length', int)])
                                new_data[:]['mean'] = old_data['mean']
                                new_data[:]['stdv'] = np.sqrt(old_data['variance'])
                                new_data[:]['start'] = old_data['start']
                                new_data[:]['length'] = old_data['length']
                                del read_group['Events']
                                read_group.create_dataset('Events', data=new_data, compression='gzip')

            # Update the version number.
            handle.attrs['file_version'] = CURRENT_FAST5_VERSION

    @staticmethod
    def read_summary_data(fname, component):
        """ Read summary data suitable to encode as a json packet.

        :param fname: The fast5 file to pull the summary data from.
        :param component: The component name to pull summary data for.

        :returns: A dictionary containing the summary data.
        """
        summary = {}
        with Fast5File(fname, mode='r') as fh:
            summary['tracking_id'] = fh.get_tracking_id()
            summary['channel_id'] = fh.get_channel_info()
            read_info = fh.status.read_info
            read_summary = []
            for read in read_info:
                read_summary.append({'read_number': read.read_number,
                                     'read_id': read.read_id,
                                     'start_time': read.start_time,
                                     'duration': read.duration,
                                     'start_mux': read.start_mux})
            summary['reads'] = read_summary
            analyses_list = fh.list_analyses(component)
            _, group_names = zip(*analyses_list)
            group_names = sorted(group_names)
            group = group_names[-1]
            summary['software'] = fh.get_analysis_attributes(group)
            summary['software']['component'] = group[:-4]

            summary['data'] = fh.get_summary_data(group)
            summary['filename'] = os.path.basename(fname)
        return summary

    ##########################
    #
    #  Private methods below
    #
    ##########################

    def _get_only_read_number(self):
        read_number = self.status.read_info[0].read_number
        return read_number

    def _initialise_file(self):
        try:
            if self.mode in ['w', 'w-', 'x']:
                with h5py.File(self.filename, self.mode) as fh:
                    fh.attrs['file_version'] = CURRENT_FAST5_VERSION
                    fh.create_group('Analyses')
                    fh.create_group('Raw/Reads')
                    fh.create_group(self.global_key + 'channel_id')
                    fh.create_group(self.global_key + 'context_tags')
                    fh.create_group(self.global_key + 'tracking_id')
                self.mode = 'r+'
            self.status = Fast5Info(self.filename)
            if self.status.valid:
                self.handle = h5py.File(self.filename, self.mode)
        except Exception:
            raise Fast5FileTypeError("Failed to initialise single-read Fast5File: '{}'".format(self.filename))


class EmptyFast5(Fast5File):
    def _initialise_file(self):
        # Enable creation of Fast5File without metadata, which we will populate later
        self.handle = h5py.File(self.filename, self.mode)
        self.handle.attrs['file_version'] = CURRENT_FAST5_VERSION