File: column.pyx

package info (click to toggle)
obitools 3.0.1~b26%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,788 kB
  • sloc: ansic: 24,299; python: 657; sh: 27; makefile: 20
file content (731 lines) | stat: -rwxr-xr-x 25,555 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
#cython: language_level=3
 

__OBIDMS_COLUMN_CLASS__ = {}


from ..capi.obitypes cimport name_data_type, \
                             obitype_t, \
                             obiversion_t, \
                             OBI_QUAL, \
                             OBI_STR

from ..capi.obidms cimport obi_import_column

from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
                                 obi_close_column, \
                                 obi_get_elements_names, \
                                 obi_column_formatted_infos, \
                                 obi_column_write_comments
                                     
from ..capi.obiutils cimport obi_format_date
 
from ..capi.obiview cimport obi_view_add_column, \
                            obi_view_get_pointer_on_column_in_view, \
                            Obiview_p, \
                            NUC_SEQUENCE_COLUMN, \
                            QUALITY_COLUMN, \
                            REVERSE_SEQUENCE_COLUMN, \
                            REVERSE_QUALITY_COLUMN


from ..object cimport OBIDeactivatedInstanceError

from obitools3.utils cimport tobytes, \
                             bytes2str, \
                             str2bytes, \
                             str2bytes_object, \
                             bytes2str_object, \
                             clean_empty_values_from_object
 
from obitools3.dms.column import typed_column
 
from libc.stdlib cimport free
from libc.string cimport strcpy
  
import importlib
import inspect
import pkgutil
import json


cdef class Column(OBIWrapper) :
    '''
    The obitools3.dms.column.Column class wraps a C instance of a column in the context of a View 
    '''

    cdef inline OBIDMS_column_p pointer(self) :
        return <OBIDMS_column_p>(<OBIDMS_column_p*>(self._pointer))[0]


    @staticmethod
    cdef type get_column_class(obitype_t obitype, bint multi_elts, bint tuples):
        '''
        Internal function returning the python class representing 
        a column for a given obitype.
        '''
        return __OBIDMS_COLUMN_CLASS__[(obitype, multi_elts, tuples)][0]


    @staticmethod
    cdef type get_python_type(obitype_t obitype, bint multi_elts):  # TODO
        '''
        Internal function returning the python type representing 
        an instance for a given obitype.
        '''
        return __OBIDMS_COLUMN_CLASS__[(obitype, multi_elts)][1]


    @staticmethod
    def import_column(object dms_1, object dms_2, object column_name, obiversion_t version_number):
        cdef obiversion_t new_version
        new_version = obi_import_column(tobytes(dms_1), tobytes(dms_2), tobytes(column_name), version_number)
        if new_version < 0 :
            raise Exception("Error importing a column")
        return new_version


    @staticmethod
    def new_column(View      view,
                   object    column_name,
                   obitype_t data_type,
                   index_t   nb_elements_per_line=1,
                   list      elements_names=None,
                   bint      dict_column=False,
                   bint      tuples=False,
                   bint      to_eval=False,
                   object    associated_column_name=b"",
                   int       associated_column_version=-1,
                   object    comments={},
                   object    alias=b""):
        # TODO indexer_name?
        
        cdef Column column
        cdef bytes  column_name_b = tobytes(column_name)
        cdef bytes  alias_b = tobytes(alias)
        cdef bytes  comments_b = str2bytes(json.dumps(bytes2str_object(comments)))
        cdef bytes  associated_column_name_b = tobytes(associated_column_name)
        cdef list   elements_names_s
        cdef bytes  elements_names_b
        cdef char*  elements_names_p
        cdef object elt_name
        cdef str    elt_name_s
        
        if not view.active() :
            raise OBIDeactivatedInstanceError()
 
        if alias_b == b"" :
            alias_b = column_name_b
        
        if elements_names is not None:
            elements_names_s = []
            for elt_name in elements_names :
                if type(elt_name) != bytes and type(elt_name) != str :
                    elements_names_s.append(str(elt_name))
                else :
                    elements_names_s.append(elt_name)
            elements_names_b = b';'.join([tobytes(x) for x in elements_names_s])
            elements_names_p = elements_names_b
        else:
            elements_names_p = NULL
        
        if column_name_b == b"SAMPLE" or column_name_b == b"sample":
        # force str type
            data_type = OBI_STR
        
        if data_type == OBI_QUAL:
            if associated_column_name_b == b"":
                if column_name == QUALITY_COLUMN:
                    if NUC_SEQUENCE_COLUMN not in view:
                         raise RuntimeError("Cannot create column %s in view %s: trying to create quality column but no NUC_SEQ column to associate it with in the view" % (bytes2str(column_name_b),
                                                                               bytes2str(view.name)))
                    associated_column_name_b = NUC_SEQUENCE_COLUMN
                    associated_column_version = view[NUC_SEQUENCE_COLUMN].version                    
                elif column_name == REVERSE_QUALITY_COLUMN:
                    if REVERSE_SEQUENCE_COLUMN not in view:
                         raise RuntimeError("Cannot create column %s in view %s: trying to create reverse quality column but no REVERSE_SEQUENCE column to associate it with in the view" % (bytes2str(column_name_b),
                                                                               bytes2str(view.name)))
                    associated_column_name_b = REVERSE_SEQUENCE_COLUMN
                    associated_column_version = view[REVERSE_SEQUENCE_COLUMN].version
                    
        
        if (obi_view_add_column(view                      = view.pointer(),
                                column_name               = column_name_b,
                                version_number            = -1,
                                alias                     = alias_b,
                                data_type                 = <obitype_t>data_type,
                                nb_lines                  = len(view),
                                nb_elements_per_line      = nb_elements_per_line,
                                elements_names            = elements_names_p,
                                elt_names_formatted       = False,
                                dict_column               = dict_column,
                                tuples                    = tuples,
                                to_eval                   = to_eval,
                                indexer_name              = NULL,
                                associated_column_name    = associated_column_name_b,
                                associated_column_version = associated_column_version,
                                comments                  = comments_b,
                                create                    = True)<0):
            raise RuntimeError("Cannot create column %s in view %s" % (bytes2str(column_name_b),
                                                                       bytes2str(view.name)))
        
        column = Column.open(view, alias_b)
        
        # Automatically associate nuc sequence column to quality column if necessary
        if data_type == OBI_QUAL:
            if column_name == QUALITY_COLUMN:
                view[NUC_SEQUENCE_COLUMN].associated_column_name = column.name
                view[NUC_SEQUENCE_COLUMN].associated_column_version = column.version
            elif column_name == REVERSE_QUALITY_COLUMN:
                view[REVERSE_SEQUENCE_COLUMN].associated_column_name = column.name
                view[REVERSE_SEQUENCE_COLUMN].associated_column_version = column.version
        
        return column
 
 
    @staticmethod
    def open(View view,
             object column_name):
        cdef bytes            column_name_b = tobytes(column_name)
        cdef OBIDMS_column_p* column_pp
        cdef OBIDMS_column_p  column_p
        cdef Column           column
        cdef obitype_t        column_type
        cdef type             column_class
        
        if not view.active() :
            raise OBIDeactivatedInstanceError()
        
        column_pp = obi_view_get_pointer_on_column_in_view(view.pointer(), 
                                                           column_name_b)
        
        if column_pp == NULL:
            raise KeyError("Cannot access to column %s in view %s" % (
                    bytes2str(column_name_b),
                    bytes2str(view.name)
                ))
        
        column_p = column_pp[0]
        column_type = column_p.header.returned_data_type
        column_class = Column.get_column_class(column_type, (column_p.header.nb_elements_per_line > 1 or column_p.header.dict_column == True), column_p.header.tuples)
        column = OBIWrapper.new_wrapper(column_class, column_pp)
        
        column._view = view
        column._alias = column_name_b
        column._elements_names = column.read_elements_names()
        view.register(column)

        return column

    
    @OBIWrapper.checkIsActive
    def add_to_view(self,
                    View   view,
                    object column_name=None) :
        
        cdef bytes alias   
        cdef OBIDMS_column_p column_p = self.pointer()
        
        if not view.active() :
            raise OBIDeactivatedInstanceError()
        
        if (column_name is None):
            alias = self._alias
        else:
            alias = tobytes(column_name)
                       
        if (obi_view_add_column(view                      = view.pointer(),
                                column_name               = column_p.header.name,
                                version_number            = column_p.header.version,
                                alias                     = alias,
                                data_type                 = <obitype_t>0,
                                nb_lines                  = -1,
                                nb_elements_per_line      = -1,
                                elements_names            = NULL,
                                elt_names_formatted       = False,
                                dict_column               = False,
                                tuples                    = False,
                                to_eval                   = False,
                                indexer_name              = NULL,
                                associated_column_name    = NULL,
                                associated_column_version = -1,
                                comments                  = NULL,
                                create                    = False) < 0):
            raise RuntimeError("Cannot insert column %s (%s@%d) into view %s" %
                               ( bytes2str(alias),
                                 bytes2str(column_p.header.name),
                                 column_p.header.version,
                                 bytes2str(view.name)
                               ))
            
        view.register(self)
        
    
    @OBIWrapper.checkIsActive
    def __len__(self):
        '''
        implements the len() function for the Column class
         
            @rtype: `int`
        '''
        return self.lines_used
     
    
    @OBIWrapper.checkIsActive 
    def __sizeof__(self):
        '''
        returns the size of the C object wrapped by the Column instance
        '''
        cdef OBIDMS_column_header_p header = self.pointer().header
        return header.header_size + header.data_size
     
    
    @OBIWrapper.checkIsActive 
    def __iter__(self):
        cdef index_t line_nb
        for line_nb in range(self.lines_used):
            yield self.get_line(line_nb)

    
    # TODO check time efficiency with and without
    @OBIWrapper.checkIsActive
    def __setitem__(self, index_t line_nb, object value):
        self.set_line(line_nb, value)


    @OBIWrapper.checkIsActive
    def __getitem__(self, index_t line_nb):
        return self.get_line(line_nb)
  
    
    @OBIWrapper.checkIsActive             
    def __str__(self) :
        cdef str    to_print
        cdef object line
        to_print = ''
        for line in self :
            to_print = to_print + str(line) + "\n"
        return to_print
 
    
    @OBIWrapper.checkIsActive
    def __repr__(self) :
        cdef str s
        cdef char* sc
        cdef OBIDMS_column_p pointer = self.pointer()
        sc = obi_column_formatted_infos(pointer, False)
        s = bytes2str(sc)
        free(sc)
        return s


    @OBIWrapper.checkIsActive
    def repr_longformat(self) :
        cdef str s
        cdef char* sc
        cdef OBIDMS_column_p pointer = self.pointer()
        sc = obi_column_formatted_infos(pointer, True)
        s = bytes2str(sc)
        free(sc)
        return s
    
         
    def close(self):  # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
    
        cdef OBIDMS_column_p pointer

        if self.active() :
            pointer = self.pointer()
            self._view.unregister(self)
            OBIWrapper.close(self)      
            #if obi_close_column(pointer) < 0 :
            #    raise Exception("Problem closing column %s" % bytes2str(self.name))
 
 
    cdef read_elements_names(self):
        cdef char* elts_names_b
        cdef list  elts_names_list
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        elts_names_b = obi_get_elements_names(self.pointer())
        if elts_names_b == NULL:
            raise Exception("Error reading the elements names of a column")
        elts_names_list = elts_names_b.split(b';')
        free(elts_names_b)
        return elts_names_list
    
    cpdef list keys(self):
        return self._elements_names


    # Column alias property getter and setter
    @property
    def name(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self._alias
    @name.setter
    def name(self, object new_alias):  # @DuplicatedSignature
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        self._view.rename_column(self._alias, new_alias)

    # elements_names property getter
    @property
    def elements_names(self):
        return self._elements_names
       
    # nb_elements_per_line property getter
    @property
    def nb_elements_per_line(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.nb_elements_per_line
 
    # dict_column property getter
    @property
    def dict_column(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.dict_column
 
    # data_type property getter
    @property
    def data_type(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return name_data_type(self.data_type_int)
 
    # data_type integer code property getter
    @property
    def data_type_int(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.returned_data_type

    # original_name property getter
    @property
    def original_name(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.name
 
    # version property getter
    @property
    def version(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.version
 
    # lines_used property getter
    @property
    def lines_used(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.lines_used
 
    # tuples property getter
    @property
    def tuples(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.tuples

    # to_eval property getter
    @property
    def to_eval(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.to_eval

    # creation_date property getter
    @property
    def creation_date(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return obi_format_date(self.pointer().header.creation_date)


    # associated_column name property getter and setter
    @property
    def associated_column_name(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.associated_column.column_name

    @associated_column_name.setter
    def associated_column_name(self, object new_name):
        strcpy(self.pointer().header.associated_column.column_name, tobytes(new_name))


    # associated_column version property getter and setter
    @property
    def associated_column_version(self):
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        return self.pointer().header.associated_column.version

    @associated_column_version.setter
    def associated_column_version(self, int new_version):
        self.pointer().header.associated_column.version = new_version


    # comments property getter
    @property
    def comments(self):
        return Column_comments(self)
    @comments.setter
    def comments(self, object value):
        Column_comments(self, value)

          
cdef class Column_comments(dict):   # Not thread safe
    def __init__(self, Column column, value=None) :
        if not column.active() :
            raise OBIDeactivatedInstanceError()
        self._column = column
        if value is not None:
            self.update(value)     # TODO test and discuss not overwriting (could use replace bool)
        self._update_from_file()

    def _update_from_file(self):
        cdef bytes comments_json
        cdef str comments_json_str
        cdef OBIDMS_column_p column_p
        cdef Column column
        if not self._column.active() :
            raise OBIDeactivatedInstanceError()
        column = self._column
        column_p = <OBIDMS_column_p>(column.pointer())
        comments_json = <bytes> column_p.header.comments
        comments_json_str = bytes2str(comments_json)
        comments_dict = json.loads(comments_json_str)
        str2bytes_object(comments_dict)
        super(Column_comments, self).update(comments_dict)
        
    def __getitem__(self, object key):
        if not self._column.active() :
            raise OBIDeactivatedInstanceError()
        if type(key) == str:
            key = str2bytes(key)
        self._update_from_file()
        return super(Column_comments, self).__getitem__(key)

    def __setitem__(self, object key, object value):
        cdef OBIDMS_column_p column_p
        cdef Column column

        if not self._column.active() :
            raise OBIDeactivatedInstanceError()

        column = self._column
        column_p = <OBIDMS_column_p>(column.pointer())
        
        # Remove virtually empty values from the object    # TODO discuss
        clean_empty_values_from_object(value)
        
        # If value is virtually empty, don't add it    # TODO discuss
        if value is None or len(value) == 0:
            return
        
        # Convert to bytes
        if type(key) == str:
            key = str2bytes(key)
        value_bytes = str2bytes_object(value)

        # Update dict with comments already written in file
        self._update_from_file()
           
        # Add new element  # TODO don't overwrite?
        super(Column_comments, self).__setitem__(key, value_bytes)
        
        # Convert to str because json library doens't like bytes
        dict_str = {key:item for key,item in self.items()}
        dict_str = bytes2str_object(dict_str)
        
        # Convert to json string
        comments_json = json.dumps(dict_str)        

        # Write new comments
        if obi_column_write_comments(column_p, tobytes(comments_json)) < 0:
            raise Exception("Could not write column comments: %s", comments_json)

    def update(self, value):
        for k,v in value.items():
            self[k] = v

    def __contains__(self, key):
        return super(Column_comments, self).__contains__(tobytes(key))

    def __str__(self):
        return bytes2str(self._column.pointer().header.comments)


######################################################################################################
 

cdef class Column_multi_elts(Column) :
    
    @OBIWrapper.checkIsActive
    def __getitem__(self, index_t line_nb):
        return Column_line(self, line_nb)
    
    cpdef set_line(self, index_t line_nb, object values):
        cdef object value_ref
        cdef object values_iter
        if not self.active() :
            raise OBIDeactivatedInstanceError()
        if values is None or len(values) == 0 or (isinstance(values, Column_line) and values.is_NA()):
            for element_name in self._elements_names :
                self.set_item(line_nb, element_name, None)
        else :
            values_iter = xrange(len(values)) if isinstance(values, list) else values
            for value_ref in values_iter :
                self.set_item(line_nb, value_ref, values[value_ref])


######################################################################################################


cdef class Column_line :
 
    def __init__(self, Column column, index_t line_nb) :
        self._index = line_nb
        self._column = column
         

    def __getitem__(self, object elt_id) :
        return self._column.get_item(self._index, elt_id)


    def __setitem__(self, object elt_id, object value):
        self._column.set_item(self._index, elt_id, value)


    def get(self, object elt_id, object default=None):      # TODO returns default if None???
        if elt_id in self:
            return self._column.get_item(self._index, elt_id)
        else:
            return default
    
    
    def __len__(self):
        return self._column.nb_elements_per_line
         
        
    def __contains__(self, object elt_id):
        if type(elt_id) == int:
            return elt_id < self._column.nb_elements_per_line
        else:
            return (tobytes(elt_id) in self._column._elements_names)
 
 
    def __repr__(self) :
        return str(self._column.get_line(self._index))
 
 
    def is_NA(self):
        return self._column.get_line(self._index) is None
    
    
    def __iter__(self) :
        cdef list  elements_names
        cdef bytes element_name
        for element_name in self._column.get_line(self._index) :
            yield element_name
    
    
    cpdef list keys(self):    
    # TODO just returning all element names of the column would be faster but 
    # potentially lots of keys associated with None values on that given line.
    # get_line() methods don't return None values
        return [element_name for element_name in self._column.get_line(self._index)]
    

    cpdef list items(self):
        return [(k, self[k]) for k in self.keys()]
    
    
    cpdef dict dict(self):
        cdef dict d
        d = {}
        for k,v in self.items():
            d[k] = v
        return d
    
    
    def __str__(self):
        cdef dict d
        cdef str  s
        if self.is_NA():
            return ""
        d = {}
        for k,v in self.items():
            if type(v) == bytes:
                value = bytes2str(v)
            else:
                value = v
            d[bytes2str(k)] = value
        s = str(d)
        return s


    cpdef bytes bytes(self):
        cdef str   s
        cdef bytes b
        s = self.__str__()
        b = str2bytes(s)
        return b
        
        
    # column property getter
    @property
    def column(self):
        return self._column

    # index property getter
    @property
    def index(self):
        return self._index
        
    
    cpdef update(self, object data):
        if isinstance(data, dict):
            data=data.items()
        for key,value in data:
            if key in self:
                self[key]=value

 
######################################################################################################


cdef register_column_class(obitype_t obitype,
                           bint multi_elts,
                           bint tuples,
                           type obiclass,
                           type python_type):
    '''
    Each sub class of `OBIDMS_column` needs to be registered after its declaration
    to declare its relationship with an `OBIType_t`
    '''
    global __OBIDMS_COLUMN_CLASS__
    
    assert issubclass(obiclass, Column)
    
    __OBIDMS_COLUMN_CLASS__[(obitype, multi_elts, tuples)] = (obiclass, python_type)


cdef register_all_column_classes() :
    
    x = list(pkgutil.walk_packages(typed_column.__path__, prefix="obitools3.dms.column.typed_column."))
    all_modules = [importlib.import_module(a[1]) for a in x]
    for mod in all_modules :
        getattr(mod, 'register_class')()


register_all_column_classes()