File: ebi.py

package info (click to toggle)
python-cogent 1.4.1-1.2
  • links: PTS, VCS
  • area: non-free
  • in suites: squeeze
  • size: 13,260 kB
  • ctags: 20,087
  • sloc: python: 116,163; ansic: 732; makefile: 74; sh: 9
file content (1526 lines) | stat: -rw-r--r-- 58,303 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
#!/usr/bin/env python
"""Provide a parser for SwissProt EBI format files.
"""
import sys

from string import maketrans, strip, rstrip
from pprint import pprint, pformat
from cogent.parse.record_finder import DelimitedRecordFinder,\
    LabeledRecordFinder, is_empty, TailedRecordFinder
from cogent.parse.record import RecordError, FieldError
from cogent.util.misc import identity, curry,\
        NestedSplitter, list_flatten
from cogent.core.sequence import Sequence

__author__ = "Zongzhi Liu and Sandra Smit"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Zongzhi Liu", "Sandra Smit", "Rob Knight", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.4.1"
__maintainer__ = "Zongzhi Liu"
__email__ = "zongzhi.liu@gmail.com"
__status__ = "Development"

all_chars = maketrans('','')

def rstrip_(chars=None):
    return curry(rstrip, chars=chars) 

EbiFinder = DelimitedRecordFinder('//', 
        constructor=rstrip)

no_indent = lambda s: not s.startswith(' ')
hanging_paragraph_finder = LabeledRecordFinder(no_indent, constructor=None) 

endswith_period = lambda x: x.endswith('.')
period_tail_finder = TailedRecordFinder(endswith_period)


#################################
# pairs_to_dict 
def pairs_to_dict(key_values, dict_mode=None, 
        all_keys=None, handlers={}, default_handler=None):
    """generate a function which return a dict from a sequence of key_value
    pairs. 

    key_values: (key, value) pairs, from any sequence type. Example:
    [('a', 1), ('b', 2), ('b', 3)]

    dict_mode: one of four modes to build a dict from pairs.
    'overwrite_value': default, same as dict(key_values) 
    the key_values example get {'a': 1, 'b': 3}
    'no_duplicated_key': raise error when there is duplicated key;
    or a dict with a list of values when there is duplicated key
    'allow_muti_value': a duplicated key will have a list of values, 
    the example get {'a': 1, 'b': [2, 3]}
    'always_multi_value': always group value(s) into a list for each key; 
    the example get {'a': [1], 'b': [2, 3]}

    all_keys: if a key not found in all_keys, raise error; recommend to use a
    dict for all_keys for efficiency.

    Each value will be converted, if a valid handler can be found in handlers
    or a default_handler is provided. 
    handler = handlers.get(key, default_handler)

    When handlers provided but no valid handler is found for a key: raise
    ValueError.
    Always use original value, if no handlers provided, for example, 
    pairs_to_dict(adict.items()) will return adict.

    Note: use default_handler=identity is often useful if you want to return
    the original value when no handler found.
    """
    if not dict_mode: dict_mode = 'overwrite_value'

    #generate add_item for different dict_mode.
    if dict_mode=='always_multi_value':
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place"""
            dictionary.setdefault(key, []).append(value)

    elif dict_mode=='allow_multi_value':
        multiples = {} #auxillary dict recording the keys with multi_values
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place
            
            Warning: using outer auxillary dictionary: multiples"""
            if key in dictionary:
                if not key in multiples:
                    multiples[key] = True 
                    dictionary[key] = [dictionary[key]]
                dictionary[key].append(value)
            else:
                dictionary[key] = value

    elif dict_mode=='no_duplicated_key':
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place"""
            if key in dictionary:
                raise ValueError('Duplicated Key')
            dictionary[key] = value

    elif dict_mode=='overwrite_value':
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place"""
            dictionary[key] = value
    else: # unknown dict_mode
        raise ValueError('Unknown dict_mode:%s. \ndict_mode must be one of '
                'overwrite_value, no_duplicated_key, allow_multi_value and '
                'always_multi_value.' % dict_mode)
        
    #generate the handle_value function. 
    if not handlers and not default_handler:
        handle_value = lambda x, y: (x, y) 

    else:  #handlers not empty, 
        def handle_value(key, raw_value):
            handler = handlers.get(key, default_handler)
            if handler:
                value = handler(raw_value)
            else: #no handler found for key
                raise ValueError('No handler found for %s' % key)
            return key, value

    #build the result dict.
    result = {}    
    for key, raw_value in key_values:
        if all_keys and key not in all_keys:
            raise ValueError('key: %s not in all_keys: %s' 
                    % (repr(key), all_keys))
        key, value = handle_value(key, raw_value)
        add_item(result, key, value)
    return result

#################################
# generic parsers

def linecode_maker(line):
    """return the linecode and the line.
    
    The two-character line code that begins each line is always followed
    by three blanks, so that the actual information begins with the sixth
    character."""
    linecode = line.split('   ', 1)[0]
    return linecode, line

def labeloff(lines, splice_from=5):
    """strip off the first splice_from characters from each line
    
    Warning: without check!"""
    return [line[splice_from:] for line in lines]

def join_parser(lines, join_str=' ', chars_to_strip=' ;.'):
    """return a joined str from a list of lines, strip off chars requested from
    the joined str"""
    #a str will not be joined
    if isinstance(lines, basestring):
        result = lines
    else:
        result = join_str.join(lines)

    return result.strip(chars_to_strip)
    
def join_split_parser(lines, delimiters=';', item_modifier=strip, 
        same_level=False, **kwargs):
    """return a nested list from lines, join lines before using NestedSplitter.

    delimiters: delimiters used by NestedSplitter
    item_modifier: passed to NestedSplitter, modify each splitted item.
    kwargs: passed to join_parser
    
    Examples:
    join_split_parser(['aa; bb;', 'cc.']) -> ['aa', 'bb', 'cc']
    join_split_parser(['aa; bb, bbb;', 'cc.'], delimiters=';,') 
    -> ['aa', ['bb','bbb'], 'cc']
    join_split_parser('aa (bb) (cc).', delimiters='(', 
    item_modifer=rstrip_(')) -> ['aa','bb','cc']
    """
    result = join_parser(lines, **kwargs)
    
    return NestedSplitter(delimiters, 
        constructor=item_modifier, same_level=same_level)(result)

def join_split_dict_parser(lines, delimiters=[';', ('=',1), ','], 
        dict_mode=None, strict=True, **kwargs):
    """return a dict from lines, using the splited pairs from
    join_split_parser and pairs_to_dict.

    delimiters, kwargs: pass to join_split_sparser
    strict: when dict() fails -- (a pair not got from the second delimiter).
    return unconstructed list when False or raise error when True (default).

    dict_mode: pass to pairs_to_dict.  if leave as None, will be
    'overwrite_value', which is same as dict(pairs).

    Examples:
    join_split_dict_parser(['aa=1; bb=2,3; cc=4 (if aa=1);'])
    -> {'aa':'1', 'bb': ['2','3'], 'cc': '4 (if aa=1)'}
    """
    primary_delimiters, value_delimiters = delimiters[:2], delimiters[2:]
    pairs = join_split_parser(lines, delimiters=primary_delimiters, 
            same_level=True, **kwargs)

    try:
        dict(pairs) #catch error for any not splitted pair. 
    except ValueError, e: #dictionary update sequence element #1 has length 1;
        if strict:
            raise ValueError('e\nFailed to get a dict from pairs: %s' % pairs)
        else:
            #return the splitted list without constucting
            return pairs  

    if value_delimiters:
        split_value = NestedSplitter(value_delimiters, same_level=False) 
        #should raise ValueError here if a pair donot have two elems.
        for i,(k, v) in enumerate(pairs):
            v = split_value(v)
            #modify v only if splitted by the first dilimiter
            if len(v) > 1:
                pairs[i][1] = v

    result = pairs_to_dict(pairs, dict_mode)
    return result


def mapping_parser(line, fields, delimiters=[';', None],
        flatten=list_flatten):
    """return a dict of zip(fields, splitted line), None key will be deleted
    from the result dict.

    line: should be a str,  to be splitted.
    fields: field name and optional type constructor for mapping.  example:
        ['EntryName', ('Length', int), 'MolType'] 
    delimiters: separators used to split the line.
    flatten: a function used to flatten the list from nested splitting.
    """
    splits = NestedSplitter(delimiters=delimiters)(line)
    values = flatten(splits)
    result = {}
    for f, v in zip(fields, values):
        if isinstance(f, (tuple, list)):
            name, type = f
            result[name] = type(v)
        else:
            result[f] = v

    if None in result:
        del result[None]
    
    return result


#################################
# individual parsers
#################################

#################################
# mapping parsers: id, sq
#
def id_parser(lines):
    """return a mapping dict from id lines (only one line).

    The ID (IDentification) line is always the first line of an entry. The general
    form of the ID line is:

    ID   ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH.
    Example:
    ID   CYC_BOVIN      STANDARD;      PRT;   104 AA.
    """
    lines = labeloff(lines)
    return mapping_parser(lines[0], delimiters=[';',None],
        fields=('EntryName','DataClass','MolType',('Length',int)))

def sq_parser(lines):
    """return a mapping dict from SQ lines (only one line).

    The SQ (SeQuence header) line marks the beginning of the sequence data and 
    gives a quick summary of its content.  The format of the SQ line is:

    SQ   SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;

    The line contains the length of the sequence in amino acids ('AA')
    followed by the molecular weight ('MW') rounded to the nearest mass unit
    (Dalton) and the sequence 64-bit CRC (Cyclic Redundancy Check) value 
    ('CRC64').
    """
    lines = labeloff(lines)
    return mapping_parser(lines[0], delimiters=[';',None],
        fields=(None, ('Length', int), None, ('MolWeight',int), None, 'Crc64'))

def kw_parser(lines):
    """return a list of keywords from KW lines.

    The format of the KW line is: 
    KW   Keyword[; Keyword...].
    """
    lines = labeloff(lines)
    return join_split_parser(lines)

def ac_parser(lines):
    """return a list of accession numbers from AC lines.

    The AC (ACcession number) line lists the accession number(s) associated 
    with an entry. The format of the AC line is:
    AC   AC_number_1;[ AC_number_2;]...[ AC_number_N;]

    The first accession number is commonly referred to as the 'primary
    accession number'. 'Secondary accession numbers' are sorted
    alphanumerically.
    """
    lines = labeloff(lines)
    return join_split_parser(lines)

def dt_parser(lines):
    """return the origal lines from DT lines.
    
    Note: not complete parsing

    The DT (DaTe) lines show the date of creation and last modification of the
    database entry.
    The format of the DT line in Swiss-Prot is:
        DT   DD-MMM-YYYY (Rel. XX, Comment)
    The format of the DT line in TrEMBL is:
        DT   DD-MMM-YYYY (TrEMBLrel. XX, Comment)

    There are always three DT lines in each entry, each of them is associated
    with a specific comment:
    * The first DT line indicates when the entry first appeared in the
      database. The comment is 'Created';
    * The second DT line indicates when the sequence data was last modified.
      comment is 'Last sequence update';
    * The third DT line indicates when data (see the note below) other than the
      sequence was last modified. 'Last annotation update'.

    Example of a block of Swiss-Prot DT lines:
        DT   01-AUG-1988 (Rel. 08, Created)
        DT   30-MAY-2000 (Rel. 39, Last sequence update)
        DT   10-MAY-2005 (Rel. 47, Last annotation update)
    """
    lines = labeloff(lines)
    return lines 

#################################
# gn_parser
def gn_parser(lines):
    """return a list of dict from GN lines.
    
    The GN (Gene Name) line indicates the name(s) of the gene(s) that code for
    the stored protein sequence. The GN line contains three types of
    information: Gene names, Ordered locus names, ORF names. format:

    GN   Name=<name>; Synonyms=<name1>[, <name2>...];
    OrderedLocusNames=<name1>[, <name2>...];
    GN   ORFNames=<name1>[, <name2>...];

    None of the above four tokens are mandatory. But a "Synonyms" token can
    only be present if there is a "Name" token.

    If there is more than one gene, GN line blocks for the different genes are 
    separated by the following line:
    GN   and
    Example:
    GN   Name=Jon99Cii; Synonyms=SER1, SER5, Ser99Da; ORFNames=CG7877;
    GN   and
    GN   Name=Jon99Ciii; Synonyms=SER2, SER5, Ser99Db; ORFNames=CG15519;"""
    lines = labeloff(lines)
    return map(gn_itemparser, gn_itemfinder(lines))

gn_itemparser = join_split_dict_parser 

gn_itemfinder = DelimitedRecordFinder('and', constructor=None, strict=False,
        keep_delimiter=False)

def oc_parser(lines):
    """return a list from OC lines.

    The OC (Organism Classification) lines contain the taxonomic classification
    of the source organism.  The classification is listed top-down as nodes in
    a taxonomic tree in which the most general grouping is given first. format:

    OC   Node[; Node...].
    """
    lines = labeloff(lines)
    return join_split_parser(lines)

def os_parser(lines):
    """return a list from OS lines.
    
    OS (Organism Species) line specifies the organism which was the source of
    the stored sequence.  The last OS line is terminated by a period.

    The species designation consists, in most cases, of the Latin genus and
    species designation followed by the English name (in parentheses). For
    viruses, only the common English name is given.

    Examples of OS lines are shown here:
        OS   Escherichia coli.
        OS   Solanum melongena (Eggplant) (Aubergine).
        OS   Rous sarcoma virus (strain SchRuppin A) (RSV-SRA) (Avian leukosis
        OS   virus-RSA).
    """
    lines = labeloff(lines)
    return join_split_parser(lines, 
        delimiters='(', item_modifier=rstrip_(') ')) 
        
def ox_parser(lines):
    """return a dict from OX lines.

    The OX (Organism taxonomy cross-reference) line is used to indicate the 
    identifier of a specific organism in a taxonomic database. The format:
    OX   Taxonomy_database_Qualifier=Taxonomic code;

    Currently the cross-references are made to the taxonomy database of NCBI, 
    which is associated with the qualifier 'TaxID' and a one- to six-digit 
    taxonomic code."""
    lines = labeloff(lines)
    return join_split_dict_parser(lines)

def og_parser(lines):
    """return a list from OG lines
    The OG (OrGanelle) line indicates if the gene coding for a protein
    originates from the mitochondria, the chloroplast, the cyanelle, the
    nucleomorph or a plasmid.  The format of the OG line is:

    OG   Hydrogenosome.
    OG   Mitochondrion.
    OG   Nucleomorph.
    OG   Plasmid name.
    OG   Plastid.
    OG   Plastid; Apicoplast.
    OG   Plastid; Chloroplast.
    OG   Plastid; Cyanelle.
    OG   Plastid; Non-photosynthetic plastid.
    Where 'name' is the name of the plasmid. example:

    OG   Mitochondrion.
    OG   Plasmid R6-5, Plasmid IncFII R100 (NR1), and
    OG   Plasmid IncFII R1-19 (R1 drd-19)."""
    lines = labeloff(lines)
    result = []
    for item in period_tail_finder(lines):
        item = ' '.join(item).rstrip('. ')
        if item.startswith('Plasmid'):
            item = item.replace(' and', '')
            item = map(strip, item.split(','))
        result.append(item)
    return result


#################################
# dr_parser
def dr_parser(lines):
    """return a dict of items from DR lines.

    The DR (Database cross-Reference) lines are used as pointers to information
    related to entries and found in data collections other than Swiss-Prot. 
    The format of one of many DR line is:

    DR   DATABASE_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER[;
    TERTIARY_IDENTIFIER][; QUATERNARY_IDENTIFIER].
    """
    lines = labeloff(lines)
    keyvalues = map(dr_itemparser, period_tail_finder(lines))
    result = pairs_to_dict(keyvalues, 'always_multi_value')
    return result

def dr_itemparser(lines):
    """return a key, value pair from lines of a DR item.
    """
    fields = join_split_parser(lines)
    return fields[0], fields[1:]

#################################
# de_parser
def de_parser(lines):
    """return a dict of {OfficalName: str, Synonyms: str, Fragment: bool,
    Contains: [itemdict,],  Includes: [itemdict,]} from DE lines
    
    The DE (DEscription) lines contain general descriptive information about
    the sequence stored. This information is generally sufficient to identify
    the protein precisely.

    The description always starts with the proposed official name of the
    protein. Synonyms are indicated between brackets. Examples below

    If a protein is known to be cleaved into multiple functional components,
    the description starts with the name of the precursor protein, followed by
    a section delimited by '[Contains: ...]'. All the individual components are
    listed in that section and are separated by semi-colons (';'). Synonyms are
    allowed at the level of the precursor and for each individual component.
    
    If a protein is known to include multiple functional domains each of which
    is described by a different name, the description starts with the name of
    the overall protein, followed by a section delimited by '[Includes: ]'. All
    the domains are listed in that section and are separated by semi-colons
    (';'). Synonyms are allowed at the level of the protein and for each
    individual domain. 

    In rare cases, the functional domains of an enzyme are cleaved, but the
    catalytic activity can only be observed, when the individual chains
    reorganize in a complex. Such proteins are described in the DE line by a
    combination of both '[Includes:...]' and '[Contains:...]', in the order
    given in the following example:

    If the complete sequence is not determined, the last information given on
    the DE lines is '(Fragment)' or '(Fragments)'. Example:

    DE   Dihydrodipicolinate reductase (EC 1.3.1.26) (DHPR) (Fragment).

    DE   Arginine biosynthesis bifunctional protein argJ [Includes: Glutamate
    DE   N-acetyltransferase (EC 2.3.1.35) (Ornithine acetyltransferase)
    DE   (Ornithine transacetylase) (OATase); Amino-acid acetyltransferase
    DE   (EC 2.3.1.1) (N-acetylglutamate synthase) (AGS)] [Contains: Arginine
    DE   biosynthesis bifunctional protein argJ alpha chain; Arginine
    DE   biosynthesis bifunctional protein argJ beta chain] (Fragment).

    Trouble maker:
    DE Amiloride-sensitive amine oxidase [copper-containing] precursor(EC
    DE 1.4.3.6) (Diamine oxidase) (DAO).
    """
    labeloff_lines = labeloff(lines)
    joined = join_parser(labeloff_lines, chars_to_strip='). ')

    keys = ['Includes', 'Contains', 'Fragment']
    fragment_label = '(Fragment'
    contains_label = '[Contains:'
    includes_label = '[Includes:'

    #Process Fragment
    fragment = False
    if joined.endswith(fragment_label):
        fragment = True
        joined = joined.rsplit('(', 1)[0]
        
    #Process Contains
    contains = []
    if contains_label in joined:
        joined, contains_str = joined.split(contains_label)
        contains_str = contains_str.strip(' ]')
        contains = map(de_itemparser, contains_str.split('; '))
    #Process Includes
    includes = []
    if includes_label in joined:
        joined, includes_str = joined.split(includes_label)
        includes_str = includes_str.strip(' ]')
        includes = map(de_itemparser, includes_str.split('; '))

    #Process Primary 
    primary = de_itemparser(joined)

    result = dict(zip(keys, (includes, contains, fragment)))
    result.update(primary)
    return result

def de_itemparser(line):
    """return a dict of {OfficalName: str, Synonyms: [str,]} from a de_item
    
    The description item is a str, always starts with the proposed official
    name of the protein. Synonyms are indicated between brackets. Examples
    below
    
    'Annexin A5 (Annexin V) (Lipocortin V) (Endonexin II)'
    """
    fieldnames = ['OfficalName', 'Synonyms']
    fields = [e.strip(') ') for e in line.split('(')]
    #if no '(', fields[1:] will be []
    return dict(zip(fieldnames, [fields[0], fields[1:]]))
    

#################################
# ft_parser
def ft_parser(lines):
    """return a list of ft items from FT lines.

    The FT (Feature Table) lines lists posttranslational modifications, binding
    sites, enzyme active sites, local secondary structure or other
    characteristics reported in the cited references. Sequence conflicts
    between references are also included in the feature table.
    
    The FT lines have a fixed format. The column numbers allocated to each of
    the data items within each FT line are shown in the following table (column
    numbers not referred to in the table are always occupied by blanks).

        Columns     Data item
        1-2     FT
        6-13    Key name
        15-20   'From' endpoint
        22-27   'To' endpoint
        35-75   Description

    The key name and the endpoints are always on a single line, but the
    description may require one or more additional lines. The following 
    description lines continues from column 35 onwards.  For more information
    about individual ft keys, see 
    http://us.expasy.org/sprot/userman.html#FT_keys

    'FROM' and 'TO' endpoints: Numbering start from 1; When a feature is known
    to extend beyond the position that is given in the feature table, the
    endpoint specification will be preceded by '<' for features which continue
    to the left end (N-terminal direction) or by '>' for features which
    continue to the right end (C- terminal direction); Unknown endpoints are
    denoted by '?'. Uncertain endpoints are denoted by a '?' before the
    position, e.g. '?42'.

    Some features (CARBOHYD, CHAIN, PEPTIDE, PROPEP, VARIANT and VARSPLIC) are
    associated with a unique and stable feature identifier (FTId), which allows
    to construct links directly from position-specific annotation in the
    feature table to specialized protein-related databases.  The FTId is always
    the last component of a feature in the description field. 

    Examples:
        FT   SIGNAL       <1     10       By similarity.
        FT   MOD_RES      41     41       Arginine amide (G-42 provides amide
        FT                                group) (By similarity).
        FT   CONFLICT    327    327       E -> R (in Ref. 2).
        FT   CONFLICT     77     77       Missing (in Ref. 1).
        FT   CARBOHYD    251    251       N-linked (GlcNAc...).
        FT                                /FTId=CAR_000070.
        FT   PROPEP       25     48
        FT                                /FTId=PRO_0000021449.
        FT   VARIANT     214    214       V -> I.
        FT                                /FTId=VAR_009122.
        FT   VARSPLIC     33     83       TVGRFRRRATP -> PLTSFHPFTSQMPP (in
        FT                                isoform 2).
        FT                                /FTId=VSP_004370.

    Secondary structure (HELIX, STRAND, TURN) - The feature table of sequence
    entries of proteins whose tertiary structure is known experimentally
    contains the secondary structure information extracted from the coordinate
    data sets of the Protein Data Bank (PDB).  Residues not specified in one of
    these classes are in a 'loop' or 'random-coil' structure. 
    """
    lines = labeloff(lines)
    fieldnames = 'Start End Description'.split()
    secondary_structure_keynames = 'HELIX STRAND TURN'.split()
    result = {}
    for item in hanging_paragraph_finder(lines):
        keyname, start, end, description = ft_basic_itemparser(item)

        #group secondary structures (as a list) into
        #result['SecondaryStructure']
        if keyname in secondary_structure_keynames:
            result.setdefault('SecondaryStructure', []).\
                    append((keyname, start, end))
            continue

        #further parser the description for certain keynames
        if keyname in ft_description_parsers:
            description = ft_description_parsers[keyname](description)

        #group current item result (as a dict) into result[keyname]
        curr = dict(zip(fieldnames, [start, end, description]))
        result.setdefault(keyname, []).  append(curr)
    return result

def ft_basic_itemparser(item_lines):
    """-> (key, start, end, description) from lines of a feature item.
    
    A feature item (generated by itemfinder) has the same keyname.
    
    WARNING: not complete, location fields need further work?
    """
    #cut_postions: the postions to split the line into fields
    original_cut_positions = [15,22,35] #see doc of ft_parser
    #keyname will start from 0(instead of 6) after labeloff
    cut_positions = [e - 6 for e in original_cut_positions]

    #unpack the first line to fields
    first_line = item_lines[0]
    keyname, from_point, to_point, description = \
            [first_line[i:j].strip() for i,j in
            zip([0]+cut_positions,cut_positions+[None])]

    #extend the description if provided following lines
    if len(item_lines) > 1:
        following_lines = item_lines[1:]
        desc_start = cut_positions[-1]
        following_description = ' '.join(
                [e[desc_start:].strip() for e in following_lines])
        description = ' '.join((description, following_description))

    #convert start and end points to int, is possible
    from_point, to_point = map(try_int, (from_point, to_point))
    return keyname, from_point, to_point, description.strip(' .')

def try_int(obj):
    """return int(obj), or original obj if failed"""
    try:
        return int(obj)
    except ValueError: #invalid literal for int()
        return obj


### ft description_parsers below

def ft_id_parser(description):
    """return a dict of {'Description':,'Id':} from raw decription str
    
    Examples.
    FT   PROPEP       25     48
    FT                                /FTId=PRO_0000021449.
    FT   VARIANT     214    214       V -> I.
    FT                                /FTId=VAR_009122.
    FT   VARSPLIC     33     83       TVGRFRRRATP -> PLTSFHPFTSQMPP (in
    FT                                isoform 2).
    FT                                /FTId=VSP_004370.
    """
    fieldnames = ['Description', 'Id']
    id_sep='/FTId='  
    try:
        desc, id = [i.strip(' .') for i in description.split(id_sep)]
    except:
        desc, id = description, ''

    #replace desc in fields with (desc, id) to get the result
    result = dict(zip(fieldnames, [desc, id]))
    return result  

def ft_mutation_parser(description, mutation_comment_delimiter='('):
    """return a  dict of {'MutateFrom': , 'MutateTo':,'Comment':} from 
    description str 

    Warning: if both id and mutation should be parsed, always parse id first.
    
    Note: will add exceptions later

    Examples.
    FT   VARIANT     214    214       V -> I (in a lung cancer).
    FT                                /FTId=VAR_009122.
    FT   CONFLICT    484    484       Missing (in Ref. 2).
    FT   CONFLICT    802    802       K -> Q (in Ref. 4, 5 and 10).
    """
    fieldnames = 'MutateFrom MutateTo Comment'.split()
    
    #split desc into mutation and comment
    desc = description.rstrip(' )')
    try:
        mutation, comment = desc.split(mutation_comment_delimiter, 1)
    except ValueError, e:  #too many values to unpack
        mutation, comment = desc, ''

    #split mutation into mut_from, mut_to
    #if mut_from/to unknown, the mutation message will be in mut_from
    mutation_delimiter = '->'
    try:
        mut_from, mut_to=map(strip, mutation.split(mutation_delimiter, 1))
    except ValueError, e:  #too many values to unpack
        mut_from, mut_to = mutation, ''

    #replace desc in fields with mut_from, mut_to and comment to get the result
    result = dict(zip(fieldnames, [mut_from, mut_to, comment]))
    return result

def ft_mutagen_parser(description):
    """return a dict from MUTAGEN description

    MUTAGEN - Site which has been experimentally altered.  Examples

    FT   MUTAGEN     119    119       C->R,E,A: Loss of cADPr hydrolase and
    FT                                ADP-ribosyl cyclase activity.
    FT   MUTAGEN     169    177       Missing: Abolishes ATP-binding.
    """
    return ft_mutation_parser(description, mutation_comment_delimiter=':')

def ft_id_mutation_parser(description):
    """return a dict from description str
    
    Examples.
    FT   VARIANT     214    214       V -> I.
    FT                                /FTId=VAR_009122.
    FT   VARSPLIC     33     83       TVGRFRRRATP -> PLTSFHPFTSQMPP (in
    FT                                isoform 2).
    FT                                /FTId=VSP_004370.
    """
    desc_id_dict = ft_id_parser(description)
    desc = desc_id_dict.pop('Description')
    result = dict(desc_id_dict, **ft_mutation_parser(desc))
    return result
    
ft_description_parsers = {
    'VARIANT': ft_id_mutation_parser,
    'VARSPLIC': ft_id_mutation_parser,
    'CARBOHYD':ft_id_parser,
    'CHAIN': ft_id_parser,
    'PEPTIDE':ft_id_parser,
    'PROPEP': ft_id_parser,
    'CONFLICT': ft_mutation_parser,
    'MUTAGEN': ft_mutagen_parser,
    #'NON_TER': ft_choplast_parser, 
    #'NON_CONS': ft_choplast_parser,
    
}


#################################
# cc_parser
all_cc_topics = dict.fromkeys([
    'ALLERGEN', 'ALTERNATIVE PRODUCTS', 'BIOPHYSICOCHEMICAL PROPERTIES',
    'BIOTECHNOLOGY', 'CATALYTIC ACTIVITY', 'CAUTION', 'COFACTOR', 'DATABASE',
    'DEVELOPMENTAL STAGE', 'DISEASE', 'DOMAIN', 'ENZYME REGULATION',
    'FUNCTION', 'INDUCTION', 'INTERACTION', 'MASS SPECTROMETRY',
    'MISCELLANEOUS', 'PATHWAY', 'PHARMACEUTICAL', 'POLYMORPHISM', 'PTM',
    'RNA EDITING', 'SIMILARITY', 'SUBCELLULAR LOCATION', 'SUBUNIT',
    'TISSUE SPECIFICITY', 'TOXIC DOSE'])
    
def cc_parser(lines, strict=False):
    """return a dict of {topic: a list of values} from CC lines.
    
    some topics have special format and will use specific parsers defined in
    handlers

    The CC lines are free text comments on the entry, and are used to convey
    any useful information. The comments always appear below the last reference
    line and are grouped together in comment blocks; a block is made up of 1 or
    more comment lines. The first line of a block starts with the characters
    '-!-'.  Format:

    CC   -!- TOPIC: First line of a comment block;
    CC       second and subsequent lines of a comment block.

    Examples:
    CC   -!- DISEASE: Defects in PHKA1 are linked to X-linked muscle
    CC       glycogenosis [MIM:311870]. It is a disease characterized by slowly
    CC       progressive, predominantly distal muscle weakness and atrophy.
    CC   -!- DATABASE: NAME=Alzheimer Research Forum; NOTE=APP mutations;
    CC       WWW="http://www.alzforum.org/res/com/mut/app/default.asp".
    CC   -!- INTERACTION:
    CC       Self; NbExp=1; IntAct=EBI-123485, EBI-123485;
    CC       Q9W158:CG4612; NbExp=1; IntAct=EBI-123485, EBI-89895;
    CC       Q9VYI0:fne; NbExp=1; IntAct=EBI-123485, EBI-126770;
    CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
    CC       Kinetic parameters:
    CC         KM=98 uM for ATP;
    CC         KM=688 uM for pyridoxal;
    CC         Vmax=1.604 mmol/min/mg enzyme;
    CC       pH dependence:
    CC         Optimum pH is 6.0. Active from pH 4.5 to 10.5;
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative splicing; Named isoforms=3;
    CC         Comment=Additional isoforms seem to exist. Experimental
    CC         confirmation may be lacking for some isoforms;
    CC       Name=1; Synonyms=AIRE-1;
    CC         IsoId=O43918-1; Sequence=Displayed;
    CC       Name=2; Synonyms=AIRE-2;
    CC         IsoId=O43918-2; Sequence=VSP_004089;
    CC       Name=3; Synonyms=AIRE-3;
    CC         IsoId=O43918-3; Sequence=VSP_004089, VSP_004090;
    CC   --------------------------------------------------------------------------
    CC   This SWISS-PROT entry is copyright. It is produced  a collaboration
    CC   removed.
    CC   --------------------------------------------------------------------------
    """
    lines = labeloff(lines) 
    #cc_itemfinder yield each topic block
    #cc_basic_itemparser split a topic block into (topic_name, content_as_list)
    topic_contents = map(cc_basic_itemparser, cc_itemfinder(lines))
    #content of a topic further parsed using a content_parser decided by the
    #topic name.  result is grouped into a dict.
    try:
        result = pairs_to_dict(topic_contents, 'always_multi_value', 
            handlers=cc_content_parsers, default_handler=join_parser)
    except Exception, e:
        pprint( lines)
        raise e

    if strict:
        for topic in result:
            if topic not in all_cc_topics:
                raise FieldError('Invalid topic: %s' % topic)

    return result

def cc_basic_itemparser(topic):
    """return (topic_name, topic_content as a list) from a cc topic block. 
    
    Format of a topic as input of this function: [ 
    '-!- TOPIC: First line of a comment block;',
    '    second and subsequent lines of a comment block.']
    """
    num_format_leading_spaces = 4  #for topic lines except the first

    #get the keyname and content_head from the first line
    topic_head = topic[0].lstrip(' -!')
    try:
        keyname, content_head = map(strip, topic_head.split(':', 1))
    except ValueError: # need more than 1 value to unpack
        raise FieldError('Not a valid topic line: %s', topic[0]) 

    if content_head:
        content = [content_head]
    else:
        content = []

    #the following lines be stripped off the format leading spaces
    if len(topic) > 1:
        content += labeloff(topic[1:], num_format_leading_spaces)

    return keyname, content

def cc_itemfinder(lines):
    """yield each topic/license as a list from CC lines without label and
    leading spaces.
    
    Warning: hardcoded LICENSE handling"""

    ## all the codes except the return line  tries to preprocess the 
    #license block

    #two clusters of '-' are used as borders for license, as observed
    license_border = '-' * 74
    license_headstr = '-!- LICENSE:'
    content_start = 4  #the idx where topic content starts

    if license_border in lines:
        #discard the bottom license border
        if lines[-1] == license_border:
            lines.pop()
        else:
            raise FieldError('No bottom line for license: %s' % lines)

        #normalize license lines to the format of topic lines
        license_idx = lines.index(license_border)
        lines[license_idx] = license_headstr 
        for i in range(license_idx+1, len(lines)):
            lines[i] = ' ' * content_start + lines[i]

    #the return line is all we need, if no license block
    return hanging_paragraph_finder(lines)


## cc_content_parsers here below

def cc_interaction_parser(content_list):
    """return a list of [interactor, {params}] from interaction content.

    Format:
    -!- INTERACTION:
        {{SP_Ac:identifier[ (xeno)]}|Self}; NbExp=n; IntAct=IntAct_Protein_Ac,
        IntAct_Protein_Ac;
    """
    result= []
    for line in content_list:
        interactor, params = line.split(';',1)
        params = join_split_dict_parser([params])
        result.append((interactor.strip(), params))
    return result
        
cc_alternative_products_event_finder = LabeledRecordFinder(
        lambda x: x.startswith('Event='))
cc_alternative_products_name_finder = LabeledRecordFinder(
        lambda x: x.startswith('Name='))
def cc_alternative_products_parser(content_list):
    """return a list from AlternativeProucts lines.

    Note: not complete parsing, consider to merge Names to the last Event??
    and make event or name to be the dict key?

    Format:
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative promoter;
    CC         Comment=Free text;
    CC       Event=Alternative splicing; Named isoforms=n;
    CC         Comment=Optional free text;
    CC       Name=Isoform_1; Synonyms=Synonym_1[, Synonym_n];
    CC         IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=Displayed;
    CC         Note=Free text;
    CC       Name=Isoform_n; Synonyms=Synonym_1[, Synonym_n];
    CC         IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=VSP_identifier_1 [, VSP_identifier_n];
    CC         Note=Free text;
    CC       Event=Alternative initiation;
    CC         Comment=Free text;
    """
    result = []
    for event in cc_alternative_products_event_finder(content_list):
        head_names = list(cc_alternative_products_name_finder(event))
        head, names = head_names[0], head_names[1:]
        event_dict = join_split_dict_parser(head)
        if names:
            event_dict['Names'] = map(join_split_dict_parser, names)
        result.append(event_dict)
    return result 

def cc_biophysicochemical_properties_parser(content):
    """return a dict from content_list of a ~ topic.
    
    Format of a ~ topic block:
    CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
    CC       Absorption:
    CC         Abs(max)=xx nm;
    CC         Note=free_text;
    CC       Kinetic parameters:
    CC         KM=xx unit for substrate [(free_text)];
    CC         Vmax=xx unit enzyme [free_text];
    CC         Note=free_text;
    CC       pH dependence:
    CC         free_text;
    CC       Redox potential:
    CC         free_text;
    CC       Temperature dependence:
    CC         free_text;

    Example of a ~ topic block:
    CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
    CC       Kinetic parameters:
    CC         KM=98 uM for ATP;
    CC         KM=688 uM for pyridoxal;
    CC         Vmax=1.604 mmol/min/mg enzyme;
    CC       pH dependence:
    CC         Optimum pH is 6.0. Active from pH 4.5 to 10.5;
    """
    def get_sub_key_content(sub_topic):
        """return (sub_key, sub_content as parsed) from lines of a sub_topic"""
        sub_key = sub_topic[0].rstrip(': ')
        sub_content = map(strip, sub_topic[1:]) #strip the two leading spaces

        #further process the content here
        if sub_key in ['Kinetic parameters',  'Absorption']:
            #group into a dict which allow multiple values.
            subkey_values = join_split_parser(sub_content,
                    delimiters=[';',('=',1)])
            sub_content = pairs_to_dict(subkey_values, 'allow_multi_value')
        else:
            sub_content = join_parser(sub_content, chars_to_strip='; ')
        return sub_key, sub_content

    sub_key_contents = map(get_sub_key_content,
            hanging_paragraph_finder(content))
    return pairs_to_dict(sub_key_contents, 'no_duplicated_key')
    
cc_content_parsers = {
    #? not complete: further group alternative splicing?
    'ALTERNATIVE PRODUCTS': cc_alternative_products_parser, 
    'BIOPHYSICOCHEMICAL PROPERTIES': cc_biophysicochemical_properties_parser,
    'INTERACTION': cc_interaction_parser, 
    'DATABASE': join_split_dict_parser, 
    'MASS SPECTROMETRY':join_split_dict_parser, 
}


#################################
# REFs parser
def refs_parser(lines):
    """return a dict of {RN: single_ref_dict}
    
    These lines comprise the literature citations. The citations indicate the
    sources from which the data has been abstracted.  if several references are
    given, there will be a reference block for each.
    """
    rn_ref_pairs = map(single_ref_parser, ref_finder(lines))
    return pairs_to_dict(rn_ref_pairs)

is_ref_line = lambda x: x.startswith('RN')
ref_finder = LabeledRecordFinder(is_ref_line)

required_ref_labels = 'RN RP RL RA/RG RL'.split()
def single_ref_parser(lines, strict=False):
    """return rn, ref_dict from lines of a single reference block

    strict: if True (default False), raise RecordError if lacking required
    labels.

    Warning: using global required_ref_labels.

    The reference lines for a given citation occur in a block, and are always
    in the order RN, RP, RC, RX, RG, RA, RT and RL. Within each such reference
    block, the RN line occurs once, the RC, RX and RT lines occur zero or more
    times, and the RP, RG/RA and RL lines occur one or more times.
    """
    #group by linecode
    label_lines = map(linecode_maker, lines)
    raw_dict = pairs_to_dict(label_lines, 'always_multi_value')

    if strict:
        labels = dict.fromkeys(raw_dict.keys())
        if 'RA' in labels or 'RG' in labels:
            labels['RA/RG'] = True
        for rlabel in required_ref_labels:
            if rlabel not in labels:
                raise RecordError('The reference block lacks required label: '\
                    '%s' % rlabel)

    #parse each field with relevant parser
    parsed_dict = pairs_to_dict(raw_dict.items(), handlers=ref_parsers)
    rn = parsed_dict.pop('RN')
    
    return rn, parsed_dict


## ref_parsers here below

def rx_parser(lines):
    """return a dict from RX lines.

    The RX (Reference cross-reference) line is an optional line which is used
    to indicate the identifier assigned to a specific reference in a
    bibliographic database. The format:
    RX   Bibliographic_db=IDENTIFIER[; Bibliographic_db=IDENTIFIER...];

    Where the valid bibliographic database names and their associated
    identifiers are:
       MEDLINE    Eight-digit MEDLINE Unique Identifier (UI)
       PubMed    PubMed Unique Identifier (PMID)
       DOI  Digital Object Identifier (DOI), examples:
           DOI=10.2345/S1384107697000225
           DOI=10.4567/0361-9230(1997)42:<OaEoSR>2.0.TX;2-B
           http://www.doi.org/handbook_2000/enumeration.html#2.2
    """
    lines = labeloff(lines) 
    return join_split_dict_parser(lines, delimiters=['; ','='])

def rc_parser(lines):
    """return a dict from RC lines.

    The RC (Reference Comment) lines are optional lines which are used to store
    comments relevant to the reference cited. The format:
    RC   TOKEN1=Text; TOKEN2=Text; ...

    The currently defined tokens and their order in the RC line are:
    STRAIN TISSUE TRANSPOSON PLASMID
    """
    lines = labeloff(lines) 
    return join_split_dict_parser(lines)

def rg_parser(lines):
    """return a list of str(group names) from RG lines

    The Reference Group (RG) line lists the consortium name associated with a
    given citation. The RG line is mainly used in submission reference blocks,
    but can also be used in paper references, if the working group is cited as
    an author in the paper. RG line and RA line (Reference Author) can be
    present in the same reference block; at least one RG or RA line is
    mandatory per reference block. example:
    RG   The mouse genome sequencing consortium;
    """
    lines = labeloff(lines) 
    return join_split_parser(lines)

def ra_parser(lines):
    """return a list from RA lines.

    The RA (Reference Author) lines list the authors of the paper (or other
    work) cited. RA might be missing in references that cite a reference group
    (see RG line). At least one RG or RA line is mandatory per reference block.

    All of the authors are included, and are listed in the order given in the
    paper. The names are listed surname first followed by a blank, followed by
    initial(s) with periods. The authors' names are separated by commas and
    terminated by a semicolon. Author names are not split between lines. eg: 
    RA   Galinier A., Bleicher F., Negre D., Perriere G., Duclos B.;

    All initials of the author names are indicated and hyphens between initials
    are kept.  An author's initials can be followed by an abbreviation such as
    'Jr' (for Junior), 'Sr' (Senior), 'II', 'III' or 'IV'.  Example:
    RA   Nasoff M.S., Baker H.V. II, Wolf R.E. Jr.;
    """
    lines = labeloff(lines) 
    return join_split_parser(lines, chars_to_strip=';', delimiters=',')

def rp_parser(lines):
    """return joined str stripped of '.'.

    The RP (Reference Position) lines describe the extent of the work relevant 
    to the entry carried out by the authors. format:
    RP   COMMENT.
    """
    lines = labeloff(lines) 
    return ' '.join(lines).strip('. ')

def rl_parser(lines):
    """return joined str stipped of '.'.

    Note: not complete parsing.

    The RL (Reference Location) lines contain the conventional citation 
    information for the reference. In general, the RL lines alone are 
    sufficient to find the paper in question.

    a) Journal citations
    RL   Journal_abbrev Volume:First_page-Last_page(YYYY).
    When a reference is made to a paper which is 'in press'
    RL   Int. J. Parasitol. 0:0-0(2005).

    b) Electronic publications
    includes an '(er)' prefix. The format is indicated below:
    RL   (er) Free text.

    c) Book citations
    RL   (In) Editor_1 I.[, Editor_2 I., Editor_X I.] (eds.);
    RL   Book_name, pp.[Volume:]First_page-Last_page, Publisher, City (YYYY).
    Examples:
    RL   (In) Rich D.H., Gross E. (eds.);
    RL   Proceedings symposium, pp.69-72, Pierce
    RL   Chemical Co., Rockford Il. (1981).

    d) Unpublished results, eg:
    RL   Unpublished results, cited by:
    RL   Shelnutt J.A., Rousseau D.L., Dethmers J.K., Margoliash E.;
    RL   Biochemistry 20:6485-6497(1981).

    e) Unpublished observations, format:
    RL   Unpublished observations (MMM-YYYY).

    f) Thesis, format:
    RL   Thesis (Year), Institution_name, Country.

    g) Patent applications, format:
    RL   Patent number Pat_num, DD-MMM-YYYY.

    h) Submissions, format:
    RL   Submitted (MMM-YYYY) to Database_name.
    'Database_name' is one of the following:
    EMBL/GenBank/DDBJ, Swiss-Prot, PDB, PIR.
    """
    lines = labeloff(lines) 
    return ' '.join(lines).strip('. ')

def rt_parser(lines):
    """return joined line stripped of .";
    
    The RT (Reference Title) lines give the title of the paper (or other work)
    cited as exactly as possible given the limitations of the computer 
    character set. The format of the RT line is:
    RT   "Title.";"""
    lines = labeloff(lines) 
    return ' '.join(lines).strip('.";')


def rn_parser(lines):
    """return a integer from RN lines (only one line).

    The RN (Reference Number) line gives a sequential number to each reference
    citation in an entry. This number is used to indicate the reference in 
    comments and feature table notes. The format of the RN line is:
    RN   [##]
    """
    lines = labeloff(lines) 
    return int(lines[0].strip(' []'))

ref_parsers = {
    'RN': rn_parser,
    'RP': rp_parser,
    'RC': rc_parser,
    'RX': rx_parser,
    'RG': rg_parser,
    'RA': ra_parser,
    'RT': rt_parser,
    'RL': rl_parser,
}

required_labels = 'ID AC DT DE OS OC OX SQ REF'.split() + ['']
#################################
# Minimal Ebi parser
def MinimalEbiParser(lines, strict=True, selected_labels=[]):
    """yield each (sequence as a str, a dict of header) from ebi record lines

    if strict (default), raise RecordError if a record lacks required labels.

    Warning: using the global required_labels.

    Line code   Content     Occurrence in an entry
    ID  Identification  Once; starts the entry
    AC  Accession number(s) Once or more
    DT  Date    Three times
    DE  Description Once or more
    GN  Gene name(s)    Optional
    OS  Organism species    Once
    OG  Organelle   Optional
    OC  Organism classification Once or more
    OX  Taxonomy cross-reference    Once
    RN  Reference number    Once or more
    RP  Reference position  Once or more
    RC  Reference comment(s)    Optional
    RX  Reference cross-reference(s)    Optional
    RG  Reference group Once or more (Optional if RA line)
    RA  Reference authors   Once or more (Optional if RG line)
    RT  Reference title Optional
    RL  Reference location  Once or more
    CC  Comments or notes   Optional
    DR  Database cross-references   Optional
    KW  Keywords    Optional
    FT  Feature table data  Optional
    SQ  Sequence header Once
    (blanks)    Sequence data   Once or more
    //  Termination line    Once; ends the entry

    The two-character line-type code that begins each line is always followed
    by three blanks, so that the actual information begins with the sixth
    character. Information is not extended beyond character position 75 except
    for one exception: CC lines that contain the 'DATABASE' topic"""
    for record in EbiFinder(lines):
        if strict and not record[0].startswith('ID'): 
            raise RecordError('Record must begin with ID line')
        del record[-1] #which must be //, ensured by Finder

        keyvalues = map(linecode_merging_maker, record)
        raw_dict = pairs_to_dict(keyvalues, 'always_multi_value',
                all_keys=_parsers)

        if strict:
            for rlabel in required_labels:
                if rlabel not in raw_dict:
                    raise RecordError('The record lacks required label: '\
                        '%s' % rlabel)

        sequence = raw_dict.pop('')  #which is the linecode for sequence
        sequence = ''.join(sequence).translate(all_chars,'\t\n ')
        
        if selected_labels:
            for key in raw_dict.keys():
                if key not in selected_labels:
                    del raw_dict[key]
        
        header_dict = raw_dict 
        yield sequence, header_dict

def linecode_merging_maker(line):
    """return merged linecode and the line.
    
    All valid reference linecodes merged into REF
    
    Warning: using global ref_parsers"""
    linecode = linecode_maker(line)[0] 
    if linecode in ref_parsers:
        linecode = 'REF'
    return linecode, line

#################################
# EbiParser
def parse_header(header_dict, strict=True):
    """Parses a dictionary of header lines"""
    return pairs_to_dict(header_dict.items(), 'no_duplicated_key',
            handlers = _parsers)
 
_parsers = {
    'ID': id_parser,
    'AC': ac_parser,
    'DE': de_parser,
    'DT': dt_parser,
    'GN': gn_parser,
    'OC': oc_parser, 
    'OS': os_parser, 
    'OX': ox_parser, 
    'OG': og_parser, 
    'REF': refs_parser,
    'CC': cc_parser,
    'DR': dr_parser,
    'KW': kw_parser,
    'FT': ft_parser,
    'SQ': sq_parser,
    '': None,
    '//': None,
}

def EbiParser(lines, seq_constructor=Sequence, 
        header_constructor= parse_header, strict=True, selected_labels=[]):
    """Parser for the EBI data format.

    lines: input data (list of lines or file stream)
    seq_constructor: constructor function to construct sequence, 'Sequence' by 
        default.
    header_constructor: function to process the header information. Default
        is 'parse_header'
    strict: whether an exception should be raised in case of a problem
        (strict=True) or whether the bad record should be skipped
        (strict=False).
    selected_labels: Labels from the original data format that you want 
        returned. All the original header labels are used, except for 
        REFERENCES, which is 'REF'.
    """
    for sequence, header_dict in MinimalEbiParser(lines, strict=strict,\
        selected_labels=selected_labels):
        if seq_constructor:
            sequence = seq_constructor(sequence)
        try:
            header = header_constructor(header_dict, strict=strict)
        except (RecordError, FieldError, ValueError), e:
            if strict:
                #!! just raise is better than raise RecordError
                raise #RecordError, str(e)
            else:
                continue
        
        yield sequence, header


if __name__ == "__main__":
    from getopt import getopt, GetoptError
    usage = """ Usage: python __.py [options] [source]

Options:
  -h, --help              show this help
  -d                      show debugging information while parsing

Examples:
"""
    try:
        opts, args = getopt(sys.argv[1:], "hd", ["help"])
    except GetoptError:
        print usage; sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print usage; sys.exit()
    if args:
        lines = file(args[0])
        print 'Parsing the file'
        for i, rec in enumerate(EbiParser(lines, strict=True)):
            print '\r %s: %s' % (i,rec[1]['ID']['EntryName']) ,
    else: 
        lines="""\
ID   Q9U9C5_CAEEL   PRELIMINARY;      PRT;   218 AA. 
AC   Q9U9C5;hdfksfsdfs;sdfsfsfs;
DT   01-MAY-2000 (TrEMBLrel. 13, Created)
DT   01-MAY-2000 (TrEMBLrel. 13, Last sequence update)
DT   13-SEP-2005 (TrEMBLrel. 31, Last annotation update)
DE   Basic salivary proline-rich protein 4 allele L (Salivary proline-rich 
DE   protein Po) (Parotid o protein) [Contains: Peptide P-D (aa); BB (bb) 
DE   (bbb)] (Fragment).
GN   Name=nob-1; ORFNames=Y75B8A.2, Y75B8A.2B;
GN   and
GN   Name=Jon99Ciii; Synonyms=SER2, SER5, Ser99Db; ORFNames=CG15519;
OS   Caenorhabditis elegans (aa) (bb).
OC   Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
OC   Rhabditidae; Peloderinae; Caenorhabditis.
OG   Plastid; Apicoplast.
OG   Plasmid R6-5, Plasmid IncFII R100 (NR1), and
OG   Plasmid IncFII R1-19 (R1 drd-19).
OX   NCBI_TaxID=6239;
RN   [1]
RP   NUCLEOTIDE SEQUENCE.
RC   STRAIN=N2;
RX   MEDLINE=20243724; PubMed=10781051; DOI=10.1073/pnas.97.9.4499;
RA   Van Auken K., Weaver D.C., Edgar L.G., Wood W.B.;
RT   "Caenorhabditis elegans embryonic axial patterning requires two
RT   recently discovered posterior-group Hox genes.";
RL   Proc. Natl. Acad. Sci. U.S.A. 97:4499-4503(2000).
RN   [2]
RP   NUCLEOTIDE SEQUENCE.
RC   STRAIN=N2;
RG   The mouse genome sequencing consortium;
RL   Submitted (JUL-1999) to the EMBL/GenBank/DDBJ databases.
CC   -!- SUBCELLULAR LOCATION: Nuclear (By similarity).
CC   -!- DATABASE: NAME=slkdfjAtlas Genet. Cytogenet. Oncol. Haematol.;
CC       WWW="http://www.infobiogen.fr/services/chromcancer/Genes/
CC   -!- DATABASE: NAME=Atlas Genet. Cytogenet. Oncol. Haematol.;
CC       WWW="http://www.infobiogen.fr/services/chromcancer/Genes/
CC       P53ID88.html".
CC   -!- INTERACTION:
CC       P51617:IRAK1; NbExp=1; IntAct=EBI-448466, EBI-358664;
CC       P51617:IRAK1; NbExp=1; IntAct=EBI-448472, EBI-358664;
CC   -!- ALTERNATIVE PRODUCTS:
CC       Event=Alternative splicing; Named isoforms=3;
CC         Comment=Additional isoforms seem to exist. Experimental
CC         confirmation may be lacking for some isoforms;
CC       Name=1; Synonyms=AIRE-1;
CC         IsoId=O43918-1; Sequence=Displayed;
CC       Name=2; Synonyms=AIRE-2;
CC         IsoId=O43918-2; Sequence=VSP_004089;
CC       Name=3; Synonyms=AIRE-3;
CC         IsoId=O43918-3; Sequence=VSP_004089, VSP_004090;
CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
CC       Kinetic parameters:
CC         KM=98 uM for ATP;
CC         KM=688 uM for pyridoxal;
CC         Vmax=1.604 mmol/min/mg enzyme;
CC       pH dependence:
CC         Optimum pH is 6.0. Active from pH 4.5 to 10.5;
CC   -!- MASS SPECTROMETRY: MW=13822; METHOD=MALDI; RANGE=19-140 (P15522-
CC       2); NOTE=Ref.1.
CC   --------------------------------------------------------------------------
CC   This SWISS-PROT entry is copyright. It is produced through a collaboration
CC   removed.
CC   --------------------------------------------------------------------------
DR   EMBL; AF172090; AAD48874.1; -; mRNA.
DR   EMBL; AL033514; CAC70124.1; -; Genomic_DNA.
DR   HSSP; P02833; 9ANT.
KW   Complete proteome; DNA-binding; Developmental protein; Homeobox;
KW   Hypothetical protein; Nuclear protein.
FT   DNA_BIND    >102    292
FT   REGION        1     44       Transcription activation (acidic).
FT   CHAIN        23    611       Halfway protein.
FT                                /FTId=PRO_0000021413.
FT   VARIANT       1      7       unknown  (in a skin tumor).
FT                                /FTId=VAR_005851.
FT   VARIANT       7      7       D -> H (in a skin tumor).
FT                                /FTId=VAR_005851.
FT   CONFLICT    282    282       R -> Q (in Ref. 18).
FT   STRAND      103    103
FT   NON_TER     80     80        non_ter.
SQ   SEQUENCE   218 AA;  24367 MW;  F24AE5E8A102FAC6 CRC64;
     MISVMQQMIN NDSPEDSKES ITSVQQTPFF WPSAAAAIPS IQGESRSERE SETGSSPQLA
     PSSTGMVMPG TAGMYGFGPS RMPTANEFGM MMNPVYTDFY QNPLASTDIT IPTTAGSSAA
     TTPNAAMHLP WAISHDGKKK RQPYKKDQIS RLEYEYSVNQ YLTNKRRSEL SAQLMLDEKQ
     VKVWFQNRRM KDKKLRQRHS GPFPHGAPVT PCIERLIN
//
ID   Q9U9C5_TEST   PRELIMINARY;      PRT;   218 AA. 
DT   ddd.
AC   Q9U9C5;hdfksfsdfs;sdfsfsfs;
SQ   SEQUENCE   218 AA;  24367 MW;  F24AE5E8A102FAC6 CRC64;
     MISVMQQMIN NDSPEDSKES ITSVQQTPFF WPSAAAAIPS IQGESRSERE
//
""".split('\n')
        pprint(list(EbiParser(lines,strict=False, selected_labels=[])))


    #from time import time
    ##sys.exit()
    #if len(sys.argv) > 1:
    #    #f = file('/home/zongzhi/Projects/SNP/working/data/uniprot_sprot_human.dat')
    #    f = file('/home/zongzhi/Projects/SNP/working/data/uniprot_sprot_fungi.dat')
    #    #f = file('/home/zongzhi/Projects/SNP/snp_tests/ebi_test.txt')

    #    i = 0
    #    for sequence, head in MinimalEbiParser(f):
    #        i += 1
    #        if i>10000: sys.exit()
    #        print '%s \r' % i, 
    #        try:
    #            de = ' '.join(head['OG'])
    #        except KeyError, e:
    #            pass
    #            #print e 
    #        else:
    #            if 'Plasmid' in de:
    #                print de, '\n'