File: obitab.py

package info (click to toggle)
obitools 1.2.13%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,652 kB
  • sloc: python: 18,199; ansic: 1,542; makefile: 98
file content (178 lines) | stat: -rw-r--r-- 5,521 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/python3
'''
:py:mod:`obitab`: converts a sequence file to a tabular file
============================================================

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

:py:mod:`obitab` command converts sequence file to a tabular file that
can be open by a spreadsheet program or R.

'''

from obitools.options import getOptionManager
from obitools.format.options import addInOutputOption

def addTableOptions(optionManager):
    optionManager.add_option('-n','--na-string',
                             action="store", dest="NA",
                             metavar="<NOT AVAILABLE STRING>",
                             type="string",
                             default="NA",
                             help="String write in the table for not available value"
                            )
    optionManager.add_option('','--output-field-separator',
                             action="store", dest="ofs",
                             metavar="STRING",
                             type="string",
                             default="\t",
                             help="Field separator for CSV file"
                            )
    optionManager.add_option('-o','--output-seq',
                             action="store_true", dest="sequence",
                             default=False,
                             help="Add an extra column for sequence"
                            )
    optionManager.add_option('-d','--no-definition',
                             action="store_false", dest="definition",
                             default=True,
                             help="Remove column for sequence definition"
                            )
    optionManager.add_option('-a','--omit-attribute',
                             action="append", dest="omit",
                             metavar="<KEY>",
                             default=[],
                             help="Add attribute name to omit in the output tab"
                            )


def headerCmp(h1,h2):
    if type(h1) is str and type(h2) is str:
        return cmp(h1, h2)
    if type(h1) is str and type(h2) is tuple:
        return cmp(h1, h2[0])
    if type(h1) is tuple and type(h2) is str:
        return cmp(h1[0], h2)
    if type(h1) is tuple and type(h2) is tuple:
        c = cmp(h1[0],h2[0])
        if c==0:
            c = cmp(h1[1],h2[1])
        return c
    raise AssertionError
            
    

    

if __name__=='__main__':

    optionParser = getOptionManager([addTableOptions,addInOutputOption])
    
    (options, entries) = optionParser()

    column = {}
    subcol = {}
    db = []
    for seq in entries: 
        db.append(seq)
        keys = list(seq.keys())      
        for k in keys:
            t=type(seq[k])
            if k in column:
                column[k].add(t)
            else:
                column[k]=set([t])
            if t is dict:
                if k not in subcol:
                    subcol[k]=set()
                subcol[k]|=set(seq[k].keys())
                
    headers = set()
    for c in column:
        if len(column[c])==1:
            column[c]=column[c].pop()
        else:
            column[c]=str
            
        if column[c] not in (str,int,float,dict,bool):
            column[c]=str

            
        if column[c] is not dict:
            headers.add(c)
        else:
            for sc in subcol[c]:
                headers.add((c,sc))
                
    omit = set(options.omit)
    headers=list(headers)
    headers.sort(headerCmp)
    
    
    OFS = options.ofs
    
    s = "id"
    if options.definition:
        s = '%s%sdefinition'%(s,OFS)

    for k in headers:
        if type(k) is str:
            if k not in omit:
                s = '%s%s%s'%(s,OFS,k)
        else:
            if k[0] not in omit:
                if type(k[1]) is tuple:
                    sk = ":".join([str(x) for x in k[1]])
                else:
                    sk = str(k[1])
                if k[0][0:7]=='merged_':
                    s = '%s%s%s:%s' % (s,OFS,k[0][7:],sk)
                else:
                    s = '%s%s%s:%s' % (s,OFS,k[0],sk)
            
    if options.sequence:
        s = "%s%ssequence"%(s,OFS)
    print(s)
    
    
    for seq in db:
        s = seq.id
        
        if options.definition:
            s = '%s%s%s'%(s,OFS,seq.definition)
            
        for k in headers:
            if type(k) is str:
                if k not in omit:
                    if k in seq:
                        v = seq[k]
                        if v is None:
                            v=options.NA
                        s = '%s%s%s'%(s,OFS,v)
                    else:
                        s = '%s%s%s'%(s,OFS,options.NA)
            else:
                if k[0] not in omit:
                    if k[0] in seq:
                        sk = seq[k[0]]
                    else:
                        sk={}
                    if k[1] in sk:
                        v = sk[k[1]]
                        if v is None:
                            v=options.NA
                        s = '%s%s%s'%(s,OFS,v)
                    else:
                        if k[0][0:7]=='merged_':
                            s = '%s%s0'%(s,OFS)
                        else:
                            s = '%s%s%s'%(s,OFS,options.NA)
        if options.sequence:
            s = '%s%s%s'%(s,OFS,str(seq))
        
        print(s)