File: obidistribute.py

package info (click to toggle)
obitools 1.2.13%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,652 kB
  • sloc: python: 18,199; ansic: 1,542; makefile: 98
file content (141 lines) | stat: -rw-r--r-- 4,694 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/python3
'''
:py:mod:`obidistribute`: Distributes sequence records over several sequence records files 
=========================================================================================

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

:py:mod:`obidistribute` distributes equitably a set of sequence records over several files 
(No sequence records are printed on standard output).

The number of files is set using the ``-n`` option (required). File names are build with a prefix if
provided (``-p``option) and the file number (1 to ``n``).

*Example:*

    .. code-block:: bash
        
        > obidistribute -n 10 -p 'part' seq.fastq

    Distribute the sequence records contained in the ``seq.fastq`` 
    file and distributes them over files ``part_1.fastq`` to ``part_10.fastq``.
'''

from obitools.options import getOptionManager
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
import math
from obitools.fasta import formatFasta
from obitools.fastq import formatFastq
from functools import reduce


def addDistributeOptions(optionManager):
    group = optionManager.add_option_group('obidistribute specific options')

    group.add_option('-n','--number',
                             action="store", dest="number",
                             metavar="###",
                             type="int",
                             default=None,
                             help="Number of files to distribute over")

    group.add_option('-p','--prefix',
                             action="store", dest="prefix",
                             metavar="<PREFIX FILENAME>",
                             type="string",
                             default="",
                             help="prefix added at each file name")
    
    
class OutFiles:
    def __init__(self,options):
        self._tags = options.tagname
        self._undefined = None
        if options.undefined is not None:
            self._undefined=open(options.undefined,'w')
        self._prefix=options.prefix
        self._files = {}
        self._first=None
        self._last=None
        self._extension=options.outputFormat
        self._digit = math.ceil(math.log10(options.number))
                
        
    def __getitem__(self,key):
        if key in self._files:
            data = self._files[key]
            prev,current,next = data
            if next is not None:
                if prev is not None:
                    self._files[prev][2]=next
                self._files[next][0]=prev
                data[0]=self._last
                data[2]=None
                self._last=key
        else:
            name = key
            if self._prefix is not None:
                template = "%s_%%0%dd.%s" % (self._prefix,self._digit,self._extension)
            else:
                template = "%%0%dd.%s" % (self._digit,self._extension)
                
            current = open(template % name,'a')
            prev=self._last 
            self._last=key
            next=None
            self._files[key]=[prev,current,next]
            if len(self._files)>100:
                oprev,old,onext=self._files[self._first]
                del(self._files[self._first])
                old.close()
                self._first=onext
            if self._first is None:
                self._first=key
        return current
    
    def __call__(self,seq):
        ok = reduce(lambda x,y: x and y, (z in seq for z in self._tags),True)
        if ok:
            k = "_".join([str(seq[x]) for x in self._tags])
            file=self[k]
        else:
            file=self._undefined
        if file is not None and self._extension=="fasta":
            print(formatFasta(seq), file=file)
        else:
            print(formatFastq(seq), file=file)
    
    def __del__(self):
        k=list(self._files.keys())
        for x in k:
            del(self._files[x])

if __name__=='__main__':
    
    optionParser = getOptionManager([addDistributeOptions,addInOutputOption], progdoc=__doc__)
    
    (options, entries) = optionParser()
    
    assert options.number is not None, "You must specify the number of parts"
    
    digit = math.ceil(math.log10(options.number))
    out=[]

    
    i = 0
    for seq in entries:
        if not out:
            template = "%s_%%0%dd.%s" % (options.prefix,digit,options.outputFormat)
            out=[sequenceWriterGenerator(options,
                                         open(template % (i+1),"w"))
                 for i in range(options.number)
                ]
            
        out[i](seq)
        i+=1
        i%=options.number
        
    del out