File: split_bins.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (47 lines) | stat: -rwxr-xr-x 1,282 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/python3

from operator import itemgetter
import os
from os import path
import sys
from Bio import SeqIO
import common
import subprocess

def print_usage():
        print("Usage: split_bins.py <contigs> <binning info> <output directory> [-g]")

contigs = sys.argv[1]
sample, _ = path.splitext(path.basename(contigs))
out_dir = sys.argv[3]
glue = False
if len(sys.argv) > 4 and sys.argv[4]:
    glue = True

binning = common.load_annotation(sys.argv[2], False)

if glue:
    glue_binning = dict()
    for split, bins in binning.items():
        contig_bins = glue_binning.setdefault(common.extract_id(split), {})
        for bin in bins:
            contig_bins.setdefault(bin, 0)
            contig_bins[bin] += 1

if path.isdir(out_dir):
    subprocess.call("rm -f {}/*.fasta".format(out_dir), shell=True)
else:
    os.mkdir(out_dir)

for seq in SeqIO.parse(contigs, "fasta"):
    if glue:
        bins = []
        bin_freq = glue_binning.get(common.extract_id(seq.id))
        if bin_freq:
            bins = [max(bin_freq.items(), key=itemgetter(1))[0]]
    else:
        bins = binning.get(seq.id, [])
    for cag in bins:
        filename = cag + ".fasta"
        with open(path.join(out_dir, filename), "a") as output:
            SeqIO.write(seq, output, "fasta")