File: report.py

package info (click to toggle)
python-seqcluster 1.2.9%2Bds-3
  • links: PTS, VCS
  • area: contrib
  • in suites: bookworm
  • size: 113,624 kB
  • sloc: python: 5,308; makefile: 184; sh: 122; javascript: 55
file content (153 lines) | stat: -rw-r--r-- 4,778 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from __future__ import print_function
import logging
import string
import os

from math import log as mlog2
from collections import Counter, defaultdict

from seqcluster.libs.read import map_to_precursors, precursor_sequence, map_to_precursor_biopython
from seqcluster.libs.utils import safe_dirs
from progressbar import ProgressBar

from seqcluster.libs.utils import file_exists
from seqcluster.function.rnafold import run_rnafold
from seqcluster.html import HTML
from seqcluster import templates

logger = logging.getLogger('html')


def _get_ann(dbs, features):
    """
    Gives format to annotation for html table output
    """
    value = ""
    for db, feature in zip(dbs, features):
        value += db + ":" + feature
    return value


def _parse(profile, size):
    total = Counter()
    for sample in profile:
        for pos in range(len(size)):
            total[pos] += profile[sample][pos]
    return total.values()


def make_profile(data, out_dir, args):
    """
    Make data report for each cluster
    """
    safe_dirs(out_dir)
    main_table = []
    header = ['id', 'ann']
    n = len(data[0])
    bar = ProgressBar(maxval=n).start()
    bar.update(0)
    for itern, c in enumerate(data[0]):
        bar.update(itern)
        logger.debug("creating cluser: {}".format(c))
        safe_dirs(os.path.join(out_dir, c))
        valid, ann, pos_structure = _single_cluster(c, data, os.path.join(out_dir, c, "maps.tsv"), args)
        data[0][c].update({'profile': pos_structure})
        loci = data[0][c]['loci']
        logger.debug("precursor_sequence")
        data[0][c]['precursor'] = {"seq": precursor_sequence(loci[0][0:5], args.ref)}
        logger.debug("parse alignments")
        data[0][c]['precursor']["colors"] = list(_parse(data[0][c]['profile'], data[0][c]['precursor']["seq"]))
        logger.debug("update rnafold")
        data[0][c]['precursor'].update(run_rnafold(data[0][c]['precursor']['seq']))

    return data


def _expand(dat, counts, start, end):
    """
    expand the same counts from start to end
    """
    for pos in range(start, end):
        for s in counts:
            dat[s][pos] += counts[s]
    return dat


def _convert_to_df(in_file, freq, raw_file):
    """
    convert data frame into table with pandas
    """
    dat = defaultdict(Counter)
    if isinstance(in_file, (str, bytes)):
        with open(in_file) as in_handle:
            for line in in_handle:
                cols = line.strip().split("\t")
                counts = freq[cols[3]]
                dat = _expand(dat, counts, int(cols[1]), int(cols[2]))
    else:
        if raw_file:
            out_handle = open(raw_file, "w")
        for name in in_file:
            counts = freq[name]
            if raw_file:
                print("%s\t%s\t%s\t%s\t%s\t%s" % ("chr", in_file[name][0], in_file[name][1], name, sum(counts.values()), "+"), file=out_handle, end="")
            dat = _expand(dat, counts, in_file[name][0], in_file[name][1])

    for s in dat:
        for p in dat[s]:
            dat[s][p] = mlog2(dat[s][p] + 1)
    return dat


def _make(c):
    """
    create html from template, adding figure,
    annotation and sequences counts
    """
    ann = defaultdict(list)

    for pos in c['ann']:
        for db in pos:
            ann[db] += list(pos[db])
    logger.debug(ann)

    valid = [l for l in c['valid']]
    ann_list = [", ".join(list(set(ann[feature]))) for feature in ann if feature in valid]

    return valid, ann_list


def _single_cluster(c, data, out_file, args):
    """
    Map sequences on precursors and create
    expression profile
    """
    valid, ann = 0, 0
    raw_file = None
    freq = defaultdict()
    [freq.update({list(s.keys())[0]: list(s.values())[0]}) for s in data[0][c]['freq']]
    names = [list(s.keys())[0] for s in data[0][c]['seqs']]
    seqs = [list(s.values())[0] for s in data[0][c]['seqs']]
    loci = data[0][c]['loci']

    if loci[0][3] - loci[0][2] > 500:
        logger.info("locus bigger > 500 nt, skipping: %s" % loci)
        return valid, ann, {}
    if not file_exists(out_file):
        if args.razer:
            logger.debug("map with razer all sequences to all loci %s " % loci)
            map_to_precursors(seqs, names, {loci[0][0]: [loci[0][0:5]]}, out_file, args)
        else:
            logger.debug("map with biopython fn all sequences to all loci %s " % loci)
            if args.debug:
                raw_file = out_file
            out_file = map_to_precursor_biopython(seqs, names, loci[0][0:5], args)

    logger.debug("plot sequences on loci")
    df = _convert_to_df(out_file, freq, raw_file)
    
    if df:
        logger.debug("create html")
        valid, ann = _make(data[0][c])
    logger.debug("done single cluster")
    return valid, ann, df