File: tool.py

package info (click to toggle)
python-seqcluster 1.2.9%2Bds-3
  • links: PTS, VCS
  • area: contrib
  • in suites: bookworm
  • size: 113,624 kB
  • sloc: python: 5,308; makefile: 184; sh: 122; javascript: 55
file content (138 lines) | stat: -rw-r--r-- 4,514 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import operator
import os

import math

import seqcluster.libs.logger as mylog

logger = mylog.getLogger(__name__)


REMOVED = 0


def get_distance(p1, p2):
        if p1.strand == "+":
                d = p1.start-p2.start
        else:
                d = (p1.end-(p1.end-p1.start)) - (p2.end-(p2.end-p2.start))
        return d


def get_ini_str(n):
        # get space values
        return "".join(" " for i in range(n))


def init_numlocidb(beds):
    dict = {}
    for filebed in beds:
        db =  os.path.basename(filebed)
        dict[db] = 0;
    return dict


def calc_complexity(nl):
    ns = len(nl)
    total = 0.0
    for l in nl:
        total += 1.0/l
        #print(total)
    total /= ns
    return (total)


def calculate_size(vector):
    maxfreq = 0
    zeros = 0
    counts = 0
    total = len(vector.keys())
    for s in vector.keys():
        maxfreq = max(vector[s], maxfreq)
        counts += int(vector[s])
        if vector[s] == 0:
            zeros += 1
    return (counts*((total-zeros)/total))


def show_seq(clus_obj, index):
    """Get the precursor and map sequences to it.
       this way we create a positional map."""
    current = clus_obj.clus
    clus_seqt = clus_obj.seq
    clus_locit = clus_obj.loci

    itern = 0
    for idc in current.keys():
        itern += 1
        timestamp = str(idc)
        seqListTemp = ()
        f = open("/tmp/"+timestamp+".fa","w")
        for idl in current[idc].loci2seq.keys():
            seqListTemp = list(set(seqListTemp).union(current[idc].loci2seq[idl]))
        maxscore = 0
        for s in seqListTemp:
            score = calculate_size(clus_seqt[s].freq)
            maxscore = max(maxscore,score)
            clus_seqt[s].score = score
            seq = clus_seqt[s]
            f.write(">"+s+"\n"+seq.seq+"\n")
        f.close()

        locilen_sorted = sorted(current[idc].locilen.iteritems(), key = operator.itemgetter(1),reverse = True)
        lmax = clus_locit[locilen_sorted[0][0]]
        f = open("/tmp/"+timestamp+".bed","w")
        f.write("%s\t%s\t%s\t.\t.\t%s\n" % (lmax.chr,lmax.start,lmax.end,lmax.strand))
        f.close()
        os.system("bedtools  getfasta -s -fi "+index+" -bed /tmp/"+timestamp+".bed -fo /tmp/"+timestamp+".pre.fa")
        os.system("bowtie2-build /tmp/"+timestamp+".pre.fa  /tmp/"+timestamp+".pre.ind >/dev/null 2>&1")
        os.system("bowtie2 --rdg 7,3 --mp 4 --end-to-end --no-head --no-sq  -D 20 -R 3 -N 0 -i S,1,0.8 -L 3 -f /tmp/"+timestamp+".pre.ind /tmp/"+timestamp+".fa -S /tmp/"+timestamp+".map  >>bowtie.log 2>&1")
        f = open("/tmp/"+timestamp+".map","r")
        seqpos = {}
        minv = 10000000
        for line in f:
            line = line.strip()
            cols = line.split("\t")
            seqpos[cols[0]] = int(cols[3])
            if minv>int(cols[3]):
                minv = int(cols[3])
        f.close()
        seqpos_sorted = sorted(seqpos.iteritems(), key = operator.itemgetter(1),reverse = False)
        showseq = ""
        showseq_plain = ""
        for (s,pos) in seqpos_sorted:
            ratio = (clus_seqt[s].score*1.0/maxscore*100.0)
            realScore = (math.log(ratio,2)*2)
            if realScore<0:
                realScore = 0
            # "score %s max %s  ratio %s real %.0f" % (clus_seqt[s].score,maxscore,ratio,realScore)
            ##calculate the mean expression of the sequence and change size letter
            showseq_plain += "<br>%s<a style = \"font-size:%.0fpx;\"href = javascript:loadSeq(\"%s\")>%s</a>" % ("".join("." for i in range(pos-1)),realScore+10,s,clus_seqt[s].seq)
            #showseq+ = seqviz.addseq(pos-1,clus_seqt[s].len,clus_seqt[s].seq)
        #current[idc].showseq = showseq
        current[idc].showseq_plain = showseq_plain
        os.system("rm /tmp/"+timestamp+"*")
    clus_obj.clus = current
    clus_obj.seq = clus_seqt
    return clus_obj


def generate_position_bed(clus_obj):
    ##generate file with positions in bed format
    bedaligned = ""
    clus_id = clus_obj.clus
    for idc in clus_id.keys():
        clus = clus_id[idc]
        for idl in clus.loci2seq.keys():
            pos = clus_obj.loci[idl]
            bedaligned +=  "%s\t%s\t%s\t%s\t%s\t%s\n" % (pos.chr,pos.start,pos.end,idc,idl,pos.strand)
    return bedaligned


def _normalize_seqs(s, t):
    """Normalize to DESeq2"""
    for ids in s:
        obj = s[ids]
        [obj.norm_freq.update({sample: 1.0 * obj.freq[sample] / t[sample]}) for sample in obj.norm_freq]
        s[ids] = obj
    return s