File: inputs.py

package info (click to toggle)
python-seqcluster 1.2.9%2Bds-3
  • links: PTS, VCS
  • area: contrib
  • in suites: bookworm
  • size: 113,624 kB
  • sloc: python: 5,308; makefile: 184; sh: 122; javascript: 55
file content (98 lines) | stat: -rw-r--r-- 3,080 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from collections import defaultdict
import pybedtools
import numpy as np
import pandas as pd

import seqcluster.libs.logger as mylog
from seqcluster.libs.classes import sequence
from seqcluster.libs.tool import _normalize_seqs


logger = mylog.getLogger(__name__)


def parse_align_file(file_in):
    """
    Parse sam files with aligned sequences
    """
    loc_id = 1
    bedfile_clusters = ""
    bamfile = pybedtools.BedTool(file_in)
    bed = pybedtools.BedTool.bam_to_bed(bamfile)
    for c, start, end, name, q, strand in bed:
        loc_id += 1
        bedfile_clusters += "%s\t%s\t%s\t%s\t%s\t%s\n" % \
                            (c, start, end, name, loc_id, strand)
    return bedfile_clusters


def parse_ma_file(seq_obj, in_file):
    """
    read seqs.ma file and create dict with
    sequence object
    """
    name = ""
    index = 1
    total = defaultdict(int)
    ratio = list()
    with open(in_file) as handle_in:
        line = handle_in.readline().strip()
        cols = line.split("\t")
        samples = cols[2:]
        for line in handle_in:
            line = line.strip()
            cols = line.split("\t")
            name = int(cols[0].replace("seq_", ""))
            seq = cols[1]
            exp = {}
            for i in range(len(samples)):
                exp[samples[i]] = int(cols[i+2])
                total[samples[i]] += int(cols[i+2])
            ratio.append(np.array(list(exp.values())) / np.mean(list(exp.values())))
            index = index+1
            if name in seq_obj:
                seq_obj[name].set_freq(exp)
                seq_obj[name].set_seq(seq)
            # new_s = sequence(seq, exp, index)
            # seq_l[name] = new_s
    df = pd.DataFrame(ratio)
    df = df[(df.T != 0).all()]
    size_factor = dict(zip(samples, df.median(axis=0)))
    seq_obj = _normalize_seqs(seq_obj, size_factor)
    return seq_obj, total, index


def parse_ma_file_raw(in_file):
    """
    read seqs.ma file and create dict with
    sequence object
    """
    name = ""
    index = 1
    total = defaultdict(int)
    seq_obj = defaultdict(sequence)
    ratio = list()
    with open(in_file) as handle_in:
        line = handle_in.readline().strip()
        cols = line.split("\t")
        samples = cols[2:]
        for line in handle_in:
            line = line.strip()
            cols = line.split("\t")
            name = int(cols[0].replace("seq_", ""))
            seq = cols[1]
            exp = {}
            for i in range(len(samples)):
                exp[samples[i]] = int(cols[i+2])
                total[samples[i]] += int(cols[i+2])
            ratio.append(np.array(list(exp.values())) / np.mean(list(exp.values())))
            index = index+1
            if name not in seq_obj:
                seq_obj[name] = sequence(name)
            seq_obj[name].set_freq(exp)
            seq_obj[name].set_seq(seq)
    df = pd.DataFrame(ratio)
    df = df[(df.T != 0).all()]
    size_factor = dict(zip(samples, df.median(axis=0)))
    seq_obj = _normalize_seqs(seq_obj, size_factor)
    return seq_obj, total, index