File: simulator.py

package info (click to toggle)
python-seqcluster 1.2.9%2Bds-5
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 113,636 kB
  • sloc: python: 5,308; makefile: 184; sh: 122; javascript: 55
file content (115 lines) | stat: -rw-r--r-- 3,457 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""simulate cluster over the genome"""
from __future__ import print_function
import random
from seqcluster.libs.read import get_fasta


def simulate(args):
    """Main function that manage simulatin of small RNAs"""
    if args.fasta:
        name = None
        seq = ""
        reads = dict()
        with open(args.fasta) as in_handle:
            for line in in_handle:
                if line.startswith(">"):
                    if name:
                        reads.update(_generate_reads(seq, name))
                    seq = ""
                    name = line[1:-1]
                else:
                    seq += line.strip()

        reads.update(_generate_reads(seq, name))
    _write_reads(reads, args.out)


def _generate_reads(seq, name):
    """Main function that create reads from precursors"""
    reads = dict()
    if len(seq) < 130 and len(seq) > 70:
        reads.update(_mature(seq[:40], 0, name))
        reads.update(_mature(seq[-40:], len(seq) - 40, name))
        reads.update(_noise(seq, name))
        reads.update(_noise(seq, name, 25))
    return reads


def _mature(subseq, absolute, c,  size=33, total=5000):
    """Create mature sequences around start/end"""
    reads = dict()
    probs = [0.1, 0.2, 0.4, 0.2, 0.1]
    end = 5 + size
    error = [-2, -1, 0, 1, 2]
    for error5 in error:
        for error3 in error:
            s = 5 - error5
            e = end - error3
            seen = subseq[s:e]
            counts = int(probs[error5 + 2] * probs[error3 + 2] * total) + 1
            name = "seq_%s_%s_%s_x%s" % (c, s + absolute, e + absolute, counts)
            reads[name] = (seen, counts)
    return reads


def _noise(seq, c, size=33, total=1000):
    """Create mature sequences around start/end"""
    reads = dict()
    seen = 0
    while seen < total:
        s = random.randint(0, len(seq) - size)
        e = s + size + random.randint(-5,5)
        p = random.uniform(0, 0.1)
        counts = int(p * total) + 1
        seen += counts
        name = "seq_%s_%s_%s_x%s" % (c, s, e, counts)
        reads[name] = (seq[s:e], counts)
    return reads


def _write_reads(reads, prefix):
    """
    Write fasta file, ma file and real position
    """
    out_ma = prefix + ".ma"
    out_fasta = prefix + ".fasta"
    out_real = prefix + ".txt"
    with open(out_ma, 'w') as ma_handle:
        print("id\tseq\tsample", file=ma_handle, end="")
        with open(out_fasta, 'w') as fa_handle:
            with open(out_real, 'w') as read_handle:
                for idx, r in enumerate(reads):
                    info = r.split("_")
                    print("seq_%s\t%s\t%s" % (idx, reads[r][0], reads[r][1]), file=ma_handle, end="")
                    print(">seq_%s\n%s" % (idx, reads[r][0]), file=fa_handle, end="")
                    print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (idx, r, reads[r][0], reads[r][1], info[1], info[2], info[3]), file=read_handle, end="")


def _get_precursor(bed_file, reference, out_fa):
    """
    get sequence precursor from position
    """
    get_fasta(bed_file, reference, out_fa)
    return 0


def _get_spot(precursor):
    """
    get spot that will be enriched
    """
    return 0


def _get_type(pob):
    """
    randomly decide if is small rna or degradation
    """
    return 0


def _random_sequences(precursor, start= None, end=None):
    """
    randomly get sequences around some nucleotides.
    It could be enriched in some positions
    """
    return 0