File: coral.py

package info (click to toggle)
python-seqcluster 1.2.7%2Bds-1
  • links: PTS, VCS
  • area: contrib
  • in suites: bullseye
  • size: 113,592 kB
  • sloc: python: 5,327; makefile: 184; sh: 122; javascript: 55
file content (175 lines) | stat: -rw-r--r-- 6,288 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""prepare data for CoRaL"""
from __future__ import print_function
import os.path as op
from collections import Counter
import pybedtools

from seqcluster.libs import utils
from seqcluster.libs.do import run, find_cmd
from seqcluster.function.rnafold import calculate_structure

min_trimmed_read_len = 14
max_trimmed_read_len = 50
seg_threshold = 2
seg_maxgap = 44
seg_minrun = min_trimmed_read_len
antisense_min_reads = 0

def prepare_bam(bam_in, precursors):
    """
    Clean BAM file to keep only position inside the bigger cluster
    """
    # use pybedtools to keep valid positions
    # intersect option with -b bigger_cluster_loci
    a = pybedtools.BedTool(bam_in)
    b = pybedtools.BedTool(precursors)
    c = a.intersect(b, u=True)
    out_file = utils.splitext_plus(op.basename(bam_in))[0] + "_clean.bam"
    c.saveas(out_file)
    return op.abspath(out_file)


def _select_anno(annotation):
    """
    Select annotation from multiple elements
    """
    options = annotation.split(",")
    return options[0]


def _reorder_columns(bed_file):
    """
    Reorder columns to be compatible with CoRaL
    """
    new_bed = utils.splitext_plus(bed_file)[0] + '_order.bed'
    with open(bed_file) as in_handle:
        with open(new_bed, 'w') as out_handle:
            for line in in_handle:
                cols = line.strip().split("\t")
                cols[3] = _select_anno(cols[3]) + "_" + cols[4]
                cols[4] = "0"
                print("\t".join(cols), file=out_handle, end="")
    return new_bed


def _fix_score_column(cov_file):
    """
    Move counts to score columns in bed file
    """
    new_cov = utils.splitext_plus(cov_file)[0] + '_fix.cov'
    with open(cov_file) as in_handle:
        with open(new_cov, 'w') as out_handle:
            for line in in_handle:
                cols = line.strip().split("\t")
                cols[4] = cols[6]
                print("\t".join(cols[0:6]), file=out_handle, end="")
    return new_cov


def detect_regions(bam_in, bed_file, out_dir, prefix):
    """
    Detect regions using first CoRaL module
    """
    bed_file = _reorder_columns(bed_file)
    counts_reads_cmd = ("coverageBed -s -counts -b {bam_in} "
                        "-a {bed_file} | sort -k4,4 "
                        "> {out_dir}/loci.cov")
    # with tx_tmpdir() as temp_dir:
    with utils.chdir(out_dir):
        run(counts_reads_cmd.format(min_trimmed_read_len=min_trimmed_read_len, max_trimmed_read_len=max_trimmed_read_len, **locals()), "Run counts_reads")
        loci_file = _fix_score_column(op.join(out_dir, "loci.cov"))
        return loci_file


def _order_antisense_column(cov_file, min_reads):
    """
    Move counts to score columns in bed file
    """
    new_cov = op.join(op.dirname(cov_file), 'feat_antisense.txt')
    with open(cov_file) as in_handle:
        with open(new_cov, 'w') as out_handle:
            print("name\tantisense", file=out_handle, end="")
            for line in in_handle:
                cols = line.strip().split("\t")
                cols[6] = 0 if cols[6] < min_reads else cols[6]
                print("%s\t%s" % (cols[3], cols[6]), file=out_handle, end="")
    return new_cov


def _reads_per_position(bam_in, loci_file, out_dir):
    """
    Create input for compute entropy
    """
    data = Counter()
    a = pybedtools.BedTool(bam_in)
    b = pybedtools.BedTool(loci_file)
    c = a.intersect(b, s=True, bed=True, wo=True)
    for line in c:
        end = int(line[1]) + 1 + int(line[2]) if line[5] == "+" else int(line[1]) + 1
        start = int(line[1]) + 1 if line[5] == "+" else int(line[1]) + 1 + int(line[2])
        side5 = "%s\t5p\t%s" % (line[15], start)
        side3 = "%s\t3p\t%s" % (line[15], end)
        data[side5] += 1
        data[side3] += 1

    counts_reads = op.join(out_dir, 'locus_readpos.counts')
    with open(counts_reads, 'w') as out_handle:
        for k in data:
            print(k, file=out_handle, end="")

    return counts_reads


def create_features(bam_in, loci_file, reference, out_dir):
    """
    Use feature extraction module from CoRaL
    """
    lenvec_plus = op.join(out_dir, 'genomic_lenvec.plus')
    lenvec_minus = op.join(out_dir, 'genomic_lenvec.minus')
    compute_genomic_cmd = ("compute_genomic_lenvectors "
                           "{bam_in} {lenvec_plus} "
                           "{lenvec_minus} "
                           "{min_len} "
                           "{max_len} ")
    index_genomic_cmd = ("index_genomic_lenvectors "
                         "{lenvec} ")
    genomic_lenvec = op.join(out_dir, 'genomic_lenvec')
    feat_len_file = op.join(out_dir, 'feat_lengths.txt')
    compute_locus_cmd = ("compute_locus_lenvectors "
                         "{loci_file} "
                         "{genomic_lenvec} "
                         "{min_len} "
                         "{max_len} "
                         "> {feat_len_file}")
    cov_S_file = op.join(out_dir, 'loci.cov_anti')
    coverage_anti_cmd = ("coverageBed -S -counts -b "
                         "{bam_in} -a {loci_file} "
                         "> {cov_S_file}")
    feat_posentropy = op.join(out_dir, 'feat_posentropy.txt')
    entropy_cmd = ("compute_locus_entropy.rb "
                   "{counts_reads} "
                   "> {feat_posentropy}")
    with utils.chdir(out_dir):
        run(compute_genomic_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute_genomic")
        run(index_genomic_cmd.format(lenvec=lenvec_plus), "Run index in plus")
        run(index_genomic_cmd.format(lenvec=lenvec_minus), "Run index in minus")
        run(compute_locus_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute locus")
        run(coverage_anti_cmd.format(**locals()), "Run coverage antisense")
        feat_antisense = _order_antisense_column(cov_S_file, min_trimmed_read_len)

        counts_reads = _reads_per_position(bam_in, loci_file, out_dir)
        run(entropy_cmd.format(**locals()), "Run entropy")

        rnafold = calculate_structure(loci_file, reference)


def prepare_ann_file(args):
    """
    Create custom ann_file for Coral
    """


def download_hsa_file(args):
    """
    In case of human, download from server
    """