1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
|
"""prepare data for CoRaL"""
from __future__ import print_function
import os.path as op
from collections import Counter
import pybedtools
from seqcluster.libs import utils
from seqcluster.libs.do import run, find_cmd
from seqcluster.function.rnafold import calculate_structure
min_trimmed_read_len = 14
max_trimmed_read_len = 50
seg_threshold = 2
seg_maxgap = 44
seg_minrun = min_trimmed_read_len
antisense_min_reads = 0
def prepare_bam(bam_in, precursors):
"""
Clean BAM file to keep only position inside the bigger cluster
"""
# use pybedtools to keep valid positions
# intersect option with -b bigger_cluster_loci
a = pybedtools.BedTool(bam_in)
b = pybedtools.BedTool(precursors)
c = a.intersect(b, u=True)
out_file = utils.splitext_plus(op.basename(bam_in))[0] + "_clean.bam"
c.saveas(out_file)
return op.abspath(out_file)
def _select_anno(annotation):
"""
Select annotation from multiple elements
"""
options = annotation.split(",")
return options[0]
def _reorder_columns(bed_file):
"""
Reorder columns to be compatible with CoRaL
"""
new_bed = utils.splitext_plus(bed_file)[0] + '_order.bed'
with open(bed_file) as in_handle:
with open(new_bed, 'w') as out_handle:
for line in in_handle:
cols = line.strip().split("\t")
cols[3] = _select_anno(cols[3]) + "_" + cols[4]
cols[4] = "0"
print("\t".join(cols), file=out_handle, end="")
return new_bed
def _fix_score_column(cov_file):
"""
Move counts to score columns in bed file
"""
new_cov = utils.splitext_plus(cov_file)[0] + '_fix.cov'
with open(cov_file) as in_handle:
with open(new_cov, 'w') as out_handle:
for line in in_handle:
cols = line.strip().split("\t")
cols[4] = cols[6]
print("\t".join(cols[0:6]), file=out_handle, end="")
return new_cov
def detect_regions(bam_in, bed_file, out_dir, prefix):
"""
Detect regions using first CoRaL module
"""
bed_file = _reorder_columns(bed_file)
counts_reads_cmd = ("coverageBed -s -counts -b {bam_in} "
"-a {bed_file} | sort -k4,4 "
"> {out_dir}/loci.cov")
# with tx_tmpdir() as temp_dir:
with utils.chdir(out_dir):
run(counts_reads_cmd.format(min_trimmed_read_len=min_trimmed_read_len, max_trimmed_read_len=max_trimmed_read_len, **locals()), "Run counts_reads")
loci_file = _fix_score_column(op.join(out_dir, "loci.cov"))
return loci_file
def _order_antisense_column(cov_file, min_reads):
"""
Move counts to score columns in bed file
"""
new_cov = op.join(op.dirname(cov_file), 'feat_antisense.txt')
with open(cov_file) as in_handle:
with open(new_cov, 'w') as out_handle:
print("name\tantisense", file=out_handle, end="")
for line in in_handle:
cols = line.strip().split("\t")
cols[6] = 0 if cols[6] < min_reads else cols[6]
print("%s\t%s" % (cols[3], cols[6]), file=out_handle, end="")
return new_cov
def _reads_per_position(bam_in, loci_file, out_dir):
"""
Create input for compute entropy
"""
data = Counter()
a = pybedtools.BedTool(bam_in)
b = pybedtools.BedTool(loci_file)
c = a.intersect(b, s=True, bed=True, wo=True)
for line in c:
end = int(line[1]) + 1 + int(line[2]) if line[5] == "+" else int(line[1]) + 1
start = int(line[1]) + 1 if line[5] == "+" else int(line[1]) + 1 + int(line[2])
side5 = "%s\t5p\t%s" % (line[15], start)
side3 = "%s\t3p\t%s" % (line[15], end)
data[side5] += 1
data[side3] += 1
counts_reads = op.join(out_dir, 'locus_readpos.counts')
with open(counts_reads, 'w') as out_handle:
for k in data:
print(k, file=out_handle, end="")
return counts_reads
def create_features(bam_in, loci_file, reference, out_dir):
"""
Use feature extraction module from CoRaL
"""
lenvec_plus = op.join(out_dir, 'genomic_lenvec.plus')
lenvec_minus = op.join(out_dir, 'genomic_lenvec.minus')
compute_genomic_cmd = ("compute_genomic_lenvectors "
"{bam_in} {lenvec_plus} "
"{lenvec_minus} "
"{min_len} "
"{max_len} ")
index_genomic_cmd = ("index_genomic_lenvectors "
"{lenvec} ")
genomic_lenvec = op.join(out_dir, 'genomic_lenvec')
feat_len_file = op.join(out_dir, 'feat_lengths.txt')
compute_locus_cmd = ("compute_locus_lenvectors "
"{loci_file} "
"{genomic_lenvec} "
"{min_len} "
"{max_len} "
"> {feat_len_file}")
cov_S_file = op.join(out_dir, 'loci.cov_anti')
coverage_anti_cmd = ("coverageBed -S -counts -b "
"{bam_in} -a {loci_file} "
"> {cov_S_file}")
feat_posentropy = op.join(out_dir, 'feat_posentropy.txt')
entropy_cmd = ("compute_locus_entropy.rb "
"{counts_reads} "
"> {feat_posentropy}")
with utils.chdir(out_dir):
run(compute_genomic_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute_genomic")
run(index_genomic_cmd.format(lenvec=lenvec_plus), "Run index in plus")
run(index_genomic_cmd.format(lenvec=lenvec_minus), "Run index in minus")
run(compute_locus_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute locus")
run(coverage_anti_cmd.format(**locals()), "Run coverage antisense")
feat_antisense = _order_antisense_column(cov_S_file, min_trimmed_read_len)
counts_reads = _reads_per_position(bam_in, loci_file, out_dir)
run(entropy_cmd.format(**locals()), "Run entropy")
rnafold = calculate_structure(loci_file, reference)
def prepare_ann_file(args):
"""
Create custom ann_file for Coral
"""
def download_hsa_file(args):
"""
In case of human, download from server
"""
|