File: exhaust.py

package info (click to toggle)
dnapi 1.1-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 42,284 kB
  • sloc: python: 831; sh: 62; makefile: 13
file content (176 lines) | stat: -rw-r--r-- 5,501 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""Functions for exhaustive adapter search
   incorporating with read mapping process.

"""

import re
import os.path
import subprocess
import fileinput

from dnapilib.io_utils import get_file_obj
from dnapilib.io_utils import fastq_sequence
from dnapilib.io_utils import fastq_record


def rm_temp_dir(temp_dir):
    """Remove temporary directory.

    """
    if temp_dir:
        if os.path.exists(temp_dir):
            subprocess.call("rm -r {}".format(temp_dir).split())


def clip_adapter(fp, aseed, tm5, tm3, min_len, max_len):
    """Return adapter-clipped clean reads.

    """
    seed_len = len(aseed)
    pp = re.compile("(.*)"+aseed, re.IGNORECASE)
    for seq in fastq_sequence(fp):
        if len(seq) < tm5 or len(seq) < tm3:
            raise Exception("trimming length is too large")
        match = pp.search(seq)
        if not match:
            continue
        end = match.end() - seed_len
        clipped_seq = seq[tm5 : end-tm3]
        L = len(clipped_seq)
        if min_len <= L and L <= max_len:
            yield clipped_seq


def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len):
    """Write FASTA containing clean reads, and return
       the number of the reads.

    """
    fq_obj = get_file_obj(fastq)
    if "RAW_INPUT".startswith(aseed):
        iterator = fastq_sequence(fq_obj)
    else:
        iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len)
    fas = {}
    clean_read_count = 0
    for seq in iterator:
        fas[seq] = fas.get(seq, 0) + 1
    fa_obj = open(fasta, "w")
    for seq, cnt in fas.items():
        clean_read_count += cnt
        fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt))
    fa_obj.close()
    fq_obj.close()
    return clean_read_count


def fastq_input_prep(fastq, ratio, temp_dir):
    """Write FASTQ in the temporary directory, and retrun
       (subsampled) FASTQ name, the total read count,
       standard deviation of read lengths.

    """
    num = int(1/ratio)
    read_count = 0.0
    stats = {}
    fq_out = "{}/input.fq".format(temp_dir)
    fq_obj = get_file_obj(fastq)
    fout = open(fq_out, "w")
    for i, rec in enumerate(fastq_record(fq_obj)):
        if i % num == 0:
            fout.write(rec)
            read_count += 1
            L = len(rec.split("\n")[1])
            stats[L] = stats.get(L,0) + 1
    fout.close()
    fq_obj.close()
    mean = sum([L*c for L,c in stats.items()]) / read_count
    sum_square = sum([(L-mean)**2 * c for L,c in stats.items()])
    sd = (sum_square / read_count)**0.5
    return fq_out, read_count, sd


def count_mapped_read_sam(samout):
    """Return the number of mapped reads to the genome.

    """
    if not os.path.exists(samout):
        raise Exception("can't open SAM")
    mapped = set()
    for x in fileinput.input(samout):
        if not x or x.startswith("@"):
            continue
        x = x.rstrip().split("\t")
        if x[2] != '*':
            mapped.add(x[0])
    cnt = sum([int(n.split('_')[1]) for n in mapped])
    return cnt


def map_clean_reads(fastq, adapter, tm5, tm3,
                    min_len, max_len, map_command, temp_dir):
    """Execute mapping command, and return the numbers
       of clean and mapped reads.

    """
    fasta = "{0}/insert_{1}.fa".format(temp_dir, adapter)
    samout = "{}/output.sam".format(temp_dir)
    clipped = to_fasta(fastq, fasta, adapter, tm5, tm3, min_len, max_len)
    map_command = map_command.replace("@in",fasta).replace("@out",samout)
    map_command += " 2> /dev/null"
    if subprocess.call(map_command, shell=True) != 0:
        raise Exception("mapping failed, check command line")
    mapped = count_mapped_read_sam(samout)
    return clipped, mapped


def make_stats_report(table, sampled_read, subsample_rate, prefix_match,
                      sd, fastq, output_dir, temp_dir, no_output_files):
    """Report read statistics with predicted adapters.

    """
    out = ["# sampled_reads={} (total_reads * {:.2f})".format(
              int(sampled_read), subsample_rate)]
    out.append("\t".join([
          "# 3'adapter",
          "reads_extracted",
          "(reads_extracted/sampled_reads)%",
          "reads_mapped",
          "(reads_mapped/sampled_reads)%",
          "params_k:r"]))
    max_mapped_read = -1
    max_index = -1
    for i, x in enumerate(table):
        if x[3] > max_mapped_read:
            max_mapped_read = x[3]
            max_index = i
        out.append("{}\t{}\t{:.2f}\t{}\t{:.2f}\t{}".format(*x))
    optimal = [table[max_index][0]]

    fq_prefix = os.path.basename(fastq).split(".")[0]
    if table[max_index][4] < 20:
        optimal.append("/POOR_QUALITY")
    if optimal[0] == "RAW_INPUT":
        if sd:
            out.append("# input reads look already clean!")
        else:
            optimal.append("?")
    else:
        if no_output_files:
            pass
        else:
            if not os.path.exists(output_dir):
                subprocess.call("mkdir {}".format(output_dir).split())
            aseq = optimal[0][:prefix_match]
            fa_tmp = "{}/insert_{}.fa".format(temp_dir, aseq)
            fa_out = "{}/{}_{}.fa".format(output_dir, fq_prefix, aseq)
            subprocess.call(("mv {} {}".format(fa_tmp,fa_out)).split())

    out.insert(0, "optimal_3'adapter={}\n".format(''.join(optimal)))
    report = "\n".join(out)
    print(report)

    if not no_output_files:
        f = open("{}/{}_report.txt".format(output_dir, fq_prefix), "w")
        f.write(report + "\n")
        f.close()