1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
|
# Copyright 2009-2016 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Optimised sequence conversion code (PRIVATE).
You are not expected to access this module, or any of its code, directly. This
is all handled internally by the Bio.SeqIO.convert(...) function which is the
public interface for this.
The idea here is that while doing this will work::
from Bio import SeqIO
records = SeqIO.parse(in_handle, in_format)
count = SeqIO.write(records, out_handle, out_format)
it is shorter to write::
from Bio import SeqIO
count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
Also, the convert function can take a number of special case optimisations. This
means that using Bio.SeqIO.convert() may be faster, as well as more convenient.
All these file format specific optimisations are handled by this (private) module.
"""
from Bio import BiopythonWarning
from Bio import SeqIO
# NOTE - Lots of lazy imports further on...
def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
"""Fast GenBank to FASTA (PRIVATE)."""
# We don't need to parse the features...
from Bio.GenBank.Scanner import GenBankScanner
records = GenBankScanner().parse_records(in_handle, do_features=False)
# For FASTA output we can ignore the alphabet too
return SeqIO.write(records, out_handle, "fasta")
def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
"""Fast EMBL to FASTA (PRIVATE)."""
# We don't need to parse the features...
from Bio.GenBank.Scanner import EmblScanner
records = EmblScanner().parse_records(in_handle, do_features=False)
# For FASTA output we can ignore the alphabet too
return SeqIO.write(records, out_handle, "fasta")
def _fastq_generic(in_handle, out_handle, mapping):
"""FASTQ helper function where can't have data loss by truncation (PRIVATE)."""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
# For real speed, don't even make SeqRecord and Seq objects!
count = 0
null = chr(0)
for title, seq, old_qual in FastqGeneralIterator(in_handle):
count += 1
# map the qual...
qual = old_qual.translate(mapping)
if null in qual:
raise ValueError("Invalid character in quality string")
out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
return count
def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
"""FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
# For real speed, don't even make SeqRecord and Seq objects!
count = 0
null = chr(0)
for title, seq, old_qual in FastqGeneralIterator(in_handle):
count += 1
# map the qual...
qual = old_qual.translate(mapping)
if null in qual:
raise ValueError("Invalid character in quality string")
if truncate_char in qual:
qual = qual.replace(truncate_char, chr(126))
import warnings
warnings.warn(truncate_msg, BiopythonWarning)
out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
return count
def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
Useful for removing line wrapping and the redundant second identifier
on the plus lines. Will check also check the quality string is valid.
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
mapping = "".join([chr(0) for ascii in range(0, 33)] +
[chr(ascii) for ascii in range(33, 127)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
Useful for removing line wrapping and the redundant second identifier
on the plus lines. Will check also check the quality string is valid.
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
mapping = "".join([chr(0) for ascii in range(0, 59)] +
[chr(ascii) for ascii in range(59, 127)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
Useful for removing line wrapping and the redundant second identifier
on the plus lines. Will check also check the quality string is valid.
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
mapping = "".join([chr(0) for ascii in range(0, 64)] +
[chr(ascii) for ascii in range(64, 127)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
mapping = "".join([chr(0) for ascii in range(0, 64)] +
[chr(33 + q) for q in range(0, 62 + 1)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion. Will issue a warning if the scores had to be truncated at 62
(maximum possible in the Illumina 1.3+ FASTQ format)
"""
# Map unexpected chars to null
trunc_char = chr(1)
mapping = "".join([chr(0) for ascii in range(0, 33)] +
[chr(64 + q) for q in range(0, 62 + 1)] +
[trunc_char for ascii in range(96, 127)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
"Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE).
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
from Bio.SeqIO.QualityIO import phred_quality_from_solexa
mapping = "".join([chr(0) for ascii in range(0, 59)] +
[chr(33 + int(round(phred_quality_from_solexa(q))))
for q in range(-5, 62 + 1)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion. Will issue a warning if the scores had to be truncated at 62
(maximum possible in the Solexa FASTQ format)
"""
# Map unexpected chars to null
from Bio.SeqIO.QualityIO import solexa_quality_from_phred
trunc_char = chr(1)
mapping = "".join([chr(0) for ascii in range(0, 33)] +
[chr(64 + int(round(solexa_quality_from_phred(q))))
for q in range(0, 62 + 1)] +
[trunc_char for ascii in range(96, 127)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
"Data loss - max Solexa quality 62 in Solexa FASTQ")
def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
from Bio.SeqIO.QualityIO import phred_quality_from_solexa
mapping = "".join([chr(0) for ascii in range(0, 59)] +
[chr(64 + int(round(phred_quality_from_solexa(q))))
for q in range(-5, 62 + 1)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE).
Avoids creating SeqRecord and Seq objects in order to speed up this
conversion.
"""
# Map unexpected chars to null
from Bio.SeqIO.QualityIO import solexa_quality_from_phred
mapping = "".join([chr(0) for ascii in range(0, 64)] +
[chr(64 + int(round(solexa_quality_from_phred(q))))
for q in range(0, 62 + 1)] +
[chr(0) for ascii in range(127, 256)])
assert len(mapping) == 256
return _fastq_generic(in_handle, out_handle, mapping)
def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
"""Fast FASTQ to FASTA conversion (PRIVATE).
Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
Seq objects in order to speed up this conversion.
NOTE - This does NOT check the characters used in the FASTQ quality string
are valid!
"""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
# For real speed, don't even make SeqRecord and Seq objects!
count = 0
for title, seq, qual in FastqGeneralIterator(in_handle):
count += 1
out_handle.write(">%s\n" % title)
# Do line wrapping
for i in range(0, len(seq), 60):
out_handle.write(seq[i:i + 60] + "\n")
return count
def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
"""Fast FASTQ to simple tabbed conversion (PRIVATE).
Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
Seq objects in order to speed up this conversion.
NOTE - This does NOT check the characters used in the FASTQ quality string
are valid!
"""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
# For real speed, don't even make SeqRecord and Seq objects!
count = 0
for title, seq, qual in FastqGeneralIterator(in_handle):
count += 1
out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
return count
def _fastq_convert_qual(in_handle, out_handle, mapping):
"""FASTQ helper function for QUAL output (PRIVATE).
Mapping should be a dictionary mapping expected ASCII characters from the
FASTQ quality string to PHRED quality scores (as strings).
"""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
# For real speed, don't even make SeqRecord and Seq objects!
count = 0
for title, seq, qual in FastqGeneralIterator(in_handle):
count += 1
out_handle.write(">%s\n" % title)
# map the qual... note even with Sanger encoding max 2 digits
try:
qualities_strs = [mapping[ascii] for ascii in qual]
except KeyError:
raise ValueError("Invalid character in quality string")
data = " ".join(qualities_strs)
while len(data) > 60:
# Know quality scores are either 1 or 2 digits, so there
# must be a space in any three consecutive characters.
if data[60] == " ":
out_handle.write(data[:60] + "\n")
data = data[61:]
elif data[59] == " ":
out_handle.write(data[:59] + "\n")
data = data[60:]
else:
assert data[58] == " ", "Internal logic failure in wrapping"
out_handle.write(data[:58] + "\n")
data = data[59:]
out_handle.write(data + "\n")
return count
def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to QUAL conversion (PRIVATE)."""
mapping = dict((chr(q + 33), str(q)) for q in range(0, 93 + 1))
return _fastq_convert_qual(in_handle, out_handle, mapping)
def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to QUAL conversion (PRIVATE)."""
from Bio.SeqIO.QualityIO import phred_quality_from_solexa
mapping = dict((chr(q + 64), str(int(round(phred_quality_from_solexa(q)))))
for q in range(-5, 62 + 1))
return _fastq_convert_qual(in_handle, out_handle, mapping)
def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE)."""
mapping = dict((chr(q + 64), str(q)) for q in range(0, 62 + 1))
return _fastq_convert_qual(in_handle, out_handle, mapping)
# TODO? - Handling aliases explicitly would let us shorten this list:
_converter = {
("genbank", "fasta"): _genbank_convert_fasta,
("gb", "fasta"): _genbank_convert_fasta,
("embl", "fasta"): _embl_convert_fasta,
("fastq", "fasta"): _fastq_convert_fasta,
("fastq-sanger", "fasta"): _fastq_convert_fasta,
("fastq-solexa", "fasta"): _fastq_convert_fasta,
("fastq-illumina", "fasta"): _fastq_convert_fasta,
("fastq", "tab"): _fastq_convert_tab,
("fastq-sanger", "tab"): _fastq_convert_tab,
("fastq-solexa", "tab"): _fastq_convert_tab,
("fastq-illumina", "tab"): _fastq_convert_tab,
("fastq", "fastq"): _fastq_sanger_convert_fastq_sanger,
("fastq-sanger", "fastq"): _fastq_sanger_convert_fastq_sanger,
("fastq-solexa", "fastq"): _fastq_solexa_convert_fastq_sanger,
("fastq-illumina", "fastq"): _fastq_illumina_convert_fastq_sanger,
("fastq", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger,
("fastq-sanger", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger,
("fastq-solexa", "fastq-sanger"): _fastq_solexa_convert_fastq_sanger,
("fastq-illumina", "fastq-sanger"): _fastq_illumina_convert_fastq_sanger,
("fastq", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa,
("fastq-sanger", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa,
("fastq-solexa", "fastq-solexa"): _fastq_solexa_convert_fastq_solexa,
("fastq-illumina", "fastq-solexa"): _fastq_illumina_convert_fastq_solexa,
("fastq", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina,
("fastq-sanger", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina,
("fastq-solexa", "fastq-illumina"): _fastq_solexa_convert_fastq_illumina,
("fastq-illumina", "fastq-illumina"): _fastq_illumina_convert_fastq_illumina,
("fastq", "qual"): _fastq_sanger_convert_qual,
("fastq-sanger", "qual"): _fastq_sanger_convert_qual,
("fastq-solexa", "qual"): _fastq_solexa_convert_qual,
("fastq-illumina", "qual"): _fastq_illumina_convert_qual,
}
def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
"""SeqIO conversion function (PRIVATE)."""
try:
f = _converter[(in_format, out_format)]
except KeyError:
f = None
if f:
return f(in_handle, out_handle, alphabet)
else:
records = SeqIO.parse(in_handle, in_format, alphabet)
return SeqIO.write(records, out_handle, out_format)
|