1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
|
import re
import pandas as pd
import numpy as np
from .specs import _get_default_colnames
__all__ = [
"parse_region",
"parse_region_string",
"is_complete_ucsc_string",
"to_ucsc_string",
]
### functions for manipulating UCSC strings ###
def to_ucsc_string(triplet):
"""
Convert a triplet to a UCSC string.
Parameters
----------
triplet : (chrom, start, end)
Returns
-------
ucsc_string : str
UCSC-style string, 'chrom:start-end'
"""
return "{0}:{1}-{2}".format(*triplet)
def _parse_humanized(s):
_NUMERIC_RE = re.compile("([0-9,.]+)")
_, value, unit = _NUMERIC_RE.split(s.replace(",", ""))
if not len(unit):
return int(value)
value = float(value)
unit = unit.upper().strip()
if unit in ("K", "KB"):
value *= 1000
elif unit in ("M", "MB"):
value *= 1000000
elif unit in ("G", "GB"):
value *= 1000000000
else:
raise ValueError("Unknown unit '{}'".format(unit))
return int(value)
def parse_region_string(s):
"""
Parse a UCSC-style genomic region string into a triple.
Parameters
----------
s : str
UCSC-style string, e.g. "chr5:10,100,000-30,000,000". Ensembl and FASTA
style sequence names are allowed. Start coordinate must >0 and end coordinate
must be greater than or equal to start.
Returns
-------
triple : (str, int or None, int or None)
"""
def _tokenize(s):
token_spec = [
("HYPHEN", r"-"),
("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"),
("OTHER", r".+"),
]
tok_regex = r"\s*" + r"|\s*".join(r"(?P<%s>%s)" % pair for pair in token_spec)
tok_regex = re.compile(tok_regex, re.IGNORECASE)
for match in tok_regex.finditer(s):
typ = match.lastgroup
yield typ, match.group(typ)
def _check_token(typ, token, expected):
if typ is None:
raise ValueError("Expected {} token missing".format(" or ".join(expected)))
else:
if typ not in expected:
raise ValueError('Unexpected token "{}"'.format(token))
def _expect(tokens):
typ, token = next(tokens, (None, None))
_check_token(typ, token, ["COORD"])
start = _parse_humanized(token)
typ, token = next(tokens, (None, None))
_check_token(typ, token, ["HYPHEN"])
typ, token = next(tokens, (None, None))
if typ is None:
return start, None
_check_token(typ, token, ["COORD"])
end = _parse_humanized(token)
if end < start:
raise ValueError("End coordinate less than start")
return start, end
parts = s.split(":")
chrom = parts[0].strip()
if not len(chrom):
raise ValueError("Chromosome name cannot be empty")
if len(parts) < 2:
return (chrom, None, None)
start, end = _expect(_tokenize(parts[1]))
return (chrom, start, end)
def is_complete_ucsc_string(mystring):
"""
Check if a string can be parsed into (`chrom`, `start`, `end`) format.
Parameters
----------
mystring : str
Returns
-------
is_complete : bool
True if able to be parsed into (`chrom`, `start`, `end`) format.
"""
if type(mystring) is not str:
return False
if len(parse_region_string(mystring)) != 3:
return False
if parse_region_string(mystring)[2] is None:
return False
return True
def parse_region(reg, chromsizes=None):
"""
Coerce a genomic region string or sequence into a triple (chrom, start, end).
Genomic regions are represented as half-open intervals (0-based starts,
1-based ends) along the length coordinate of a contig/scaffold/chromosome.
Start must be >= 0, and end coordinate must be >= start.
Parameters
----------
reg : str or tuple
UCSC-style genomic region string, or
Triple (chrom, start, end), where ``start`` or ``end`` may be ``None``.
Quadriple (chrom, start, end, name) (name is ignored).
chromsizes : mapping, optional
Lookup table of scaffold lengths to check against ``chrom`` and the
``end`` coordinate. Required if ``end`` is not supplied.
Returns
-------
triple : (str, int, int)
A well-formed genomic region triple (str, int, int)
"""
if isinstance(reg, str):
chrom, start, end = parse_region_string(reg)
else:
if len(reg) not in [3, 4]:
raise ValueError("length of a region should be 3 or 4")
chrom, start, end = reg[:3]
start = int(start) if start is not None else start
end = int(end) if end is not None else end
try:
clen = chromsizes[chrom] if chromsizes is not None else None
except KeyError:
raise ValueError("Unknown sequence label: {}".format(chrom))
start = 0 if start is None else start
if end is None:
end = clen # if clen is None, end is None too!
if (end is not None) and (end < start):
raise ValueError("End cannot be less than start")
if start < 0 or (clen is not None and end > clen):
raise ValueError("Genomic region out of bounds: [{}, {})".format(start, end))
return chrom, start, end
|