File: resources.py

package info (click to toggle)
python-bioframe 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,280 kB
  • sloc: python: 7,459; makefile: 14; sh: 13
file content (296 lines) | stat: -rw-r--r-- 8,644 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import urllib
from functools import partial
from typing import Union
from urllib.parse import urljoin

import numpy as np
import pandas as pd

from .assembly import assembly_info
from .fileops import read_chromsizes, read_table
from .schemas import SCHEMAS

__all__ = [
    "fetch_chromsizes",
    "fetch_centromeres",
    "UCSCClient",
]


def fetch_chromsizes(
    db: str,
    *,
    provider: str = "local",
    as_bed: bool = False,
    filter_chroms: bool = True,
    chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
    natsort: bool = True,
    **kwargs,
) -> Union[pd.Series, pd.DataFrame]:
    """
    Fetch chromsizes from local storage or the UCSC database.

    Parameters
    ----------
    db : str
        Assembly name.
    provider : str, optional [default: "local"]
        The provider of chromsizes. Either "local" for local storage or "ucsc".
    as_bed : bool, optional
        If True, return chromsizes as an interval DataFrame (chrom, start, end)
        instead of a Series.

    The remaining options only apply to provider="ucsc".

    filter_chroms : bool, optional
        Filter for chromosome names given in ``chrom_patterns``.
    chrom_patterns : sequence, optional
        Sequence of regular expressions to capture desired sequence names.
    natsort : bool, optional
        Sort each captured group of names in natural order. Default is True.
    **kwargs :
        Passed to :func:`pandas.read_csv`

    Returns
    -------
    Series of integer bp lengths indexed by sequence name or BED3 DataFrame.

    Notes
    -----
    For more fine-grained control over the chromsizes from local storage,
    use :func:`bioframe.assembly_info`.

    Examples
    --------
    >>> fetch_chromsizes("hg38")
    name
    chr1     248956422
    chr2     242193529
    chr3     198295559
    ...      ...
    chrX     156040895
    chrY      57227415
    chrM         16569
    Name: length, dtype: int64

    >>> fetch_chromsizes("hg38", as_bed=True)
            chrom      start        end
    0        chr1          0  248956422
    1        chr2          0  242193529
    2        chr3          0  198295559
    ...      ...
    21       chrX          0  156040895
    22       chrY          0   57227415
    23       chrM          0      16569

    See also
    --------
    bioframe.assembly_info
    bioframe.UCSCClient
    """
    if provider == "local":
        assembly = assembly_info(db)
        if as_bed:
            return assembly.viewframe[["chrom", "start", "end"]].copy()
        else:
            return assembly.chromsizes
    elif provider == "ucsc":
        return UCSCClient(db).fetch_chromsizes(
            filter_chroms=filter_chroms,
            chrom_patterns=chrom_patterns,
            natsort=natsort,
            as_bed=as_bed,
            **kwargs,
        )
    else:
        raise ValueError(f"Unknown provider '{provider}'")


def _origins_from_cytoband(
    cyb: pd.DataFrame, band_col: str = "gieStain"
) -> pd.DataFrame:
    """
    Extract chromosomal origin positions separating chromosome arms from
    cytological band data. Takes the cytological origin, i.e. the boundary
    between the two bands labeled 'acen'.

    Parameters
    ----------
    cyb : pandas.DataFrame
        DataFrame with cytoband data.

    Returns
    -------
    pandas.DataFrame
        A dataframe with columns 'chrom', 'start', 'end', 'mid'.
    """
    cyb = cyb[cyb[band_col] == "acen"]
    grouped = cyb.groupby("chrom", sort=False)
    cens = []
    for chrom, group in grouped:
        if not len(group) == 2:
            raise ValueError(f"Expected 2 'acen' bands for {chrom}, found {len(group)}")
        acens = group.sort_values("start")
        cens.append(
            {
                "chrom": chrom,
                "start": acens.iloc[0]["start"],
                "end": acens.iloc[1]["end"],
                "mid": acens.iloc[0]["end"],
            }
        )
    return pd.DataFrame.from_records(cens)


def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame:
    """
    Extract chromosomal origin positions from UCSC centromeres.txt table
    describing centromere model sequences. Takes the midpoint of all
    modeled centromere sequences.

    Parameters
    ----------
    cens : pandas.DataFrame
        DataFrame with centromeres.txt data.

    Returns
    -------
    pandas.DataFrame
        A dataframe with columns 'chrom', 'start', 'end', 'mid'.
    """
    cens = cens.groupby("chrom").agg({"start": np.min, "end": np.max}).reset_index()
    cens["mid"] = (cens["start"] + cens["end"]) // 2
    cens = (
        cens[["chrom", "start", "end", "mid"]]
        .sort_values("chrom")
        .reset_index(drop=True)
    )
    return cens


def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame:
    """
    Extract centromere locations for a given assembly 'db' from a variety
    of file formats in UCSC (cytoband, centromeres) depending on
    availability, returning a DataFrame.

    Parameters
    ----------
    db : str
        Assembly name.
    provider : str, optional [default: "local"]
        The provider of centromere data. Either "local" for local storage
        or "ucsc".

    Returns
    -------
    DataFrame with centromere 'chrom', 'start', 'end', 'mid'.

    Notes
    -----
    When provider="local", centromeres are derived from cytoband tables
    in local storage.

    Whe provider="ucsc", the fallback priority goes as follows:
    - UCSC cytoBand
    - UCSC cytoBandIdeo
    - UCSC centromeres.txt

    Note that UCSC "gap" files no longer provide centromere information.

    Currently only works for human assemblies.

    See also
    --------
    bioframe.assembly_info
    bioframe.UCSCClient
    """
    if provider == "local":
        assembly = assembly_info(db)
        cyb = assembly.cytobands
        if cyb is None:
            raise ValueError(
                f"No source for centromere data found from provider '{provider}'."
            )
        return _origins_from_cytoband(cyb, band_col="stain")

    elif provider == "ucsc":
        client = UCSCClient(db)
        fetchers = [
            ("cytoband", client.fetch_cytoband),
            ("cytoband", partial(client.fetch_cytoband, ideo=True)),
            ("centromeres", client.fetch_centromeres),
        ]

        for schema, fetcher in fetchers:  # noqa: B007
            try:
                df = fetcher()
                break
            except urllib.error.HTTPError:
                pass
        else:
            raise ValueError(
                f"No source for centromere data found from provider '{provider}'."
            )

        if schema == "centromeres":
            return _origins_from_ucsccentromeres(df)
        else:
            return _origins_from_cytoband(df)

    else:
        raise ValueError(f"Unknown provider '{provider}'")


class UCSCClient:
    BASE_URL = "https://hgdownload.soe.ucsc.edu/"

    def __init__(self, db: str):
        self._db = db
        self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/")

    def fetch_chromsizes(
        self,
        filter_chroms: bool = True,
        chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
        natsort: bool = True,
        as_bed: bool = False,
        **kwargs,
    ) -> Union[pd.Series, pd.DataFrame]:
        url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes")
        return read_chromsizes(
            url,
            filter_chroms=filter_chroms,
            chrom_patterns=chrom_patterns,
            natsort=natsort,
            as_bed=as_bed,
            **kwargs,
        )

    def fetch_centromeres(self, **kwargs) -> pd.DataFrame:
        url = urljoin(self._db_url, "database/centromeres.txt.gz")
        return read_table(url, schema="centromeres", **kwargs)

    def fetch_gaps(self, **kwargs):
        url = urljoin(self._db_url, "database/gap.txt.gz")
        return read_table(
            url,
            schema="gap",
            usecols=["chrom", "start", "end", "length", "type", "bridge"],
            **kwargs,
        )

    def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame:
        if ideo:
            url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz")
        else:
            url = urljoin(self._db_url, "database/cytoBand.txt.gz")
        return read_table(url, schema="cytoband")

    def fetch_mrna(self, **kwargs) -> pd.DataFrame:
        url = urljoin(self._db_url, "database/all_mrna.txt.gz")
        return read_table(
            url,
            schema=SCHEMAS["all_mrna"],
            **kwargs,
        )