File: test_transcript_ids.py

package info (click to toggle)
pyensembl 2.3.13-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 612 kB
  • sloc: python: 4,199; makefile: 208; sh: 74
file content (62 lines) | stat: -rw-r--r-- 2,015 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Tests for methods which return collections of transcript IDs that aren't
converting from some type of name or ID.
"""
from __future__ import absolute_import

from pyensembl import genome_for_reference_name
from .common import eq_
from .common import run_multiple_genomes

grch38 = genome_for_reference_name("GRCh38")

# subset of transcript IDs for HLA-A
HLA_A_TRANSCRIPT_IDS = [
    "ENST00000396634",
    "ENST00000376809",
    "ENST00000376806",
    "ENST00000376802",
    "ENST00000496081",
    "ENST00000495183",
    "ENST00000461903",
    "ENST00000479320",
]


def test_transcript_ids_ensembl_grch38_hla_a():
    # chr6:29,945,884  is a position for HLA-A
    # based on:
    # http://useast.ensembl.org/Homo_sapiens/Gene/
    # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
    transcript_ids = grch38.transcript_ids_at_locus(6, 29941260, 29945884)
    for transcript_id in HLA_A_TRANSCRIPT_IDS:
        assert transcript_id in transcript_ids, (
            "Transcript %s of HLA-A not found overlapping locus" % transcript_id
        )


KNOWN_TRANSCRIPT_IDS = HLA_A_TRANSCRIPT_IDS + [
    "ENST00000398417",  # transcript ID of SMAD4-001
    "ENST00000334701",  # transcript ID of HSP90AA1-001
    "ENST00000599837",  # transcript ID of CTAG1A-002
]


# TODO: add release 54 after transcript IDs for older GTFs are filled in
# See https://github.com/hammerlab/pyensembl/issues/20
@run_multiple_genomes(75, grch38.release)
def test_all_transcript_ids(genome):
    transcript_ids = set(genome.transcript_ids())
    for transcript_id in KNOWN_TRANSCRIPT_IDS:
        assert transcript_id in transcript_ids, "Missing transcript ID %s from %s" % (
            transcript_id,
            genome,
        )


def test_transcript_id_of_protein_id_CCR2():
    # Looked up on Oct 9 2021:
    # CCR2-203 ENST00000445132.3 maps to ENSP00000399285.2
    # Ensembl release 104, GRCh38.p13
    transcript_id = grch38.transcript_id_of_protein_id("ENSP00000399285")
    eq_("ENST00000445132", transcript_id)