File: thread.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (79 lines) | stat: -rw-r--r-- 3,203 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Tools for "threading" out specific species from alignments (removing other
species and fixing alignment text).
"""

from copy import deepcopy


def thread(mafs, species):
    """
    Restrict an list of alignments to a given list of species by:

        1) Removing components for any other species
        2) Remove any columns containing all gaps

    Example:

    >>> import bx.align.maf

    >>> block1 = bx.align.maf.from_string( '''
    ... a score=4964.0
    ... s hg18.chr10                  52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
    ... s rheMac2.chr20            58163346 43 -  88221753 ATATTATCTTAACATTAAAGA-AGAACAGTAATTCTGGTCATAA
    ... s panTro1.chrUn_random    208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
    ... s oryCun1.scaffold_175207     85970 22 +    212797 ----------------------AAAATATTAGTTATCACCATAT
    ... s bosTau2.chr23            23894492 43 +  41602928 AAACTACCTTAATGTCACAGG-AAACAATGTATgctgctgctgc
    ... ''' )

    >>> block2 = bx.align.maf.from_string( '''
    ... a score=9151.0
    ... s hg18.chr10                  52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAG-GAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTT-TGAC
    ... s oryCun1.scaffold_175207     85992 71 +    212797 TCTAGTGCTCTCCAATAATATAATAGATTATAACTTCATATAATTATGTGAAATATAAGATTATTTATCAG
    ... s panTro1.chrUn_random    208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAG-GGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTT-TGAT
    ... s rheMac2.chr20            58163389 69 -  88221753 ACACATATTATTTCTTAACATGGAGGATTATATCTT-AAACATGTGTGCaaaatataaatatatat-tcaa
    ... ''' )

    >>> mafs = [ block1, block2 ]

    >>> threaded = [ t for t in thread( mafs, [ "hg18", "panTro1" ] ) ]

    >>> len( threaded )
    2

    >>> print(threaded[0])
    a score=0.0
    s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
    s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
    <BLANKLINE>

    >>> print(threaded[1])
    a score=0.0
    s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC
    s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT
    <BLANKLINE>

    """
    for m in mafs:
        new_maf = deepcopy(m)
        new_components = get_components_for_species(new_maf, species)
        if new_components:
            new_maf.components = new_components
            new_maf.score = 0.0
            new_maf.text_size = len(new_components[0].text)
            new_maf.remove_all_gap_columns()
            yield new_maf


def get_components_for_species(alignment, species):
    """Return the component for each species in the list `species` or None"""
    # If the number of components in the alignment is less that the requested number
    # of species we can immediately fail
    if len(alignment.components) < len(species):
        return None
    # Otherwise, build an index of components by species, then lookup
    index = {c.src.split(".")[0]: c for c in alignment.components}
    try:
        return [index[s] for s in species]
    except Exception:
        return None