File: _blast.py

package info (click to toggle)
python-skbio 0.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 16,556 kB
  • ctags: 7,222
  • sloc: python: 42,085; ansic: 670; makefile: 180; sh: 10
file content (60 lines) | stat: -rw-r--r-- 2,442 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import functools
import contextlib

import pandas as pd

_possible_columns = {'qseqid': str, 'qgi': float, 'qacc': str, 'qaccver': str,
                     'qlen': float, 'sseqid': str, 'sallseqid': str,
                     'sgi': float, 'sallgi': float, 'sacc': str,
                     'saccver': str, 'sallacc': str, 'slen': float,
                     'qstart': float, 'qend': float, 'sstart': float,
                     'send': float, 'qseq': str, 'sseq': str,
                     'evalue': float, 'bitscore': float, 'score': float,
                     'length': float, 'pident': float, 'nident': float,
                     'mismatch': float, 'positive': float, 'gapopen': float,
                     'gaps': float, 'ppos': float, 'frames': str,
                     'qframe': float, 'sframe': float, 'btop': float,
                     'staxids': str, 'sscinames': str, 'scomnames': str,
                     'sblastnames': str, 'sskingdoms': str, 'stitle': str,
                     'salltitles': str, 'sstrand': str, 'qcovs': float,
                     'qcovhsp': float}


def _parse_blast_data(fh, columns, error, error_message, comment=None,
                      skiprows=None):
    read_csv = functools.partial(pd.read_csv, na_values='N/A', sep='\t',
                                 header=None, keep_default_na=False,
                                 comment=comment, skiprows=skiprows)

    # HACK for https://github.com/pandas-dev/pandas/issues/14418
    # this avoids closing the `fh`, whose lifetime isn't the responsibility
    # of this parser
    with _noop_close(fh) as fh:

        lineone = read_csv(fh, nrows=1)

        if len(lineone.columns) != len(columns):
            raise error(error_message % (len(columns), len(lineone.columns)))

        fh.seek(0)

        return read_csv(fh, names=columns, dtype=_possible_columns)


# HACK for https://github.com/pandas-dev/pandas/issues/14418
@contextlib.contextmanager
def _noop_close(fh):
    backup = fh.close
    fh.close = lambda: None
    try:
        yield fh
    finally:
        fh.close = backup