# Copyright 2007 by Michiel de Hoon.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Code to work with the sprotXX.dat file from SwissProt.

https://web.expasy.org/docs/userman.html

Classes:
 - Record             Holds SwissProt data.
 - Reference          Holds reference data from a SwissProt record.

Functions:
 - read               Read one SwissProt record
 - parse              Read multiple SwissProt records

"""

from __future__ import print_function

from Bio._py3k import _as_string


class Record(object):
    """Holds information from a SwissProt record.

    Attributes:
     - entry_name        Name of this entry, e.g. RL1_ECOLI.
     - data_class        Either 'STANDARD' or 'PRELIMINARY'.
     - molecule_type     Type of molecule, 'PRT',
     - sequence_length   Number of residues.
     - accessions        List of the accession numbers, e.g. ['P00321']
     - created           A tuple of (date, release).
     - sequence_update   A tuple of (date, release).
     - annotation_update A tuple of (date, release).
     - description       Free-format description.
     - gene_name         Gene name.  See userman.txt for description.
     - organism          The source of the sequence.
     - organelle         The origin of the sequence.
     - organism_classification  The taxonomy classification.  List of strings.
       (http://www.ncbi.nlm.nih.gov/Taxonomy/)
     - taxonomy_id       A list of NCBI taxonomy id's.
     - host_organism     A list of names of the hosts of a virus, if any.
     - host_taxonomy_id  A list of NCBI taxonomy id's of the hosts, if any.
     - references        List of Reference objects.
     - comments          List of strings.
     - cross_references  List of tuples (db, id1[, id2][, id3]).  See the docs.
     - keywords          List of the keywords.
     - features          List of tuples (key name, from, to, description).
       from and to can be either integers for the residue
       numbers, '<', '>', or '?'
     - protein_existence Numerical value describing the evidence for the existence of the protein.
     - seqinfo           tuple of (length, molecular weight, CRC32 value)
     - sequence          The sequence.

    Examples
    --------
    >>> import Bio.SwissProt as sp
    >>> example_filename = "SwissProt/sp008"
    >>> with open(example_filename) as handle:
    ...     records = sp.parse(handle)
    ...     for record in records:
    ...         print(record.entry_name)
    ...         print(",".join(record.accessions))
    ...         print(record.keywords)
    ...         print(repr(record.organism))
    ...         print(record.sequence[:20] + "...")
    ...
    1A02_HUMAN
    P01892,P06338,P30514,P30444,P30445,P30446,Q29680,Q29899,Q95352,Q29837,Q95380
    ['MHC I', 'Transmembrane', 'Glycoprotein', 'Signal', 'Polymorphism', '3D-structure']
    'Homo sapiens (Human).'
    MAVMAPRTLVLLLSGALALT...

    """

    def __init__(self):
        """Initialize the class."""
        self.entry_name = None
        self.data_class = None
        self.molecule_type = None
        self.sequence_length = None

        self.accessions = []
        self.created = None
        self.sequence_update = None
        self.annotation_update = None

        self.description = []
        self.gene_name = ''
        self.organism = []
        self.organelle = ''
        self.organism_classification = []
        self.taxonomy_id = []
        self.host_organism = []
        self.host_taxonomy_id = []
        self.references = []
        self.comments = []
        self.cross_references = []
        self.keywords = []
        self.features = []
        self.protein_existence = ''

        self.seqinfo = None
        self.sequence = ''


class Reference(object):
    """Holds information from one reference in a SwissProt entry.

    Attributes:
     - number      Number of reference in an entry.
     - evidence    Evidence code.  List of strings.
     - positions   Describes extent of work.  List of strings.
     - comments    Comments.  List of (token, text).
     - references  References.  List of (dbname, identifier).
     - authors     The authors of the work.
     - title       Title of the work.
     - location    A citation for the work.

    """

    def __init__(self):
        """Initialize the class."""
        self.number = None
        self.positions = []
        self.comments = []
        self.references = []
        self.authors = []
        self.title = []
        self.location = []


def parse(handle):
    while True:
        record = _read(handle)
        if not record:
            return
        yield record


def read(handle):
    record = _read(handle)
    if not record:
        raise ValueError("No SwissProt record found")
    # We should have reached the end of the record by now
    # Used to check with handle.read() but that breaks on Python 3.5
    # due to http://bugs.python.org/issue26499 and could download
    # lot of data needlessly if there were more records.
    remainder = handle.readline()
    if remainder:
        raise ValueError("More than one SwissProt record found")
    return record


# Everything below is considered private


def _read(handle):
    record = None
    unread = ""
    for line in handle:
        # This is for Python 3 to cope with a binary handle (byte strings),
        # or a text handle (unicode strings):
        line = _as_string(line)
        key, value = line[:2], line[5:].rstrip()
        if unread:
            value = unread + " " + value
            unread = ""
        if key == '**':
            # See Bug 2353, some files from the EBI have extra lines
            # starting "**" (two asterisks/stars).  They appear
            # to be unofficial automated annotations. e.g.
            # **
            # **   #################    INTERNAL SECTION    ##################
            # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
            pass
        elif key == 'ID':
            record = Record()
            _read_id(record, line)
            _sequence_lines = []
        elif key == 'AC':
            accessions = [word for word in value.rstrip(";").split("; ")]
            record.accessions.extend(accessions)
        elif key == 'DT':
            _read_dt(record, line)
        elif key == 'DE':
            record.description.append(value.strip())
        elif key == 'GN':
            if record.gene_name:
                record.gene_name += " "
            record.gene_name += value
        elif key == 'OS':
            record.organism.append(value)
        elif key == 'OG':
            record.organelle += line[5:]
        elif key == 'OC':
            cols = [col for col in value.rstrip(";.").split("; ")]
            record.organism_classification.extend(cols)
        elif key == 'OX':
            _read_ox(record, line)
        elif key == 'OH':
            _read_oh(record, line)
        elif key == 'RN':
            reference = Reference()
            _read_rn(reference, value)
            record.references.append(reference)
        elif key == 'RP':
            assert record.references, "RP: missing RN"
            record.references[-1].positions.append(value)
        elif key == 'RC':
            assert record.references, "RC: missing RN"
            reference = record.references[-1]
            unread = _read_rc(reference, value)
        elif key == 'RX':
            assert record.references, "RX: missing RN"
            reference = record.references[-1]
            _read_rx(reference, value)
        elif key == 'RL':
            assert record.references, "RL: missing RN"
            reference = record.references[-1]
            reference.location.append(value)
        # In UniProt release 1.12 of 6/21/04, there is a new RG
        # (Reference Group) line, which references a group instead of
        # an author.  Each block must have at least 1 RA or RG line.
        elif key == 'RA':
            assert record.references, "RA: missing RN"
            reference = record.references[-1]
            reference.authors.append(value)
        elif key == 'RG':
            assert record.references, "RG: missing RN"
            reference = record.references[-1]
            reference.authors.append(value)
        elif key == "RT":
            assert record.references, "RT: missing RN"
            reference = record.references[-1]
            reference.title.append(value)
        elif key == 'CC':
            _read_cc(record, line)
        elif key == 'DR':
            _read_dr(record, value)
        elif key == 'PE':
            _read_pe(record, value)
        elif key == 'KW':
            _read_kw(record, value)
        elif key == 'FT':
            _read_ft(record, line)
        elif key == 'SQ':
            cols = value.split()
            assert len(cols) == 7, "I don't understand SQ line %s" % line
            # Do more checking here?
            record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
        elif key == '  ':
            _sequence_lines.append(value.replace(" ", "").rstrip())
        elif key == '//':
            # Join multiline data into one string
            record.description = " ".join(record.description)
            record.organism = " ".join(record.organism)
            record.organelle = record.organelle.rstrip()
            for reference in record.references:
                reference.authors = " ".join(reference.authors).rstrip(";")
                reference.title = " ".join(reference.title).rstrip(";")
                if reference.title.startswith('"') and reference.title.endswith('"'):
                    reference.title = reference.title[1:-1]  # remove quotes
                reference.location = " ".join(reference.location)
            record.sequence = "".join(_sequence_lines)
            return record
        else:
            raise ValueError("Unknown keyword '%s' found" % key)
    if record:
        raise ValueError("Unexpected end of stream.")


def _read_id(record, line):
    cols = line[5:].split()
    # Prior to release 51, included with MoleculeType:
    # ID   EntryName DataClass; MoleculeType; SequenceLength AA.
    #
    # Newer files lack the MoleculeType:
    # ID   EntryName DataClass; SequenceLength AA.
    if len(cols) == 5:
        record.entry_name = cols[0]
        record.data_class = cols[1].rstrip(";")
        record.molecule_type = cols[2].rstrip(";")
        record.sequence_length = int(cols[3])
    elif len(cols) == 4:
        record.entry_name = cols[0]
        record.data_class = cols[1].rstrip(";")
        record.molecule_type = None
        record.sequence_length = int(cols[2])
    else:
        raise ValueError("ID line has unrecognised format:\n" + line)
    # check if the data class is one of the allowed values
    allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
    if record.data_class not in allowed:
        raise ValueError("Unrecognized data class %s in line\n%s" %
                         (record.data_class, line))
    # molecule_type should be 'PRT' for PRoTein
    # Note that has been removed in recent releases (set to None)
    if record.molecule_type not in (None, 'PRT'):
        raise ValueError("Unrecognized molecule type %s in line\n%s" %
                         (record.molecule_type, line))


def _read_dt(record, line):
    value = line[5:]
    uprline = value.upper()
    cols = value.rstrip().split()
    if 'CREATED' in uprline \
    or 'LAST SEQUENCE UPDATE' in uprline \
    or 'LAST ANNOTATION UPDATE' in uprline:
        # Old style DT line
        # =================
        # e.g.
        # DT   01-FEB-1995 (Rel. 31, Created)
        # DT   01-FEB-1995 (Rel. 31, Last sequence update)
        # DT   01-OCT-2000 (Rel. 40, Last annotation update)
        #
        # or:
        # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
        # ...

        # find where the version information will be located
        # This is needed for when you have cases like IPI where
        # the release version is in a different spot:
        # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
        uprcols = uprline.split()
        rel_index = -1
        for index in range(len(uprcols)):
            if 'REL.' in uprcols[index]:
                rel_index = index
        assert rel_index >= 0, "Could not find Rel. in DT line: %s" % line
        version_index = rel_index + 1
        # get the version information
        str_version = cols[version_index].rstrip(",")
        # no version number
        if str_version == '':
            version = 0
        # dot versioned
        elif '.' in str_version:
            version = str_version
        # integer versioned
        else:
            version = int(str_version)
        date = cols[0]

        if 'CREATED' in uprline:
            record.created = date, version
        elif 'LAST SEQUENCE UPDATE' in uprline:
            record.sequence_update = date, version
        elif 'LAST ANNOTATION UPDATE' in uprline:
            record.annotation_update = date, version
        else:
            assert False, "Shouldn't reach this line!"
    elif 'INTEGRATED INTO' in uprline \
    or 'SEQUENCE VERSION' in uprline \
    or 'ENTRY VERSION' in uprline:
        # New style DT line
        # =================
        # As of UniProt Knowledgebase release 7.0 (including
        # Swiss-Prot release 49.0 and TrEMBL release 32.0) the
        # format of the DT lines and the version information
        # in them was changed - the release number was dropped.
        #
        # For more information see bug 1948 and
        # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0
        #
        # e.g.
        # DT   01-JAN-1998, integrated into UniProtKB/Swiss-Prot.
        # DT   15-OCT-2001, sequence version 3.
        # DT   01-APR-2004, entry version 14.
        #
        # This is a new style DT line...

        # The date should be in string cols[1]
        # Get the version number if there is one.
        # For the three DT lines above: 0, 3, 14
        try:
            version = 0
            for s in cols[-1].split('.'):
                if s.isdigit():
                    version = int(s)
        except ValueError:
            version = 0
        date = cols[0].rstrip(",")

        # Re-use the historical property names, even though
        # the meaning has changed slighty:
        if "INTEGRATED" in uprline:
            record.created = date, version
        elif 'SEQUENCE VERSION' in uprline:
            record.sequence_update = date, version
        elif 'ENTRY VERSION' in uprline:
            record.annotation_update = date, version
        else:
            assert False, "Shouldn't reach this line!"
    else:
        raise ValueError("I don't understand the date line %s" % line)


def _read_ox(record, line):
    # The OX line used to be in the simple format:
    # OX   DESCRIPTION=ID[, ID]...;
    # If there are too many id's to fit onto a line, then the ID's
    # continue directly onto the next line, e.g.
    # OX   DESCRIPTION=ID[, ID]...
    # OX   ID[, ID]...;
    # Currently, the description is always "NCBI_TaxID".
    # To parse this, I need to check to see whether I'm at the
    # first line.  If I am, grab the description and make sure
    # it's an NCBI ID.  Then, grab all the id's.
    #
    # As of the 2014-10-01 release, there may be an evidence code, e.g.
    # OX   NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1};
    # In the short term, we will ignore any evidence codes:
    line = line.split('{')[0]
    if record.taxonomy_id:
        ids = line[5:].rstrip().rstrip(";")
    else:
        descr, ids = line[5:].rstrip().rstrip(";").split("=")
        assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
    record.taxonomy_id.extend(ids.split(', '))


def _read_oh(record, line):
    # Line type OH (Organism Host) for viral hosts
    assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line
    line = line[16:].rstrip()
    assert line[-1] == "." and line.count(";") == 1, line
    taxid, name = line[:-1].split(";")
    record.host_taxonomy_id.append(taxid.strip())
    record.host_organism.append(name.strip())


def _read_rn(reference, rn):
    # This used to be a very simple line with a reference number, e.g.
    # RN   [1]
    # As of the 2014-10-01 release, there may be an evidence code, e.g.
    # RN   [1] {ECO:0000313|EMBL:AEX14553.1}
    words = rn.split(None, 1)
    number = words[0]
    assert number.startswith('[') and number.endswith(']'), "Missing brackets %s" % number
    reference.number = int(number[1:-1])
    if len(words) > 1:
        evidence = words[1]
        assert evidence.startswith('{') and evidence.endswith('}'), "Missing braces %s" % evidence
        reference.evidence = evidence[1:-1].split('|')


def _read_rc(reference, value):
    cols = value.split(';')
    if value[-1] == ';':
        unread = ""
    else:
        cols, unread = cols[:-1], cols[-1]
    for col in cols:
        if not col:  # last column will be the empty string
            return
        # The token is everything before the first '=' character.
        i = col.find("=")
        if i >= 0:
            token, text = col[:i], col[i + 1:]
            comment = token.lstrip(), text
            reference.comments.append(comment)
        else:
            comment = reference.comments[-1]
            comment = "%s %s" % (comment, col)
            reference.comments[-1] = comment
    return unread


def _read_rx(reference, value):
    # The basic (older?) RX line is of the form:
    # RX   MEDLINE; 85132727.
    # but there are variants of this that need to be dealt with (see below)

    # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
    # have extraneous information in the RX line.  Check for
    # this and chop it out of the line.
    # (noticed by katel@worldpath.net)
    value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '')

    # RX lines can also be used of the form
    # RX   PubMed=9603189;
    # reported by edvard@farmasi.uit.no
    # and these can be more complicated like:
    # RX   MEDLINE=95385798; PubMed=7656980;
    # RX   PubMed=15060122; DOI=10.1136/jmg 2003.012781;
    # We look for these cases first and deal with them
    warn = False
    if "=" in value:
        cols = value.split("; ")
        cols = [x.strip() for x in cols]
        cols = [x for x in cols if x]
        for col in cols:
            x = col.split("=")
            if len(x) != 2 or x == ("DOI", "DOI"):
                warn = True
                break
            assert len(x) == 2, "I don't understand RX line %s" % value
            reference.references.append((x[0], x[1].rstrip(";")))
    # otherwise we assume we have the type 'RX   MEDLINE; 85132727.'
    else:
        cols = value.split("; ")
        # normally we split into the three parts
        if len(cols) != 2:
            warn = True
        else:
            reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
    if warn:
        import warnings
        from Bio import BiopythonParserWarning
        warnings.warn("Possibly corrupt RX line %r" % value,
                      BiopythonParserWarning)


def _read_cc(record, line):
    key, value = line[5:8], line[9:].rstrip()
    if key == '-!-':   # Make a new comment
        record.comments.append(value)
    elif key == '   ':  # add to the previous comment
        if not record.comments:
            # TCMO_STRGA in Release 37 has comment with no topic
            record.comments.append(value)
        else:
            record.comments[-1] += " " + value


def _read_dr(record, value):
    cols = value.rstrip(".").split('; ')
    record.cross_references.append(tuple(cols))


def _read_pe(record, value):
    pe = value.split(":")
    record.protein_existence = int(pe[0])


def _read_kw(record, value):
    # Old style - semi-colon separated, multi-line. e.g. Q13639.txt
    # KW   Alternative splicing; Cell membrane; Complete proteome;
    # KW   Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein;
    # KW   Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer;
    # KW   Transmembrane.
    #
    # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt
    # KW   Monooxygenase {ECO:0000313|EMBL:AEX14553.1};
    # KW   Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}.
    # For now to match the XML parser, drop the evidence codes.
    for value in value.rstrip(";.").split('; '):
        if value.endswith("}"):
            # Discard the evidence code
            value = value.rsplit("{", 1)[0]
        record.keywords.append(value.strip())


def _read_ft(record, line):
    line = line[5:]    # get rid of junk in front
    name = line[0:8].rstrip()
    try:
        from_res = int(line[9:15])
    except ValueError:
        from_res = line[9:15].lstrip()
    try:
        to_res = int(line[16:22])
    except ValueError:
        to_res = line[16:22].lstrip()
    # if there is a feature_id (FTId), store it away
    if line[29:35] == r"/FTId=":
        ft_id = line[35:70].rstrip()[:-1]
        description = ""
    else:
        ft_id = ""
        description = line[29:70].rstrip()
    if not name:  # is continuation of last one
        assert not from_res and not to_res
        name, from_res, to_res, old_description, old_ft_id = record.features[-1]
        del record.features[-1]
        description = ("%s %s" % (old_description, description)).strip()

        # special case -- VARSPLIC, reported by edvard@farmasi.uit.no
        if name == "VARSPLIC":
            # Remove unwanted spaces in sequences.
            # During line carryover, the sequences in VARSPLIC can get mangled
            # with unwanted spaces like:
            # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
            # We want to check for this case and correct it as it happens.
            descr_cols = description.split(" -> ")
            if len(descr_cols) == 2:
                first_seq, second_seq = descr_cols
                extra_info = ''
                # we might have more information at the end of the
                # second sequence, which should be in parenthesis
                extra_info_pos = second_seq.find(" (")
                if extra_info_pos != -1:
                    extra_info = second_seq[extra_info_pos:]
                    second_seq = second_seq[:extra_info_pos]
                # now clean spaces out of the first and second string
                first_seq = first_seq.replace(" ", "")
                second_seq = second_seq.replace(" ", "")
                # reassemble the description
                description = first_seq + " -> " + second_seq + extra_info
    record.features.append((name, from_res, to_res, description, ft_id))


if __name__ == "__main__":
    from Bio._utils import run_doctest
    run_doctest(verbose=0)
