File: _afdb_modelling.py

package info (click to toggle)
promod3 3.4.2%2Bds-1
links: PTS, VCS
area: main
in suites: trixie
size: 966,596 kB
sloc: cpp: 55,820; python: 18,058; makefile: 85; sh: 51
file content (730 lines) | stat: -rw-r--r-- 31,256 bytes
parent folder | download | duplicates (2)
# Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and
#                          Biozentrum - University of Basel
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import gzip
import mmap
import pickle
import numpy as np
import time
import tarfile
import glob
import re

import ost
from ost import io
from ost import seq
from ._raw_model import BuildRawModel
from ._pipeline import BuildFromRawModel
from ._modelling import *

class _AFDBLogSink(ost.LogSink):
    """ To capture all log messages of ProMod3 when modelling
    """
    def __init__(self):
        ost.LogSink.__init__(self)
        self.messages = list()
        self.severities = list()

    def LogMessage(self, message, severity):
        self.messages.append(message)
        self.severities.append(severity)

class FSStructureServer:
    """ FSStructureServer - A filesystem based AFDB structure server

    Stores OMF data entries in huge binary files and extracts the respective
    binary data blobs with an indexing mechanism. Efficient reading of these
    huge files is delegated to the OS using the Python mmap module.

    Data creation happens with static Create function: :func:`FromDataChunks`.
    The required preprocessing must be done with external scripts.

    :param db_dir: Directory containing required files. 1) pos.dat (see
                   :attr:`pos`) 2) length.dat (see :attr:`length`)
                   3) chunk.dat (see :attr:`chunk`) 4) indexer.dat (see
                   :attr:`indexer`) 4) one or several files with pattern
                   fs_data_[x].dat (see :attr:`data`)
    :type db_dir: :class:`str`
    """
    def __init__(self, db_dir):
        super().__init__()

        self._pos = None
        self._length = None
        self._chunk = None
        self._indexer = None
        self._search_keys = None
        self._data = None
        self._data_fh = None
        self._n_entries = None

        # only check if required files are there, they're lazy loaded
        self.pos_file = os.path.join(db_dir, "pos.dat")
        self.length_file = os.path.join(db_dir, "length.dat")
        self.chunk_file = os.path.join(db_dir, "chunk.dat")
        self.search_key_file = os.path.join(db_dir, "search_keys.dat")
        self.indexer_file = os.path.join(db_dir, "indexer.dat")
        self.data_files = glob.glob(os.path.join(db_dir, "fs_data_*.dat"))
        if not os.path.exists(self.pos_file):
            raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.length_file):
            raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.chunk_file):
            raise RuntimeError(f"Exp \"chunk.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.indexer_file):
            raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})")
        # there can be one or more data files, just check whether there is at
        # least one
        if len(self.data_files) == 0:
            raise RuntimeError(f"Exp at least one data file in form "
                               f"\"fs_data_[x].dat\" in db_dir ({db_dir})")

    def __del__(self):
        if self._data:
            for fh in self._data:
                fh.close()
            for fh in self._data_fh:
                fh.close()

    @property
    def pos(self):
        """ Start positions of data in internal linear memory layout

        :type: :class:`np.ndarray` with dtype np.int64
        """
        if self._pos is None:
            with open(self.pos_file, 'rb') as fh:
                self._pos = np.load(fh)
        return self._pos

    @property
    def length(self):
        """ Lengths of data in internal linear memory layout

        :type: :class:`np.ndarray` with dtype np.int32
        """
        if self._length is None:
            with open(self.length_file, 'rb') as fh:
                self._length = np.load(fh)
        return self._length

    @property
    def chunk(self):
        """ Chunk, in which entry is stored

        :type: :class:`np.ndarray` with dtype np.int16
        """
        if self._chunk is None:
            with open(self.chunk_file, 'rb') as fh:
                self._chunk = np.load(fh)
        return self._chunk

    @property
    def indexer(self):
        """ Internal data structure - Relates entries with data indices

        :type: :class:`np.ndarray` with dtype np.int32
        """
        if self._indexer is None:
            with open(self.indexer_file, 'rb') as fh:
                self._indexer = np.load(fh)
        return self._indexer

    @property
    def search_keys(self):
        """ Internal data structure - Relates entries with data indices
        """
        if self._search_keys is None:
            with open(self.search_key_file, 'rb') as fh:
                self._search_keys = np.load(fh)
        return self._search_keys               

    @property
    def data(self):
        """ Internal binary data in memory mapped files

        :type: :class:`list` of :class:`mmap.mmap`
        """
        if self._data is None:
            self._data = list()
            self._data_fh = list()
            tmp = list()
            for f in self.data_files:
                idx = int(f.split('_')[-1].replace(".dat", ""))
                tmp.append((idx, f))
            tmp.sort()
            sorted_data_files = [x[1] for x in tmp]
            for f in sorted_data_files:
                self._data_fh.append(open(f, 'rb'))
                self._data.append(mmap.mmap(self._data_fh[-1].fileno(), 0,
                                            prot=mmap.PROT_READ))
        return self._data

    @property
    def n_entries(self):
        """ Number of entries

        :type: :class:`int`
        """
        if self._n_entries is None:
            self._n_entries = self.pos.shape[0]
        return self._n_entries

    def GetIdx(self, uniprot_ac, fragment="F1", version="v4"):
        """ Get internal idx of stored data

        Can be used for data retrieval with :func:`GetOMFByIdx`

        :param uniprot_ac: Uniprot AC 
        :type uniprot_ac: :class:`str`
        :param fragment: AFDB entries are potentially split to fragments
                         99.999% of all entries only have one fragment: F1
        :type fragment: class:`str`
        :param version: Version of entry
        :type version: :class:`str`
        :returns: Internal idx of that entry
        :raises: :class:`RuntimeError` if no such entry exists
        """
        if not fragment.startswith('F'):
            raise RuntimeError("expect fragment to start with F")
        if not version.startswith('v'):
            raise RuntimeError("expect version to start with v")
        search_key = CreateAFDBIdx(uniprot_ac, int(fragment[1:]),
                                   int(version[1:]))
        idx = np.searchsorted(self.search_keys, np.uint64(search_key))
        if idx != len(self.indexer) and self.search_keys[idx] == search_key:
            return self.indexer[idx]
        raise RuntimeError(f"No entry for {uniprot_ac} {fragment} "
                           f"{version}")

    def GetOMFByIdx(self, idx):
        """ Get stored OMF data structure

        :param idx: Internal index which can be derived from :func:`GetIdx`  
        :type idx: :class:`int`
        :returns: OMF data structure of type :class:`ost.io.OMF`
        :raises: :class:`RuntimeError` if *idx* is out of range
        """
        if idx < 0 or idx >= self.n_entries:
            raise RuntimeError(f"Invalid idx, must be in [0, {self.n_entries-1}]")
        pos = self.pos[idx]
        length = self.length[idx]
        chunk = self.chunk[idx]
        omf_data = self.data[chunk][pos:pos+length]
        return io.OMF.FromBytes(gzip.decompress(omf_data))

    def GetOMFByPLC(self, pos, length, chunk):
        """ Get stored OMF data structure

        Get data by explicitely specifying PLC (pos, length, chunk). For expert
        use only, no range checks performed.
        Instead of providing a uniprot AC or an index, this function takes
        directly the internal pos, length and chunk parameters that are stored
        for that particular index. Use case: avoid loading the respective data
        files and only open the memory mapped files. 

        :param pos: Byte pos in specified chunk
        :type pos: :class:`int`
        :param length: Num bytes of entry
        :type length: :class:`int`
        :param chunk: Chunk in which entry resides
        :type chunk: :class:`int` 
        :returns: OMF data structure of type :class:`ost.io.OMF`
        """
        omf_data = self.data[chunk][pos:pos+length]
        return io.OMF.FromBytes(gzip.decompress(omf_data))

    def GetOMF(self, uniprot_ac, fragment="F1", version="v4"):
        """ Get stored OMF data structure

        :param uniprot_ac: Uniprot AC 
        :type uniprot_ac: :class:`str`
        :param fragment: AFDB entries are potentially split to fragments
                         99.999% of all entries only have one fragment: F1
        :type fragment: class:`str`
        :param version: Version of entry
        :type version: :class:`str`
        :returns: OMF data structure of type :class:`ost.io.OMF`
        :raises: :class:`RuntimeError` if no such entry exists
        """
        idx = self.GetIdx(uniprot_ac, fragment = fragment, version = version)
        return self.GetOMFByIdx(idx)

    @staticmethod
    def FromDataChunks(chunk_dir, db_dir, chunk_bytes=None):
        """ Static method to create new database from preprocessed data

        Data preprocessing consists of creating several data chunks that are
        pickled to disk.
        In detail: each chunk is a pickled file containing a list, where each
        entry is a tuple with 4 elements: 1) uniprot_ac (:class:`str`)
        2) fragment (:class:`str`) 3) version (:class:`str`) 4) structure data
        (:class:`ost.io.OMF` object which has been written to a bytes object and
        compressed with gzip)

        The data itself is again stored in chunks of binary data which are
        indexed.

        :param chunk_dir: Path to directory containing described data chunks
        :type chunk_dir: :class:`str`
        :param db_dir: Output directory - Creates all files that are needed for
                       :class:`FSStructureServer`
        :type db_dir: :class:`str`
        :param chunk_bytes: Size in bytes of binary data chunks in final
                            database - default None: Everything ends up in one
                            single chunk
        :type chunk_bytes: :class:`int`
        :returns: :class:`FSStructureServer` with all data from *chunk_dir*
        """
        if not os.path.exists(db_dir):
            raise RuntimeError(f"{db_dir} does not exist")
        positions = list()
        lengths = list()
        chunks = list()
        search_keys = list()
        indexer = list()
        chunk_files = os.listdir(chunk_dir)
        chunk_files = [f for f in chunk_files if f.endswith(".pkl")]
        current_pos = 0
        current_chunk = 0
        current_entry = 0
        data_file = open(os.path.join(db_dir, "fs_data_0.dat"), 'wb')
        t0 = time.time()
        for cf_idx, cf in enumerate(chunk_files):
            print(f"processing chunk {cf_idx}, {cf}")
            with open(os.path.join(chunk_dir, cf), 'rb') as fh:
                data = pickle.load(fh)
            for entry in data:
                uniprot_ac = entry[0]
                fragment = entry[1]
                version = entry[2]
                data_bytes = entry[3]
                length = len(data_bytes)
                data_file.write(data_bytes)
                positions.append(current_pos)
                lengths.append(length)
                chunks.append(current_chunk)
                k = CreateAFDBIdx(uniprot_ac, int(fragment[1:]),
                                  int(version[1:]))
                search_keys.append(k)
                indexer.append(current_entry)
                current_pos += length
                current_entry += 1
                if chunk_bytes and current_pos > chunk_bytes:
                    data_file.close()
                    current_chunk += 1
                    f = os.path.join(db_dir, f"fs_data_{current_chunk}.dat")
                    data_file = open(f, 'wb')
                    current_pos = 0
        data_file.close()
        print(f"done processing chunks ({round(time.time() - t0, 3)}s)")
        print("sort indexer matrix")
        t0 = time.time()
        # make search keys searchable by bisect search => sort
        search_keys = np.asarray(search_keys, dtype=np.uint64)
        indexer = np.asarray(indexer, dtype=np.int32)
        sort_indices = np.argsort(search_keys)
        search_keys = search_keys[sort_indices]
        indexer = indexer[sort_indices]

        with open(os.path.join(db_dir, "search_keys.dat"), 'wb') as fh:
            np.save(fh, search_keys)
        with open(os.path.join(db_dir, "indexer.dat"), 'wb') as fh:
            np.save(fh, indexer)
        with open(os.path.join(db_dir, "pos.dat"), 'wb') as fh:
            np.save(fh, np.array(positions, dtype=np.int64))
        with open(os.path.join(db_dir, "length.dat"), 'wb') as fh:
            np.save(fh, np.array(lengths, dtype=np.int32))
        with open(os.path.join(db_dir, "chunk.dat"), 'wb') as fh:
            np.save(fh, np.array(chunks, dtype=np.int16))
        fs_server = FSStructureServer(db_dir)
        return fs_server


class PentaMatch:
    """ Pentamer matching for fast sequence searches

    :class:`PentaMatch` has fast sequence searches with low sensitivity as
    use case. Specifically searching the full AFDB. Stores all unique pentamers
    for each search sequence. Given a query sequence, it computes the number of
    matching pentamers with respect to each search sequence and returns the top
    hits.

    :param db_dir: Directory containing all required files (indexer.dat,
                   pos.dat, length.dat, meta.dat). New :class:`PentaMatch`
                   objects can be derived using the static :func:`FromSeqList`
                   creator.
    :type db_dir: :class:`str`
    :raises: :class:`RuntimeError` if any required file is missing in *db_dir*
    """
    def __init__(self, db_dir):
        self._indexer = None
        self._pos = None
        self._length = None
        self._N = None
        self.db_dir = db_dir

        # only check if required files are there, they're lazy loaded
        self.indexer_file = os.path.join(db_dir, "indexer.dat")
        self.pos_file = os.path.join(db_dir, "pos.dat")
        self.length_file = os.path.join(db_dir, "length.dat")
        self.meta_file = os.path.join(db_dir, "meta.dat")

        if not os.path.exists(self.indexer_file):
            raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.pos_file):
            raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.length_file):
            raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.meta_file):
            raise RuntimeError(f"Exp \"meta.dat\" file in db_dir ({db_dir})")

    @property
    def indexer(self):
        """ Entry indices for pentamers

        Entry data for one pentamer can be extracted with the respective values in
        :attr:`pos` and :attr:`length`

        :type: memory mapped :class:`np.ndarray` of dtype np.int32
        """
        if self._indexer is None:
            self._indexer = np.memmap(self.indexer_file, dtype=np.int32,
                                      mode='r')
        return self._indexer

    @property
    def pos(self):
        """ Start position for each pentamer in :attr:`indexer`

        :type: :class:`np.ndarray` of dtype np.int64 with 3.2E6 entries
        """
        if self._pos is None:
            self._pos = np.fromfile(self.pos_file, dtype=np.int64)
        return self._pos

    @property
    def length(self):
        """ Length for each pentamer in :attr:`indexer`

        :type: :class:`np.ndarray` of dtype np.int32 with 3.2E6 entries
        """
        if self._length is None:
            self._length = np.fromfile(self.length_file, dtype=np.int32)
        return self._length

    @property
    def N(self):
        """ Number of entries in underlying :class:`FSStructureServer`

        :type: :class:`int`
        """
        if self._N is None:
            with open(self.meta_file, 'r') as fh:
                self._N = int(fh.read()) + 1 # the files stores max idx => +1
        return self._N

    def TopN(self, N, sequence, return_counts=False, unique_pentamers=True):
        """ Find top-N matches given *sequence*

        Identifies unique pentamers in *sequence* and counts number of
        occurences in each entry. Returns the top-N entries with respect
        to counts.

        :param N: Number of results to return
        :type N: :class:`int`
        :param sequence: Sequence to search
        :type sequence: :class:`str`
        :param return_counts: Additionally return underlying pentamer counts
        :type return_counts: :class:`bool`
        :param unique_pentamers: Set to True if :attr:`indexer` contains only
                                 unique pentamers for each entry. This way we
                                 can use faster methods to accumulate counts.
                                 In detail: accumulator[indices] += 1 is much
                                 faster than np.add.at(accumulator, indices, 1).
                                 But the latter is required if there are
                                 duplicates.
        :type unique_pentamers: :class:`bool`
        :returns:  :class:`list` of :class:`int` with length *N* specifying
                   entry indices. If *return_counts* is true, the
                   :class:`list` contains :class:`tuple` with two elements:
                   1) count 2) index.
        :raises: :class:`RuntimeError` if N is invalid or sequence is shorter
                 than 5 characters
        """
        if N <=0:
            raise RuntimeError(f"N ({N}) must be larger than 0")
        if N > self.N:
            raise RuntimeError(f"N ({N}) larger than actual entries ({self.N})")
        if len(sequence) < 5:
            raise RuntimeError(f"sequence must have length >=5, got: {sequence}")
        pentamers = list()
        SeqToPentamerIndices(sequence, True, pentamers)
        # uint16 allows for up to 65535 pentamer matches
        accumulator = np.zeros(self.N, dtype=np.int32)
        if unique_pentamers:
            for p in pentamers:
                pos = self.pos[p]
                length = self.length[p]
                accumulator[self.indexer[pos:pos+length]] += 1
        else:
            for p in pentamers:
                pos = self.pos[p]
                length = self.length[p]
                np.add.at(accumulator, self.indexer[pos:pos+length], 1)

        top_n_indices = np.argpartition(accumulator, -N)[-N:]
        top_n_counts = accumulator[top_n_indices]
        # top_n_indices is not sorted by counts => sort by counts and return
        tmp = [(a,b) for a,b in zip(top_n_counts, top_n_indices)]
        tmp.sort(reverse=True)
        if return_counts:
            return tmp
        else:
            return [x[1] for x in tmp]

    @staticmethod
    def FromSeqList(fasta_file, db_dir, entries_from_seqnames=False):
        """ Creates PentaMatch object from Fasta file

        :param fasta_file: Path to Fasta file with sequences
        :type fasta_file: :class:`str`
        :param db_dir: Files required for :class:`PentaMatch` will be dumped
                       here, will be created if non-existent.
        :type db_dir: :class:`str`
        :param entries_from_seqnames: If set to False, indices returned by
                                      :func:`TopN` refer to position in
                                      *fasta_file*. If set to True, integer
                                      indices are parsed from sequence name.
        :type entries_from_seqnames: :class:`bool`
        :returns: class:`PentaMatch`
        :raises: :class:`ost.Error` if *entries_from_seqnames* is True but
                 sequence name cannot be casted to int.
        """
        slist = io.LoadSequenceList(fasta_file)
        if not os.path.exists(db_dir):
            os.makedirs(db_dir)
        CreatePentaMatch(slist, db_dir, entries_from_seqnames)
        return PentaMatch(db_dir)

def AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100,
                  seqid_thresh = 70, tpl_n = 5):
    """ Searches *tpl_n* templates in *fs_server*/*pentamatch*

    Step 1: Identifies *pentamatch_n* sequences in *pentamatch* with largest
    number of matching pentamers with respect to *trg_seq*.
    Step 2: Generate pairwise alignments with :func:`ost.seq.alg.LocalAlign`
    and only retain the ones with seqid >= *seqid_thresh*.
    Step 3: Extract respective templates from *fs_server* and score them by
    the sum of plDDT of aligned residues divided by *trg_seq* length.
    Step 4: Return top *tpl_n* (or less)

    :param fs_server: Structure database - The AFDB
    :type fs_server: :class:`FSStructureServer`
    :param pentamatch: Pentamatch object specific for *fs_server*
    :type pentamatch: :class:`PentaMatch`
    :param trg_seq: Target sequence
    :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str`
    :pentamatch_n: Number of sequences that are initially searched in
                   *pentamatch*
    :type pentamatch_n: :class:`int`
    :param seqid_thresh: Sequence Identity threshold [0-100] that alignment is
                         considered further
    :type seqid_thresh: :class:`int`
    :param tpl_n: Number of templates that are finally returned based on
                  described scoring
    :type tpl_n: :class:`int`
    :returns: :class:`list` of pairs with first element being the tpl score,
              the second element being a :class:`ost.seq.AlignmentHandle` with
              first sequence being *trg_seq* and second sequence the hit found
              in *fs_server* with structure attached. If *fs_server* has been
              generated with the default procedure described in the docs,
              additional info is available in the name of the attached
              structure. That's accessible with
              aln.GetSequence(1).GetAttachedView().GetName(). That is
              structured as "<UniprotAC> <Fragment> <AFDB version> <Idx>" where
              idx refers to the raw idx of the template in *fs_server*.
    """
    top_n = pentamatch.TopN(pentamatch_n, str(trg_seq))
    if isinstance(trg_seq, str):
        trg_seq = seq.CreateSequence("A", trg_seq)
    tmp = list()
    for idx in top_n:
        omf = fs_server.GetOMFByIdx(idx)
        assert(omf.GetChainNames() == ["A"])
        omf_s = omf.GetSequence("A")
        aln = seq.alg.LocalAlign(trg_seq, seq.CreateSequence("A", omf_s),
                                 seq.alg.BLOSUM62)[0]
        if seq.alg.SequenceIdentity(aln) >= seqid_thresh:
            bfactors = omf.GetAvgBFactors("A")
            summed_bfac = 0.0
            current_pos = aln.GetSequence(1).offset
            for col in aln:
                if col[0] != '-' and col[1] != '-':
                    summed_bfac += bfactors[current_pos]
                if col[1] != '-':
                    current_pos += 1
            score = summed_bfac / len(trg_seq)
            tmp.append((score, aln, omf, idx))
    tmp.sort(reverse=True, key=lambda x: x[0])
    return_list = list()
    for item in tmp[:tpl_n]:
        # the alignments are local, expand the first sequence to
        # *trg_seq*, i.e. make sure that the offset is 0
        aln = item[1]
        if aln.GetSequence(0).offset > 0:
            s1 = aln.GetSequence(0)
            s2 = aln.GetSequence(1)
            s1_prefix = trg_seq[:s1.offset]
            s2_prefix = '-' * s1.offset
            new_s1 = seq.CreateSequence(s1.name, s1_prefix + str(s1))
            new_s2 = seq.CreateSequence(s2.name, s2_prefix + str(s2))
            new_s2.SetOffset(s2.offset)
            aln = seq.CreateAlignment(new_s1, new_s2)
        ent = item[2].GetAUChain("A").CreateFullView()
        ent.SetName(ent.GetName() + ' ' + str(item[3]))
        aln.AttachView(1, ent)
        return_list.append((item[0], aln))
    return return_list

def _TransferBFactors(messages, aln, model):
    """ Simple heuristic to transfer plDDT from AFDB model

    Assumes *model* to be monomer. In case of an aligned residue, bfactor
    (i.e. plDDT) gets simply transferred. Gaps are treated with a heuristic.
    This operates on the full stretch of remodelled amino acids and not solely
    on the gaps indicated in the alignment. We first derive a minimum plDDT
    which is 0.5*min(n_stem_plddt, c_stem_plddt). The plDDT values of the
    processed amino acids then linearly decreases from the stem towards that
    minimum with a slope of 0.25 (i.e. reach the minimum value when they're 4
    residues away).

    :param messages: List of log messages that you derived during logging
                     of :func:`modelling.BuildFromRawModel`.
    :type messages: :class:`list` of :class:`str`
    :param aln: Alignment
    :type aln: :class:`ost.seq.AlignmentHandle`
    :param model: Model
    :type model: :class:`ost.mol.EntityHandle`
    """
    assert(model.chain_count == 1)
    bfactors = [0.0] * len(aln)
    for col_idx, col in enumerate(aln):
        if col[0] != '-' and col[1] != '-':
            r = col.GetResidue(1)
            bfactors[col_idx] = np.mean([a.GetBFactor() for a in r.atoms])

    # regular expression that finds stuff like: filling A.PRO59-(ESRQG)-A.ILE65
    # and directly makes stem residue numbers (59 and 65) available as groups
    cname = model.chains[0].GetName()
    pat = f"ling {cname}\.[A-Z]+([0-9]*)-\([A-Z]+\)-{cname}\.[A-Z]+([0-9]+)"
    for m in messages:
        if m.startswith("Resolved"):
            match = re.search(pat, m)
            assert(match is not None)
            groups = match.groups()
            assert(len(groups) == 2)
            n_stem = int(groups[0]) - 1 # rnum to idx
            c_stem = int(groups[1]) - 1 # rnum to idx
            # we have no guarantee that these stems were aligned from the 
            # very beginning. Lets move towards the termini and find the first
            # non-zero bfactors
            while n_stem > 0:
                if bfactors[n_stem] != 0.0:
                    break
                n_stem -= 1
            while c_stem < len(bfactors):
                if bfactors[c_stem] != 0.0:
                    break
                c_stem += 1
            n_stem_bfac = bfactors[n_stem]
            c_stem_bfac = bfactors[c_stem]
            min_bfac = 0.5*(min(n_stem_bfac, c_stem_bfac))
            for idx in range(n_stem+1, c_stem):
                n_stem_d = idx - n_stem
                c_stem_d = c_stem - idx
                if n_stem_d < c_stem_d:
                    # closer to n stem
                    n_stem_d = min(4, n_stem_d)
                    weight = 0.25 * n_stem_d
                    bfactors[idx] = weight * min_bfac + (1-weight)*n_stem_bfac
                else:
                    # closer to c stem (or same d...)
                    c_stem_d = min(4, c_stem_d)
                    weight = 0.25*c_stem_d
                    bfactors[idx] = weight * min_bfac + (1-weight)*c_stem_bfac

    for r in model.residues:
        rnum = r.GetNumber().GetNum()
        bfac = bfactors[rnum-1]
        for a in r.atoms:
            a.SetBFactor(bfac)

def AFDBModel(fs_server, pentamatch, trg_seq, transfer_bfactors=False):
    """ Build model with AFDB as template library

    :param fs_server: Structure database - The AFDB
    :type fs_server: :class:`FSStructureServer`
    :param pentamatch: Pentamatch object specific for *fs_server*
    :type pentamatch: :class:`PentaMatch`
    :param trg_seq: Target sequence
    :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str`
    :param transfer_bfactors: Simple heuristic to transfer bfactors (plDDT) to
                              model. In case of an aligned residue, bfactor
                              (i.e. plDDT) gets simply transferred.
                              Gaps are treated with a heuristic. This operates
                              on the full stretch of remodelled amino acids and
                              not solely on the gap indicated by the alignment.
                              We first derive a minimum plDDT which is
                              0.5*min(n_stem_plddt, c_stem_plddt). The plDDT
                              values of the processed amino acids then linearly
                              decreases from the stem towards that minimum with
                              a slope of 0.25 (i.e. reach the minimum value when
                              they're 4 residues away).
    :returns: :class:`tuple` with 4 elements. 1: The model 2: The model score
              based on plDDT 3: Pairwise alignment (first seq: *trg_seq*,
              second seq: tpl seq) 4: Template name (formatted as
              "<uniprot AC> <AFDB_fragment> <AFDB_version> <chain name>").
              If no appropriate template can be found, all 4 elements are None.
    """
    tpl_list = AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100,
                             seqid_thresh=70, tpl_n = 1)
    if len(tpl_list):
        score = tpl_list[0][0]
        aln = tpl_list[0][1]
        mhandle = BuildRawModel(aln)
        if transfer_bfactors:
            # setup custom logger to fish out logging messages
            log_sink = _AFDBLogSink()
            ost.PushLogSink(log_sink)
            ost.PushVerbosityLevel(3)
            model = BuildFromRawModel(mhandle)
            _TransferBFactors(log_sink.messages, aln, model)
            ost.PopLogSink()
            ost.PopVerbosityLevel()
            # let the world know in original log sink
            orig_log_sink = ost.GetCurrentLogSink()
            if orig_log_sink:
                for m,s in zip(log_sink.messages, log_sink.severities):
                    orig_log_sink.LogMessage(m, s) 
        else:
            model = BuildFromRawModel(mhandle)

        name = aln.GetSequence(1).GetAttachedView().GetName()
        return (model, score, name, aln)
    return (None, None, None, None)

__all__ = ('FSStructureServer', 'PentaMatch', 'AFDBTPLSearch', 'AFDBModel')