File: _afdb_modelling.py

package info (click to toggle)
promod3 3.4.2%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 966,596 kB
  • sloc: cpp: 55,820; python: 18,058; makefile: 85; sh: 51
file content (730 lines) | stat: -rw-r--r-- 31,256 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
# Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and
#                          Biozentrum - University of Basel
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import gzip
import mmap
import pickle
import numpy as np
import time
import tarfile
import glob
import re

import ost
from ost import io
from ost import seq
from ._raw_model import BuildRawModel
from ._pipeline import BuildFromRawModel
from ._modelling import *

class _AFDBLogSink(ost.LogSink):
    """ To capture all log messages of ProMod3 when modelling
    """
    def __init__(self):
        ost.LogSink.__init__(self)
        self.messages = list()
        self.severities = list()

    def LogMessage(self, message, severity):
        self.messages.append(message)
        self.severities.append(severity)

class FSStructureServer:
    """ FSStructureServer - A filesystem based AFDB structure server

    Stores OMF data entries in huge binary files and extracts the respective
    binary data blobs with an indexing mechanism. Efficient reading of these
    huge files is delegated to the OS using the Python mmap module.

    Data creation happens with static Create function: :func:`FromDataChunks`.
    The required preprocessing must be done with external scripts.

    :param db_dir: Directory containing required files. 1) pos.dat (see
                   :attr:`pos`) 2) length.dat (see :attr:`length`)
                   3) chunk.dat (see :attr:`chunk`) 4) indexer.dat (see
                   :attr:`indexer`) 4) one or several files with pattern
                   fs_data_[x].dat (see :attr:`data`)
    :type db_dir: :class:`str`
    """
    def __init__(self, db_dir):
        super().__init__()

        self._pos = None
        self._length = None
        self._chunk = None
        self._indexer = None
        self._search_keys = None
        self._data = None
        self._data_fh = None
        self._n_entries = None

        # only check if required files are there, they're lazy loaded
        self.pos_file = os.path.join(db_dir, "pos.dat")
        self.length_file = os.path.join(db_dir, "length.dat")
        self.chunk_file = os.path.join(db_dir, "chunk.dat")
        self.search_key_file = os.path.join(db_dir, "search_keys.dat")
        self.indexer_file = os.path.join(db_dir, "indexer.dat")
        self.data_files = glob.glob(os.path.join(db_dir, "fs_data_*.dat"))
        if not os.path.exists(self.pos_file):
            raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.length_file):
            raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.chunk_file):
            raise RuntimeError(f"Exp \"chunk.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.indexer_file):
            raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})")
        # there can be one or more data files, just check whether there is at
        # least one
        if len(self.data_files) == 0:
            raise RuntimeError(f"Exp at least one data file in form "
                               f"\"fs_data_[x].dat\" in db_dir ({db_dir})")

    def __del__(self):
        if self._data:
            for fh in self._data:
                fh.close()
            for fh in self._data_fh:
                fh.close()

    @property
    def pos(self):
        """ Start positions of data in internal linear memory layout

        :type: :class:`np.ndarray` with dtype np.int64
        """
        if self._pos is None:
            with open(self.pos_file, 'rb') as fh:
                self._pos = np.load(fh)
        return self._pos

    @property
    def length(self):
        """ Lengths of data in internal linear memory layout

        :type: :class:`np.ndarray` with dtype np.int32
        """
        if self._length is None:
            with open(self.length_file, 'rb') as fh:
                self._length = np.load(fh)
        return self._length

    @property
    def chunk(self):
        """ Chunk, in which entry is stored

        :type: :class:`np.ndarray` with dtype np.int16
        """
        if self._chunk is None:
            with open(self.chunk_file, 'rb') as fh:
                self._chunk = np.load(fh)
        return self._chunk

    @property
    def indexer(self):
        """ Internal data structure - Relates entries with data indices

        :type: :class:`np.ndarray` with dtype np.int32
        """
        if self._indexer is None:
            with open(self.indexer_file, 'rb') as fh:
                self._indexer = np.load(fh)
        return self._indexer

    @property
    def search_keys(self):
        """ Internal data structure - Relates entries with data indices
        """
        if self._search_keys is None:
            with open(self.search_key_file, 'rb') as fh:
                self._search_keys = np.load(fh)
        return self._search_keys               

    @property
    def data(self):
        """ Internal binary data in memory mapped files

        :type: :class:`list` of :class:`mmap.mmap`
        """
        if self._data is None:
            self._data = list()
            self._data_fh = list()
            tmp = list()
            for f in self.data_files:
                idx = int(f.split('_')[-1].replace(".dat", ""))
                tmp.append((idx, f))
            tmp.sort()
            sorted_data_files = [x[1] for x in tmp]
            for f in sorted_data_files:
                self._data_fh.append(open(f, 'rb'))
                self._data.append(mmap.mmap(self._data_fh[-1].fileno(), 0,
                                            prot=mmap.PROT_READ))
        return self._data

    @property
    def n_entries(self):
        """ Number of entries

        :type: :class:`int`
        """
        if self._n_entries is None:
            self._n_entries = self.pos.shape[0]
        return self._n_entries

    def GetIdx(self, uniprot_ac, fragment="F1", version="v4"):
        """ Get internal idx of stored data

        Can be used for data retrieval with :func:`GetOMFByIdx`

        :param uniprot_ac: Uniprot AC 
        :type uniprot_ac: :class:`str`
        :param fragment: AFDB entries are potentially split to fragments
                         99.999% of all entries only have one fragment: F1
        :type fragment: class:`str`
        :param version: Version of entry
        :type version: :class:`str`
        :returns: Internal idx of that entry
        :raises: :class:`RuntimeError` if no such entry exists
        """
        if not fragment.startswith('F'):
            raise RuntimeError("expect fragment to start with F")
        if not version.startswith('v'):
            raise RuntimeError("expect version to start with v")
        search_key = CreateAFDBIdx(uniprot_ac, int(fragment[1:]),
                                   int(version[1:]))
        idx = np.searchsorted(self.search_keys, np.uint64(search_key))
        if idx != len(self.indexer) and self.search_keys[idx] == search_key:
            return self.indexer[idx]
        raise RuntimeError(f"No entry for {uniprot_ac} {fragment} "
                           f"{version}")

    def GetOMFByIdx(self, idx):
        """ Get stored OMF data structure

        :param idx: Internal index which can be derived from :func:`GetIdx`  
        :type idx: :class:`int`
        :returns: OMF data structure of type :class:`ost.io.OMF`
        :raises: :class:`RuntimeError` if *idx* is out of range
        """
        if idx < 0 or idx >= self.n_entries:
            raise RuntimeError(f"Invalid idx, must be in [0, {self.n_entries-1}]")
        pos = self.pos[idx]
        length = self.length[idx]
        chunk = self.chunk[idx]
        omf_data = self.data[chunk][pos:pos+length]
        return io.OMF.FromBytes(gzip.decompress(omf_data))

    def GetOMFByPLC(self, pos, length, chunk):
        """ Get stored OMF data structure

        Get data by explicitely specifying PLC (pos, length, chunk). For expert
        use only, no range checks performed.
        Instead of providing a uniprot AC or an index, this function takes
        directly the internal pos, length and chunk parameters that are stored
        for that particular index. Use case: avoid loading the respective data
        files and only open the memory mapped files. 

        :param pos: Byte pos in specified chunk
        :type pos: :class:`int`
        :param length: Num bytes of entry
        :type length: :class:`int`
        :param chunk: Chunk in which entry resides
        :type chunk: :class:`int` 
        :returns: OMF data structure of type :class:`ost.io.OMF`
        """
        omf_data = self.data[chunk][pos:pos+length]
        return io.OMF.FromBytes(gzip.decompress(omf_data))

    def GetOMF(self, uniprot_ac, fragment="F1", version="v4"):
        """ Get stored OMF data structure

        :param uniprot_ac: Uniprot AC 
        :type uniprot_ac: :class:`str`
        :param fragment: AFDB entries are potentially split to fragments
                         99.999% of all entries only have one fragment: F1
        :type fragment: class:`str`
        :param version: Version of entry
        :type version: :class:`str`
        :returns: OMF data structure of type :class:`ost.io.OMF`
        :raises: :class:`RuntimeError` if no such entry exists
        """
        idx = self.GetIdx(uniprot_ac, fragment = fragment, version = version)
        return self.GetOMFByIdx(idx)

    @staticmethod
    def FromDataChunks(chunk_dir, db_dir, chunk_bytes=None):
        """ Static method to create new database from preprocessed data

        Data preprocessing consists of creating several data chunks that are
        pickled to disk.
        In detail: each chunk is a pickled file containing a list, where each
        entry is a tuple with 4 elements: 1) uniprot_ac (:class:`str`)
        2) fragment (:class:`str`) 3) version (:class:`str`) 4) structure data
        (:class:`ost.io.OMF` object which has been written to a bytes object and
        compressed with gzip)

        The data itself is again stored in chunks of binary data which are
        indexed.

        :param chunk_dir: Path to directory containing described data chunks
        :type chunk_dir: :class:`str`
        :param db_dir: Output directory - Creates all files that are needed for
                       :class:`FSStructureServer`
        :type db_dir: :class:`str`
        :param chunk_bytes: Size in bytes of binary data chunks in final
                            database - default None: Everything ends up in one
                            single chunk
        :type chunk_bytes: :class:`int`
        :returns: :class:`FSStructureServer` with all data from *chunk_dir*
        """
        if not os.path.exists(db_dir):
            raise RuntimeError(f"{db_dir} does not exist")
        positions = list()
        lengths = list()
        chunks = list()
        search_keys = list()
        indexer = list()
        chunk_files = os.listdir(chunk_dir)
        chunk_files = [f for f in chunk_files if f.endswith(".pkl")]
        current_pos = 0
        current_chunk = 0
        current_entry = 0
        data_file = open(os.path.join(db_dir, "fs_data_0.dat"), 'wb')
        t0 = time.time()
        for cf_idx, cf in enumerate(chunk_files):
            print(f"processing chunk {cf_idx}, {cf}")
            with open(os.path.join(chunk_dir, cf), 'rb') as fh:
                data = pickle.load(fh)
            for entry in data:
                uniprot_ac = entry[0]
                fragment = entry[1]
                version = entry[2]
                data_bytes = entry[3]
                length = len(data_bytes)
                data_file.write(data_bytes)
                positions.append(current_pos)
                lengths.append(length)
                chunks.append(current_chunk)
                k = CreateAFDBIdx(uniprot_ac, int(fragment[1:]),
                                  int(version[1:]))
                search_keys.append(k)
                indexer.append(current_entry)
                current_pos += length
                current_entry += 1
                if chunk_bytes and current_pos > chunk_bytes:
                    data_file.close()
                    current_chunk += 1
                    f = os.path.join(db_dir, f"fs_data_{current_chunk}.dat")
                    data_file = open(f, 'wb')
                    current_pos = 0
        data_file.close()
        print(f"done processing chunks ({round(time.time() - t0, 3)}s)")
        print("sort indexer matrix")
        t0 = time.time()
        # make search keys searchable by bisect search => sort
        search_keys = np.asarray(search_keys, dtype=np.uint64)
        indexer = np.asarray(indexer, dtype=np.int32)
        sort_indices = np.argsort(search_keys)
        search_keys = search_keys[sort_indices]
        indexer = indexer[sort_indices]

        with open(os.path.join(db_dir, "search_keys.dat"), 'wb') as fh:
            np.save(fh, search_keys)
        with open(os.path.join(db_dir, "indexer.dat"), 'wb') as fh:
            np.save(fh, indexer)
        with open(os.path.join(db_dir, "pos.dat"), 'wb') as fh:
            np.save(fh, np.array(positions, dtype=np.int64))
        with open(os.path.join(db_dir, "length.dat"), 'wb') as fh:
            np.save(fh, np.array(lengths, dtype=np.int32))
        with open(os.path.join(db_dir, "chunk.dat"), 'wb') as fh:
            np.save(fh, np.array(chunks, dtype=np.int16))
        fs_server = FSStructureServer(db_dir)
        return fs_server


class PentaMatch:
    """ Pentamer matching for fast sequence searches

    :class:`PentaMatch` has fast sequence searches with low sensitivity as
    use case. Specifically searching the full AFDB. Stores all unique pentamers
    for each search sequence. Given a query sequence, it computes the number of
    matching pentamers with respect to each search sequence and returns the top
    hits.

    :param db_dir: Directory containing all required files (indexer.dat,
                   pos.dat, length.dat, meta.dat). New :class:`PentaMatch`
                   objects can be derived using the static :func:`FromSeqList`
                   creator.
    :type db_dir: :class:`str`
    :raises: :class:`RuntimeError` if any required file is missing in *db_dir*
    """
    def __init__(self, db_dir):
        self._indexer = None
        self._pos = None
        self._length = None
        self._N = None
        self.db_dir = db_dir

        # only check if required files are there, they're lazy loaded
        self.indexer_file = os.path.join(db_dir, "indexer.dat")
        self.pos_file = os.path.join(db_dir, "pos.dat")
        self.length_file = os.path.join(db_dir, "length.dat")
        self.meta_file = os.path.join(db_dir, "meta.dat")

        if not os.path.exists(self.indexer_file):
            raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.pos_file):
            raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.length_file):
            raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})")
        if not os.path.exists(self.meta_file):
            raise RuntimeError(f"Exp \"meta.dat\" file in db_dir ({db_dir})")

    @property
    def indexer(self):
        """ Entry indices for pentamers

        Entry data for one pentamer can be extracted with the respective values in
        :attr:`pos` and :attr:`length`

        :type: memory mapped :class:`np.ndarray` of dtype np.int32
        """
        if self._indexer is None:
            self._indexer = np.memmap(self.indexer_file, dtype=np.int32,
                                      mode='r')
        return self._indexer

    @property
    def pos(self):
        """ Start position for each pentamer in :attr:`indexer`

        :type: :class:`np.ndarray` of dtype np.int64 with 3.2E6 entries
        """
        if self._pos is None:
            self._pos = np.fromfile(self.pos_file, dtype=np.int64)
        return self._pos

    @property
    def length(self):
        """ Length for each pentamer in :attr:`indexer`

        :type: :class:`np.ndarray` of dtype np.int32 with 3.2E6 entries
        """
        if self._length is None:
            self._length = np.fromfile(self.length_file, dtype=np.int32)
        return self._length

    @property
    def N(self):
        """ Number of entries in underlying :class:`FSStructureServer`

        :type: :class:`int`
        """
        if self._N is None:
            with open(self.meta_file, 'r') as fh:
                self._N = int(fh.read()) + 1 # the files stores max idx => +1
        return self._N

    def TopN(self, N, sequence, return_counts=False, unique_pentamers=True):
        """ Find top-N matches given *sequence*

        Identifies unique pentamers in *sequence* and counts number of
        occurences in each entry. Returns the top-N entries with respect
        to counts.

        :param N: Number of results to return
        :type N: :class:`int`
        :param sequence: Sequence to search
        :type sequence: :class:`str`
        :param return_counts: Additionally return underlying pentamer counts
        :type return_counts: :class:`bool`
        :param unique_pentamers: Set to True if :attr:`indexer` contains only
                                 unique pentamers for each entry. This way we
                                 can use faster methods to accumulate counts.
                                 In detail: accumulator[indices] += 1 is much
                                 faster than np.add.at(accumulator, indices, 1).
                                 But the latter is required if there are
                                 duplicates.
        :type unique_pentamers: :class:`bool`
        :returns:  :class:`list` of :class:`int` with length *N* specifying
                   entry indices. If *return_counts* is true, the
                   :class:`list` contains :class:`tuple` with two elements:
                   1) count 2) index.
        :raises: :class:`RuntimeError` if N is invalid or sequence is shorter
                 than 5 characters
        """
        if N <=0:
            raise RuntimeError(f"N ({N}) must be larger than 0")
        if N > self.N:
            raise RuntimeError(f"N ({N}) larger than actual entries ({self.N})")
        if len(sequence) < 5:
            raise RuntimeError(f"sequence must have length >=5, got: {sequence}")
        pentamers = list()
        SeqToPentamerIndices(sequence, True, pentamers)
        # uint16 allows for up to 65535 pentamer matches
        accumulator = np.zeros(self.N, dtype=np.int32)
        if unique_pentamers:
            for p in pentamers:
                pos = self.pos[p]
                length = self.length[p]
                accumulator[self.indexer[pos:pos+length]] += 1
        else:
            for p in pentamers:
                pos = self.pos[p]
                length = self.length[p]
                np.add.at(accumulator, self.indexer[pos:pos+length], 1)

        top_n_indices = np.argpartition(accumulator, -N)[-N:]
        top_n_counts = accumulator[top_n_indices]
        # top_n_indices is not sorted by counts => sort by counts and return
        tmp = [(a,b) for a,b in zip(top_n_counts, top_n_indices)]
        tmp.sort(reverse=True)
        if return_counts:
            return tmp
        else:
            return [x[1] for x in tmp]

    @staticmethod
    def FromSeqList(fasta_file, db_dir, entries_from_seqnames=False):
        """ Creates PentaMatch object from Fasta file

        :param fasta_file: Path to Fasta file with sequences
        :type fasta_file: :class:`str`
        :param db_dir: Files required for :class:`PentaMatch` will be dumped
                       here, will be created if non-existent.
        :type db_dir: :class:`str`
        :param entries_from_seqnames: If set to False, indices returned by
                                      :func:`TopN` refer to position in
                                      *fasta_file*. If set to True, integer
                                      indices are parsed from sequence name.
        :type entries_from_seqnames: :class:`bool`
        :returns: class:`PentaMatch`
        :raises: :class:`ost.Error` if *entries_from_seqnames* is True but
                 sequence name cannot be casted to int.
        """
        slist = io.LoadSequenceList(fasta_file)
        if not os.path.exists(db_dir):
            os.makedirs(db_dir)
        CreatePentaMatch(slist, db_dir, entries_from_seqnames)
        return PentaMatch(db_dir)

def AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100,
                  seqid_thresh = 70, tpl_n = 5):
    """ Searches *tpl_n* templates in *fs_server*/*pentamatch*

    Step 1: Identifies *pentamatch_n* sequences in *pentamatch* with largest
    number of matching pentamers with respect to *trg_seq*.
    Step 2: Generate pairwise alignments with :func:`ost.seq.alg.LocalAlign`
    and only retain the ones with seqid >= *seqid_thresh*.
    Step 3: Extract respective templates from *fs_server* and score them by
    the sum of plDDT of aligned residues divided by *trg_seq* length.
    Step 4: Return top *tpl_n* (or less)

    :param fs_server: Structure database - The AFDB
    :type fs_server: :class:`FSStructureServer`
    :param pentamatch: Pentamatch object specific for *fs_server*
    :type pentamatch: :class:`PentaMatch`
    :param trg_seq: Target sequence
    :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str`
    :pentamatch_n: Number of sequences that are initially searched in
                   *pentamatch*
    :type pentamatch_n: :class:`int`
    :param seqid_thresh: Sequence Identity threshold [0-100] that alignment is
                         considered further
    :type seqid_thresh: :class:`int`
    :param tpl_n: Number of templates that are finally returned based on
                  described scoring
    :type tpl_n: :class:`int`
    :returns: :class:`list` of pairs with first element being the tpl score,
              the second element being a :class:`ost.seq.AlignmentHandle` with
              first sequence being *trg_seq* and second sequence the hit found
              in *fs_server* with structure attached. If *fs_server* has been
              generated with the default procedure described in the docs,
              additional info is available in the name of the attached
              structure. That's accessible with
              aln.GetSequence(1).GetAttachedView().GetName(). That is
              structured as "<UniprotAC> <Fragment> <AFDB version> <Idx>" where
              idx refers to the raw idx of the template in *fs_server*.
    """
    top_n = pentamatch.TopN(pentamatch_n, str(trg_seq))
    if isinstance(trg_seq, str):
        trg_seq = seq.CreateSequence("A", trg_seq)
    tmp = list()
    for idx in top_n:
        omf = fs_server.GetOMFByIdx(idx)
        assert(omf.GetChainNames() == ["A"])
        omf_s = omf.GetSequence("A")
        aln = seq.alg.LocalAlign(trg_seq, seq.CreateSequence("A", omf_s),
                                 seq.alg.BLOSUM62)[0]
        if seq.alg.SequenceIdentity(aln) >= seqid_thresh:
            bfactors = omf.GetAvgBFactors("A")
            summed_bfac = 0.0
            current_pos = aln.GetSequence(1).offset
            for col in aln:
                if col[0] != '-' and col[1] != '-':
                    summed_bfac += bfactors[current_pos]
                if col[1] != '-':
                    current_pos += 1
            score = summed_bfac / len(trg_seq)
            tmp.append((score, aln, omf, idx))
    tmp.sort(reverse=True, key=lambda x: x[0])
    return_list = list()
    for item in tmp[:tpl_n]:
        # the alignments are local, expand the first sequence to
        # *trg_seq*, i.e. make sure that the offset is 0
        aln = item[1]
        if aln.GetSequence(0).offset > 0:
            s1 = aln.GetSequence(0)
            s2 = aln.GetSequence(1)
            s1_prefix = trg_seq[:s1.offset]
            s2_prefix = '-' * s1.offset
            new_s1 = seq.CreateSequence(s1.name, s1_prefix + str(s1))
            new_s2 = seq.CreateSequence(s2.name, s2_prefix + str(s2))
            new_s2.SetOffset(s2.offset)
            aln = seq.CreateAlignment(new_s1, new_s2)
        ent = item[2].GetAUChain("A").CreateFullView()
        ent.SetName(ent.GetName() + ' ' + str(item[3]))
        aln.AttachView(1, ent)
        return_list.append((item[0], aln))
    return return_list

def _TransferBFactors(messages, aln, model):
    """ Simple heuristic to transfer plDDT from AFDB model

    Assumes *model* to be monomer. In case of an aligned residue, bfactor
    (i.e. plDDT) gets simply transferred. Gaps are treated with a heuristic.
    This operates on the full stretch of remodelled amino acids and not solely
    on the gaps indicated in the alignment. We first derive a minimum plDDT
    which is 0.5*min(n_stem_plddt, c_stem_plddt). The plDDT values of the
    processed amino acids then linearly decreases from the stem towards that
    minimum with a slope of 0.25 (i.e. reach the minimum value when they're 4
    residues away).

    :param messages: List of log messages that you derived during logging
                     of :func:`modelling.BuildFromRawModel`.
    :type messages: :class:`list` of :class:`str`
    :param aln: Alignment
    :type aln: :class:`ost.seq.AlignmentHandle`
    :param model: Model
    :type model: :class:`ost.mol.EntityHandle`
    """
    assert(model.chain_count == 1)
    bfactors = [0.0] * len(aln)
    for col_idx, col in enumerate(aln):
        if col[0] != '-' and col[1] != '-':
            r = col.GetResidue(1)
            bfactors[col_idx] = np.mean([a.GetBFactor() for a in r.atoms])

    # regular expression that finds stuff like: filling A.PRO59-(ESRQG)-A.ILE65
    # and directly makes stem residue numbers (59 and 65) available as groups
    cname = model.chains[0].GetName()
    pat = f"ling {cname}\.[A-Z]+([0-9]*)-\([A-Z]+\)-{cname}\.[A-Z]+([0-9]+)"
    for m in messages:
        if m.startswith("Resolved"):
            match = re.search(pat, m)
            assert(match is not None)
            groups = match.groups()
            assert(len(groups) == 2)
            n_stem = int(groups[0]) - 1 # rnum to idx
            c_stem = int(groups[1]) - 1 # rnum to idx
            # we have no guarantee that these stems were aligned from the 
            # very beginning. Lets move towards the termini and find the first
            # non-zero bfactors
            while n_stem > 0:
                if bfactors[n_stem] != 0.0:
                    break
                n_stem -= 1
            while c_stem < len(bfactors):
                if bfactors[c_stem] != 0.0:
                    break
                c_stem += 1
            n_stem_bfac = bfactors[n_stem]
            c_stem_bfac = bfactors[c_stem]
            min_bfac = 0.5*(min(n_stem_bfac, c_stem_bfac))
            for idx in range(n_stem+1, c_stem):
                n_stem_d = idx - n_stem
                c_stem_d = c_stem - idx
                if n_stem_d < c_stem_d:
                    # closer to n stem
                    n_stem_d = min(4, n_stem_d)
                    weight = 0.25 * n_stem_d
                    bfactors[idx] = weight * min_bfac + (1-weight)*n_stem_bfac
                else:
                    # closer to c stem (or same d...)
                    c_stem_d = min(4, c_stem_d)
                    weight = 0.25*c_stem_d
                    bfactors[idx] = weight * min_bfac + (1-weight)*c_stem_bfac

    for r in model.residues:
        rnum = r.GetNumber().GetNum()
        bfac = bfactors[rnum-1]
        for a in r.atoms:
            a.SetBFactor(bfac)

def AFDBModel(fs_server, pentamatch, trg_seq, transfer_bfactors=False):
    """ Build model with AFDB as template library

    :param fs_server: Structure database - The AFDB
    :type fs_server: :class:`FSStructureServer`
    :param pentamatch: Pentamatch object specific for *fs_server*
    :type pentamatch: :class:`PentaMatch`
    :param trg_seq: Target sequence
    :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str`
    :param transfer_bfactors: Simple heuristic to transfer bfactors (plDDT) to
                              model. In case of an aligned residue, bfactor
                              (i.e. plDDT) gets simply transferred.
                              Gaps are treated with a heuristic. This operates
                              on the full stretch of remodelled amino acids and
                              not solely on the gap indicated by the alignment.
                              We first derive a minimum plDDT which is
                              0.5*min(n_stem_plddt, c_stem_plddt). The plDDT
                              values of the processed amino acids then linearly
                              decreases from the stem towards that minimum with
                              a slope of 0.25 (i.e. reach the minimum value when
                              they're 4 residues away).
    :returns: :class:`tuple` with 4 elements. 1: The model 2: The model score
              based on plDDT 3: Pairwise alignment (first seq: *trg_seq*,
              second seq: tpl seq) 4: Template name (formatted as
              "<uniprot AC> <AFDB_fragment> <AFDB_version> <chain name>").
              If no appropriate template can be found, all 4 elements are None.
    """
    tpl_list = AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100,
                             seqid_thresh=70, tpl_n = 1)
    if len(tpl_list):
        score = tpl_list[0][0]
        aln = tpl_list[0][1]
        mhandle = BuildRawModel(aln)
        if transfer_bfactors:
            # setup custom logger to fish out logging messages
            log_sink = _AFDBLogSink()
            ost.PushLogSink(log_sink)
            ost.PushVerbosityLevel(3)
            model = BuildFromRawModel(mhandle)
            _TransferBFactors(log_sink.messages, aln, model)
            ost.PopLogSink()
            ost.PopVerbosityLevel()
            # let the world know in original log sink
            orig_log_sink = ost.GetCurrentLogSink()
            if orig_log_sink:
                for m,s in zip(log_sink.messages, log_sink.severities):
                    orig_log_sink.LogMessage(m, s) 
        else:
            model = BuildFromRawModel(mhandle)

        name = aln.GetSequence(1).GetAttachedView().GetName()
        return (model, score, name, aln)
    return (None, None, None, None)

__all__ = ('FSStructureServer', 'PentaMatch', 'AFDBTPLSearch', 'AFDBModel')