File: affine.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (119 lines) | stat: -rw-r--r-- 4,239 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

from py_stringmatching import utils
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
                                                    SequenceSimilarityMeasure
from py_stringmatching.similarity_measure.cython.cython_affine import affine
from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident

class Affine(SequenceSimilarityMeasure):
    """Returns the affine gap score between two strings. 

    The affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
    gracefully. For more information refer to the string matching chapter in the DI book ("Principles of Data Integration").

    Args:
        gap_start (float): Cost for the gap at the start (defaults to 1).
        gap_continuation (float): Cost for the gap continuation (defaults to 0.5).
        sim_func (function): Function computing similarity score between two characters, which are represented as strings (defaults
                             to an identity function, which returns 1 if the two characters are the same and returns 0 otherwise). 

    Attributes:
        gap_start (float): An attribute to store the gap cost at the start.
        gap_continuation (float): An attribute to store the gap continuation cost.
        sim_func (function): An attribute to store the similarity function.
    """

    def __init__(self, gap_start=1, gap_continuation=0.5, sim_func=cython_sim_ident):
        self.gap_start = gap_start
        self.gap_continuation = gap_continuation
        self.sim_func = sim_func
        super(Affine, self).__init__()

    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func)

    def get_gap_start(self):
        """Get gap start cost.

        Returns:
            gap start cost (float).
        """
        return self.gap_start

    def get_gap_continuation(self):
        """Get gap continuation cost.

        Returns:
            gap continuation cost (float).
        """
        return self.gap_continuation

    def get_sim_func(self):
        """Get similarity function.

        Returns:
            similarity function (function).
        """
        return self.sim_func

    def set_gap_start(self, gap_start):
        """Set gap start cost.

        Args:
            gap_start (float): Cost for the gap at the start.
        """
        self.gap_start = gap_start
        return True

    def set_gap_continuation(self, gap_continuation):
        """Set gap continuation cost.

        Args:
            gap_continuation (float): Cost for the gap continuation.
        """
        self.gap_continuation = gap_continuation
        return True

    def set_sim_func(self, sim_func):
        """Set similarity function.

        Args:
            sim_func (function): Function computing similarity score between two characters, represented as strings.
        """
        self.sim_func = sim_func
        return True