File: caching.py

package info (click to toggle)
gladtex 2.3.1-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 440 kB
  • sloc: python: 4,531; sh: 7; makefile: 3
file content (233 lines) | stat: -rw-r--r-- 9,708 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""This module contains the ImageCache, which caches formulas which have already
been converted. Only with this mechanism, a formula which occurs multiple times
can be reused, even across several runs of GladTeX.

Cache format:

    { # dict of formulas
        'some formula': # formula as key into dictionary
            { # list of display math / inline maths variants
                True: # displaymath = True
                    { # dictionary of values describing formula
                        'path': 'some/path'
                        'pos': { # positioning within the HTML document
                            'height': ..., 'width':..., 'depth:....
                        }
                    }
                    }
            }
    }

Formulas are `normalized`, so spacing is unified to detect possibly equal
formulas more easyly.
"""

import json
import os

CACHE_VERSION = '2.0'

def normalize_formula(formula):
    """This function normalizes a formula. This e.g. means that multiple white
    spaces are squeezed into one and a tab will be replaced by a space. With
    this it is more realistic that a recurring formula in a document is detected
    as such, even though if it might have been written with different spacing.
    Empty braces ({}) are removed as well."""
    return formula.replace('{}', ' ').replace('\t', ' ').replace('  ', ' '). \
        rstrip().lstrip()

def recover_bools(object):
    """After JSon is read from disk, keys as False or True have been serialized
    to 'false' and 'true', but they're not recovered by the json parser. This
    function alters converts these keys back to booleans; note: it only works
    with references, so this function doesn't return anything."""
    if isinstance(object, dict):
        for key in ['false', 'true']:
            if key in object:
                val = object[key] # store value
                object[key == 'true'] = val # safe it with boolean representation
                del object[key] # remove string key
        # iterate recursively through dict
        for value in object.values():
            recover_bools(value)
    if isinstance(object, list):
        for item in object:
            recover_bools(item)

class JsonParserException(Exception):
    """Specialized exception class for handling errors while parsing the JSON
    cache."""
    pass

class ImageCache:
    """
    This cache stores formulas which have been converted already and don't need
    to be converted again. This is both a disk usage and performance
    improvement. The cache can be written and read from disk.

    If the argument keep_old_cache is True, the cache will raise a
    JsonParserException if
    that file could not be read (i.e. incompatible GladTeX version). If set to
    False, it'll discard the cache along with all eqn* files and start with a
    clean cache.

    cache = ImageCache()
    c.add_formula('\\tau', # the formulas
        {'height': 1, 'depth': 2, 'width='3'}, # the positioning information for the output document
        'eqn042.png', displaymath=True):
    assert len(cache) == 1 # one entry
    c.write()
    assert os.path.exists('gladtex.cache')
    """
    VERSION_STR = 'GladTeX__cache__version'

    def __init__(self, path='gladtex.cache', keep_old_cache=True):
        self.__cache = {}
        self.__set_version(CACHE_VERSION)
        self.__path = path
        if os.path.exists(path):
            try:
                self._read()
            except JsonParserException:
                if keep_old_cache:
                    raise
                else:
                    self._remove_old_cache_and_files()

    def __len__(self):
        """Return number of formulas in the cache."""
        # ignore version
        return len(self.__cache) - 1

    def __set_version(self, version):
        """Set version of cache (data structure format)."""
        self.__cache[ImageCache.VERSION_STR] = version

    def write(self):
        """Write cache to disk. The file name will be the one configured during
        initialisation of the cache."""
        if len(self.__cache) == 0:
            return
        with open(self.__path, 'w', encoding='UTF-8') as file:
            file.write(json.dumps(self.__cache))

    def _read(self):
        """Read Json from disk into cache, if file exists.
        :raises JsonParserException if json could not be parsed"""
        def raise_error(msg):
            raise JsonParserException(msg + "\nPlease delete the cache (and" + \
                        " the images) and rerun the program.")
        if os.path.exists(self.__path):
            #pylint: disable=broad-except
            try:
                with open(self.__path, 'r', encoding='utf-8') as file:
                    self.__cache = json.load(file)
            except Exception as e:
                msg = "error while reading cache from %s: " % os.path.abspath(self.__path)
                if isinstance(e, (ValueError, OSError)):
                    msg += str(e.args[0])
                elif isinstance(e, UnicodeDecodeError):
                    msg += 'expected UTF-8 encoding, erroneous byte ' + \
                            '{0} at {1}:{2} ({3})'.format(*(e.args[1:]))
                else:
                    msg += str(e.args[0])
                raise_error(msg)
        if not isinstance(self.__cache, dict):
            raise_error("Decoded Json is not a dictionary.")
        if not self.__cache.get(ImageCache.VERSION_STR):
            self.__set_version(CACHE_VERSION)
        cur_version = self.__cache.get(ImageCache.VERSION_STR)
        if cur_version != CACHE_VERSION:
            raise_error("Cache in %s has version %s, expected %s." % \
                    (self.__path, cur_version, CACHE_VERSION))
        recover_bools(self.__cache)

    def _remove_old_cache_and_files(self):
        os.remove(self.__path)
        directory = os.path.split(self.__path)[0]
        if not directory:
            directory = '.'
        # remove all files starting with eqn*
        for file in os.listdir(directory):
            if not file.startswith('eqn'):
                continue
            file = os.path.join(directory, file)
            if os.path.isfile(file):
                os.remove(file)

    def add_formula(self, formula, pos, file_path, displaymath=False):
        """Add formula to cache. The pos argument contains the positioning
        info for the output document and is a dict with 'height', 'width' and
        'depth'.
        Keep in mind that formulas set with displaymath are not the same as
        those set iwth inlinemath.
        This method raises OSError if specified image doesn't exist or if it got
        an absolute file_path."""
        if not pos or not formula or not file_path:
            raise ValueError("the supplied arguments may not be empty/none")
        if not isinstance(displaymath, bool):
            raise ValueError("displaymath must be a boolean")
        if os.path.isabs(file_path):
            raise OSError("The file path to the image may NOT be an absolute path")
        if '\\' in file_path:
            file_path = file_path.replace('\\', '/')
        if not os.path.exists(file_path):
            # could be that the current working directory is different
            test_path = os.path.join(os.path.split(self.__path)[0],
                    os.path.split(file_path)[1])
            if not os.path.exists(test_path):
                raise OSError("cannot add %s to the cache: doesn't exist" %
                    file_path)
        formula = normalize_formula(formula)
        if not formula in self.__cache:
            self.__cache[formula] = {}
        val = self.__cache[formula]
        if not displaymath in val:
            val[displaymath] = {'pos' : pos, 'path' : file_path}

    def remove_formula(self, formula, displaymath):
        """This method removes the given formula from the cache. A KeyError is
        raised, if the formula did not exist. Internally, formulas are
        normalized to detect similarities."""
        formula = normalize_formula(formula)
        if not formula in self.__cache:
            raise KeyError("key %s not in cache" % formula)
        else:
            value = self.__cache[formula]
            if displaymath in value:
                del self.__cache[formula]
            else:
                raise KeyError("key %s (%s) not in cache" % (formula, displaymath))

    def contains(self, formula, displaymath):
        """Check whether a formula was already cached and return True if
        found."""
        try:
            return bool(self.get_data_for(formula, displaymath))
        except KeyError:
            return False


    def get_data_for(self, formula, displaymath):
        """
        Retrieve meta data about a formula from the cache.

        The meta information is used to embed the formula in the HTML document.
        It is a dictionary with the keys 'pos' and 'path'. The positioning info
        is described in the documentation of this class.
        This method raises a KeyError if the formula wasn't found."""
        formula = normalize_formula(formula)
        if not formula in self.__cache:
            raise KeyError(formula, displaymath)
        else:
            # check whether file still exists
            value = self.__cache[formula]
            if displaymath in value.keys():
                if not os.path.exists(value[displaymath]['path']):
                    del self.__cache[formula]
                    raise KeyError((formula, displaymath))
                else:
                    return value[displaymath]
            else:
                raise KeyError((formula, displaymath))