File: unicode_index_times.py

package info (click to toggle)
jython 2.7.3%2Brepack1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 62,820 kB
  • sloc: python: 641,384; java: 306,981; xml: 2,066; sh: 514; ansic: 126; makefile: 77
file content (405 lines) | stat: -rw-r--r-- 13,363 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# -*- coding: utf-8 -*-
# unicode speed tests for access operations and find
# This is part of an effort to supply the Jython implementation of
# unicode with efficient, correct translations between the visible index
# of a character and and the index in the implementation array of the
# UTF-16 code unit(s) that represent it. See also test.test_unicode_jy,
# and Jython issue #2100. The presence of characters with Unicode
# code points > 0xffff (supplementary characters), that it takes two
# code units to represent, makes this non-trivial.
#
# The program defines a variety of test strings of differing lengths
# and distribution of supplementary characters, then times some basic
# operations involving index translation: of retrieval, slicing and
# find, to provide an average time per operation. It runs several trials
# of each test case (a combination of an operation and the test material)
# and reports the shortest trial, using the same strategy as the timeit
# module).
#
# It was difficult to get repeatable times from the test under Jython,
# because the JIT compiler (?) is non-deterministic. It proved
# impossible using a strategy that would run the same test case multiple
# times in succession. The approach eventually taken was to run the all
# the test cases once, then repeat this sequence several times, and
# report the minimum time of each case in these widely separated trials.
# This strategy provides stable results with the default JIT behaviour.
#
# Two interesting options are to run with the # JIT compiler disabled:
#   $ jython -J-Xint tests/python/unicode_index_times.py
#
# or with it continuously enabled:
#   $ jython -J-Xcomp tests/python/unicode_index_times.py
#
from __future__ import print_function
import itertools
import random
import sys
import time

if sys.platform == "win32" or sys.platform.startswith("java"):
    # On Windows and Jython, the best timer is time.clock()
    timer = time.clock
else:
    # On most other platforms the best timer is time.time()
    timer = time.time

if sys.version_info[0] > 2:
    unichr = chr
else:
    def next(it) : return it.next()

# Proportion of characters that are supplementary (if not explicit)
DEFAULT_RATE = 0.2  # 20%

# We will test performance with these sizes
SHORT = 10
MEDIUM = 100
LONG = 1000
HUGE = 10000


class UnicodeMaterial(object):
    ''' Object holding a list of single characters and a unicode string
        that is their concatenation. The sequence is created from a
        background sequence of basic plane characters and random
        replacement with supplementary plane characters (those with
        point code>0xffff).
    '''

    base = tuple(u'abcdefghijklmnopqrstuvwxyz')
    if sys.maxunicode < 0x10000:
        print("Narrow build: all characters from BMP", file=sys.stderr)
        supp = tuple(map(unichr, range(0x20, 0x2c)))
    else:
        # Wide build: we have real supplementary characters
        supp = tuple(map(unichr, range(0x10000, 0x1000c)))
    used = sorted(set(base+supp))

    def __init__(self, size=20, pred=None, ran=None):
        ''' Create size chars choosing an SP char at i where
            pred(ran, i)==True where ran is an instance of
            random.Random supplied in the constructor or created
            locally (if ran==None).
        '''

        # Generators for the BMP and SP characters
        base = itertools.cycle(UnicodeMaterial.base)
        supp = itertools.cycle(UnicodeMaterial.supp)

        # Each instance gets a random generator
        if ran is None:
            ran = random.Random()
        self.random = ran

        if pred is None:
            pred = lambda ran, j : ran.random() < DEFAULT_RATE

        # Generate the list
        r = list()
        for i in range(size):
            if pred(self.random, i):
                c = next(supp)
            else:
                c = next(base)
            r.append(c)

        # The list and its concatenation are our material
        self.ref = r
        self.size = len(r)
        self.text = u''.join(r)
        self.target = u''

    def __len__(self):
        return self.size

    def insert(self, target, p=None):
        ''' Insert target string at position p (or middle), truncating if
            that would make the material any longer
        '''
        if p is None:
            p = max(0, (self.size-len(target)) // 2)

        n = 0
        for t in target:
            if p+n >= self.size:
                break;
            self.ref[p+n] = t
            n += 1

        self.target = target[:n]
        self.text = u''.join(self.ref)


class UnicodeActions(UnicodeMaterial):
    ''' Provides test material with loops for timing.'''

    def __init__(self, size=20, pred=None, ran=None):
        super(UnicodeActions, self).__init__(size, pred, ran)
        if self.size <= 0:
            raise ValueError("The timings don't work for zero length")
        self.used =  UnicodeMaterial.used
        self.trash = None
        # String to find (note 'abcde' in base: nice for false alarms)
        self.insert(u"abide")


    def repeat_getitem(self, mincount):
        ''' Access each code point in sequence repeatedly so that at
            least mincount operations have been peformed, and return the
            actual number of operations.
        '''
        n = self.size
        t = self.text
        opcount = 0
        dummy = None
        while opcount < mincount:
            # Access each point code
            i = 0
            while i < n:
                # Avoid optimising away the call
                dummy = t[i]
                i += 1
            opcount += n
        self.trash = dummy
        return opcount

    def repeat_slice(self, mincount):
        ''' Extract a slice at each feasible position and length,
            repeating enough times to do mincount operations, and
            return the actual number of operations.
        '''
        n = self.size
        t = self.text
        opcount = 0
        dummy = None

        while opcount < mincount:
            m = 1
            while m <= n:
                starts = n - m + 1
                for i in range(starts):
                    dummy = t[i:i+m]
                    #print(i, i+m, step, dummy)
                opcount += starts
                m *= 5

        return opcount

    def repeat_slice_step(self, mincount):
        ''' Extract a slice at each feasible position and length,
            and using different sizes for the step,
            repeating enough times to do mincount operations, and
            return the actual number of operations.
        '''
        n = self.size
        t = self.text
        opcount = 0
        dummy = None
        steps = [3, -1, 10]

        while opcount < mincount:
            for step in steps:
                if step > 0:
                    m = 1
                    while m <= n:
                        starts = n - m + 1
                        for i in range(starts):
                            dummy = t[i:i+m:step]
                            #print(i, i+m, step, dummy)
                        opcount += starts
                        m *= 5
                else:
                    m = 1
                    while m <= n:
                        starts = n - m + 1
                        for i in range(starts):
                            dummy = t[i+m:i:step]
                            #print(i+m, i, step, dummy)
                        opcount += starts
                        m *= 5
                    
        return opcount

    def repeat_find_char(self, mincount):
        ''' Do an incremental find of each code point, repeating
            enough times to do mincount finds, and return the actual
            number of operations.
        '''
        opcount = 0
        n = self.size
        findop = self.text.find
        dummy = 0

        while opcount < mincount:
            # The text is searched for every code c.
            for c in self.used:
                # ... and every occurrence is found.
                start = 0
                while start < n:
                    i = findop(c, start)
                    if i < 0: break
                    # Avoid optimising away the call
                    dummy += i
                    start = i + 1

            # Every character in the text was a hit exactly once.
            # And every character was also a miss, except for
            # the character that ends the text. So we did:
            opcount += n + len(self.used) - 1

        self.trash = dummy
        return opcount

    def repeat_find_str(self, mincount):
        ''' Find the target string within the material, repeating
            enough times to do mincount finds, and return the actual
            number of operations.
        '''
        opcount = 0
        s = self.target
        findop = self.text.find
        dummy = 0

        while opcount < mincount:
            dummy += findop(s)
            opcount += 1

        self.trash = dummy
        return opcount

    def repeat_rfind_char(self, mincount):
        ''' Do an incremental rfind of each code point, repeating
            enough times to do mincount finds, and return the actual
            number of operations.
        '''
        opcount = 0
        n = self.size
        findop = self.text.rfind

        while opcount < mincount:
            # The text is searched for every code c.
            for c in self.used:
                # ... and every occurrence is found.
                end = n
                while end >= 0:
                    end = findop(c, 0, end)

            # Every character in the text was a hit exactly once.
            # And every character was also a miss. So we did:
            opcount += n + len(self.used)

        return opcount

    def repeat_rfind_str(self, mincount):
        ''' Find the target string within the material, repeating
            enough times to do mincount finds, and return the actual
            number of operations.
        '''
        opcount = 0
        s = self.target
        findop = self.text.rfind
        dummy = 0

        while opcount < mincount:
            dummy += findop(s)
            opcount += 1

        self.trash = dummy
        return opcount


def time_per_op(op, mincount):
    ''' Repeat the given operation at least mincount times and
        return the time per operation in microseconds.
    '''
    t = timer()
    opcount = op(mincount)
    return (timer() - t) * 1e6 / opcount

# Functions defining particular distributions of SP codes
#
def evenly(rate=DEFAULT_RATE):
    'Evenly distributed at given rate'
    def f(ran, i):
        return ran.random() < rate
    return f

def evenly_before(k, rate=DEFAULT_RATE):
    'Evenly distributed on i<k at given rate'
    def f(ran, i):
        return i < k and ran.random() < rate
    return f

def evenly_from(k, rate=DEFAULT_RATE):
    'Evenly distributed on i>=k at given rate'
    def f(ran, i):
        return i >= k and ran.random() < rate
    return f

def time_all():

    setups = [
        #("empty", UnicodeActions(0)),
        ("single bmp", UnicodeActions(1, (lambda ran, i : False))),
        ("single", UnicodeActions(1, (lambda ran, i : True))),
        ("short bmp", UnicodeActions(SHORT, (lambda ran, i : False))),
        ("short 50%", UnicodeActions(SHORT, evenly(0.5))),
        ("medium bmp", UnicodeActions(MEDIUM, (lambda ran, i : False))),
        ("medium 10%", UnicodeActions(MEDIUM, evenly(0.1))),
        ("long bmp", UnicodeActions(LONG, (lambda ran, i : False))),
        ("long 1%", UnicodeActions(LONG, evenly(0.01))),
        ("long 10%", UnicodeActions(LONG, evenly(0.1))),
        ("long 10% L", UnicodeActions(LONG, evenly_before(LONG/4, 0.4))),
        ("long 10% H", UnicodeActions(LONG, evenly_from(LONG-(LONG/4), 0.4))),
        ("long 50%", UnicodeActions(LONG, evenly(0.5))),
        ("huge bmp", UnicodeActions(HUGE, (lambda ran, i : False))),
        ("huge 10%", UnicodeActions(HUGE, evenly(0.1))),
    ]

    ops = [
        ("[i]", "repeat_getitem"),
        ("[i:i+n]", "repeat_slice"),
        ("[i:i+n:k]", "repeat_slice_step"),
        ("find(c)", "repeat_find_char"),
        ("find(s)", "repeat_find_str"),
        ("rfind(c)", "repeat_rfind_char"),
        ("rfind(s)", "repeat_rfind_str"),
    ]


    print("{:12s}{:>6s}".format("time (us)", "len"), end='')
    for title, _ in ops:
        print("{:>10s}".format(title), end='')
    print()

    mintime = dict()
    def save_mintime(k, t):
        mintime[k] = min(t, mintime.get(k, 1e9))

    trashcan = []

    # Cycle through the cases repeatedly.
    for k in range(5):
        for name, material in setups:
            for _, opname in ops:
                # For each case, keep the minimum time
                op = material.__getattribute__(opname)
                t = time_per_op(op, 1000)
                save_mintime((name, opname), t)

            if k == 0:
                trashcan.append(material.trash)

    # Cycle through the cases again to print them.
    for name, material in setups:
        print("{:12s}{:6d}".format(name, material.size), end='')
        for _, opname in ops:
            t = mintime[(name, opname)]
            print("{:10.2f}".format(t), end='')
        print()

    print("y =", trashcan)

if __name__ == "__main__":

    time_all()