## Setup

In [1]:
import numpy as np
import numcodecs
numcodecs.__version__

'0.9.1'

## Unicode string benchmarks

In [2]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)


def benchmark_codec(codec, a):
    print(codec)
    print('encode')
    %timeit codec.encode(a)
    enc = codec.encode(a)
    print('decode')
    %timeit codec.decode(enc)
    print('size         : {:,}'.format(len(enc)))
    print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
    print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
    print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))

In [3]:
from numcodecs.tests.common import greetings
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')
vlen_codec = numcodecs.VLenUTF8()

### Greetings benchmark

In [4]:
np.random.seed(42)
data = np.random.choice(greetings, size=1000000).astype(object)
data

array(['Γεια σου κόσμε!', 'Hei maailma!', 'Zdravo svete!', ...,
       'Servus Woid!', 'เฮลโลเวิลด์', 'Zdravo svete!'], dtype=object)

In [5]:
%time enc = vlen_codec.encode(data)

CPU times: user 109 ms, sys: 30.9 ms, total: 140 ms
Wall time: 143 ms


In [6]:
%time dec = vlen_codec.decode(enc)

CPU times: user 162 ms, sys: 25.1 ms, total: 187 ms
Wall time: 185 ms


In [7]:
benchmark_codec(msgpack_codec, data)

MsgPack(raw=False, use_bin_type=True, use_single_float=False)
encode
78.6 ms ± 9.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
265 ms ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 18,913,397
size (zstd 1): 1,529,314
size (zstd 5): 1,405,819
size (zstd 9): 1,178,324


In [8]:
benchmark_codec(json_codec, data)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
252 ms ± 5.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
396 ms ± 45.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 33,322,605
size (zstd 1): 1,840,791
size (zstd 5): 1,675,175
size (zstd 9): 1,360,789


In [9]:
benchmark_codec(pickle_codec, data)

Pickle(protocol=5)
encode
277 ms ± 37.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
258 ms ± 37.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 20,835,273
size (zstd 1): 1,565,100
size (zstd 5): 1,435,771
size (zstd 9): 1,204,419


In [10]:
benchmark_codec(cat_codec, data)

Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])
encode
274 ms ± 21.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
46.3 ms ± 4.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 1,000,000
size (zstd 1): 458,196
size (zstd 5): 490,680
size (zstd 9): 490,487


In [11]:
benchmark_codec(vlen_codec, data)

VLenUTF8()
encode
107 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
184 ms ± 2.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,275
size (zstd 1): 1,762,783
size (zstd 5): 1,546,616
size (zstd 9): 1,216,314


### Lorem benchmark

In [12]:
from faker import Faker
fake = Faker()

In [13]:
data2 = np.array(' '.join(fake.sentences(nb=200000)).split(), dtype=object)
len(data2), data2[:10]

(1102008,
 array(['Ahead', 'everybody', 'important', 'indeed.', 'White', 'look',
        'than', 'environment', 'anyone.', 'Order'], dtype=object))

In [14]:
benchmark_codec(msgpack_codec, data2)

MsgPack(raw=False, use_bin_type=True, use_single_float=False)
encode
76.7 ms ± 996 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
173 ms ± 7.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 7,405,616
size (zstd 1): 3,312,068
size (zstd 5): 2,708,195
size (zstd 9): 2,700,345


In [15]:
benchmark_codec(json_codec, data2)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
183 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
189 ms ± 5.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 9,609,634
size (zstd 1): 2,897,941
size (zstd 5): 2,715,484
size (zstd 9): 2,682,781


In [16]:
benchmark_codec(pickle_codec, data2)

Pickle(protocol=5)
encode
230 ms ± 3.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
164 ms ± 8.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 9,615,186
size (zstd 1): 3,054,991
size (zstd 5): 2,756,213
size (zstd 9): 2,830,899


In [17]:
benchmark_codec(vlen_codec, data2)

VLenUTF8()
encode
111 ms ± 4.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
116 ms ± 3.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,711,630
size (zstd 1): 3,641,192
size (zstd 5): 3,461,415
size (zstd 9): 3,025,742


## Byte strings benchmark

In [18]:
vlen_bytes_codec = numcodecs.VLenBytes()

In [19]:
np.random.seed(42)
greetings_bytes = [g.encode('utf-8') for g in greetings]
data3 = np.random.choice(greetings_bytes, size=1000000).astype(object)
data3

array([b'\xce\x93\xce\xb5\xce\xb9\xce\xb1 \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5!',
       b'Hei maailma!', b'Zdravo svete!', ..., b'Servus Woid!',
       b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c',
       b'Zdravo svete!'], dtype=object)

In [20]:
vlen_bytes_codec.decode(vlen_bytes_codec.encode(data3))

array([b'\xce\x93\xce\xb5\xce\xb9\xce\xb1 \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5!',
       b'Hei maailma!', b'Zdravo svete!', ..., b'Servus Woid!',
       b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c',
       b'Zdravo svete!'], dtype=object)

In [21]:
benchmark_codec(pickle_codec, data3)

Pickle(protocol=5)
encode
231 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
108 ms ± 6.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 20,835,273
size (zstd 1): 1,565,112
size (zstd 5): 1,435,770
size (zstd 9): 1,204,445


In [22]:
benchmark_codec(vlen_bytes_codec, data3)

VLenBytes()
encode
33.7 ms ± 3.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
65.4 ms ± 1.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,275
size (zstd 1): 1,762,783
size (zstd 5): 1,546,616
size (zstd 9): 1,216,314


## Array benchmarks

In [23]:
np.random.seed(42)
data4 = np.array([np.random.randint(0, 100, size=np.random.randint(0, 20)).astype('i4')
                  for i in range(100000)], dtype=object)
data4

array([array([51, 92, 14, 71, 60, 20], dtype=int32),
       array([82, 86, 74, 74, 87, 99], dtype=int32),
       array([23,  2, 21, 52,  1, 87, 29], dtype=int32), ...,
       array([19, 62, 18], dtype=int32),
       array([93, 20,  7, 50], dtype=int32), array([51, 28], dtype=int32)],
      dtype=object)

In [24]:
vlen_arr_codec = numcodecs.VLenArray('<i4')

In [25]:
vlen_arr_codec.decode(vlen_arr_codec.encode(data4))

array([array([51, 92, 14, 71, 60, 20], dtype=int32),
       array([82, 86, 74, 74, 87, 99], dtype=int32),
       array([23,  2, 21, 52,  1, 87, 29], dtype=int32), ...,
       array([19, 62, 18], dtype=int32),
       array([93, 20,  7, 50], dtype=int32), array([51, 28], dtype=int32)],
      dtype=object)

In [26]:
benchmark_codec(vlen_arr_codec, data4)

VLenArray(dtype='<i4')
encode
24.8 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
53.5 ms ± 842 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 4,195,540
size (zstd 1): 1,299,769
size (zstd 5): 1,119,369
size (zstd 9): 1,196,642


In [27]:
benchmark_codec(pickle_codec, data4)

Pickle(protocol=5)
encode
280 ms ± 13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
130 ms ± 7.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 6,296,822
size (zstd 1): 1,619,421
size (zstd 5): 1,507,086
size (zstd 9): 1,493,343
