File: 0009-Fix-vlen-enc.patch

package info (click to toggle)
numcodecs 0.16.0%2Bds-1
links: PTS, VCS
area: main
in suites: trixie
size: 1,004 kB
sloc: python: 4,167; makefile: 41
file content (387 lines) | stat: -rw-r--r-- 13,718 bytes
From: Antonio Valentino <antonio.valentino@tiscali.it>
Date: Thu, 10 Apr 2025 05:42:56 +0000
Subject: Fix vlen enc

Forwarded: https://github.com/zarr-developers/numcodecs/pull/736
---
 numcodecs/compat_ext.pxd           |   2 +
 numcodecs/fletcher32.pyx           |   4 +-
 numcodecs/tests/test_vlen_bytes.py |   4 --
 numcodecs/vlen.pyx                 | 140 ++++++++++++++++---------------------
 4 files changed, 66 insertions(+), 84 deletions(-)

diff --git a/numcodecs/compat_ext.pxd b/numcodecs/compat_ext.pxd
index 436c23f..129c1d8 100644
--- a/numcodecs/compat_ext.pxd
+++ b/numcodecs/compat_ext.pxd
@@ -3,7 +3,9 @@
 
 cdef extern from *:
     """
+    #ifndef PyBytes_RESIZE
     #define PyBytes_RESIZE(b, n) _PyBytes_Resize(&b, n)
+    #endif
     """
     int PyBytes_RESIZE(object b, Py_ssize_t n) except -1
 
diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx
index 387295b..56a5f79 100644
--- a/numcodecs/fletcher32.pyx
+++ b/numcodecs/fletcher32.pyx
@@ -76,7 +76,7 @@ class Fletcher32(Codec):
         """Return buffer plus a footer with the fletcher checksum (4-bytes)"""
         buf = ensure_contiguous_ndarray(buf).ravel().view('uint8')
         cdef const uint8_t[::1] b_mv = buf
-        cdef uint8_t* b_ptr = &b_mv[0]
+        cdef const uint8_t* b_ptr = &b_mv[0]
         cdef Py_ssize_t b_len = len(b_mv)
 
         cdef Py_ssize_t out_len = b_len + FOOTER_LENGTH
@@ -92,7 +92,7 @@ class Fletcher32(Codec):
         """Check fletcher checksum, and return buffer without it"""
         b = ensure_contiguous_ndarray(buf).view('uint8')
         cdef const uint8_t[::1] b_mv = b
-        cdef uint8_t* b_ptr = &b_mv[0]
+        cdef const uint8_t* b_ptr = &b_mv[0]
         cdef Py_ssize_t b_len = len(b_mv)
 
         val = _fletcher32(b_mv[:-FOOTER_LENGTH])
diff --git a/numcodecs/tests/test_vlen_bytes.py b/numcodecs/tests/test_vlen_bytes.py
index 467c9a8..3546dba 100644
--- a/numcodecs/tests/test_vlen_bytes.py
+++ b/numcodecs/tests/test_vlen_bytes.py
@@ -1,4 +1,3 @@
-import sys
 import unittest
 
 import numpy as np
@@ -85,9 +84,6 @@ def test_decode_errors():
         codec.decode(enc, out=np.zeros(10, dtype='i4'))
 
 
-# TODO: fix this test on GitHub actions somehow...
-# See https://github.com/zarr-developers/numcodecs/issues/683
-@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.")
 def test_encode_none():
     a = np.array([b'foo', None, b'bar'], dtype=object)
     codec = VLenBytes()
diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index f564a66..6c04260 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -7,26 +7,13 @@
 
 cimport cython
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport uint8_t, uint32_t
 from libc.string cimport memcpy
 
-from cpython.buffer cimport PyBuffer_IsContiguous
-from cpython.bytearray cimport (
-    PyByteArray_AS_STRING,
-    PyByteArray_FromStringAndSize,
-)
-from cpython.bytes cimport (
-    PyBytes_AS_STRING,
-    PyBytes_GET_SIZE,
-    PyBytes_Check,
-    PyBytes_FromStringAndSize,
-)
+from cpython.bytearray cimport PyByteArray_FromStringAndSize
+from cpython.bytes cimport PyBytes_FromStringAndSize
 from cpython.memoryview cimport PyMemoryView_GET_BUFFER
-from cpython.unicode cimport (
-    PyUnicode_AsUTF8String,
-    PyUnicode_Check,
-    PyUnicode_FromStringAndSize,
-)
+from cpython.unicode cimport PyUnicode_FromStringAndSize
 
 from numpy cimport ndarray
 
@@ -39,8 +26,12 @@ from .abc import Codec
 from .compat import ensure_contiguous_ndarray
 
 
-# 4 bytes to store number of items
-cdef Py_ssize_t HEADER_LENGTH = 4
+# Define header size used to store number of items that follow.
+cdef extern from *:
+    """
+    const Py_ssize_t HEADER_LENGTH = sizeof(uint32_t);
+    """
+    const Py_ssize_t HEADER_LENGTH
 
 
 def check_out_param(out, n_items):
@@ -89,11 +80,11 @@ class VLenUTF8(Codec):
             ndarray[object, ndim=1] input_values
             object[:] encoded_values
             int[:] encoded_lengths
-            char* encv
             bytes b
             bytearray out
             char* data
-            object u
+            object o
+            unicode u
 
         # normalise input
         input_values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -108,15 +99,13 @@ class VLenUTF8(Codec):
         # first iteration to convert to bytes
         data_length = 0
         for i in range(n_items):
-            u = input_values[i]
-            if u is None or u == 0:  # treat these as missing value, normalize
-                u = ''
-            elif not PyUnicode_Check(u):
-                raise TypeError('expected unicode string, found %r' % u)
-            b = PyUnicode_AsUTF8String(u)
-            l = PyBytes_GET_SIZE(b)
+            o = input_values[i]
+            # replace missing value and coerce to typed data
+            u = "" if o is None or o == 0 else o
+            b = u.encode("utf-8")
+            l = len(b)
             encoded_values[i] = b
-            data_length += l + 4  # 4 bytes to store item length
+            data_length += l + HEADER_LENGTH
             encoded_lengths[i] = l
 
         # setup output
@@ -124,7 +113,7 @@ class VLenUTF8(Codec):
         out = PyByteArray_FromStringAndSize(NULL, total_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
@@ -132,9 +121,9 @@ class VLenUTF8(Codec):
         for i in range(n_items):
             l = encoded_lengths[i]
             store_le32(<uint8_t*>data, l)
-            data += 4
-            encv = PyBytes_AS_STRING(encoded_values[i])
-            memcpy(data, encv, l)
+            data += HEADER_LENGTH
+            b = encoded_values[i]
+            memcpy(data, <const char*>b, l)
             data += l
 
         return out
@@ -151,12 +140,10 @@ class VLenUTF8(Codec):
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -178,10 +165,10 @@ class VLenUTF8(Codec):
         # https://github.com/cython/cython/issues/1608
         data += HEADER_LENGTH
         for i in range(n_items):
-            if data + 4 > data_end:
+            if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
             l = load_le32(<uint8_t*>data)
-            data += 4
+            data += HEADER_LENGTH
             if data + l > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
             out[i] = PyUnicode_FromStringAndSize(data, l)
@@ -223,9 +210,10 @@ class VLenBytes(Codec):
         cdef:
             Py_ssize_t i, l, n_items, data_length, total_length
             object[:] values
+            object[:] normed_values
             int[:] lengths
-            char* encv
-            object b
+            object o
+            bytes b
             bytearray out
             char* data
 
@@ -236,18 +224,18 @@ class VLenBytes(Codec):
         n_items = values.shape[0]
 
         # setup intermediates
+        normed_values = np.empty(n_items, dtype=object)
         lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to find lengths
         data_length = 0
         for i in range(n_items):
-            b = values[i]
-            if b is None or b == 0:  # treat these as missing value, normalize
-                b = b''
-            elif not PyBytes_Check(b):
-                raise TypeError('expected byte string, found %r' % b)
-            l = PyBytes_GET_SIZE(b)
-            data_length += l + 4  # 4 bytes to store item length
+            o = values[i]
+            # replace missing value and coerce to typed data
+            b = b"" if o is None or o == 0 else o
+            normed_values[i] = b
+            l = len(b)
+            data_length += l + HEADER_LENGTH
             lengths[i] = l
 
         # setup output
@@ -255,7 +243,7 @@ class VLenBytes(Codec):
         out = PyByteArray_FromStringAndSize(NULL, total_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
@@ -263,9 +251,9 @@ class VLenBytes(Codec):
         for i in range(n_items):
             l = lengths[i]
             store_le32(<uint8_t*>data, l)
-            data += 4
-            encv = PyBytes_AS_STRING(values[i])
-            memcpy(data, encv, l)
+            data += HEADER_LENGTH
+            b = normed_values[i]
+            memcpy(data, <const char*>b, l)
             data += l
 
         return out
@@ -282,12 +270,10 @@ class VLenBytes(Codec):
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -309,10 +295,10 @@ class VLenBytes(Codec):
         # https://github.com/cython/cython/issues/1608
         data += HEADER_LENGTH
         for i in range(n_items):
-            if data + 4 > data_end:
+            if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
             l = load_le32(<uint8_t*>data)
-            data += 4
+            data += HEADER_LENGTH
             if data + l > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
             out[i] = PyBytes_FromStringAndSize(data, l)
@@ -369,13 +355,12 @@ class VLenArray(Codec):
             object[:] values
             object[:] normed_values
             int[:] lengths
-            const char* encv
             bytes b
             bytearray out
             char* data
             memoryview value_mv
             const Py_buffer* value_pb
-            object v
+            object o
 
         # normalise input
         values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -390,16 +375,18 @@ class VLenArray(Codec):
         # first iteration to convert to bytes
         data_length = 0
         for i in range(n_items):
-            v = values[i]
-            if v is None:
-                v = np.array([], dtype=self.dtype)
-            else:
-                v = np.ascontiguousarray(v, self.dtype)
-            if v.ndim != 1:
-                raise ValueError('only 1-dimensional arrays are supported')
-            l = v.nbytes
-            normed_values[i] = v
-            data_length += l + 4  # 4 bytes to store item length
+            o = values[i]
+            # replace missing value and coerce to typed data
+            value_mv = ensure_continguous_memoryview(
+                np.array([], dtype=self.dtype) if o is None
+                else np.ascontiguousarray(o, self.dtype)
+            )
+            value_pb = PyMemoryView_GET_BUFFER(value_mv)
+            if value_pb.ndim != 1:
+                raise ValueError("only 1-dimensional arrays are supported")
+            l = value_pb.len
+            normed_values[i] = value_mv
+            data_length += l + HEADER_LENGTH
             lengths[i] = l
 
         # setup output
@@ -407,7 +394,7 @@ class VLenArray(Codec):
         out = PyByteArray_FromStringAndSize(NULL, total_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
@@ -415,13 +402,12 @@ class VLenArray(Codec):
         for i in range(n_items):
             l = lengths[i]
             store_le32(<uint8_t*>data, l)
-            data += 4
+            data += HEADER_LENGTH
 
-            value_mv = ensure_continguous_memoryview(normed_values[i])
+            value_mv = normed_values[i]
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
-            encv = <const char*>value_pb.buf
 
-            memcpy(data, encv, l)
+            memcpy(data, value_pb.buf, l)
             data += l
 
         return out
@@ -441,12 +427,10 @@ class VLenArray(Codec):
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -468,10 +452,10 @@ class VLenArray(Codec):
         # https://github.com/cython/cython/issues/1608
         data += HEADER_LENGTH
         for i in range(n_items):
-            if data + 4 > data_end:
+            if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
             l = load_le32(<uint8_t*>data)
-            data += 4
+            data += HEADER_LENGTH
             if data + l > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')