Import python venv for stability

2026-02-15 21:24:16 -08:00
parent 1343e93a59
commit 7d784705c9
4997 changed files with 1628270 additions and 0 deletions
@@ -0,0 +1,117 @@
+"""
+Native accelerators for Parquet encoding and decoding.
+"""
+# cython: profile=False
+# cython: linetrace=False
+# cython: binding=False
+# cython: language_level=3
+# cython: initializedcheck=False
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: overflowcheck=False
+# cython: initializedcheck=False
+# cython: cdivision=True
+# cython: always_allow_keywords=False
+
+from libc.string cimport memcpy
+
+from cpython cimport (PyUnicode_AsUTF8String, PyUnicode_DecodeUTF8,
+                      PyBytes_CheckExact, PyBytes_FromStringAndSize,
+                      PyBytes_GET_SIZE, PyBytes_AS_STRING)
+from cpython.unicode cimport PyUnicode_DecodeUTF8
+
+import numpy as np
+cimport numpy as np
+import cython
+
+
+_obj_dtype = np.dtype('object')
+
+
+def array_encode_utf8(inp):
+    """
+    utf-8 encode all elements of a 1d ndarray of "object" dtype.
+    A new ndarray of bytes objects is returned.
+    """
+    # TODO: combine with pack_byte_array as is done for unpack
+    cdef:
+        Py_ssize_t i, n
+        np.ndarray[object, ndim=1] arr
+        np.ndarray[object] result
+
+    arr = np.array(inp, copy=False)
+
+    n = arr.shape[0]
+    # TODO: why not inplace?
+    result = np.empty(n, dtype=object)
+    for i in range(n):
+        # Fast utf-8 encoding, avoiding method call and codec lookup indirection
+        result[i] = PyUnicode_AsUTF8String(arr[i])
+
+    return result
+
+
+def pack_byte_array(list items):
+    """
+    Pack a variable length byte array column.
+    A bytes object is returned.
+    """
+    cdef:
+        Py_ssize_t i, n, itemlen, total_size
+        unsigned char *start
+        unsigned char *data
+        object val, out
+
+    # Strategy: compute the total output size and allocate it in one go.
+    n = len(items)
+    total_size = 0
+    for i in range(n):
+        val = items[i]
+        if not PyBytes_CheckExact(val):
+            raise TypeError("expected list of bytes")
+        total_size += 4 + PyBytes_GET_SIZE(val)
+
+    out = PyBytes_FromStringAndSize(NULL, total_size)
+    start = data = <unsigned char *> PyBytes_AS_STRING(out)
+
+    # Copy data to output.
+    for i in range(n):
+        val = items[i]
+        # `itemlen` should be >= 0, so no signed extension issues
+        itemlen = PyBytes_GET_SIZE(val)
+        (<int*> data)[0] = itemlen
+        data += 4
+        memcpy(data, PyBytes_AS_STRING(val), itemlen)
+        data += itemlen
+
+    assert (data - start) == total_size
+    return out
+
+
+@cython.boundscheck(False)
+def unpack_byte_array(const unsigned char[::1] raw_bytes, Py_ssize_t n, const char utf=False):
+    """
+    Unpack a variable length byte array column.
+    An array of bytes objects is returned.
+    """
+    cdef:
+        Py_ssize_t i = 0
+        char* ptr = <char*>&raw_bytes[0]
+        int itemlen, bytecount
+        np.ndarray[object, ndim=1, mode="c"] out = np.empty(n, dtype="object")
+
+    assert out is not None
+    bytecount = raw_bytes.shape[0]
+    while i < n and bytecount > 0:
+
+        itemlen = (<int*> ptr)[0]
+        ptr += 4
+        if utf:
+            out[i] = PyUnicode_DecodeUTF8(ptr, itemlen, "ignore")
+        else:
+            out[i] = PyBytes_FromStringAndSize(ptr, itemlen)
+        ptr += itemlen
+        bytecount -= 4 + itemlen
+        i += 1
+
+    return out