AI-Stock-Trader/WebServer/AIPython/python/lib/python3.11/site-packages/fastparquet/speedups.pyx

"""
Native accelerators for Parquet encoding and decoding.
"""
# cython: profile=False
# cython: linetrace=False
# cython: binding=False
# cython: language_level=3
# cython: initializedcheck=False
# cython: boundscheck=False
# cython: wraparound=False
# cython: overflowcheck=False
# cython: initializedcheck=False
# cython: cdivision=True
# cython: always_allow_keywords=False

from libc.string cimport memcpy

from cpython cimport (PyUnicode_AsUTF8String, PyUnicode_DecodeUTF8,
                      PyBytes_CheckExact, PyBytes_FromStringAndSize,
                      PyBytes_GET_SIZE, PyBytes_AS_STRING)
from cpython.unicode cimport PyUnicode_DecodeUTF8

import numpy as np
cimport numpy as np
import cython


_obj_dtype = np.dtype('object')


def array_encode_utf8(inp):
    """
    utf-8 encode all elements of a 1d ndarray of "object" dtype.
    A new ndarray of bytes objects is returned.
    """
    # TODO: combine with pack_byte_array as is done for unpack
    cdef:
        Py_ssize_t i, n
        np.ndarray[object, ndim=1] arr
        np.ndarray[object] result

    arr = np.array(inp, copy=False)

    n = arr.shape[0]
    # TODO: why not inplace?
    result = np.empty(n, dtype=object)
    for i in range(n):
        # Fast utf-8 encoding, avoiding method call and codec lookup indirection
        result[i] = PyUnicode_AsUTF8String(arr[i])

    return result


def pack_byte_array(list items):
    """
    Pack a variable length byte array column.
    A bytes object is returned.
    """
    cdef:
        Py_ssize_t i, n, itemlen, total_size
        unsigned char *start
        unsigned char *data
        object val, out

    # Strategy: compute the total output size and allocate it in one go.
    n = len(items)
    total_size = 0
    for i in range(n):
        val = items[i]
        if not PyBytes_CheckExact(val):
            raise TypeError("expected list of bytes")
        total_size += 4 + PyBytes_GET_SIZE(val)

    out = PyBytes_FromStringAndSize(NULL, total_size)
    start = data = <unsigned char *> PyBytes_AS_STRING(out)

    # Copy data to output.
    for i in range(n):
        val = items[i]
        # `itemlen` should be >= 0, so no signed extension issues
        itemlen = PyBytes_GET_SIZE(val)
        (<int*> data)[0] = itemlen
        data += 4
        memcpy(data, PyBytes_AS_STRING(val), itemlen)
        data += itemlen

    assert (data - start) == total_size
    return out


@cython.boundscheck(False)
def unpack_byte_array(const unsigned char[::1] raw_bytes, Py_ssize_t n, const char utf=False):
    """
    Unpack a variable length byte array column.
    An array of bytes objects is returned.
    """
    cdef:
        Py_ssize_t i = 0
        char* ptr = <char*>&raw_bytes[0]
        int itemlen, bytecount
        np.ndarray[object, ndim=1, mode="c"] out = np.empty(n, dtype="object")

    assert out is not None
    bytecount = raw_bytes.shape[0]
    while i < n and bytecount > 0:

        itemlen = (<int*> ptr)[0]
        ptr += 4
        if utf:
            out[i] = PyUnicode_DecodeUTF8(ptr, itemlen, "ignore")
        else:
            out[i] = PyBytes_FromStringAndSize(ptr, itemlen)
        ptr += itemlen
        bytecount -= 4 + itemlen
        i += 1

    return out