AI-Stock-Trader/WebServer/AIPython/python/lib/python3.11/site-packages/fastparquet/cencoding.pyx

# https://cython.readthedocs.io/en/latest/src/userguide/
#   source_files_and_compilation.html#compiler-directives
# cython: profile=False
# cython: linetrace=False
# cython: binding=False
# cython: language_level=3
# cython: initializedcheck=False
# cython: boundscheck=False
# cython: wraparound=False
# cython: overflowcheck=False
# cython: cdivision=True
# cython: always_allow_keywords=False

import cython
import numpy as np
from cpython cimport (
    PyBytes_FromStringAndSize, PyBytes_GET_SIZE, PyUnicode_DecodeUTF8,
)
from libc.string cimport memcpy
from libc.stdint cimport int8_t, uint8_t, uint32_t, int32_t, uint64_t, int64_t


cpdef void read_rle(NumpyIO file_obj, int32_t header, int32_t bit_width, NumpyIO o, int32_t itemsize=4) noexcept:
    """Read a run-length encoded run from the given fo with the given header and bit_width.

    The count is determined from the header and the width is used to grab the
    value that's repeated. Yields the value repeated count times.
    """
    cdef:
        uint32_t count, width, i, vals_left
        int32_t data = 0
        char * inptr = file_obj.get_pointer()
        char * outptr = o.get_pointer()
    count = header >> 1
    width = (bit_width + 7) // 8
    for i in range(width):
        data |= (inptr[0] & 0xff) << (i * 8)
        inptr += 1
    vals_left = (o.nbytes - o.loc) // itemsize
    if count > vals_left:
        count = vals_left
    if itemsize == 4:
        for i in range(count):
            (<int32_t*>outptr)[0] = data
            outptr += 4
    else:
        for i in range(count):
            outptr[0] = data & 0xff
            outptr += 1
    o.loc += outptr - o.get_pointer()
    file_obj.loc += inptr - file_obj.get_pointer()


cpdef int32_t width_from_max_int(int64_t value) noexcept:
    """Convert the value specified to a bit_width."""
    cdef int32_t i
    for i in range(0, 64):
        if value == 0:
            return i
        value >>= 1


cdef int32_t _mask_for_bits(int32_t i) noexcept:
    """Generate a mask to grab `i` bits from an int value."""
    return (1 << i) - 1


cpdef void read_bitpacked1(NumpyIO file_obj, int32_t count, NumpyIO o) noexcept:
    # implementation of np.unpackbits with output array. Output is int8 array
    cdef:
        char * inptr = file_obj.get_pointer()
        char * outptr = o.get_pointer()
        char * endptr
        unsigned char data
        int32_t counter, i, startcount=count
    if count > o.nbytes - o.loc:
        count = o.nbytes - o.loc
    for counter in range(count // 8):
        # whole bytes
        data = inptr[0]
        inptr += 1
        for i in range(8):
            outptr[0] = data & 1
            outptr += 1
            data >>= 1
    if count % 8:
        # remaining values in the last byte
        data = <int32_t>inptr[0]
        inptr += 1
        for i in range(count % 8):
            outptr[0] = data & 1
            outptr += 1
            data >>= 1
    file_obj.loc += (startcount + 7) // 8
    o.loc += count


cpdef void write_bitpacked1(NumpyIO file_obj, int32_t count, NumpyIO o) noexcept:
    # implementation of np.packbits with output array. Input is int8 array
    cdef char * inptr
    cdef char * outptr
    cdef char data = 0
    cdef int32_t counter, i
    cdef int64_t indata
    outptr = o.get_pointer()
    inptr = file_obj.get_pointer()
    for counter in range(count // 8):
        # fetch a long in one op, instead of byte by byte
        indata = (<int64_t*>inptr)[0]
        inptr += 8
        for i in range(8):
            data = data << 1 | (indata & 1)
            indata >>= 8
        outptr[0] = data
        outptr += 1
    if count % 8:
        # leftover partial byte
        data = 0
        for i in range(count % 8):
            data = data << 1 | (inptr[0] != 0)
            inptr += 1
        outptr[0] = data
        outptr += 1
    file_obj.loc += count * 4
    o.loc += (count + 7) // 8


cpdef void read_bitpacked(NumpyIO file_obj, int32_t header, int32_t width, NumpyIO o, int32_t itemsize=4) noexcept:
    """
    Read values packed into width-bits each (which can be >8)
    """
    cdef:
        uint32_t count, mask, data, vals_left
        unsigned char left = 8, right = 0
        char * inptr = file_obj.get_pointer()
        char * outptr = o.get_pointer()
        char * endptr

    count = (header >> 1) * 8
    # TODO: special case for width=1, 2, 4, 8
    if width == 1 and itemsize == 1:
        read_bitpacked1(file_obj, count, o)
        return
    endptr = (o.nbytes - o.loc) + outptr - itemsize
    mask = _mask_for_bits(width)
    data = 0xff & <int32_t>inptr[0]
    inptr += 1
    while count:
        if right > 8:
            data >>= 8
            left -= 8
            right -= 8
        elif left - right < width:
            data |= (inptr[0] & 0xff) << left
            inptr += 1
            left += 8
        else:
            if outptr <= endptr:
                if itemsize == 4:
                    (<int32_t*>outptr)[0] = <int32_t>(data >> right & mask)
                    outptr += 4
                else:
                    outptr[0] = data >> right & mask
                    outptr += 1
            count -= 1
            right += width
    o.loc = o.loc + outptr - o.get_pointer()
    file_obj.loc += inptr - file_obj.get_pointer()


cpdef uint64_t read_unsigned_var_int(NumpyIO file_obj) noexcept:
    """Read a value using the unsigned, variable int encoding.
    file-obj is a NumpyIO of bytes; avoids struct to allow numba-jit
    """
    cdef uint64_t result = 0
    cdef int32_t shift = 0
    cdef char byte
    cdef char * inptr = file_obj.get_pointer()

    while True:
        byte = inptr[0]
        inptr += 1
        result |= (<int64_t>(byte & 0x7F) << shift)
        if (byte & 0x80) == 0:
            break
        shift += 7
    file_obj.loc += inptr - file_obj.get_pointer()
    return result


cpdef void read_rle_bit_packed_hybrid(NumpyIO io_obj, int32_t width, uint32_t length, NumpyIO o,
                                      int32_t itemsize=4) noexcept:
    """Read values from `io_obj` using the rel/bit-packed hybrid encoding.

    If length is not specified, then a 32-bit int is read first to grab the
    length of the encoded data.

    file-obj is a NumpyIO of bytes; o if an output NumpyIO of int32 or int8/bool

    The caller can tell the number of elements in the output by looking
    at .tell().
    """
    cdef int32_t start, header
    if length is False:
        length = <uint32_t>io_obj.read_int()
    start = io_obj.loc
    while io_obj.loc - start < length and o.loc < o.nbytes:
        header = <int32_t>read_unsigned_var_int(io_obj)
        if header & 1 == 0:
            read_rle(io_obj, header, width, o, itemsize)
        else:
            read_bitpacked(io_obj, header, width, o, itemsize)


cdef void delta_read_bitpacked(NumpyIO file_obj, uint8_t bitwidth,
                               NumpyIO o, uint64_t count, uint8_t longval=0) noexcept:
    cdef:
        uint64_t data = 0
        int8_t left = 0
        int8_t right = 0
        uint64_t mask = 0XFFFFFFFFFFFFFFFF >> (64 - bitwidth)
    while count > 0:
        if (left - right) < bitwidth:
            data = data | (<uint64_t>file_obj.read_byte() << left)
            left += 8
        elif right > 8:
            data >>= 8
            left -= 8
            right -= 8
        else:
            if longval:
                o.write_long((data >> right) & mask)
            else:
                o.write_int((data >> right) & mask)
            right += bitwidth
            count -= 1


cpdef int delta_binary_unpack(NumpyIO file_obj, NumpyIO o, uint8_t longval=0):
    cdef:
        uint64_t block_size = read_unsigned_var_int(file_obj)
        uint64_t miniblock_per_block = read_unsigned_var_int(file_obj)
        int64_t count = read_unsigned_var_int(file_obj)
        int64_t value = zigzag_long(read_unsigned_var_int(file_obj))
        int64_t block, min_delta, i, j, values_per_miniblock, temp
        const uint8_t[:] bitwidths
        uint8_t bitwidth
    values_per_miniblock = block_size // miniblock_per_block
    while True:
        min_delta = zigzag_long(read_unsigned_var_int(file_obj))
        bitwidths = file_obj.read(miniblock_per_block)
        for i in range(miniblock_per_block):
            bitwidth = bitwidths[i]
            if bitwidth:
                temp = o.loc
                if count > 1:
                    # no more diffs if on last value
                    delta_read_bitpacked(file_obj, bitwidth, o, values_per_miniblock, longval)
                o.loc = temp
                for j in range(values_per_miniblock):
                    if longval:
                        temp = o.read_long()
                        o.loc -= 8
                        o.write_long(value)
                    else:
                        temp = o.read_int()
                        o.loc -= 4
                        o.write_int(value)
                    value += min_delta + temp
                    count -= 1
                    if count <= 0:
                        return 0
            else:
                for j in range(values_per_miniblock):
                    if longval:
                        o.write_long(value)
                    else:
                        o.write_int(value)
                    value += min_delta
                    count -= 1
                    if count <= 0:
                        return 0


cpdef void encode_unsigned_varint(uint64_t x, NumpyIO o) noexcept:  # pragma: no cover
    while x > 127:
        o.write_byte((x & 0x7F) | 0x80)
        x >>= 7
    o.write_byte(x)


cpdef void encode_bitpacked(int32_t[:] values, int32_t width, NumpyIO o) noexcept:
    """
    Write values packed into width-bits each (which can be >8)
    """

    cdef int32_t bit_packed_count = (values.shape[0] + 7) // 8
    encode_unsigned_varint(bit_packed_count << 1 | 1, o)  # write run header
    cdef int32_t bit=0, bits=0, v, counter
    for counter in range(values.shape[0]):
        v = values[counter]
        bits |= v << bit
        bit += width
        while bit >= 8:
            o.write_byte(bits & 0xff)
            bit -= 8
            bits >>= 8
    if bit:
        o.write_byte(bits)


cpdef void encode_rle_bp(int32_t[:] data, int32_t width, NumpyIO o, int32_t withlength = 0) noexcept:
    cdef uint32_t start, end
    if withlength:
        start = o.tell()
        o.seek(4, 1)
    encode_bitpacked(data, width, o)
    if withlength:
        end = o.tell()
        o.seek(start)
        o.write_int(end - start - 4)
        o.seek(end)


@cython.freelist(100)
@cython.final
cdef class NumpyIO:
    """
    Read or write from a numpy array like a file object

    The main purpose is to keep track of the current location in the memory
    """
    cdef const uint8_t[:] data
    cdef uint32_t loc, nbytes
    cdef char* ptr
    cdef char writable

    def __cinit__(self, const uint8_t[::1] data):
        self.data = data
        self.loc = 0
        self.ptr = <char*>&data[0]
        self.nbytes = data.shape[0]

    cdef char* get_pointer(self) noexcept:
        return self.ptr + self.loc

    @property
    def len(self):
        return self.nbytes

    cpdef const uint8_t[:] read(self, int32_t x=-1):
        cdef const uint8_t[:] out
        if x < 1:
            x = self.nbytes - self.loc
        out = self.data[self.loc:self.loc + x]
        self.loc += x
        return out

    cpdef uint8_t read_byte(self) noexcept:
        cdef char out
        out = self.ptr[self.loc]
        self.loc += 1
        return out

    cpdef int32_t read_int(self) noexcept:
        cdef int32_t i
        if self.nbytes - self.loc < 4:
            return 0
        i = (<int32_t*> self.get_pointer())[0]
        self.loc += 4
        return i

    cpdef void write(self, const char[::1] d) noexcept:
        memcpy(<void*>self.ptr[self.loc], <void*>&d[0], d.shape[0])
        self.loc += d.shape[0]

    cpdef void write_byte(self, uint8_t b) noexcept:
        if self.loc >= self.nbytes:
            # ignore attempt to write past end of buffer
            return
        self.ptr[self.loc] = b
        self.loc += 1

    cpdef void write_int(self, int32_t i) noexcept:
        if self.nbytes - self.loc < 4:
            return
        (<int32_t*> self.get_pointer())[0] = i
        self.loc += 4

    cdef void write_long(self, int64_t i) noexcept:
        if self.nbytes - self.loc < 8:
            return
        (<int64_t*> self.get_pointer())[0] = i
        self.loc += 8

    cdef int64_t read_long(self) noexcept:
        cdef int64_t i
        if self.nbytes - self.loc < 8:
            return 0
        i = (<int64_t*> self.get_pointer())[0]
        self.loc += 8
        return i

    cdef void write_many(self, char b, int32_t count) noexcept:
        cdef int32_t i
        for i in range(count):
            self.write_byte(b)

    cpdef int32_t tell(self) noexcept:
        return self.loc

    cpdef uint32_t seek(self, int32_t loc, int32_t whence=0) noexcept:
        if whence == 0:
            self.loc = loc
        elif whence == 1:
            self.loc += loc
        elif whence == 2:
            self.loc = self.nbytes + loc
        if self.loc > self.nbytes:
            self.loc = self.nbytes
        return self.loc

    @cython.wraparound(False)
    cpdef const uint8_t[:] so_far(self) noexcept:
        """ In write mode, the data we have gathered until now
        """
        return self.data[:self.loc]


def _assemble_objects(object[:] assign, const uint8_t[:] defi, const uint8_t[:] rep,
                      val, dic, d,
                      char null, null_val, int32_t max_defi, int32_t prev_i):
    """Dremel-assembly of arrays of values into lists

    Parameters
    ----------
    assign: array dtype O
        To insert lists into
    defi: int array
        Definition levels, max 3
    rep: int array
        Repetition levels, max 1
    dic: array of labels or None
        Applied if d is True
    d: bool
        Whether to dereference dict values
    null: bool
        Can an entry be None?
    null_val: bool
        can list elements be None
    max_defi: int
        value of definition level that corresponds to non-null
    prev_i: int
        1 + index where the last row in the previous page was inserted (0 if first page)
    """
    cdef int32_t counter, i, re, de
    cdef int32_t vali = 0
    cdef char started = False, have_null = False
    if d:
        # dereference dict values
        val = dic[val]
    i = prev_i
    part = []
    for counter in range(rep.shape[0]):
        de = defi[counter] if defi is not None else max_defi
        re = rep[counter]
        if not re:
            # new row - save what we have
            if started:
                assign[i] = None if have_null else part
                part = []
                i += 1
            else:
                # first time: no row to save yet, unless it's a row continued from previous page
                if vali > 0:
                    assign[i - 1].extend(part) # add the items to previous row
                    part = []
                    # don't increment i since we only filled i-1
                started = True
        if de == max_defi:
            # append real value to current item
            part.append(val[vali])
            vali += 1
        elif de > null:
            # append null to current item
            part.append(None)
        # next object is None as opposed to an object
        have_null = de == 0 and null
    if started: # normal case - add the leftovers to the next row
        assign[i] = None if have_null else part
    else: # can only happen if the only elements in this page are the continuation of the last row from previous page
        assign[i - 1].extend(part)
    return i


cdef int64_t nat = -9223372036854775808


cpdef void time_shift(const int64_t[::1] data, int32_t factor=1000) noexcept:
    cdef int32_t i
    cdef int64_t * ptr
    cdef int64_t value
    ptr = <int64_t*>&data[0]
    for i in range(data.shape[0]):
        if ptr[0] != nat:
            ptr[0] *= factor
        ptr += 1


cdef int32_t zigzag_int(uint64_t n) noexcept:
    return (n >> 1) ^ -(n & 1)


cdef int64_t zigzag_long(uint64_t n) noexcept:
    return (n >> 1) ^ -(n & 1)


cdef uint64_t long_zigzag(int64_t n) noexcept:
    return (n << 1) ^ (n >> 63)


cpdef dict read_thrift(NumpyIO data):
    cdef char byte, id = 0, bit
    cdef int32_t size
    cdef dict out = {}
    cdef bint hasi64 = 0
    cdef bint hasi32 = 0
    cdef list i32 = None
    while True:
        byte = data.read_byte()
        if byte == 0:
            break
        id += (byte & 0b11110000) >> 4
        bit = byte & 0b00001111
        if bit == 5:
            out[id] = zigzag_long(read_unsigned_var_int(data))
            hasi32 = True
            if i32 is None:
                i32 = list()
            i32.append(id)
        elif bit == 6:
            out[id] = zigzag_long(read_unsigned_var_int(data))
            hasi64 = True
        elif bit == 7:
            out[id] = <double>data.get_pointer()[0]
            data.seek(8, 1)
        elif bit == 8:
            size = read_unsigned_var_int(data)
            out[id] = PyBytes_FromStringAndSize(data.get_pointer(), size)
            data.seek(size, 1)
        elif bit == 9:
            out[id] = read_list(data)
        elif bit == 12:
            out[id] = read_thrift(data)
        elif bit == 1:
            out[id] = True
        elif bit == 2:
            out[id] = False
        elif bit == 4:
            # I16
            out[id] = zigzag_long(read_unsigned_var_int(data))
        elif bit == 3:
            # I8
            out[id] = data.read_byte()
        else:
            print("Corrupted thrift data at ", data.tell(), ": ", id, bit)
    if hasi32:
        if hasi64:
            out["i32list"] = i32
        else:
            out["i32"] = 1
    return out


cdef list read_list(NumpyIO data):
    cdef unsigned char byte, typ
    cdef int32_t size, bsize, _
    byte = data.read_byte()
    if byte >= 0xf0:  # 0b11110000
        size = read_unsigned_var_int(data)
    else:
        size = ((byte & 0xf0) >> 4)
    out = []
    typ = byte & 0x0f # 0b00001111
    if typ == 5 or typ == 6:
        for _ in range(size):
            out.append(zigzag_long(read_unsigned_var_int(data)))
    elif typ == 8:
        for _ in range(size):
            # all parquet list types contain str, not bytes
            bsize = read_unsigned_var_int(data)
            out.append(PyUnicode_DecodeUTF8(data.get_pointer(), bsize, "ignore"))
            data.seek(bsize, 1)
    else:
        for _ in range(size):
            out.append(read_thrift(data))

    return out


cpdef int write_thrift(dict data, NumpyIO output):
    cdef int i, l, prev = 0
    cdef int delt = 0
    cdef double d
    cdef bytes b
    cdef char * c
    cdef int i32 = "i32" in data
    cdef list i32s
    if "i32list" in data:
        i32 = 2
        i32s = data['i32list']
    for i in range(1, 14):  # 14 is the max number of fields
        if i not in data:
            continue
        val = data.get(i)
        if val is None:
            # not defined - skip (None is default on load)
            continue
        delt = i - prev
        prev = i
        if isinstance(val, bool):
            if val is True:
                output.write_byte((delt << 4) | 1)
            else:
                output.write_byte((delt << 4) | 2)
        elif isinstance(val, int):
            if i32 == 1 or (i32 == 2 and i in i32s):
                output.write_byte((delt << 4) | 5)
            else:
                output.write_byte((delt << 4) | 6)
            encode_unsigned_varint(long_zigzag(<int64_t>val), output)
        elif isinstance(val, float):
            output.write_byte((delt << 4) | 7)
            d = val
            (<double*>output.get_pointer())[0] = d
            output.loc += 8
        elif isinstance(val, bytes):
            output.write_byte((delt << 4) | 8)
            l = PyBytes_GET_SIZE(<bytes>val)
            encode_unsigned_varint(l, output)
            c = val
            memcpy(<void*>output.get_pointer(), <void*>c, l)
            output.loc += l
        elif isinstance(val, str):
            output.write_byte((delt << 4) | 8)
            b = (<str>val).encode()
            l = PyBytes_GET_SIZE(b)
            encode_unsigned_varint(l, output)
            c = b
            memcpy(<void*>output.get_pointer(), <void*>c, l)
            output.loc += l
        elif isinstance(val, list):
            output.write_byte((delt << 4) | 9)
            write_list(<list>val, output)
        elif isinstance(val, ThriftObject):
            output.write_byte((delt << 4) | 12)
            write_thrift((<ThriftObject>val).data, output)
        else:
            output.write_byte((delt << 4) | 12)
            write_thrift(<dict>val, output)
    output.write_byte(0)


cdef int write_list(list data, NumpyIO output):
    cdef int l = len(data)
    cdef int i
    cdef ThriftObject dd
    cdef bytes b
    cdef str s
    cdef char * c
    if l:
        first = data[0]
        if isinstance(first, int):
            if l > 14:  # all lists are i64
                output.write_byte(5 | 0b11110000)
                encode_unsigned_varint(l, output)
            else:
                output.write_byte(5 | (l << 4))
            for i in data:
                encode_unsigned_varint(long_zigzag(i), output)
        elif isinstance(first, bytes):
            if l > 14:
                output.write_byte(8 | 0b11110000)
                encode_unsigned_varint(l, output)
            else:
                output.write_byte(8 | (l << 4))
            for b in data:
                i = PyBytes_GET_SIZE(b)
                encode_unsigned_varint(i, output)
                c = b
                memcpy(<void*>output.get_pointer(), <void*>c, i)
                output.loc += i
        elif isinstance(first, str):
            if l > 14:
                output.write_byte(8 | 0b11110000)
                encode_unsigned_varint(l, output)
            else:
                output.write_byte(8 | (l << 4))
            for s in data:
                b = s.encode("utf8", "ignore")
                i = PyBytes_GET_SIZE(b)
                encode_unsigned_varint(i, output)
                c = b
                memcpy(<void*>output.get_pointer(), <void*>c, i)
                output.loc += i
        else: # STRUCT
            if l > 14:
                output.write_byte(12 | 0b11110000)
                encode_unsigned_varint(l, output)
            else:
                output.write_byte(12 | (l << 4))
            for d in data:
                if isinstance(d, ThriftObject):
                    write_thrift((<ThriftObject>d).data, output)
                else:
                    write_thrift(d, output)
    else:
        # Not sure if zero-length list is allowed
        encode_unsigned_varint(0, output)


def from_buffer(buffer, name=None):
    cdef NumpyIO buf
    if isinstance(buffer, NumpyIO):
        buf = buffer
    else:
        buf = NumpyIO(buffer)
    cdef dict o = read_thrift(buf)
    if name is not None:
        return ThriftObject(name, o)
    return o


@cython.freelist(1000)
@cython.final
cdef class ThriftObject:

    cdef str name
    cdef dict spec
    cdef dict children
    cdef dict data

    def __init__(self, str name, dict indict):
        self.name = name
        self.spec = specs[name]
        self.children = children.get(name, {})
        self.data = indict

    def __getattr__(self, str item):
        cdef str ch
        if item in self.spec:
            out = self.get(self.spec[item], None)
            ch = self.children.get(item)
            if ch is not None and out is not None:
                if isinstance(out, list):
                    return [ThriftObject(ch, o) if isinstance(o, dict) else o for o in out]
                return ThriftObject(ch, out) if isinstance(out, dict) else out
            return out
        else:
            try:
                return self.data[item]
            except KeyError:
                raise AttributeError

    def __setitem__(self, key, value):
        self.data[key] = value

    def __getitem__(self, item):
        return self.data.get(item)

    def __delitem__(self, key):
        self.data.pop(key)

    def get(self, key, default=None):
        return self.data.get(key, default)

    def __setattr__(self, str item, value):
        cdef int i = self.spec[item]
        cdef int j
        if isinstance(value, ThriftObject):
            self.data[i] = value.data
        elif isinstance(value, list):
            self.data[i] = [(<ThriftObject>v).data for v in value]
        else:
            self.data[i] = value

    def __delattr__(self, item):
        cdef int i = self.spec[item]
        del self.data[i]

    cpdef const uint8_t[:] to_bytes(self):
        """raw serialise of internal state"""
        cdef int size = 0
        if self.name == "RowGroup":
            size = 1000 * len(self[1])  # num-columns
        elif self.name == "FileMetaData":
            # num-cols * num-rgs + size of key-values
            size = 1000 * len(self[4]) * len(self[2]) + len(str(self[5]))
        if size < 500000:
            size = 500000
        cdef uint8_t[::1] ser_buf = np.empty(size, dtype='uint8')
        cdef NumpyIO o = NumpyIO(ser_buf)
        write_thrift(self.data, o)
        return o.so_far()

    def __reduce_ex__(self, _):
        # TODO: to_bytes returns a memoryview, so could sideband for pickle 5
        return from_buffer, (bytes(self.to_bytes()), self.name)

    @property
    def thrift_name(self):
        return self.name

    @property
    def contents(self):
        return self.data

    from_buffer = from_buffer

    def copy(self):
        """shallow copy"""
        return type(self)(self.name, self.data.copy())

    def __copy__(self):
        return self.copy()

    def __deepcopy__(self, memodict={}):
        import copy
        d = copy.deepcopy(self.data)
        return ThriftObject(self.name, d)

    cpdef _asdict(self):
        """Create dict version with field names instead of integers"""
        cdef str k
        cdef out = {}
        for k in self.spec:
            if k in self.children:
                lower = getattr(self, k)
                if lower is None:
                    out[k] = None
                elif isinstance(lower, list):
                    out[k] = [l._asdict() for l in lower]
                else:
                    out[k] = lower._asdict()
            else:
                lower = getattr(self, k)
                if isinstance(lower, bytes):
                    lower = str(lower)
                elif isinstance(lower, list) and lower and isinstance(lower[0], bytes):
                    lower = [str(l) for l in lower]
                out[k] = lower
        return out

    def __dir__(self):
        """Lists attributed"""
        return list(self.spec)

    def __repr__(self):
        alt = self._asdict()
        try:
            import yaml
            return yaml.dump(alt)
        except ImportError:
            return str(alt)

    def __eq__(self, other):
        if isinstance(other, ThriftObject):
            return dict_eq(self.contents, other.contents)
        elif isinstance(other, dict):
            return dict_eq(self.contents, other)
        return False

    @staticmethod
    def from_fields(thrift_name,bint i32=0, list i32list=None, **kwargs):
        cdef spec = specs[thrift_name]
        cdef int i
        cdef str k
        cdef dict out = {}
        for k, i in spec.items():  # ensure field index increases monotonically
            if k in kwargs:
                # missing fields are implicitly None
                v = kwargs[k]
                if isinstance(v, ThriftObject):
                    out[i] = (<ThriftObject>v).data
                elif isinstance(v, list) and v and isinstance(v[0], ThriftObject):
                    out[i] = [(<ThriftObject>it).data for it in v]
                else:
                    out[i] = v
        if i32:
            # integer fields are all 32-bit
            out['i32'] = 1
        if i32list:
            # given integer fields are 32-bit
            out['i32list'] = i32list
        return ThriftObject(thrift_name, out)


def dict_eq(d1, d2):
    """ dicts are equal if none-None keys match """
    if isinstance(d1, ThriftObject):
        d1 = d1.contents
    if isinstance(d2, ThriftObject):
        d2 = d2.contents
    for k in set(d1).union(d2):
        if not isinstance(k, int):
            # dynamic fields are immaterial
            continue
        if d1.get(k, None) is None:
            if d2.get(k, None) is None:
                continue
            return False
        if d2.get(k, None) is None:
            return False
        elif isinstance(d1[k], dict):
            if not dict_eq(d1[k], d2[k]):
                return False
        elif isinstance(d1[k], list):
            if len(d1[k]) != len(d2[k]):
                return False
            # Recursive call as described in
            # https://github.com/dask/fastparquet/pull/723#issuecomment-995147362
            if any(not dict_eq(a,b) if isinstance(a, dict) else (a != b)
                   for a, b in zip(d1[k], d2[k])):
                return False
        elif isinstance(d1[k], str):
            s = d2[k]
            if d1[k] != (s.decode() if isinstance(s, bytes) else s):
                return False
        else:
            if d1.get(k, None) != d2.get(k, None):
                return False
    return True


cdef dict specs = {
    'Statistics': {'max': 1,
                   'min': 2,
                   'null_count': 3,
                   'distinct_count': 4,
                   'max_value': 5,
                   'min_value': 6},
    'StringType': {},
    'UUIDType': {},
    'MapType': {},
    'ListType': {},
    'EnumType': {},
    'DateType': {},
    'NullType': {},
    'DecimalType': {'scale': 1, 'precision': 2},
    'MilliSeconds': {},
    'MicroSeconds': {},
    'NanoSeconds': {},
    'TimeUnit': {'MILLIS': 1, 'MICROS': 2, 'NANOS': 3},
    'TimestampType': {'isAdjustedToUTC': 1, 'unit': 2},
    'TimeType': {'isAdjustedToUTC': 1, 'unit': 2},
    'IntType': {'bitWidth': 1, 'isSigned': 2},
    'JsonType': {},
    'BsonType': {},
    'LogicalType': {'STRING': 1,
                    'MAP': 2,
                    'LIST': 3,
                    'ENUM': 4,
                    'DECIMAL': 5,
                    'DATE': 6,
                    'TIME': 7,
                    'TIMESTAMP': 8,
                    'INTEGER': 10,
                    'UNKNOWN': 11,
                    'JSON': 12,
                    'BSON': 13,
                    'UUID': 14},
    'SchemaElement': {'type': 1,
                      'type_length': 2,
                      'repetition_type': 3,
                      'name': 4,
                      'num_children': 5,
                      'converted_type': 6,
                      'scale': 7,
                      'precision': 8,
                      'field_id': 9,
                      'logicalType': 10},
    'DataPageHeader': {'num_values': 1,
                       'encoding': 2,
                       'definition_level_encoding': 3,
                       'repetition_level_encoding': 4,
                       'statistics': 5},
    'IndexPageHeader': {},
    'DictionaryPageHeader': {'num_values': 1, 'encoding': 2, 'is_sorted': 3},
    'DataPageHeaderV2': {'num_values': 1,
                         'num_nulls': 2,
                         'num_rows': 3,
                         'encoding': 4,
                         'definition_levels_byte_length': 5,
                         'repetition_levels_byte_length': 6,
                         'is_compressed': 7,
                         'statistics': 8},
    'SplitBlockAlgorithm': {},
    'BloomFilterAlgorithm': {'BLOCK': 1},
    'XxHash': {},
    'BloomFilterHash': {'XXHASH': 1},
    'Uncompressed': {},
    'PageHeader': {'type': 1,
                   'uncompressed_page_size': 2,
                   'compressed_page_size': 3,
                   'crc': 4,
                   'data_page_header': 5,
                   'index_page_header': 6,
                   'dictionary_page_header': 7,
                   'data_page_header_v2': 8},
    'KeyValue': {'key': 1, 'value': 2},
    'SortingColumn': {'column_idx': 1, 'descending': 2, 'nulls_first': 3},
    'PageEncodingStats': {'page_type': 1, 'encoding': 2, 'count': 3},
    'ColumnMetaData': {'type': 1,
                       'encodings': 2,
                       'path_in_schema': 3,
                       'codec': 4,
                       'num_values': 5,
                       'total_uncompressed_size': 6,
                       'total_compressed_size': 7,
                       'key_value_metadata': 8,
                       'data_page_offset': 9,
                       'index_page_offset': 10,
                       'dictionary_page_offset': 11,
                       'statistics': 12,
                       'encoding_stats': 13,
                       'bloom_filter_offset': 14},
    'ColumnChunk': {'file_path': 1,
                    'file_offset': 2,
                    'meta_data': 3,
                    'offset_index_offset': 4,
                    'offset_index_length': 5,
                    'column_index_offset': 6,
                    'column_index_length': 7,
                    'crypto_metadata': 8,
                    'encrypted_column_metadata': 9},
    'RowGroup': {'columns': 1,
                 'total_byte_size': 2,
                 'num_rows': 3,
                 'sorting_columns': 4,
                 'file_offset': 5,
                 'total_compressed_size': 6,
                 'ordinal': 7},
    'TypeDefinedOrder': {},
    'ColumnOrder': {'TYPE_ORDER': 1},
    'PageLocation': {'offset': 1,
                     'compressed_page_size': 2,
                     'first_row_index': 3},
    'OffsetIndex': {'page_locations': 1},
    'ColumnIndex': {'null_pages': 1,
                    'min_values': 2,
                    'max_values': 3,
                    'boundary_order': 4,
                    'null_counts': 5},
    'FileMetaData': {'version': 1,
                     'schema': 2,
                     'num_rows': 3,
                     'row_groups': 4,
                     'key_value_metadata': 5,
                     'created_by': 6,
                     'column_orders': 7,
                     'encryption_algorithm': 8,
                     'footer_signing_key_metadata': 9},
}

cdef dict children = {
    'TimeUnit': {'MILLIS': 'MilliSeconds',
                 'MICROS': 'MicroSeconds',
                 'NANOS': 'NanoSeconds'},
    'TimestampType': {'unit': 'TimeUnit'},
    'TimeType': {'unit': 'TimeUnit'},
    'LogicalType': {'STRING': 'StringType',
                    'MAP': 'MapType',
                    'LIST': 'ListType',
                    'ENUM': 'EnumType',
                    'DECIMAL': 'DecimalType',
                    'DATE': 'DateType',
                    'TIME': 'TimeType',
                    'TIMESTAMP': 'TimestampType',
                    'INTEGER': 'IntType',
                    'UNKNOWN': 'NullType',
                    'JSON': 'JsonType',
                    'BSON': 'BsonType',
                    'UUID': 'UUIDType'},
    'SchemaElement': {'logicalType': 'LogicalType'},
    'DataPageHeader': {'statistics': 'Statistics'},
    'DataPageHeaderV2': {'statistics': 'Statistics'},
    'PageHeader': {'data_page_header': 'DataPageHeader',
                   'index_page_header': 'IndexPageHeader',
                   'dictionary_page_header': 'DictionaryPageHeader',
                   'data_page_header_v2': 'DataPageHeaderV2'},
    'ColumnMetaData': {'key_value_metadata': 'KeyValue',
                       'statistics': 'Statistics',
                       'encoding_stats': 'PageEncodingStats'},
    'ColumnCryptoMetaData': {'ENCRYPTION_WITH_FOOTER_KEY': 'EncryptionWithFooterKey',
                             'ENCRYPTION_WITH_COLUMN_KEY': 'EncryptionWithColumnKey'},
    'ColumnChunk': {'meta_data': 'ColumnMetaData',
                    'crypto_metadata': 'ColumnCryptoMetaData'},
    'RowGroup': {'columns': 'ColumnChunk', 'sorting_columns': 'SortingColumn'},
    'ColumnOrder': {'TYPE_ORDER': 'TypeDefinedOrder'},
    'OffsetIndex': {'page_locations': 'PageLocation'},
    'FileMetaData': {'schema': 'SchemaElement',
                     'row_groups': 'RowGroup',
                     'key_value_metadata': 'KeyValue',
                     'column_orders': 'ColumnOrder',
                     'encryption_algorithm': 'EncryptionAlgorithm'},
}

# specs = {}
# for o in [o for o in fastparquet.parquet_thrift.__dict__.values() if isinstance(o, type)]:
#     if hasattr(o, "thrift_spec"):
#         specs[o.__name__] = {k[2]: k[0] for k in o.thrift_spec if k}
#
#
#
# children = {}
# for o in [o for o in fastparquet.parquet_thrift.__dict__.values() if isinstance(o, type)]:
#     if hasattr(o, "thrift_spec"):
#         bit = {}
#         for k in o.thrift_spec:
#             if k and k[1] == fastparquet.parquet_thrift.TType.STRUCT and hasattr(k[3][0], "thrift_spec"):
#                 bit[k[2]] =  k[3][0].__name__
#             elif k and k[1] == fastparquet.parquet_thrift.TType.LIST and k[3][0] == \
#                 fastparquet.parquet_thrift.TType.STRUCT:
#                 bit[k[2]] =  k[3][1][0].__name__
#         if bit:
#             children[o.__name__] = bit
#