1128 lines
37 KiB
Cython
1128 lines
37 KiB
Cython
# https://cython.readthedocs.io/en/latest/src/userguide/
|
|
# source_files_and_compilation.html#compiler-directives
|
|
# cython: profile=False
|
|
# cython: linetrace=False
|
|
# cython: binding=False
|
|
# cython: language_level=3
|
|
# cython: initializedcheck=False
|
|
# cython: boundscheck=False
|
|
# cython: wraparound=False
|
|
# cython: overflowcheck=False
|
|
# cython: cdivision=True
|
|
# cython: always_allow_keywords=False
|
|
|
|
import cython
|
|
import numpy as np
|
|
from cpython cimport (
|
|
PyBytes_FromStringAndSize, PyBytes_GET_SIZE, PyUnicode_DecodeUTF8,
|
|
)
|
|
from libc.string cimport memcpy
|
|
from libc.stdint cimport int8_t, uint8_t, uint32_t, int32_t, uint64_t, int64_t
|
|
|
|
|
|
cpdef void read_rle(NumpyIO file_obj, int32_t header, int32_t bit_width, NumpyIO o, int32_t itemsize=4) noexcept:
|
|
"""Read a run-length encoded run from the given fo with the given header and bit_width.
|
|
|
|
The count is determined from the header and the width is used to grab the
|
|
value that's repeated. Yields the value repeated count times.
|
|
"""
|
|
cdef:
|
|
uint32_t count, width, i, vals_left
|
|
int32_t data = 0
|
|
char * inptr = file_obj.get_pointer()
|
|
char * outptr = o.get_pointer()
|
|
count = header >> 1
|
|
width = (bit_width + 7) // 8
|
|
for i in range(width):
|
|
data |= (inptr[0] & 0xff) << (i * 8)
|
|
inptr += 1
|
|
vals_left = (o.nbytes - o.loc) // itemsize
|
|
if count > vals_left:
|
|
count = vals_left
|
|
if itemsize == 4:
|
|
for i in range(count):
|
|
(<int32_t*>outptr)[0] = data
|
|
outptr += 4
|
|
else:
|
|
for i in range(count):
|
|
outptr[0] = data & 0xff
|
|
outptr += 1
|
|
o.loc += outptr - o.get_pointer()
|
|
file_obj.loc += inptr - file_obj.get_pointer()
|
|
|
|
|
|
cpdef int32_t width_from_max_int(int64_t value) noexcept:
|
|
"""Convert the value specified to a bit_width."""
|
|
cdef int32_t i
|
|
for i in range(0, 64):
|
|
if value == 0:
|
|
return i
|
|
value >>= 1
|
|
|
|
|
|
cdef int32_t _mask_for_bits(int32_t i) noexcept:
|
|
"""Generate a mask to grab `i` bits from an int value."""
|
|
return (1 << i) - 1
|
|
|
|
|
|
cpdef void read_bitpacked1(NumpyIO file_obj, int32_t count, NumpyIO o) noexcept:
|
|
# implementation of np.unpackbits with output array. Output is int8 array
|
|
cdef:
|
|
char * inptr = file_obj.get_pointer()
|
|
char * outptr = o.get_pointer()
|
|
char * endptr
|
|
unsigned char data
|
|
int32_t counter, i, startcount=count
|
|
if count > o.nbytes - o.loc:
|
|
count = o.nbytes - o.loc
|
|
for counter in range(count // 8):
|
|
# whole bytes
|
|
data = inptr[0]
|
|
inptr += 1
|
|
for i in range(8):
|
|
outptr[0] = data & 1
|
|
outptr += 1
|
|
data >>= 1
|
|
if count % 8:
|
|
# remaining values in the last byte
|
|
data = <int32_t>inptr[0]
|
|
inptr += 1
|
|
for i in range(count % 8):
|
|
outptr[0] = data & 1
|
|
outptr += 1
|
|
data >>= 1
|
|
file_obj.loc += (startcount + 7) // 8
|
|
o.loc += count
|
|
|
|
|
|
cpdef void write_bitpacked1(NumpyIO file_obj, int32_t count, NumpyIO o) noexcept:
|
|
# implementation of np.packbits with output array. Input is int8 array
|
|
cdef char * inptr
|
|
cdef char * outptr
|
|
cdef char data = 0
|
|
cdef int32_t counter, i
|
|
cdef int64_t indata
|
|
outptr = o.get_pointer()
|
|
inptr = file_obj.get_pointer()
|
|
for counter in range(count // 8):
|
|
# fetch a long in one op, instead of byte by byte
|
|
indata = (<int64_t*>inptr)[0]
|
|
inptr += 8
|
|
for i in range(8):
|
|
data = data << 1 | (indata & 1)
|
|
indata >>= 8
|
|
outptr[0] = data
|
|
outptr += 1
|
|
if count % 8:
|
|
# leftover partial byte
|
|
data = 0
|
|
for i in range(count % 8):
|
|
data = data << 1 | (inptr[0] != 0)
|
|
inptr += 1
|
|
outptr[0] = data
|
|
outptr += 1
|
|
file_obj.loc += count * 4
|
|
o.loc += (count + 7) // 8
|
|
|
|
|
|
cpdef void read_bitpacked(NumpyIO file_obj, int32_t header, int32_t width, NumpyIO o, int32_t itemsize=4) noexcept:
|
|
"""
|
|
Read values packed into width-bits each (which can be >8)
|
|
"""
|
|
cdef:
|
|
uint32_t count, mask, data, vals_left
|
|
unsigned char left = 8, right = 0
|
|
char * inptr = file_obj.get_pointer()
|
|
char * outptr = o.get_pointer()
|
|
char * endptr
|
|
|
|
count = (header >> 1) * 8
|
|
# TODO: special case for width=1, 2, 4, 8
|
|
if width == 1 and itemsize == 1:
|
|
read_bitpacked1(file_obj, count, o)
|
|
return
|
|
endptr = (o.nbytes - o.loc) + outptr - itemsize
|
|
mask = _mask_for_bits(width)
|
|
data = 0xff & <int32_t>inptr[0]
|
|
inptr += 1
|
|
while count:
|
|
if right > 8:
|
|
data >>= 8
|
|
left -= 8
|
|
right -= 8
|
|
elif left - right < width:
|
|
data |= (inptr[0] & 0xff) << left
|
|
inptr += 1
|
|
left += 8
|
|
else:
|
|
if outptr <= endptr:
|
|
if itemsize == 4:
|
|
(<int32_t*>outptr)[0] = <int32_t>(data >> right & mask)
|
|
outptr += 4
|
|
else:
|
|
outptr[0] = data >> right & mask
|
|
outptr += 1
|
|
count -= 1
|
|
right += width
|
|
o.loc = o.loc + outptr - o.get_pointer()
|
|
file_obj.loc += inptr - file_obj.get_pointer()
|
|
|
|
|
|
cpdef uint64_t read_unsigned_var_int(NumpyIO file_obj) noexcept:
|
|
"""Read a value using the unsigned, variable int encoding.
|
|
file-obj is a NumpyIO of bytes; avoids struct to allow numba-jit
|
|
"""
|
|
cdef uint64_t result = 0
|
|
cdef int32_t shift = 0
|
|
cdef char byte
|
|
cdef char * inptr = file_obj.get_pointer()
|
|
|
|
while True:
|
|
byte = inptr[0]
|
|
inptr += 1
|
|
result |= (<int64_t>(byte & 0x7F) << shift)
|
|
if (byte & 0x80) == 0:
|
|
break
|
|
shift += 7
|
|
file_obj.loc += inptr - file_obj.get_pointer()
|
|
return result
|
|
|
|
|
|
cpdef void read_rle_bit_packed_hybrid(NumpyIO io_obj, int32_t width, uint32_t length, NumpyIO o,
|
|
int32_t itemsize=4) noexcept:
|
|
"""Read values from `io_obj` using the rel/bit-packed hybrid encoding.
|
|
|
|
If length is not specified, then a 32-bit int is read first to grab the
|
|
length of the encoded data.
|
|
|
|
file-obj is a NumpyIO of bytes; o if an output NumpyIO of int32 or int8/bool
|
|
|
|
The caller can tell the number of elements in the output by looking
|
|
at .tell().
|
|
"""
|
|
cdef int32_t start, header
|
|
if length is False:
|
|
length = <uint32_t>io_obj.read_int()
|
|
start = io_obj.loc
|
|
while io_obj.loc - start < length and o.loc < o.nbytes:
|
|
header = <int32_t>read_unsigned_var_int(io_obj)
|
|
if header & 1 == 0:
|
|
read_rle(io_obj, header, width, o, itemsize)
|
|
else:
|
|
read_bitpacked(io_obj, header, width, o, itemsize)
|
|
|
|
|
|
cdef void delta_read_bitpacked(NumpyIO file_obj, uint8_t bitwidth,
|
|
NumpyIO o, uint64_t count, uint8_t longval=0) noexcept:
|
|
cdef:
|
|
uint64_t data = 0
|
|
int8_t left = 0
|
|
int8_t right = 0
|
|
uint64_t mask = 0XFFFFFFFFFFFFFFFF >> (64 - bitwidth)
|
|
while count > 0:
|
|
if (left - right) < bitwidth:
|
|
data = data | (<uint64_t>file_obj.read_byte() << left)
|
|
left += 8
|
|
elif right > 8:
|
|
data >>= 8
|
|
left -= 8
|
|
right -= 8
|
|
else:
|
|
if longval:
|
|
o.write_long((data >> right) & mask)
|
|
else:
|
|
o.write_int((data >> right) & mask)
|
|
right += bitwidth
|
|
count -= 1
|
|
|
|
|
|
cpdef int delta_binary_unpack(NumpyIO file_obj, NumpyIO o, uint8_t longval=0):
|
|
cdef:
|
|
uint64_t block_size = read_unsigned_var_int(file_obj)
|
|
uint64_t miniblock_per_block = read_unsigned_var_int(file_obj)
|
|
int64_t count = read_unsigned_var_int(file_obj)
|
|
int64_t value = zigzag_long(read_unsigned_var_int(file_obj))
|
|
int64_t block, min_delta, i, j, values_per_miniblock, temp
|
|
const uint8_t[:] bitwidths
|
|
uint8_t bitwidth
|
|
values_per_miniblock = block_size // miniblock_per_block
|
|
while True:
|
|
min_delta = zigzag_long(read_unsigned_var_int(file_obj))
|
|
bitwidths = file_obj.read(miniblock_per_block)
|
|
for i in range(miniblock_per_block):
|
|
bitwidth = bitwidths[i]
|
|
if bitwidth:
|
|
temp = o.loc
|
|
if count > 1:
|
|
# no more diffs if on last value
|
|
delta_read_bitpacked(file_obj, bitwidth, o, values_per_miniblock, longval)
|
|
o.loc = temp
|
|
for j in range(values_per_miniblock):
|
|
if longval:
|
|
temp = o.read_long()
|
|
o.loc -= 8
|
|
o.write_long(value)
|
|
else:
|
|
temp = o.read_int()
|
|
o.loc -= 4
|
|
o.write_int(value)
|
|
value += min_delta + temp
|
|
count -= 1
|
|
if count <= 0:
|
|
return 0
|
|
else:
|
|
for j in range(values_per_miniblock):
|
|
if longval:
|
|
o.write_long(value)
|
|
else:
|
|
o.write_int(value)
|
|
value += min_delta
|
|
count -= 1
|
|
if count <= 0:
|
|
return 0
|
|
|
|
|
|
cpdef void encode_unsigned_varint(uint64_t x, NumpyIO o) noexcept: # pragma: no cover
|
|
while x > 127:
|
|
o.write_byte((x & 0x7F) | 0x80)
|
|
x >>= 7
|
|
o.write_byte(x)
|
|
|
|
|
|
cpdef void encode_bitpacked(int32_t[:] values, int32_t width, NumpyIO o) noexcept:
|
|
"""
|
|
Write values packed into width-bits each (which can be >8)
|
|
"""
|
|
|
|
cdef int32_t bit_packed_count = (values.shape[0] + 7) // 8
|
|
encode_unsigned_varint(bit_packed_count << 1 | 1, o) # write run header
|
|
cdef int32_t bit=0, bits=0, v, counter
|
|
for counter in range(values.shape[0]):
|
|
v = values[counter]
|
|
bits |= v << bit
|
|
bit += width
|
|
while bit >= 8:
|
|
o.write_byte(bits & 0xff)
|
|
bit -= 8
|
|
bits >>= 8
|
|
if bit:
|
|
o.write_byte(bits)
|
|
|
|
|
|
cpdef void encode_rle_bp(int32_t[:] data, int32_t width, NumpyIO o, int32_t withlength = 0) noexcept:
|
|
cdef uint32_t start, end
|
|
if withlength:
|
|
start = o.tell()
|
|
o.seek(4, 1)
|
|
encode_bitpacked(data, width, o)
|
|
if withlength:
|
|
end = o.tell()
|
|
o.seek(start)
|
|
o.write_int(end - start - 4)
|
|
o.seek(end)
|
|
|
|
|
|
@cython.freelist(100)
|
|
@cython.final
|
|
cdef class NumpyIO:
|
|
"""
|
|
Read or write from a numpy array like a file object
|
|
|
|
The main purpose is to keep track of the current location in the memory
|
|
"""
|
|
cdef const uint8_t[:] data
|
|
cdef uint32_t loc, nbytes
|
|
cdef char* ptr
|
|
cdef char writable
|
|
|
|
def __cinit__(self, const uint8_t[::1] data):
|
|
self.data = data
|
|
self.loc = 0
|
|
self.ptr = <char*>&data[0]
|
|
self.nbytes = data.shape[0]
|
|
|
|
cdef char* get_pointer(self) noexcept:
|
|
return self.ptr + self.loc
|
|
|
|
@property
|
|
def len(self):
|
|
return self.nbytes
|
|
|
|
cpdef const uint8_t[:] read(self, int32_t x=-1):
|
|
cdef const uint8_t[:] out
|
|
if x < 1:
|
|
x = self.nbytes - self.loc
|
|
out = self.data[self.loc:self.loc + x]
|
|
self.loc += x
|
|
return out
|
|
|
|
cpdef uint8_t read_byte(self) noexcept:
|
|
cdef char out
|
|
out = self.ptr[self.loc]
|
|
self.loc += 1
|
|
return out
|
|
|
|
cpdef int32_t read_int(self) noexcept:
|
|
cdef int32_t i
|
|
if self.nbytes - self.loc < 4:
|
|
return 0
|
|
i = (<int32_t*> self.get_pointer())[0]
|
|
self.loc += 4
|
|
return i
|
|
|
|
cpdef void write(self, const char[::1] d) noexcept:
|
|
memcpy(<void*>self.ptr[self.loc], <void*>&d[0], d.shape[0])
|
|
self.loc += d.shape[0]
|
|
|
|
cpdef void write_byte(self, uint8_t b) noexcept:
|
|
if self.loc >= self.nbytes:
|
|
# ignore attempt to write past end of buffer
|
|
return
|
|
self.ptr[self.loc] = b
|
|
self.loc += 1
|
|
|
|
cpdef void write_int(self, int32_t i) noexcept:
|
|
if self.nbytes - self.loc < 4:
|
|
return
|
|
(<int32_t*> self.get_pointer())[0] = i
|
|
self.loc += 4
|
|
|
|
cdef void write_long(self, int64_t i) noexcept:
|
|
if self.nbytes - self.loc < 8:
|
|
return
|
|
(<int64_t*> self.get_pointer())[0] = i
|
|
self.loc += 8
|
|
|
|
cdef int64_t read_long(self) noexcept:
|
|
cdef int64_t i
|
|
if self.nbytes - self.loc < 8:
|
|
return 0
|
|
i = (<int64_t*> self.get_pointer())[0]
|
|
self.loc += 8
|
|
return i
|
|
|
|
cdef void write_many(self, char b, int32_t count) noexcept:
|
|
cdef int32_t i
|
|
for i in range(count):
|
|
self.write_byte(b)
|
|
|
|
cpdef int32_t tell(self) noexcept:
|
|
return self.loc
|
|
|
|
cpdef uint32_t seek(self, int32_t loc, int32_t whence=0) noexcept:
|
|
if whence == 0:
|
|
self.loc = loc
|
|
elif whence == 1:
|
|
self.loc += loc
|
|
elif whence == 2:
|
|
self.loc = self.nbytes + loc
|
|
if self.loc > self.nbytes:
|
|
self.loc = self.nbytes
|
|
return self.loc
|
|
|
|
@cython.wraparound(False)
|
|
cpdef const uint8_t[:] so_far(self) noexcept:
|
|
""" In write mode, the data we have gathered until now
|
|
"""
|
|
return self.data[:self.loc]
|
|
|
|
|
|
def _assemble_objects(object[:] assign, const uint8_t[:] defi, const uint8_t[:] rep,
|
|
val, dic, d,
|
|
char null, null_val, int32_t max_defi, int32_t prev_i):
|
|
"""Dremel-assembly of arrays of values into lists
|
|
|
|
Parameters
|
|
----------
|
|
assign: array dtype O
|
|
To insert lists into
|
|
defi: int array
|
|
Definition levels, max 3
|
|
rep: int array
|
|
Repetition levels, max 1
|
|
dic: array of labels or None
|
|
Applied if d is True
|
|
d: bool
|
|
Whether to dereference dict values
|
|
null: bool
|
|
Can an entry be None?
|
|
null_val: bool
|
|
can list elements be None
|
|
max_defi: int
|
|
value of definition level that corresponds to non-null
|
|
prev_i: int
|
|
1 + index where the last row in the previous page was inserted (0 if first page)
|
|
"""
|
|
cdef int32_t counter, i, re, de
|
|
cdef int32_t vali = 0
|
|
cdef char started = False, have_null = False
|
|
if d:
|
|
# dereference dict values
|
|
val = dic[val]
|
|
i = prev_i
|
|
part = []
|
|
for counter in range(rep.shape[0]):
|
|
de = defi[counter] if defi is not None else max_defi
|
|
re = rep[counter]
|
|
if not re:
|
|
# new row - save what we have
|
|
if started:
|
|
assign[i] = None if have_null else part
|
|
part = []
|
|
i += 1
|
|
else:
|
|
# first time: no row to save yet, unless it's a row continued from previous page
|
|
if vali > 0:
|
|
assign[i - 1].extend(part) # add the items to previous row
|
|
part = []
|
|
# don't increment i since we only filled i-1
|
|
started = True
|
|
if de == max_defi:
|
|
# append real value to current item
|
|
part.append(val[vali])
|
|
vali += 1
|
|
elif de > null:
|
|
# append null to current item
|
|
part.append(None)
|
|
# next object is None as opposed to an object
|
|
have_null = de == 0 and null
|
|
if started: # normal case - add the leftovers to the next row
|
|
assign[i] = None if have_null else part
|
|
else: # can only happen if the only elements in this page are the continuation of the last row from previous page
|
|
assign[i - 1].extend(part)
|
|
return i
|
|
|
|
|
|
cdef int64_t nat = -9223372036854775808
|
|
|
|
|
|
cpdef void time_shift(const int64_t[::1] data, int32_t factor=1000) noexcept:
|
|
cdef int32_t i
|
|
cdef int64_t * ptr
|
|
cdef int64_t value
|
|
ptr = <int64_t*>&data[0]
|
|
for i in range(data.shape[0]):
|
|
if ptr[0] != nat:
|
|
ptr[0] *= factor
|
|
ptr += 1
|
|
|
|
|
|
cdef int32_t zigzag_int(uint64_t n) noexcept:
|
|
return (n >> 1) ^ -(n & 1)
|
|
|
|
|
|
cdef int64_t zigzag_long(uint64_t n) noexcept:
|
|
return (n >> 1) ^ -(n & 1)
|
|
|
|
|
|
cdef uint64_t long_zigzag(int64_t n) noexcept:
|
|
return (n << 1) ^ (n >> 63)
|
|
|
|
|
|
cpdef dict read_thrift(NumpyIO data):
|
|
cdef char byte, id = 0, bit
|
|
cdef int32_t size
|
|
cdef dict out = {}
|
|
cdef bint hasi64 = 0
|
|
cdef bint hasi32 = 0
|
|
cdef list i32 = None
|
|
while True:
|
|
byte = data.read_byte()
|
|
if byte == 0:
|
|
break
|
|
id += (byte & 0b11110000) >> 4
|
|
bit = byte & 0b00001111
|
|
if bit == 5:
|
|
out[id] = zigzag_long(read_unsigned_var_int(data))
|
|
hasi32 = True
|
|
if i32 is None:
|
|
i32 = list()
|
|
i32.append(id)
|
|
elif bit == 6:
|
|
out[id] = zigzag_long(read_unsigned_var_int(data))
|
|
hasi64 = True
|
|
elif bit == 7:
|
|
out[id] = <double>data.get_pointer()[0]
|
|
data.seek(8, 1)
|
|
elif bit == 8:
|
|
size = read_unsigned_var_int(data)
|
|
out[id] = PyBytes_FromStringAndSize(data.get_pointer(), size)
|
|
data.seek(size, 1)
|
|
elif bit == 9:
|
|
out[id] = read_list(data)
|
|
elif bit == 12:
|
|
out[id] = read_thrift(data)
|
|
elif bit == 1:
|
|
out[id] = True
|
|
elif bit == 2:
|
|
out[id] = False
|
|
elif bit == 4:
|
|
# I16
|
|
out[id] = zigzag_long(read_unsigned_var_int(data))
|
|
elif bit == 3:
|
|
# I8
|
|
out[id] = data.read_byte()
|
|
else:
|
|
print("Corrupted thrift data at ", data.tell(), ": ", id, bit)
|
|
if hasi32:
|
|
if hasi64:
|
|
out["i32list"] = i32
|
|
else:
|
|
out["i32"] = 1
|
|
return out
|
|
|
|
|
|
cdef list read_list(NumpyIO data):
|
|
cdef unsigned char byte, typ
|
|
cdef int32_t size, bsize, _
|
|
byte = data.read_byte()
|
|
if byte >= 0xf0: # 0b11110000
|
|
size = read_unsigned_var_int(data)
|
|
else:
|
|
size = ((byte & 0xf0) >> 4)
|
|
out = []
|
|
typ = byte & 0x0f # 0b00001111
|
|
if typ == 5 or typ == 6:
|
|
for _ in range(size):
|
|
out.append(zigzag_long(read_unsigned_var_int(data)))
|
|
elif typ == 8:
|
|
for _ in range(size):
|
|
# all parquet list types contain str, not bytes
|
|
bsize = read_unsigned_var_int(data)
|
|
out.append(PyUnicode_DecodeUTF8(data.get_pointer(), bsize, "ignore"))
|
|
data.seek(bsize, 1)
|
|
else:
|
|
for _ in range(size):
|
|
out.append(read_thrift(data))
|
|
|
|
return out
|
|
|
|
|
|
cpdef int write_thrift(dict data, NumpyIO output):
|
|
cdef int i, l, prev = 0
|
|
cdef int delt = 0
|
|
cdef double d
|
|
cdef bytes b
|
|
cdef char * c
|
|
cdef int i32 = "i32" in data
|
|
cdef list i32s
|
|
if "i32list" in data:
|
|
i32 = 2
|
|
i32s = data['i32list']
|
|
for i in range(1, 14): # 14 is the max number of fields
|
|
if i not in data:
|
|
continue
|
|
val = data.get(i)
|
|
if val is None:
|
|
# not defined - skip (None is default on load)
|
|
continue
|
|
delt = i - prev
|
|
prev = i
|
|
if isinstance(val, bool):
|
|
if val is True:
|
|
output.write_byte((delt << 4) | 1)
|
|
else:
|
|
output.write_byte((delt << 4) | 2)
|
|
elif isinstance(val, int):
|
|
if i32 == 1 or (i32 == 2 and i in i32s):
|
|
output.write_byte((delt << 4) | 5)
|
|
else:
|
|
output.write_byte((delt << 4) | 6)
|
|
encode_unsigned_varint(long_zigzag(<int64_t>val), output)
|
|
elif isinstance(val, float):
|
|
output.write_byte((delt << 4) | 7)
|
|
d = val
|
|
(<double*>output.get_pointer())[0] = d
|
|
output.loc += 8
|
|
elif isinstance(val, bytes):
|
|
output.write_byte((delt << 4) | 8)
|
|
l = PyBytes_GET_SIZE(<bytes>val)
|
|
encode_unsigned_varint(l, output)
|
|
c = val
|
|
memcpy(<void*>output.get_pointer(), <void*>c, l)
|
|
output.loc += l
|
|
elif isinstance(val, str):
|
|
output.write_byte((delt << 4) | 8)
|
|
b = (<str>val).encode()
|
|
l = PyBytes_GET_SIZE(b)
|
|
encode_unsigned_varint(l, output)
|
|
c = b
|
|
memcpy(<void*>output.get_pointer(), <void*>c, l)
|
|
output.loc += l
|
|
elif isinstance(val, list):
|
|
output.write_byte((delt << 4) | 9)
|
|
write_list(<list>val, output)
|
|
elif isinstance(val, ThriftObject):
|
|
output.write_byte((delt << 4) | 12)
|
|
write_thrift((<ThriftObject>val).data, output)
|
|
else:
|
|
output.write_byte((delt << 4) | 12)
|
|
write_thrift(<dict>val, output)
|
|
output.write_byte(0)
|
|
|
|
|
|
cdef int write_list(list data, NumpyIO output):
|
|
cdef int l = len(data)
|
|
cdef int i
|
|
cdef ThriftObject dd
|
|
cdef bytes b
|
|
cdef str s
|
|
cdef char * c
|
|
if l:
|
|
first = data[0]
|
|
if isinstance(first, int):
|
|
if l > 14: # all lists are i64
|
|
output.write_byte(5 | 0b11110000)
|
|
encode_unsigned_varint(l, output)
|
|
else:
|
|
output.write_byte(5 | (l << 4))
|
|
for i in data:
|
|
encode_unsigned_varint(long_zigzag(i), output)
|
|
elif isinstance(first, bytes):
|
|
if l > 14:
|
|
output.write_byte(8 | 0b11110000)
|
|
encode_unsigned_varint(l, output)
|
|
else:
|
|
output.write_byte(8 | (l << 4))
|
|
for b in data:
|
|
i = PyBytes_GET_SIZE(b)
|
|
encode_unsigned_varint(i, output)
|
|
c = b
|
|
memcpy(<void*>output.get_pointer(), <void*>c, i)
|
|
output.loc += i
|
|
elif isinstance(first, str):
|
|
if l > 14:
|
|
output.write_byte(8 | 0b11110000)
|
|
encode_unsigned_varint(l, output)
|
|
else:
|
|
output.write_byte(8 | (l << 4))
|
|
for s in data:
|
|
b = s.encode("utf8", "ignore")
|
|
i = PyBytes_GET_SIZE(b)
|
|
encode_unsigned_varint(i, output)
|
|
c = b
|
|
memcpy(<void*>output.get_pointer(), <void*>c, i)
|
|
output.loc += i
|
|
else: # STRUCT
|
|
if l > 14:
|
|
output.write_byte(12 | 0b11110000)
|
|
encode_unsigned_varint(l, output)
|
|
else:
|
|
output.write_byte(12 | (l << 4))
|
|
for d in data:
|
|
if isinstance(d, ThriftObject):
|
|
write_thrift((<ThriftObject>d).data, output)
|
|
else:
|
|
write_thrift(d, output)
|
|
else:
|
|
# Not sure if zero-length list is allowed
|
|
encode_unsigned_varint(0, output)
|
|
|
|
|
|
def from_buffer(buffer, name=None):
|
|
cdef NumpyIO buf
|
|
if isinstance(buffer, NumpyIO):
|
|
buf = buffer
|
|
else:
|
|
buf = NumpyIO(buffer)
|
|
cdef dict o = read_thrift(buf)
|
|
if name is not None:
|
|
return ThriftObject(name, o)
|
|
return o
|
|
|
|
|
|
@cython.freelist(1000)
|
|
@cython.final
|
|
cdef class ThriftObject:
|
|
|
|
cdef str name
|
|
cdef dict spec
|
|
cdef dict children
|
|
cdef dict data
|
|
|
|
def __init__(self, str name, dict indict):
|
|
self.name = name
|
|
self.spec = specs[name]
|
|
self.children = children.get(name, {})
|
|
self.data = indict
|
|
|
|
def __getattr__(self, str item):
|
|
cdef str ch
|
|
if item in self.spec:
|
|
out = self.get(self.spec[item], None)
|
|
ch = self.children.get(item)
|
|
if ch is not None and out is not None:
|
|
if isinstance(out, list):
|
|
return [ThriftObject(ch, o) if isinstance(o, dict) else o for o in out]
|
|
return ThriftObject(ch, out) if isinstance(out, dict) else out
|
|
return out
|
|
else:
|
|
try:
|
|
return self.data[item]
|
|
except KeyError:
|
|
raise AttributeError
|
|
|
|
def __setitem__(self, key, value):
|
|
self.data[key] = value
|
|
|
|
def __getitem__(self, item):
|
|
return self.data.get(item)
|
|
|
|
def __delitem__(self, key):
|
|
self.data.pop(key)
|
|
|
|
def get(self, key, default=None):
|
|
return self.data.get(key, default)
|
|
|
|
def __setattr__(self, str item, value):
|
|
cdef int i = self.spec[item]
|
|
cdef int j
|
|
if isinstance(value, ThriftObject):
|
|
self.data[i] = value.data
|
|
elif isinstance(value, list):
|
|
self.data[i] = [(<ThriftObject>v).data for v in value]
|
|
else:
|
|
self.data[i] = value
|
|
|
|
def __delattr__(self, item):
|
|
cdef int i = self.spec[item]
|
|
del self.data[i]
|
|
|
|
cpdef const uint8_t[:] to_bytes(self):
|
|
"""raw serialise of internal state"""
|
|
cdef int size = 0
|
|
if self.name == "RowGroup":
|
|
size = 1000 * len(self[1]) # num-columns
|
|
elif self.name == "FileMetaData":
|
|
# num-cols * num-rgs + size of key-values
|
|
size = 1000 * len(self[4]) * len(self[2]) + len(str(self[5]))
|
|
if size < 500000:
|
|
size = 500000
|
|
cdef uint8_t[::1] ser_buf = np.empty(size, dtype='uint8')
|
|
cdef NumpyIO o = NumpyIO(ser_buf)
|
|
write_thrift(self.data, o)
|
|
return o.so_far()
|
|
|
|
def __reduce_ex__(self, _):
|
|
# TODO: to_bytes returns a memoryview, so could sideband for pickle 5
|
|
return from_buffer, (bytes(self.to_bytes()), self.name)
|
|
|
|
@property
|
|
def thrift_name(self):
|
|
return self.name
|
|
|
|
@property
|
|
def contents(self):
|
|
return self.data
|
|
|
|
from_buffer = from_buffer
|
|
|
|
def copy(self):
|
|
"""shallow copy"""
|
|
return type(self)(self.name, self.data.copy())
|
|
|
|
def __copy__(self):
|
|
return self.copy()
|
|
|
|
def __deepcopy__(self, memodict={}):
|
|
import copy
|
|
d = copy.deepcopy(self.data)
|
|
return ThriftObject(self.name, d)
|
|
|
|
cpdef _asdict(self):
|
|
"""Create dict version with field names instead of integers"""
|
|
cdef str k
|
|
cdef out = {}
|
|
for k in self.spec:
|
|
if k in self.children:
|
|
lower = getattr(self, k)
|
|
if lower is None:
|
|
out[k] = None
|
|
elif isinstance(lower, list):
|
|
out[k] = [l._asdict() for l in lower]
|
|
else:
|
|
out[k] = lower._asdict()
|
|
else:
|
|
lower = getattr(self, k)
|
|
if isinstance(lower, bytes):
|
|
lower = str(lower)
|
|
elif isinstance(lower, list) and lower and isinstance(lower[0], bytes):
|
|
lower = [str(l) for l in lower]
|
|
out[k] = lower
|
|
return out
|
|
|
|
def __dir__(self):
|
|
"""Lists attributed"""
|
|
return list(self.spec)
|
|
|
|
def __repr__(self):
|
|
alt = self._asdict()
|
|
try:
|
|
import yaml
|
|
return yaml.dump(alt)
|
|
except ImportError:
|
|
return str(alt)
|
|
|
|
def __eq__(self, other):
|
|
if isinstance(other, ThriftObject):
|
|
return dict_eq(self.contents, other.contents)
|
|
elif isinstance(other, dict):
|
|
return dict_eq(self.contents, other)
|
|
return False
|
|
|
|
@staticmethod
|
|
def from_fields(thrift_name,bint i32=0, list i32list=None, **kwargs):
|
|
cdef spec = specs[thrift_name]
|
|
cdef int i
|
|
cdef str k
|
|
cdef dict out = {}
|
|
for k, i in spec.items(): # ensure field index increases monotonically
|
|
if k in kwargs:
|
|
# missing fields are implicitly None
|
|
v = kwargs[k]
|
|
if isinstance(v, ThriftObject):
|
|
out[i] = (<ThriftObject>v).data
|
|
elif isinstance(v, list) and v and isinstance(v[0], ThriftObject):
|
|
out[i] = [(<ThriftObject>it).data for it in v]
|
|
else:
|
|
out[i] = v
|
|
if i32:
|
|
# integer fields are all 32-bit
|
|
out['i32'] = 1
|
|
if i32list:
|
|
# given integer fields are 32-bit
|
|
out['i32list'] = i32list
|
|
return ThriftObject(thrift_name, out)
|
|
|
|
|
|
def dict_eq(d1, d2):
|
|
""" dicts are equal if none-None keys match """
|
|
if isinstance(d1, ThriftObject):
|
|
d1 = d1.contents
|
|
if isinstance(d2, ThriftObject):
|
|
d2 = d2.contents
|
|
for k in set(d1).union(d2):
|
|
if not isinstance(k, int):
|
|
# dynamic fields are immaterial
|
|
continue
|
|
if d1.get(k, None) is None:
|
|
if d2.get(k, None) is None:
|
|
continue
|
|
return False
|
|
if d2.get(k, None) is None:
|
|
return False
|
|
elif isinstance(d1[k], dict):
|
|
if not dict_eq(d1[k], d2[k]):
|
|
return False
|
|
elif isinstance(d1[k], list):
|
|
if len(d1[k]) != len(d2[k]):
|
|
return False
|
|
# Recursive call as described in
|
|
# https://github.com/dask/fastparquet/pull/723#issuecomment-995147362
|
|
if any(not dict_eq(a,b) if isinstance(a, dict) else (a != b)
|
|
for a, b in zip(d1[k], d2[k])):
|
|
return False
|
|
elif isinstance(d1[k], str):
|
|
s = d2[k]
|
|
if d1[k] != (s.decode() if isinstance(s, bytes) else s):
|
|
return False
|
|
else:
|
|
if d1.get(k, None) != d2.get(k, None):
|
|
return False
|
|
return True
|
|
|
|
|
|
cdef dict specs = {
|
|
'Statistics': {'max': 1,
|
|
'min': 2,
|
|
'null_count': 3,
|
|
'distinct_count': 4,
|
|
'max_value': 5,
|
|
'min_value': 6},
|
|
'StringType': {},
|
|
'UUIDType': {},
|
|
'MapType': {},
|
|
'ListType': {},
|
|
'EnumType': {},
|
|
'DateType': {},
|
|
'NullType': {},
|
|
'DecimalType': {'scale': 1, 'precision': 2},
|
|
'MilliSeconds': {},
|
|
'MicroSeconds': {},
|
|
'NanoSeconds': {},
|
|
'TimeUnit': {'MILLIS': 1, 'MICROS': 2, 'NANOS': 3},
|
|
'TimestampType': {'isAdjustedToUTC': 1, 'unit': 2},
|
|
'TimeType': {'isAdjustedToUTC': 1, 'unit': 2},
|
|
'IntType': {'bitWidth': 1, 'isSigned': 2},
|
|
'JsonType': {},
|
|
'BsonType': {},
|
|
'LogicalType': {'STRING': 1,
|
|
'MAP': 2,
|
|
'LIST': 3,
|
|
'ENUM': 4,
|
|
'DECIMAL': 5,
|
|
'DATE': 6,
|
|
'TIME': 7,
|
|
'TIMESTAMP': 8,
|
|
'INTEGER': 10,
|
|
'UNKNOWN': 11,
|
|
'JSON': 12,
|
|
'BSON': 13,
|
|
'UUID': 14},
|
|
'SchemaElement': {'type': 1,
|
|
'type_length': 2,
|
|
'repetition_type': 3,
|
|
'name': 4,
|
|
'num_children': 5,
|
|
'converted_type': 6,
|
|
'scale': 7,
|
|
'precision': 8,
|
|
'field_id': 9,
|
|
'logicalType': 10},
|
|
'DataPageHeader': {'num_values': 1,
|
|
'encoding': 2,
|
|
'definition_level_encoding': 3,
|
|
'repetition_level_encoding': 4,
|
|
'statistics': 5},
|
|
'IndexPageHeader': {},
|
|
'DictionaryPageHeader': {'num_values': 1, 'encoding': 2, 'is_sorted': 3},
|
|
'DataPageHeaderV2': {'num_values': 1,
|
|
'num_nulls': 2,
|
|
'num_rows': 3,
|
|
'encoding': 4,
|
|
'definition_levels_byte_length': 5,
|
|
'repetition_levels_byte_length': 6,
|
|
'is_compressed': 7,
|
|
'statistics': 8},
|
|
'SplitBlockAlgorithm': {},
|
|
'BloomFilterAlgorithm': {'BLOCK': 1},
|
|
'XxHash': {},
|
|
'BloomFilterHash': {'XXHASH': 1},
|
|
'Uncompressed': {},
|
|
'PageHeader': {'type': 1,
|
|
'uncompressed_page_size': 2,
|
|
'compressed_page_size': 3,
|
|
'crc': 4,
|
|
'data_page_header': 5,
|
|
'index_page_header': 6,
|
|
'dictionary_page_header': 7,
|
|
'data_page_header_v2': 8},
|
|
'KeyValue': {'key': 1, 'value': 2},
|
|
'SortingColumn': {'column_idx': 1, 'descending': 2, 'nulls_first': 3},
|
|
'PageEncodingStats': {'page_type': 1, 'encoding': 2, 'count': 3},
|
|
'ColumnMetaData': {'type': 1,
|
|
'encodings': 2,
|
|
'path_in_schema': 3,
|
|
'codec': 4,
|
|
'num_values': 5,
|
|
'total_uncompressed_size': 6,
|
|
'total_compressed_size': 7,
|
|
'key_value_metadata': 8,
|
|
'data_page_offset': 9,
|
|
'index_page_offset': 10,
|
|
'dictionary_page_offset': 11,
|
|
'statistics': 12,
|
|
'encoding_stats': 13,
|
|
'bloom_filter_offset': 14},
|
|
'ColumnChunk': {'file_path': 1,
|
|
'file_offset': 2,
|
|
'meta_data': 3,
|
|
'offset_index_offset': 4,
|
|
'offset_index_length': 5,
|
|
'column_index_offset': 6,
|
|
'column_index_length': 7,
|
|
'crypto_metadata': 8,
|
|
'encrypted_column_metadata': 9},
|
|
'RowGroup': {'columns': 1,
|
|
'total_byte_size': 2,
|
|
'num_rows': 3,
|
|
'sorting_columns': 4,
|
|
'file_offset': 5,
|
|
'total_compressed_size': 6,
|
|
'ordinal': 7},
|
|
'TypeDefinedOrder': {},
|
|
'ColumnOrder': {'TYPE_ORDER': 1},
|
|
'PageLocation': {'offset': 1,
|
|
'compressed_page_size': 2,
|
|
'first_row_index': 3},
|
|
'OffsetIndex': {'page_locations': 1},
|
|
'ColumnIndex': {'null_pages': 1,
|
|
'min_values': 2,
|
|
'max_values': 3,
|
|
'boundary_order': 4,
|
|
'null_counts': 5},
|
|
'FileMetaData': {'version': 1,
|
|
'schema': 2,
|
|
'num_rows': 3,
|
|
'row_groups': 4,
|
|
'key_value_metadata': 5,
|
|
'created_by': 6,
|
|
'column_orders': 7,
|
|
'encryption_algorithm': 8,
|
|
'footer_signing_key_metadata': 9},
|
|
}
|
|
|
|
cdef dict children = {
|
|
'TimeUnit': {'MILLIS': 'MilliSeconds',
|
|
'MICROS': 'MicroSeconds',
|
|
'NANOS': 'NanoSeconds'},
|
|
'TimestampType': {'unit': 'TimeUnit'},
|
|
'TimeType': {'unit': 'TimeUnit'},
|
|
'LogicalType': {'STRING': 'StringType',
|
|
'MAP': 'MapType',
|
|
'LIST': 'ListType',
|
|
'ENUM': 'EnumType',
|
|
'DECIMAL': 'DecimalType',
|
|
'DATE': 'DateType',
|
|
'TIME': 'TimeType',
|
|
'TIMESTAMP': 'TimestampType',
|
|
'INTEGER': 'IntType',
|
|
'UNKNOWN': 'NullType',
|
|
'JSON': 'JsonType',
|
|
'BSON': 'BsonType',
|
|
'UUID': 'UUIDType'},
|
|
'SchemaElement': {'logicalType': 'LogicalType'},
|
|
'DataPageHeader': {'statistics': 'Statistics'},
|
|
'DataPageHeaderV2': {'statistics': 'Statistics'},
|
|
'PageHeader': {'data_page_header': 'DataPageHeader',
|
|
'index_page_header': 'IndexPageHeader',
|
|
'dictionary_page_header': 'DictionaryPageHeader',
|
|
'data_page_header_v2': 'DataPageHeaderV2'},
|
|
'ColumnMetaData': {'key_value_metadata': 'KeyValue',
|
|
'statistics': 'Statistics',
|
|
'encoding_stats': 'PageEncodingStats'},
|
|
'ColumnCryptoMetaData': {'ENCRYPTION_WITH_FOOTER_KEY': 'EncryptionWithFooterKey',
|
|
'ENCRYPTION_WITH_COLUMN_KEY': 'EncryptionWithColumnKey'},
|
|
'ColumnChunk': {'meta_data': 'ColumnMetaData',
|
|
'crypto_metadata': 'ColumnCryptoMetaData'},
|
|
'RowGroup': {'columns': 'ColumnChunk', 'sorting_columns': 'SortingColumn'},
|
|
'ColumnOrder': {'TYPE_ORDER': 'TypeDefinedOrder'},
|
|
'OffsetIndex': {'page_locations': 'PageLocation'},
|
|
'FileMetaData': {'schema': 'SchemaElement',
|
|
'row_groups': 'RowGroup',
|
|
'key_value_metadata': 'KeyValue',
|
|
'column_orders': 'ColumnOrder',
|
|
'encryption_algorithm': 'EncryptionAlgorithm'},
|
|
}
|
|
|
|
# specs = {}
|
|
# for o in [o for o in fastparquet.parquet_thrift.__dict__.values() if isinstance(o, type)]:
|
|
# if hasattr(o, "thrift_spec"):
|
|
# specs[o.__name__] = {k[2]: k[0] for k in o.thrift_spec if k}
|
|
#
|
|
#
|
|
#
|
|
# children = {}
|
|
# for o in [o for o in fastparquet.parquet_thrift.__dict__.values() if isinstance(o, type)]:
|
|
# if hasattr(o, "thrift_spec"):
|
|
# bit = {}
|
|
# for k in o.thrift_spec:
|
|
# if k and k[1] == fastparquet.parquet_thrift.TType.STRUCT and hasattr(k[3][0], "thrift_spec"):
|
|
# bit[k[2]] = k[3][0].__name__
|
|
# elif k and k[1] == fastparquet.parquet_thrift.TType.LIST and k[3][0] == \
|
|
# fastparquet.parquet_thrift.TType.STRUCT:
|
|
# bit[k[2]] = k[3][1][0].__name__
|
|
# if bit:
|
|
# children[o.__name__] = bit
|
|
#
|