115 lines
3.6 KiB
Python
115 lines
3.6 KiB
Python
|
|
import cramjam
|
|
import numpy as np
|
|
from fastparquet import parquet_thrift
|
|
|
|
# TODO: use stream/direct-to-buffer conversions instead of memcopy
|
|
|
|
compressions = {
|
|
'UNCOMPRESSED': lambda x: x
|
|
}
|
|
decompressions = {
|
|
'UNCOMPRESSED': lambda x, y: x
|
|
}
|
|
|
|
# Gzip is present regardless
|
|
COMPRESSION_LEVEL = 6
|
|
|
|
|
|
def gzip_compress_v3(data, compresslevel=COMPRESSION_LEVEL):
|
|
return cramjam.gzip.compress(data, level=compresslevel)
|
|
|
|
|
|
def gzip_decompress(data, uncompressed_size):
|
|
return cramjam.gzip.decompress(data, output_len=uncompressed_size)
|
|
|
|
|
|
compressions['GZIP'] = gzip_compress_v3
|
|
decompressions['GZIP'] = gzip_decompress
|
|
compressions['SNAPPY'] = cramjam.snappy.compress_raw
|
|
decompressions['SNAPPY'] = cramjam.snappy.decompress_raw
|
|
|
|
try:
|
|
import lzo
|
|
def lzo_decompress(data, uncompressed_size):
|
|
return lzo.decompress(data)
|
|
compressions['LZO'] = lzo.compress
|
|
decompressions['LZO'] = lzo_decompress
|
|
except ImportError:
|
|
pass
|
|
compressions['BROTLI'] = cramjam.brotli.compress
|
|
decompressions['BROTLI'] = cramjam.brotli.decompress
|
|
|
|
|
|
def lz4_compress(data, **kwargs):
|
|
kwargs['store_size'] = False
|
|
return cramjam.lz4.compress_block(data, **kwargs)
|
|
|
|
|
|
def lz4_decomp(data, size):
|
|
return cramjam.lz4.decompress_block(np.frombuffer(data, 'uint8'), size)
|
|
|
|
|
|
compressions['LZ4'] = lz4_compress
|
|
decompressions['LZ4'] = lz4_decomp
|
|
|
|
# LZ4 is actually LZ4 block, aka "raw", see
|
|
# https://github.com/apache/parquet-format/commit/7f06e838cbd1b7dbd722ff2580b9c2525e37fc46
|
|
compressions['LZ4_RAW'] = lz4_compress
|
|
decompressions['LZ4_RAW'] = lz4_decomp
|
|
compressions['ZSTD'] = cramjam.zstd.compress
|
|
decompressions['ZSTD'] = cramjam.zstd.decompress
|
|
decom_into = {
|
|
"GZIP": cramjam.gzip.decompress_into,
|
|
"SNAPPY": cramjam.snappy.decompress_raw_into,
|
|
"ZSTD": cramjam.zstd.decompress_into,
|
|
"BROTLI": cramjam.brotli.decompress_into
|
|
}
|
|
|
|
compressions = {k.upper(): v for k, v in compressions.items()}
|
|
decompressions = {k.upper(): v for k, v in decompressions.items()}
|
|
|
|
rev_map = {getattr(parquet_thrift.CompressionCodec, key): key for key in
|
|
dir(parquet_thrift.CompressionCodec) if key in
|
|
['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD', 'LZ4_RAW']}
|
|
|
|
|
|
def compress_data(data, compression='gzip'):
|
|
if isinstance(compression, dict):
|
|
algorithm = compression.get('type', 'gzip')
|
|
if isinstance(algorithm, int):
|
|
algorithm = rev_map[compression]
|
|
args = compression.get('args', None)
|
|
else:
|
|
algorithm = compression
|
|
args = None
|
|
|
|
if isinstance(algorithm, int):
|
|
algorithm = rev_map[compression]
|
|
|
|
if algorithm.upper() not in compressions:
|
|
raise RuntimeError("Compression '%s' not available. Options: %s" %
|
|
(algorithm, sorted(compressions)))
|
|
if args is None:
|
|
return compressions[algorithm.upper()](data)
|
|
else:
|
|
if not isinstance(args, dict):
|
|
raise ValueError("args dict entry is not a dict")
|
|
return compressions[algorithm.upper()](data, **args)
|
|
|
|
|
|
def decompress_data(data, uncompressed_size, algorithm='gzip'):
|
|
if isinstance(algorithm, int):
|
|
algorithm = rev_map[algorithm]
|
|
if algorithm.upper() not in decompressions:
|
|
raise RuntimeError(
|
|
"Decompression '%s' not available. Options: %s" %
|
|
(algorithm.upper(), sorted(decompressions))
|
|
)
|
|
if algorithm.upper() in decom_into:
|
|
# ensures writable buffer from cramjam
|
|
x = np.empty(uncompressed_size, dtype='uint8')
|
|
decom_into[algorithm.upper()](np.frombuffer(data, dtype=np.uint8), x)
|
|
return x
|
|
return decompressions[algorithm.upper()](data, uncompressed_size)
|