Import python venv for stability
This commit is contained in:
@@ -0,0 +1,114 @@
|
||||
|
||||
import cramjam
|
||||
import numpy as np
|
||||
from fastparquet import parquet_thrift
|
||||
|
||||
# TODO: use stream/direct-to-buffer conversions instead of memcopy
|
||||
|
||||
compressions = {
|
||||
'UNCOMPRESSED': lambda x: x
|
||||
}
|
||||
decompressions = {
|
||||
'UNCOMPRESSED': lambda x, y: x
|
||||
}
|
||||
|
||||
# Gzip is present regardless
|
||||
COMPRESSION_LEVEL = 6
|
||||
|
||||
|
||||
def gzip_compress_v3(data, compresslevel=COMPRESSION_LEVEL):
|
||||
return cramjam.gzip.compress(data, level=compresslevel)
|
||||
|
||||
|
||||
def gzip_decompress(data, uncompressed_size):
|
||||
return cramjam.gzip.decompress(data, output_len=uncompressed_size)
|
||||
|
||||
|
||||
compressions['GZIP'] = gzip_compress_v3
|
||||
decompressions['GZIP'] = gzip_decompress
|
||||
compressions['SNAPPY'] = cramjam.snappy.compress_raw
|
||||
decompressions['SNAPPY'] = cramjam.snappy.decompress_raw
|
||||
|
||||
try:
|
||||
import lzo
|
||||
def lzo_decompress(data, uncompressed_size):
|
||||
return lzo.decompress(data)
|
||||
compressions['LZO'] = lzo.compress
|
||||
decompressions['LZO'] = lzo_decompress
|
||||
except ImportError:
|
||||
pass
|
||||
compressions['BROTLI'] = cramjam.brotli.compress
|
||||
decompressions['BROTLI'] = cramjam.brotli.decompress
|
||||
|
||||
|
||||
def lz4_compress(data, **kwargs):
|
||||
kwargs['store_size'] = False
|
||||
return cramjam.lz4.compress_block(data, **kwargs)
|
||||
|
||||
|
||||
def lz4_decomp(data, size):
|
||||
return cramjam.lz4.decompress_block(np.frombuffer(data, 'uint8'), size)
|
||||
|
||||
|
||||
compressions['LZ4'] = lz4_compress
|
||||
decompressions['LZ4'] = lz4_decomp
|
||||
|
||||
# LZ4 is actually LZ4 block, aka "raw", see
|
||||
# https://github.com/apache/parquet-format/commit/7f06e838cbd1b7dbd722ff2580b9c2525e37fc46
|
||||
compressions['LZ4_RAW'] = lz4_compress
|
||||
decompressions['LZ4_RAW'] = lz4_decomp
|
||||
compressions['ZSTD'] = cramjam.zstd.compress
|
||||
decompressions['ZSTD'] = cramjam.zstd.decompress
|
||||
decom_into = {
|
||||
"GZIP": cramjam.gzip.decompress_into,
|
||||
"SNAPPY": cramjam.snappy.decompress_raw_into,
|
||||
"ZSTD": cramjam.zstd.decompress_into,
|
||||
"BROTLI": cramjam.brotli.decompress_into
|
||||
}
|
||||
|
||||
compressions = {k.upper(): v for k, v in compressions.items()}
|
||||
decompressions = {k.upper(): v for k, v in decompressions.items()}
|
||||
|
||||
rev_map = {getattr(parquet_thrift.CompressionCodec, key): key for key in
|
||||
dir(parquet_thrift.CompressionCodec) if key in
|
||||
['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD', 'LZ4_RAW']}
|
||||
|
||||
|
||||
def compress_data(data, compression='gzip'):
|
||||
if isinstance(compression, dict):
|
||||
algorithm = compression.get('type', 'gzip')
|
||||
if isinstance(algorithm, int):
|
||||
algorithm = rev_map[compression]
|
||||
args = compression.get('args', None)
|
||||
else:
|
||||
algorithm = compression
|
||||
args = None
|
||||
|
||||
if isinstance(algorithm, int):
|
||||
algorithm = rev_map[compression]
|
||||
|
||||
if algorithm.upper() not in compressions:
|
||||
raise RuntimeError("Compression '%s' not available. Options: %s" %
|
||||
(algorithm, sorted(compressions)))
|
||||
if args is None:
|
||||
return compressions[algorithm.upper()](data)
|
||||
else:
|
||||
if not isinstance(args, dict):
|
||||
raise ValueError("args dict entry is not a dict")
|
||||
return compressions[algorithm.upper()](data, **args)
|
||||
|
||||
|
||||
def decompress_data(data, uncompressed_size, algorithm='gzip'):
|
||||
if isinstance(algorithm, int):
|
||||
algorithm = rev_map[algorithm]
|
||||
if algorithm.upper() not in decompressions:
|
||||
raise RuntimeError(
|
||||
"Decompression '%s' not available. Options: %s" %
|
||||
(algorithm.upper(), sorted(decompressions))
|
||||
)
|
||||
if algorithm.upper() in decom_into:
|
||||
# ensures writable buffer from cramjam
|
||||
x = np.empty(uncompressed_size, dtype='uint8')
|
||||
decom_into[algorithm.upper()](np.frombuffer(data, dtype=np.uint8), x)
|
||||
return x
|
||||
return decompressions[algorithm.upper()](data, uncompressed_size)
|
||||
Reference in New Issue
Block a user