558 lines
17 KiB
Python
558 lines
17 KiB
Python
from collections import defaultdict
|
|
import copy
|
|
from packaging.version import Version
|
|
from functools import lru_cache
|
|
import io
|
|
import struct
|
|
import os
|
|
import operator
|
|
import re
|
|
import numbers
|
|
import zoneinfo
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import fsspec
|
|
|
|
from fastparquet import parquet_thrift
|
|
from fastparquet.cencoding import ThriftObject
|
|
from fastparquet import __version__
|
|
|
|
PANDAS_VERSION = Version(pd.__version__)
|
|
created_by = f"fastparquet-python version {__version__} (build 0)"
|
|
|
|
|
|
class ParquetException(Exception):
|
|
"""Generic Exception related to unexpected data format when
|
|
reading parquet file."""
|
|
pass
|
|
|
|
|
|
def default_mkdirs(f):
|
|
os.makedirs(f, exist_ok=True)
|
|
|
|
|
|
PATH_DATE_FMT = '%Y%m%d_%H%M%S.%f'
|
|
|
|
|
|
def path_string(o):
|
|
if isinstance(o, pd.Timestamp):
|
|
return o.isoformat()
|
|
return str(o)
|
|
|
|
|
|
default_open = open
|
|
|
|
|
|
def default_remove(paths):
|
|
for path in paths:
|
|
try:
|
|
os.unlink(path)
|
|
except IOError:
|
|
pass
|
|
|
|
|
|
def val_from_meta(x, meta):
|
|
try:
|
|
if meta['pandas_type'] == 'categorical':
|
|
return x
|
|
t = np.dtype(meta['numpy_type'])
|
|
if t == "bool":
|
|
return x in [True, "true", "True", 't', "T", 1, "1"]
|
|
return np.dtype(t).type(x)
|
|
except ValueError:
|
|
if meta['numpy_type'] == 'datetime64[ns]':
|
|
return pd.to_datetime(x, format=PATH_DATE_FMT)
|
|
else:
|
|
raise
|
|
|
|
|
|
def val_to_num(x, meta=None):
|
|
"""Parse a string as a number, date or timedelta if possible"""
|
|
if meta:
|
|
return val_from_meta(x, meta)
|
|
return _val_to_num(x)
|
|
|
|
|
|
@lru_cache(1000)
|
|
def _val_to_num(x):
|
|
if isinstance(x, numbers.Real):
|
|
return x
|
|
if x in ['now', 'NOW', 'TODAY', '']:
|
|
return x
|
|
if type(x) == str and x.lower() == 'nan':
|
|
return x
|
|
if x == "True":
|
|
return True
|
|
if x == "False":
|
|
return False
|
|
try:
|
|
return int(x, base=10)
|
|
except:
|
|
pass
|
|
try:
|
|
return float(x)
|
|
except:
|
|
pass
|
|
try:
|
|
return pd.Timestamp(x)
|
|
except:
|
|
pass
|
|
try:
|
|
# TODO: determine the valid usecases for this, then try to limit the set
|
|
# ofstrings which may get inadvertently converted to timedeltas
|
|
return pd.Timedelta(x)
|
|
except:
|
|
return x
|
|
|
|
|
|
def ensure_bytes(s):
|
|
return s.encode('utf-8') if isinstance(s, str) else s
|
|
|
|
|
|
def ensure_str(b, *, ignore_error=False):
|
|
if isinstance(b, str):
|
|
return b
|
|
else:
|
|
try:
|
|
return b.decode('utf-8')
|
|
except (UnicodeDecodeError, AttributeError):
|
|
if not ignore_error:
|
|
raise
|
|
return b
|
|
|
|
|
|
def check_column_names(columns, *args):
|
|
"""Ensure that parameters listing column names have corresponding columns"""
|
|
for arg in args:
|
|
if isinstance(arg, (tuple, list)):
|
|
missing = set(arg) - set(columns)
|
|
if missing:
|
|
raise ValueError("Following columns were requested but are "
|
|
"not available: %s.\n"
|
|
"All requested columns: %s\n"
|
|
"Available columns: %s"
|
|
"" % (missing, arg, columns))
|
|
|
|
|
|
def reset_row_idx(data: pd.DataFrame) -> pd.DataFrame:
|
|
"""Reset row (multi-)index as column(s) of the DataFrame.
|
|
|
|
Multi-index are stored in columns, one per index level.
|
|
|
|
Parameters
|
|
----------
|
|
data : dataframe
|
|
|
|
Returns
|
|
-------
|
|
dataframe
|
|
"""
|
|
if isinstance(data.index, pd.MultiIndex):
|
|
for name, cats, codes in zip(data.index.names, data.index.levels,
|
|
data.index.codes):
|
|
data = data.assign(**{name: pd.Categorical.from_codes(codes,
|
|
cats)})
|
|
data.reset_index(drop=True)
|
|
else:
|
|
data = data.reset_index()
|
|
return data
|
|
|
|
|
|
def metadata_from_many(file_list, verify_schema=False, open_with=default_open,
|
|
root=False, fs=None):
|
|
"""
|
|
Given list of parquet files, make a FileMetaData that points to them
|
|
|
|
Parameters
|
|
----------
|
|
file_list: list of paths of parquet files
|
|
verify_schema: bool (False)
|
|
Whether to assert that the schemas in each file are identical
|
|
open_with: function
|
|
Use this to open each path.
|
|
root: str
|
|
Top of the dataset's directory tree, for cases where it can't be
|
|
automatically inferred.
|
|
fs: fsspsec.AbstractFileSystem
|
|
Used in preference to open_with, if given
|
|
|
|
Returns
|
|
-------
|
|
basepath: the root path that other paths are relative to
|
|
fmd: metadata thrift structure
|
|
"""
|
|
from fastparquet import api
|
|
|
|
legacy = True
|
|
if all(isinstance(pf, api.ParquetFile) for pf in file_list):
|
|
pfs = file_list
|
|
file_list = [pf.fn for pf in pfs]
|
|
elif all(not isinstance(pf, api.ParquetFile) for pf in file_list):
|
|
|
|
if verify_schema or fs is None or len(file_list) < 3:
|
|
pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
|
|
else:
|
|
# activate new code path here
|
|
f0 = file_list[0]
|
|
pf0 = api.ParquetFile(f0, open_with=open_with)
|
|
if pf0.file_scheme not in ['empty', 'simple']:
|
|
# set of directories, revert
|
|
pfs = [pf0] + [api.ParquetFile(fn, open_with=open_with) for fn in file_list[1:]]
|
|
else:
|
|
# permits concurrent fetch of footers; needs fsspec >= 2021.6
|
|
size = int(1.4 * pf0._head_size)
|
|
pieces = fs.cat(file_list[1:], start=-size)
|
|
sizes = {path: int.from_bytes(piece[-8:-4], "little") + 8 for
|
|
path, piece in pieces.items()}
|
|
not_bigenough = [path for path, s in sizes.items() if s > size]
|
|
if not_bigenough:
|
|
new_pieces = fs.cat(not_bigenough, start=-max(sizes.values()))
|
|
pieces.update(new_pieces)
|
|
pieces = {k: _get_fmd(v) for k, v in pieces.items()}
|
|
pieces = [(fn, pieces[fn]) for fn in file_list[1:]] # recover ordering
|
|
legacy = False
|
|
else:
|
|
raise ValueError("Merge requires all ParquetFile instances or none")
|
|
basepath, file_list = analyse_paths(file_list, root=root)
|
|
|
|
if legacy:
|
|
# legacy code path
|
|
if verify_schema:
|
|
for pf in pfs[1:]:
|
|
if pf._schema != pfs[0]._schema:
|
|
raise ValueError('Incompatible schemas')
|
|
|
|
fmd = copy.copy(pfs[0].fmd) # we inherit "created by" field
|
|
rgs = []
|
|
|
|
for pf, fn in zip(pfs, file_list):
|
|
if pf.file_scheme not in ['simple', 'empty']:
|
|
for rg in pf.row_groups:
|
|
rg = copy.copy(rg)
|
|
rg.columns = [copy.copy(c) for c in rg.columns]
|
|
for chunk in rg.columns:
|
|
chunk.file_path = '/'.join(
|
|
[fn, chunk.file_path if isinstance(chunk.file_path, str) else chunk.file_path.decode()]
|
|
)
|
|
rgs.append(rg)
|
|
|
|
else:
|
|
for rg in pf.row_groups:
|
|
rg = copy.copy(rg)
|
|
rg.columns = [copy.copy(c) for c in rg.columns]
|
|
for chunk in rg.columns:
|
|
chunk.file_path = fn
|
|
rgs.append(rg)
|
|
|
|
fmd.row_groups = rgs
|
|
fmd.num_rows = sum(rg.num_rows for rg in fmd.row_groups)
|
|
return basepath, fmd
|
|
|
|
for rg in pf0.fmd.row_groups:
|
|
# chunks of first file, which would have file_path=None
|
|
rg.columns[0].file_path = f0[len(basepath):].lstrip("/")
|
|
|
|
rgs0 = pf0.fmd.row_groups
|
|
for k, v in pieces:
|
|
# Set file paths on other files
|
|
if len(v.schema) > len(pf0.fmd.schema):
|
|
# or was UPDATED with supercast
|
|
pf0.fmd.schema = v.schema
|
|
rgs = v.row_groups or []
|
|
for rg in rgs:
|
|
rg.columns[0].file_path = k[len(basepath):].lstrip("/")
|
|
rgs0.extend(rgs)
|
|
pf0.fmd.row_groups = rgs0
|
|
pf0.fmd.num_rows = sum(rg.num_rows for rg in pf0.fmd.row_groups)
|
|
return basepath, pf0.fmd
|
|
|
|
|
|
def _get_fmd(inbytes):
|
|
from .cencoding import from_buffer
|
|
|
|
f = io.BytesIO(inbytes)
|
|
f.seek(-8, 2)
|
|
head_size = struct.unpack('<i', f.read(4))[0]
|
|
f.seek(-(head_size + 8), 2)
|
|
data = f.read(head_size)
|
|
return from_buffer(data, "FileMetaData")
|
|
|
|
|
|
def update_custom_metadata(obj, custom_metadata : dict):
|
|
"""Update custom metadata stored in thrift object or parquet file.
|
|
|
|
Update strategy depends if key found in new custom metadata is also found
|
|
in already existing custom metadata within thrift object, as well as its
|
|
value.
|
|
|
|
- If not found in existing, it is added.
|
|
- If found in existing, it is updated.
|
|
- If its value is `None`, it is not added, and if found in existing,
|
|
it is removed from existing.
|
|
|
|
Parameters
|
|
----------
|
|
obj : metadata ThriftObject or parquet file
|
|
Thrift object or parquet file which metadata is to update.
|
|
custom_metadata : dict
|
|
Key-value metadata to update in thrift object.
|
|
The values must be strings or binary. To pass a dictionary, serialize it as json string then encode it in binary.
|
|
Notes
|
|
-----
|
|
Key-value metadata are expected binary encoded. This function ensures it
|
|
is.
|
|
"""
|
|
kvm = (obj.key_value_metadata if isinstance(obj, ThriftObject)
|
|
else obj.fmd.key_value_metadata)
|
|
|
|
if kvm is None:
|
|
kvm = []
|
|
|
|
# Spare list of keys.
|
|
kvm_keys = [item.key for item in kvm]
|
|
for key, value in custom_metadata.items():
|
|
key_b = ensure_bytes(key)
|
|
if key_b in kvm_keys:
|
|
idx = kvm_keys.index(key_b)
|
|
if value is None:
|
|
# Remove item.
|
|
del kvm[idx]
|
|
# Update 'kvm_keys' as well, for keeping indexing
|
|
# up-to-date.
|
|
del kvm_keys[idx]
|
|
else:
|
|
# Replace item.
|
|
kvm[idx] = parquet_thrift.KeyValue(key=key_b,
|
|
value=ensure_bytes(value))
|
|
elif value is not None:
|
|
kvm.append(parquet_thrift.KeyValue(key=key_b,
|
|
value=ensure_bytes(value)))
|
|
if isinstance(obj, ThriftObject):
|
|
obj.key_value_metadata = kvm
|
|
else:
|
|
obj.fmd.key_value_metadata = kvm
|
|
# Reset '_kvm' to refresh 'key_value_metadata' cached property.
|
|
obj._kvm = None
|
|
|
|
|
|
# simple cache to avoid re compile every time
|
|
seps = {}
|
|
|
|
|
|
def ex_from_sep(sep):
|
|
"""Generate regex for category folder matching"""
|
|
if sep not in seps:
|
|
if sep in r'\^$.|?*+()[]':
|
|
s = re.compile(r"([a-zA-Z_0-9]+)=([^\\{}]+)".format(sep))
|
|
else:
|
|
s = re.compile("([a-zA-Z_0-9]+)=([^{}]+)".format(sep))
|
|
seps[sep] = s
|
|
return seps[sep]
|
|
|
|
|
|
def analyse_paths(file_list, root=False):
|
|
"""Consolidate list of file-paths into parquet relative paths"""
|
|
path_parts_list = [join_path(fn).split('/') for fn in file_list]
|
|
if root is False:
|
|
basepath = path_parts_list[0][:-1]
|
|
for i, path_parts in enumerate(path_parts_list):
|
|
j = len(path_parts) - 1
|
|
for k, (base_part, path_part) in enumerate(
|
|
zip(basepath, path_parts)):
|
|
if base_part != path_part:
|
|
j = k
|
|
break
|
|
basepath = basepath[:j]
|
|
l = len(basepath)
|
|
|
|
else:
|
|
basepath = join_path(root).split('/')
|
|
l = len(basepath)
|
|
assert all(p[:l] == basepath for p in path_parts_list
|
|
), "All paths must begin with the given root"
|
|
out_list = []
|
|
for path_parts in path_parts_list:
|
|
out_list.append('/'.join(path_parts[l:])) # use '/'.join() instead of join_path to be consistent with split('/')
|
|
|
|
return '/'.join(basepath), out_list # use '/'.join() instead of join_path to be consistent with split('/')
|
|
|
|
|
|
def infer_dtype(column):
|
|
try:
|
|
return pd.api.types.infer_dtype(column, skipna=False)
|
|
except AttributeError:
|
|
return pd.lib.infer_dtype(column)
|
|
|
|
|
|
def groupby_types(iterable):
|
|
groups = defaultdict(list)
|
|
for x in iterable:
|
|
groups[type(x)].append(x)
|
|
return groups
|
|
|
|
|
|
def get_column_metadata(column, name, object_dtype=None):
|
|
"""Produce pandas column metadata block"""
|
|
inferred_dtypes = {
|
|
"utf8": "unicode",
|
|
"bytes": "bytes",
|
|
"bool": "bool",
|
|
"int": "int",
|
|
"json": "object",
|
|
"bson": "object"
|
|
}
|
|
dtype = column.dtype
|
|
if object_dtype in inferred_dtypes and dtype == "object":
|
|
inferred_dtype = inferred_dtypes.get(object_dtype, "mixed")
|
|
else:
|
|
inferred_dtype = infer_dtype(column)
|
|
if str(dtype) == "bool":
|
|
# pandas accidentally calls this "boolean"
|
|
inferred_dtype = "bool"
|
|
|
|
if isinstance(dtype, pd.CategoricalDtype):
|
|
extra_metadata = {
|
|
'num_categories': len(column.cat.categories),
|
|
'ordered': column.cat.ordered,
|
|
}
|
|
dtype = column.cat.codes.dtype
|
|
elif isinstance(dtype, pd.DatetimeTZDtype):
|
|
if isinstance(dtype.tz, zoneinfo.ZoneInfo):
|
|
extra_metadata = {'timezone': dtype.tz.key}
|
|
else:
|
|
try:
|
|
stz = str(dtype.tz)
|
|
if "UTC" in stz and ":" in stz:
|
|
extra_metadata = {'timezone': stz.strip("UTC")}
|
|
elif len(str(stz)) == 3: # like "UTC", "CET", ...
|
|
extra_metadata = {'timezone': str(stz)}
|
|
elif getattr(dtype.tz, "zone", False):
|
|
extra_metadata = {'timezone': dtype.tz.zone}
|
|
elif "pytz" not in stz:
|
|
pd.Series([pd.to_datetime('now', utc=True)]).dt.tz_localize(stz)
|
|
extra_metadata = {'timezone': stz}
|
|
elif "Offset" in stz:
|
|
extra_metadata = {'timezone': f"{dtype.tz._minutes // 60:+03}:00"}
|
|
else:
|
|
raise KeyError
|
|
except Exception as e:
|
|
raise ValueError("Time-zone information could not be serialised: "
|
|
"%s, please use another" % str(dtype.tz)) from e
|
|
else:
|
|
extra_metadata = None
|
|
|
|
if isinstance(name, tuple):
|
|
name = str(name)
|
|
elif not isinstance(name, str):
|
|
raise TypeError(
|
|
'Column name must be a string. Got column {} of type {}'.format(
|
|
name, type(name).__name__
|
|
)
|
|
)
|
|
|
|
return {
|
|
'name': name,
|
|
'field_name': name,
|
|
'pandas_type': {
|
|
'string': 'unicode',
|
|
'datetime64': (
|
|
'datetimetz' if hasattr(dtype, 'tz')
|
|
else 'datetime'
|
|
),
|
|
'integer': str(dtype),
|
|
'floating': str(dtype),
|
|
}.get(inferred_dtype, inferred_dtype),
|
|
'numpy_type': get_numpy_type(dtype),
|
|
'metadata': extra_metadata,
|
|
}
|
|
|
|
|
|
def get_numpy_type(dtype):
|
|
if isinstance(dtype, pd.CategoricalDtype):
|
|
return 'category'
|
|
elif "Int" in str(dtype):
|
|
return str(dtype).lower()
|
|
elif str(dtype) == "boolean":
|
|
return "bool"
|
|
elif str(dtype) == "string":
|
|
return "object"
|
|
else:
|
|
return str(dtype)
|
|
|
|
|
|
def get_file_scheme(paths):
|
|
"""For the given row groups, figure out if the partitioning scheme
|
|
|
|
Parameters
|
|
----------
|
|
paths: list of str
|
|
normally from row_group.columns[0].file_path
|
|
|
|
Returns
|
|
-------
|
|
'empty': no rgs at all
|
|
'simple': all rgs in a single file
|
|
'flat': multiple files in one directory
|
|
'hive': directories are all `key=value`; all files are at the same
|
|
directory depth
|
|
'drill': assume directory names are labels, and field names are of the
|
|
form dir0, dir1; all files are at the same directory depth
|
|
'other': none of the above, assume no partitioning
|
|
"""
|
|
if not paths:
|
|
return 'empty'
|
|
if set(paths) == {None}:
|
|
return 'simple'
|
|
if None in paths:
|
|
return 'other'
|
|
parts = [p.split('/') for p in paths]
|
|
lens = [len(p) for p in parts]
|
|
if len(set(lens)) > 1:
|
|
return 'other'
|
|
if set(lens) == {1}:
|
|
return 'flat'
|
|
matches = all(all("=" in p[1:-1] for p in part[:-1]) for part in parts)
|
|
return "hive" if matches else "drill"
|
|
|
|
|
|
def join_path(*path):
|
|
return "/".join([str(p).replace("\\", "/").rstrip("/") for p in path if p])
|
|
|
|
|
|
def _strip_path_tail(paths) -> set:
|
|
return {path.rsplit("/", 1)[0] if "/" in path else "" for path in paths}
|
|
|
|
|
|
ops = {
|
|
"==": operator.eq,
|
|
"=": operator.eq,
|
|
"!=": operator.ne,
|
|
">": operator.gt,
|
|
">=": operator.ge,
|
|
"<": operator.lt,
|
|
"<=": operator.le
|
|
}
|
|
|
|
|
|
def norm_col_name(name, is_index:bool=None):
|
|
if isinstance(name, tuple):
|
|
if is_index:
|
|
return name[0]
|
|
else:
|
|
return str(name)
|
|
return name
|
|
|
|
|
|
def get_fs(fn, open_with, mkdirs):
|
|
fs = None
|
|
if "FastParquetImpl.write.<locals>.<lambda>" in str(open_with):
|
|
import inspect
|
|
so = inspect.getclosurevars(open_with).nonlocals["storage_options"] or {}
|
|
fs, fn = fsspec.core.url_to_fs(fn, **so)
|
|
open_with = fs.open
|
|
mkdirs = mkdirs or (lambda d: fs.mkdirs(d, exist_ok=True))
|
|
return fs, fn, open_with, mkdirs
|
|
|