Import python venv for stability
This commit is contained in:
@@ -0,0 +1,557 @@
|
||||
from collections import defaultdict
|
||||
import copy
|
||||
from packaging.version import Version
|
||||
from functools import lru_cache
|
||||
import io
|
||||
import struct
|
||||
import os
|
||||
import operator
|
||||
import re
|
||||
import numbers
|
||||
import zoneinfo
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import fsspec
|
||||
|
||||
from fastparquet import parquet_thrift
|
||||
from fastparquet.cencoding import ThriftObject
|
||||
from fastparquet import __version__
|
||||
|
||||
PANDAS_VERSION = Version(pd.__version__)
|
||||
created_by = f"fastparquet-python version {__version__} (build 0)"
|
||||
|
||||
|
||||
class ParquetException(Exception):
|
||||
"""Generic Exception related to unexpected data format when
|
||||
reading parquet file."""
|
||||
pass
|
||||
|
||||
|
||||
def default_mkdirs(f):
|
||||
os.makedirs(f, exist_ok=True)
|
||||
|
||||
|
||||
PATH_DATE_FMT = '%Y%m%d_%H%M%S.%f'
|
||||
|
||||
|
||||
def path_string(o):
|
||||
if isinstance(o, pd.Timestamp):
|
||||
return o.isoformat()
|
||||
return str(o)
|
||||
|
||||
|
||||
default_open = open
|
||||
|
||||
|
||||
def default_remove(paths):
|
||||
for path in paths:
|
||||
try:
|
||||
os.unlink(path)
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
|
||||
def val_from_meta(x, meta):
|
||||
try:
|
||||
if meta['pandas_type'] == 'categorical':
|
||||
return x
|
||||
t = np.dtype(meta['numpy_type'])
|
||||
if t == "bool":
|
||||
return x in [True, "true", "True", 't', "T", 1, "1"]
|
||||
return np.dtype(t).type(x)
|
||||
except ValueError:
|
||||
if meta['numpy_type'] == 'datetime64[ns]':
|
||||
return pd.to_datetime(x, format=PATH_DATE_FMT)
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def val_to_num(x, meta=None):
|
||||
"""Parse a string as a number, date or timedelta if possible"""
|
||||
if meta:
|
||||
return val_from_meta(x, meta)
|
||||
return _val_to_num(x)
|
||||
|
||||
|
||||
@lru_cache(1000)
|
||||
def _val_to_num(x):
|
||||
if isinstance(x, numbers.Real):
|
||||
return x
|
||||
if x in ['now', 'NOW', 'TODAY', '']:
|
||||
return x
|
||||
if type(x) == str and x.lower() == 'nan':
|
||||
return x
|
||||
if x == "True":
|
||||
return True
|
||||
if x == "False":
|
||||
return False
|
||||
try:
|
||||
return int(x, base=10)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
return float(x)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
return pd.Timestamp(x)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
# TODO: determine the valid usecases for this, then try to limit the set
|
||||
# ofstrings which may get inadvertently converted to timedeltas
|
||||
return pd.Timedelta(x)
|
||||
except:
|
||||
return x
|
||||
|
||||
|
||||
def ensure_bytes(s):
|
||||
return s.encode('utf-8') if isinstance(s, str) else s
|
||||
|
||||
|
||||
def ensure_str(b, *, ignore_error=False):
|
||||
if isinstance(b, str):
|
||||
return b
|
||||
else:
|
||||
try:
|
||||
return b.decode('utf-8')
|
||||
except (UnicodeDecodeError, AttributeError):
|
||||
if not ignore_error:
|
||||
raise
|
||||
return b
|
||||
|
||||
|
||||
def check_column_names(columns, *args):
|
||||
"""Ensure that parameters listing column names have corresponding columns"""
|
||||
for arg in args:
|
||||
if isinstance(arg, (tuple, list)):
|
||||
missing = set(arg) - set(columns)
|
||||
if missing:
|
||||
raise ValueError("Following columns were requested but are "
|
||||
"not available: %s.\n"
|
||||
"All requested columns: %s\n"
|
||||
"Available columns: %s"
|
||||
"" % (missing, arg, columns))
|
||||
|
||||
|
||||
def reset_row_idx(data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Reset row (multi-)index as column(s) of the DataFrame.
|
||||
|
||||
Multi-index are stored in columns, one per index level.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dataframe
|
||||
|
||||
Returns
|
||||
-------
|
||||
dataframe
|
||||
"""
|
||||
if isinstance(data.index, pd.MultiIndex):
|
||||
for name, cats, codes in zip(data.index.names, data.index.levels,
|
||||
data.index.codes):
|
||||
data = data.assign(**{name: pd.Categorical.from_codes(codes,
|
||||
cats)})
|
||||
data.reset_index(drop=True)
|
||||
else:
|
||||
data = data.reset_index()
|
||||
return data
|
||||
|
||||
|
||||
def metadata_from_many(file_list, verify_schema=False, open_with=default_open,
|
||||
root=False, fs=None):
|
||||
"""
|
||||
Given list of parquet files, make a FileMetaData that points to them
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file_list: list of paths of parquet files
|
||||
verify_schema: bool (False)
|
||||
Whether to assert that the schemas in each file are identical
|
||||
open_with: function
|
||||
Use this to open each path.
|
||||
root: str
|
||||
Top of the dataset's directory tree, for cases where it can't be
|
||||
automatically inferred.
|
||||
fs: fsspsec.AbstractFileSystem
|
||||
Used in preference to open_with, if given
|
||||
|
||||
Returns
|
||||
-------
|
||||
basepath: the root path that other paths are relative to
|
||||
fmd: metadata thrift structure
|
||||
"""
|
||||
from fastparquet import api
|
||||
|
||||
legacy = True
|
||||
if all(isinstance(pf, api.ParquetFile) for pf in file_list):
|
||||
pfs = file_list
|
||||
file_list = [pf.fn for pf in pfs]
|
||||
elif all(not isinstance(pf, api.ParquetFile) for pf in file_list):
|
||||
|
||||
if verify_schema or fs is None or len(file_list) < 3:
|
||||
pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
|
||||
else:
|
||||
# activate new code path here
|
||||
f0 = file_list[0]
|
||||
pf0 = api.ParquetFile(f0, open_with=open_with)
|
||||
if pf0.file_scheme not in ['empty', 'simple']:
|
||||
# set of directories, revert
|
||||
pfs = [pf0] + [api.ParquetFile(fn, open_with=open_with) for fn in file_list[1:]]
|
||||
else:
|
||||
# permits concurrent fetch of footers; needs fsspec >= 2021.6
|
||||
size = int(1.4 * pf0._head_size)
|
||||
pieces = fs.cat(file_list[1:], start=-size)
|
||||
sizes = {path: int.from_bytes(piece[-8:-4], "little") + 8 for
|
||||
path, piece in pieces.items()}
|
||||
not_bigenough = [path for path, s in sizes.items() if s > size]
|
||||
if not_bigenough:
|
||||
new_pieces = fs.cat(not_bigenough, start=-max(sizes.values()))
|
||||
pieces.update(new_pieces)
|
||||
pieces = {k: _get_fmd(v) for k, v in pieces.items()}
|
||||
pieces = [(fn, pieces[fn]) for fn in file_list[1:]] # recover ordering
|
||||
legacy = False
|
||||
else:
|
||||
raise ValueError("Merge requires all ParquetFile instances or none")
|
||||
basepath, file_list = analyse_paths(file_list, root=root)
|
||||
|
||||
if legacy:
|
||||
# legacy code path
|
||||
if verify_schema:
|
||||
for pf in pfs[1:]:
|
||||
if pf._schema != pfs[0]._schema:
|
||||
raise ValueError('Incompatible schemas')
|
||||
|
||||
fmd = copy.copy(pfs[0].fmd) # we inherit "created by" field
|
||||
rgs = []
|
||||
|
||||
for pf, fn in zip(pfs, file_list):
|
||||
if pf.file_scheme not in ['simple', 'empty']:
|
||||
for rg in pf.row_groups:
|
||||
rg = copy.copy(rg)
|
||||
rg.columns = [copy.copy(c) for c in rg.columns]
|
||||
for chunk in rg.columns:
|
||||
chunk.file_path = '/'.join(
|
||||
[fn, chunk.file_path if isinstance(chunk.file_path, str) else chunk.file_path.decode()]
|
||||
)
|
||||
rgs.append(rg)
|
||||
|
||||
else:
|
||||
for rg in pf.row_groups:
|
||||
rg = copy.copy(rg)
|
||||
rg.columns = [copy.copy(c) for c in rg.columns]
|
||||
for chunk in rg.columns:
|
||||
chunk.file_path = fn
|
||||
rgs.append(rg)
|
||||
|
||||
fmd.row_groups = rgs
|
||||
fmd.num_rows = sum(rg.num_rows for rg in fmd.row_groups)
|
||||
return basepath, fmd
|
||||
|
||||
for rg in pf0.fmd.row_groups:
|
||||
# chunks of first file, which would have file_path=None
|
||||
rg.columns[0].file_path = f0[len(basepath):].lstrip("/")
|
||||
|
||||
rgs0 = pf0.fmd.row_groups
|
||||
for k, v in pieces:
|
||||
# Set file paths on other files
|
||||
if len(v.schema) > len(pf0.fmd.schema):
|
||||
# or was UPDATED with supercast
|
||||
pf0.fmd.schema = v.schema
|
||||
rgs = v.row_groups or []
|
||||
for rg in rgs:
|
||||
rg.columns[0].file_path = k[len(basepath):].lstrip("/")
|
||||
rgs0.extend(rgs)
|
||||
pf0.fmd.row_groups = rgs0
|
||||
pf0.fmd.num_rows = sum(rg.num_rows for rg in pf0.fmd.row_groups)
|
||||
return basepath, pf0.fmd
|
||||
|
||||
|
||||
def _get_fmd(inbytes):
|
||||
from .cencoding import from_buffer
|
||||
|
||||
f = io.BytesIO(inbytes)
|
||||
f.seek(-8, 2)
|
||||
head_size = struct.unpack('<i', f.read(4))[0]
|
||||
f.seek(-(head_size + 8), 2)
|
||||
data = f.read(head_size)
|
||||
return from_buffer(data, "FileMetaData")
|
||||
|
||||
|
||||
def update_custom_metadata(obj, custom_metadata : dict):
|
||||
"""Update custom metadata stored in thrift object or parquet file.
|
||||
|
||||
Update strategy depends if key found in new custom metadata is also found
|
||||
in already existing custom metadata within thrift object, as well as its
|
||||
value.
|
||||
|
||||
- If not found in existing, it is added.
|
||||
- If found in existing, it is updated.
|
||||
- If its value is `None`, it is not added, and if found in existing,
|
||||
it is removed from existing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : metadata ThriftObject or parquet file
|
||||
Thrift object or parquet file which metadata is to update.
|
||||
custom_metadata : dict
|
||||
Key-value metadata to update in thrift object.
|
||||
The values must be strings or binary. To pass a dictionary, serialize it as json string then encode it in binary.
|
||||
Notes
|
||||
-----
|
||||
Key-value metadata are expected binary encoded. This function ensures it
|
||||
is.
|
||||
"""
|
||||
kvm = (obj.key_value_metadata if isinstance(obj, ThriftObject)
|
||||
else obj.fmd.key_value_metadata)
|
||||
|
||||
if kvm is None:
|
||||
kvm = []
|
||||
|
||||
# Spare list of keys.
|
||||
kvm_keys = [item.key for item in kvm]
|
||||
for key, value in custom_metadata.items():
|
||||
key_b = ensure_bytes(key)
|
||||
if key_b in kvm_keys:
|
||||
idx = kvm_keys.index(key_b)
|
||||
if value is None:
|
||||
# Remove item.
|
||||
del kvm[idx]
|
||||
# Update 'kvm_keys' as well, for keeping indexing
|
||||
# up-to-date.
|
||||
del kvm_keys[idx]
|
||||
else:
|
||||
# Replace item.
|
||||
kvm[idx] = parquet_thrift.KeyValue(key=key_b,
|
||||
value=ensure_bytes(value))
|
||||
elif value is not None:
|
||||
kvm.append(parquet_thrift.KeyValue(key=key_b,
|
||||
value=ensure_bytes(value)))
|
||||
if isinstance(obj, ThriftObject):
|
||||
obj.key_value_metadata = kvm
|
||||
else:
|
||||
obj.fmd.key_value_metadata = kvm
|
||||
# Reset '_kvm' to refresh 'key_value_metadata' cached property.
|
||||
obj._kvm = None
|
||||
|
||||
|
||||
# simple cache to avoid re compile every time
|
||||
seps = {}
|
||||
|
||||
|
||||
def ex_from_sep(sep):
|
||||
"""Generate regex for category folder matching"""
|
||||
if sep not in seps:
|
||||
if sep in r'\^$.|?*+()[]':
|
||||
s = re.compile(r"([a-zA-Z_0-9]+)=([^\\{}]+)".format(sep))
|
||||
else:
|
||||
s = re.compile("([a-zA-Z_0-9]+)=([^{}]+)".format(sep))
|
||||
seps[sep] = s
|
||||
return seps[sep]
|
||||
|
||||
|
||||
def analyse_paths(file_list, root=False):
|
||||
"""Consolidate list of file-paths into parquet relative paths"""
|
||||
path_parts_list = [join_path(fn).split('/') for fn in file_list]
|
||||
if root is False:
|
||||
basepath = path_parts_list[0][:-1]
|
||||
for i, path_parts in enumerate(path_parts_list):
|
||||
j = len(path_parts) - 1
|
||||
for k, (base_part, path_part) in enumerate(
|
||||
zip(basepath, path_parts)):
|
||||
if base_part != path_part:
|
||||
j = k
|
||||
break
|
||||
basepath = basepath[:j]
|
||||
l = len(basepath)
|
||||
|
||||
else:
|
||||
basepath = join_path(root).split('/')
|
||||
l = len(basepath)
|
||||
assert all(p[:l] == basepath for p in path_parts_list
|
||||
), "All paths must begin with the given root"
|
||||
out_list = []
|
||||
for path_parts in path_parts_list:
|
||||
out_list.append('/'.join(path_parts[l:])) # use '/'.join() instead of join_path to be consistent with split('/')
|
||||
|
||||
return '/'.join(basepath), out_list # use '/'.join() instead of join_path to be consistent with split('/')
|
||||
|
||||
|
||||
def infer_dtype(column):
|
||||
try:
|
||||
return pd.api.types.infer_dtype(column, skipna=False)
|
||||
except AttributeError:
|
||||
return pd.lib.infer_dtype(column)
|
||||
|
||||
|
||||
def groupby_types(iterable):
|
||||
groups = defaultdict(list)
|
||||
for x in iterable:
|
||||
groups[type(x)].append(x)
|
||||
return groups
|
||||
|
||||
|
||||
def get_column_metadata(column, name, object_dtype=None):
|
||||
"""Produce pandas column metadata block"""
|
||||
inferred_dtypes = {
|
||||
"utf8": "unicode",
|
||||
"bytes": "bytes",
|
||||
"bool": "bool",
|
||||
"int": "int",
|
||||
"json": "object",
|
||||
"bson": "object"
|
||||
}
|
||||
dtype = column.dtype
|
||||
if object_dtype in inferred_dtypes and dtype == "object":
|
||||
inferred_dtype = inferred_dtypes.get(object_dtype, "mixed")
|
||||
else:
|
||||
inferred_dtype = infer_dtype(column)
|
||||
if str(dtype) == "bool":
|
||||
# pandas accidentally calls this "boolean"
|
||||
inferred_dtype = "bool"
|
||||
|
||||
if isinstance(dtype, pd.CategoricalDtype):
|
||||
extra_metadata = {
|
||||
'num_categories': len(column.cat.categories),
|
||||
'ordered': column.cat.ordered,
|
||||
}
|
||||
dtype = column.cat.codes.dtype
|
||||
elif isinstance(dtype, pd.DatetimeTZDtype):
|
||||
if isinstance(dtype.tz, zoneinfo.ZoneInfo):
|
||||
extra_metadata = {'timezone': dtype.tz.key}
|
||||
else:
|
||||
try:
|
||||
stz = str(dtype.tz)
|
||||
if "UTC" in stz and ":" in stz:
|
||||
extra_metadata = {'timezone': stz.strip("UTC")}
|
||||
elif len(str(stz)) == 3: # like "UTC", "CET", ...
|
||||
extra_metadata = {'timezone': str(stz)}
|
||||
elif getattr(dtype.tz, "zone", False):
|
||||
extra_metadata = {'timezone': dtype.tz.zone}
|
||||
elif "pytz" not in stz:
|
||||
pd.Series([pd.to_datetime('now', utc=True)]).dt.tz_localize(stz)
|
||||
extra_metadata = {'timezone': stz}
|
||||
elif "Offset" in stz:
|
||||
extra_metadata = {'timezone': f"{dtype.tz._minutes // 60:+03}:00"}
|
||||
else:
|
||||
raise KeyError
|
||||
except Exception as e:
|
||||
raise ValueError("Time-zone information could not be serialised: "
|
||||
"%s, please use another" % str(dtype.tz)) from e
|
||||
else:
|
||||
extra_metadata = None
|
||||
|
||||
if isinstance(name, tuple):
|
||||
name = str(name)
|
||||
elif not isinstance(name, str):
|
||||
raise TypeError(
|
||||
'Column name must be a string. Got column {} of type {}'.format(
|
||||
name, type(name).__name__
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
'name': name,
|
||||
'field_name': name,
|
||||
'pandas_type': {
|
||||
'string': 'unicode',
|
||||
'datetime64': (
|
||||
'datetimetz' if hasattr(dtype, 'tz')
|
||||
else 'datetime'
|
||||
),
|
||||
'integer': str(dtype),
|
||||
'floating': str(dtype),
|
||||
}.get(inferred_dtype, inferred_dtype),
|
||||
'numpy_type': get_numpy_type(dtype),
|
||||
'metadata': extra_metadata,
|
||||
}
|
||||
|
||||
|
||||
def get_numpy_type(dtype):
|
||||
if isinstance(dtype, pd.CategoricalDtype):
|
||||
return 'category'
|
||||
elif "Int" in str(dtype):
|
||||
return str(dtype).lower()
|
||||
elif str(dtype) == "boolean":
|
||||
return "bool"
|
||||
elif str(dtype) == "string":
|
||||
return "object"
|
||||
else:
|
||||
return str(dtype)
|
||||
|
||||
|
||||
def get_file_scheme(paths):
|
||||
"""For the given row groups, figure out if the partitioning scheme
|
||||
|
||||
Parameters
|
||||
----------
|
||||
paths: list of str
|
||||
normally from row_group.columns[0].file_path
|
||||
|
||||
Returns
|
||||
-------
|
||||
'empty': no rgs at all
|
||||
'simple': all rgs in a single file
|
||||
'flat': multiple files in one directory
|
||||
'hive': directories are all `key=value`; all files are at the same
|
||||
directory depth
|
||||
'drill': assume directory names are labels, and field names are of the
|
||||
form dir0, dir1; all files are at the same directory depth
|
||||
'other': none of the above, assume no partitioning
|
||||
"""
|
||||
if not paths:
|
||||
return 'empty'
|
||||
if set(paths) == {None}:
|
||||
return 'simple'
|
||||
if None in paths:
|
||||
return 'other'
|
||||
parts = [p.split('/') for p in paths]
|
||||
lens = [len(p) for p in parts]
|
||||
if len(set(lens)) > 1:
|
||||
return 'other'
|
||||
if set(lens) == {1}:
|
||||
return 'flat'
|
||||
matches = all(all("=" in p[1:-1] for p in part[:-1]) for part in parts)
|
||||
return "hive" if matches else "drill"
|
||||
|
||||
|
||||
def join_path(*path):
|
||||
return "/".join([str(p).replace("\\", "/").rstrip("/") for p in path if p])
|
||||
|
||||
|
||||
def _strip_path_tail(paths) -> set:
|
||||
return {path.rsplit("/", 1)[0] if "/" in path else "" for path in paths}
|
||||
|
||||
|
||||
ops = {
|
||||
"==": operator.eq,
|
||||
"=": operator.eq,
|
||||
"!=": operator.ne,
|
||||
">": operator.gt,
|
||||
">=": operator.ge,
|
||||
"<": operator.lt,
|
||||
"<=": operator.le
|
||||
}
|
||||
|
||||
|
||||
def norm_col_name(name, is_index:bool=None):
|
||||
if isinstance(name, tuple):
|
||||
if is_index:
|
||||
return name[0]
|
||||
else:
|
||||
return str(name)
|
||||
return name
|
||||
|
||||
|
||||
def get_fs(fn, open_with, mkdirs):
|
||||
fs = None
|
||||
if "FastParquetImpl.write.<locals>.<lambda>" in str(open_with):
|
||||
import inspect
|
||||
so = inspect.getclosurevars(open_with).nonlocals["storage_options"] or {}
|
||||
fs, fn = fsspec.core.url_to_fs(fn, **so)
|
||||
open_with = fs.open
|
||||
mkdirs = mkdirs or (lambda d: fs.mkdirs(d, exist_ok=True))
|
||||
return fs, fn, open_with, mkdirs
|
||||
|
||||
Reference in New Issue
Block a user