Import python venv for stability

2026-02-15 21:24:16 -08:00
parent 1343e93a59
commit 7d784705c9
4997 changed files with 1628270 additions and 0 deletions
@@ -0,0 +1,557 @@
+from collections import defaultdict
+import copy
+from packaging.version import Version
+from functools import lru_cache
+import io
+import struct
+import os
+import operator
+import re
+import numbers
+import zoneinfo
+
+import numpy as np
+import pandas as pd
+
+import fsspec
+
+from fastparquet import parquet_thrift
+from fastparquet.cencoding import ThriftObject
+from fastparquet import __version__
+
+PANDAS_VERSION = Version(pd.__version__)
+created_by = f"fastparquet-python version {__version__} (build 0)"
+
+
+class ParquetException(Exception):
+    """Generic Exception related to unexpected data format when
+     reading parquet file."""
+    pass
+
+
+def default_mkdirs(f):
+    os.makedirs(f, exist_ok=True)
+
+
+PATH_DATE_FMT = '%Y%m%d_%H%M%S.%f'
+
+
+def path_string(o):
+    if isinstance(o, pd.Timestamp):
+        return o.isoformat()
+    return str(o)
+
+
+default_open = open
+
+
+def default_remove(paths):
+    for path in paths:
+        try:
+            os.unlink(path)
+        except IOError:
+            pass
+    
+
+def val_from_meta(x, meta):
+    try:
+        if meta['pandas_type'] == 'categorical':
+            return x
+        t = np.dtype(meta['numpy_type'])
+        if t == "bool":
+            return x in [True, "true", "True", 't', "T", 1, "1"]
+        return np.dtype(t).type(x)
+    except ValueError:
+        if meta['numpy_type'] == 'datetime64[ns]':
+            return pd.to_datetime(x, format=PATH_DATE_FMT)
+        else:
+            raise
+
+
+def val_to_num(x, meta=None):
+    """Parse a string as a number, date or timedelta if possible"""
+    if meta:
+        return val_from_meta(x, meta)
+    return _val_to_num(x)
+
+
+@lru_cache(1000)
+def _val_to_num(x):
+    if isinstance(x, numbers.Real):
+        return x
+    if x in ['now', 'NOW', 'TODAY', '']:
+        return x
+    if type(x) == str and x.lower() == 'nan':
+        return x
+    if x == "True":
+        return True
+    if x == "False":
+        return False
+    try:
+        return int(x, base=10)
+    except:
+        pass
+    try:
+        return float(x)
+    except:
+        pass
+    try:
+        return pd.Timestamp(x)
+    except:
+        pass
+    try:
+        # TODO: determine the valid usecases for this, then try to limit the set
+        #  ofstrings which may get inadvertently converted to timedeltas
+        return pd.Timedelta(x)
+    except:
+        return x
+
+
+def ensure_bytes(s):
+    return s.encode('utf-8') if isinstance(s, str) else s
+
+
+def ensure_str(b, *, ignore_error=False):
+    if isinstance(b, str):
+        return b
+    else:
+        try:
+            return b.decode('utf-8')
+        except (UnicodeDecodeError, AttributeError):
+            if not ignore_error:
+                raise
+            return b
+
+
+def check_column_names(columns, *args):
+    """Ensure that parameters listing column names have corresponding columns"""
+    for arg in args:
+        if isinstance(arg, (tuple, list)):
+            missing = set(arg) - set(columns)
+            if missing:
+                raise ValueError("Following columns were requested but are "
+                                 "not available: %s.\n"
+                                 "All requested columns: %s\n"
+                                 "Available columns: %s"
+                                 "" % (missing, arg, columns))
+
+
+def reset_row_idx(data: pd.DataFrame) -> pd.DataFrame:
+    """Reset row (multi-)index as column(s) of the DataFrame.
+
+    Multi-index are stored in columns, one per index level.
+
+    Parameters
+    ----------
+    data : dataframe
+
+    Returns
+    -------
+    dataframe
+    """
+    if isinstance(data.index, pd.MultiIndex):
+        for name, cats, codes in zip(data.index.names, data.index.levels,
+                                     data.index.codes):
+            data = data.assign(**{name: pd.Categorical.from_codes(codes,
+                                                                  cats)})
+        data.reset_index(drop=True)
+    else:
+        data = data.reset_index()
+    return data
+
+
+def metadata_from_many(file_list, verify_schema=False, open_with=default_open,
+                       root=False, fs=None):
+    """
+    Given list of parquet files, make a FileMetaData that points to them
+
+    Parameters
+    ----------
+    file_list: list of paths of parquet files
+    verify_schema: bool (False)
+        Whether to assert that the schemas in each file are identical
+    open_with: function
+        Use this to open each path.
+    root: str
+        Top of the dataset's directory tree, for cases where it can't be
+        automatically inferred.
+    fs: fsspsec.AbstractFileSystem
+        Used in preference to open_with, if given
+
+    Returns
+    -------
+    basepath: the root path that other paths are relative to
+    fmd: metadata thrift structure
+    """
+    from fastparquet import api
+
+    legacy = True
+    if all(isinstance(pf, api.ParquetFile) for pf in file_list):
+        pfs = file_list
+        file_list = [pf.fn for pf in pfs]
+    elif all(not isinstance(pf, api.ParquetFile) for pf in file_list):
+
+        if verify_schema or fs is None or len(file_list) < 3:
+            pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
+        else:
+            # activate new code path here
+            f0 = file_list[0]
+            pf0 = api.ParquetFile(f0, open_with=open_with)
+            if pf0.file_scheme not in ['empty', 'simple']:
+                # set of directories, revert
+                pfs = [pf0] + [api.ParquetFile(fn, open_with=open_with) for fn in file_list[1:]]
+            else:
+                # permits concurrent fetch of footers; needs fsspec >= 2021.6
+                size = int(1.4 * pf0._head_size)
+                pieces = fs.cat(file_list[1:], start=-size)
+                sizes = {path: int.from_bytes(piece[-8:-4], "little") + 8 for
+                         path, piece in pieces.items()}
+                not_bigenough = [path for path, s in sizes.items() if s > size]
+                if not_bigenough:
+                    new_pieces = fs.cat(not_bigenough, start=-max(sizes.values()))
+                    pieces.update(new_pieces)
+                pieces = {k: _get_fmd(v) for k, v in pieces.items()}
+                pieces = [(fn, pieces[fn]) for fn in file_list[1:]]  # recover ordering
+                legacy = False
+    else:
+        raise ValueError("Merge requires all ParquetFile instances or none")
+    basepath, file_list = analyse_paths(file_list, root=root)
+
+    if legacy:
+        # legacy code path
+        if verify_schema:
+            for pf in pfs[1:]:
+                if pf._schema != pfs[0]._schema:
+                    raise ValueError('Incompatible schemas')
+
+        fmd = copy.copy(pfs[0].fmd)  # we inherit "created by" field
+        rgs = []
+
+        for pf, fn in zip(pfs, file_list):
+            if pf.file_scheme not in ['simple', 'empty']:
+                for rg in pf.row_groups:
+                    rg = copy.copy(rg)
+                    rg.columns = [copy.copy(c) for c in rg.columns]
+                    for chunk in rg.columns:
+                        chunk.file_path = '/'.join(
+                            [fn, chunk.file_path if isinstance(chunk.file_path, str) else chunk.file_path.decode()]
+                        )
+                    rgs.append(rg)
+
+            else:
+                for rg in pf.row_groups:
+                    rg = copy.copy(rg)
+                    rg.columns = [copy.copy(c) for c in rg.columns]
+                    for chunk in rg.columns:
+                        chunk.file_path = fn
+                    rgs.append(rg)
+
+        fmd.row_groups = rgs
+        fmd.num_rows = sum(rg.num_rows for rg in fmd.row_groups)
+        return basepath, fmd
+
+    for rg in pf0.fmd.row_groups:
+        # chunks of first file, which would have file_path=None
+        rg.columns[0].file_path = f0[len(basepath):].lstrip("/")
+
+    rgs0 = pf0.fmd.row_groups
+    for k, v in pieces:
+        # Set file paths on other files
+        if len(v.schema) > len(pf0.fmd.schema):
+            # or was UPDATED with supercast
+            pf0.fmd.schema = v.schema
+        rgs = v.row_groups or []
+        for rg in rgs:
+            rg.columns[0].file_path = k[len(basepath):].lstrip("/")
+        rgs0.extend(rgs)
+    pf0.fmd.row_groups = rgs0
+    pf0.fmd.num_rows = sum(rg.num_rows for rg in pf0.fmd.row_groups)
+    return basepath, pf0.fmd
+
+
+def _get_fmd(inbytes):
+    from .cencoding import from_buffer
+
+    f = io.BytesIO(inbytes)
+    f.seek(-8, 2)
+    head_size = struct.unpack('<i', f.read(4))[0]
+    f.seek(-(head_size + 8), 2)
+    data = f.read(head_size)
+    return from_buffer(data, "FileMetaData")
+
+
+def update_custom_metadata(obj, custom_metadata : dict):
+    """Update custom metadata stored in thrift object or parquet file.
+
+    Update strategy depends if key found in new custom metadata is also found
+    in already existing custom metadata within thrift object, as well as its
+    value.
+        
+      - If not found in existing, it is added.
+      - If found in existing, it is updated.
+      - If its value is `None`, it is not added, and if found in existing,
+        it is removed from existing.
+
+    Parameters
+    ----------
+    obj : metadata ThriftObject or parquet file
+        Thrift object or parquet file which metadata is to update.
+    custom_metadata : dict
+        Key-value metadata to update in thrift object.
+        The values must be strings or binary. To pass a dictionary, serialize it as json string then encode it in binary.
+    Notes
+    -----
+    Key-value metadata are expected binary encoded. This function ensures it
+    is.
+    """
+    kvm = (obj.key_value_metadata if isinstance(obj, ThriftObject)
+           else obj.fmd.key_value_metadata)
+    
+    if kvm is None:
+        kvm = []
+
+    # Spare list of keys.
+    kvm_keys = [item.key for item in kvm]
+    for key, value in custom_metadata.items():
+        key_b = ensure_bytes(key)
+        if key_b in kvm_keys:
+            idx = kvm_keys.index(key_b)
+            if value is None:
+                # Remove item.
+                del kvm[idx]
+                # Update 'kvm_keys' as well, for keeping indexing
+                # up-to-date.
+                del kvm_keys[idx]
+            else:
+                # Replace item.
+                kvm[idx] = parquet_thrift.KeyValue(key=key_b,
+                                                   value=ensure_bytes(value))
+        elif value is not None:
+            kvm.append(parquet_thrift.KeyValue(key=key_b,
+                                               value=ensure_bytes(value)))
+    if isinstance(obj, ThriftObject):
+        obj.key_value_metadata = kvm
+    else:
+        obj.fmd.key_value_metadata = kvm
+        # Reset '_kvm' to refresh 'key_value_metadata' cached property.
+        obj._kvm = None
+
+
+# simple cache to avoid re compile every time
+seps = {}
+
+
+def ex_from_sep(sep):
+    """Generate regex for category folder matching"""
+    if sep not in seps:
+        if sep in r'\^$.|?*+()[]':
+            s = re.compile(r"([a-zA-Z_0-9]+)=([^\\{}]+)".format(sep))
+        else:
+            s = re.compile("([a-zA-Z_0-9]+)=([^{}]+)".format(sep))
+        seps[sep] = s
+    return seps[sep]
+
+
+def analyse_paths(file_list, root=False):
+    """Consolidate list of file-paths into  parquet relative paths"""
+    path_parts_list = [join_path(fn).split('/') for fn in file_list]
+    if root is False:
+        basepath = path_parts_list[0][:-1]
+        for i, path_parts in enumerate(path_parts_list):
+            j = len(path_parts) - 1
+            for k, (base_part, path_part) in enumerate(
+                    zip(basepath, path_parts)):
+                if base_part != path_part:
+                    j = k
+                    break
+            basepath = basepath[:j]
+        l = len(basepath)
+
+    else:
+        basepath = join_path(root).split('/')
+        l = len(basepath)
+        assert all(p[:l] == basepath for p in path_parts_list
+                   ), "All paths must begin with the given root"
+    out_list = []
+    for path_parts in path_parts_list:
+        out_list.append('/'.join(path_parts[l:]))  # use '/'.join() instead of join_path to be consistent with split('/')
+
+    return '/'.join(basepath), out_list  # use '/'.join() instead of join_path to be consistent with split('/')
+
+
+def infer_dtype(column):
+    try:
+        return pd.api.types.infer_dtype(column, skipna=False)
+    except AttributeError:
+        return pd.lib.infer_dtype(column)
+
+
+def groupby_types(iterable):
+    groups = defaultdict(list)
+    for x in iterable:
+        groups[type(x)].append(x)
+    return groups
+
+
+def get_column_metadata(column, name, object_dtype=None):
+    """Produce pandas column metadata block"""
+    inferred_dtypes = {
+        "utf8": "unicode",
+        "bytes": "bytes",
+        "bool": "bool",
+        "int": "int",
+        "json": "object",
+        "bson": "object"
+    }
+    dtype = column.dtype
+    if object_dtype in inferred_dtypes and dtype == "object":
+        inferred_dtype = inferred_dtypes.get(object_dtype, "mixed")
+    else:
+        inferred_dtype = infer_dtype(column)
+    if str(dtype) == "bool":
+        # pandas accidentally calls this "boolean"
+        inferred_dtype = "bool"
+
+    if isinstance(dtype, pd.CategoricalDtype):
+        extra_metadata = {
+            'num_categories': len(column.cat.categories),
+            'ordered': column.cat.ordered,
+        }
+        dtype = column.cat.codes.dtype
+    elif isinstance(dtype, pd.DatetimeTZDtype):
+        if isinstance(dtype.tz, zoneinfo.ZoneInfo):
+            extra_metadata = {'timezone': dtype.tz.key}
+        else:
+            try:
+                stz = str(dtype.tz)
+                if "UTC" in stz and ":" in stz:
+                    extra_metadata = {'timezone': stz.strip("UTC")}
+                elif len(str(stz)) == 3:  # like "UTC", "CET", ...
+                    extra_metadata = {'timezone': str(stz)}
+                elif getattr(dtype.tz, "zone", False):
+                    extra_metadata = {'timezone': dtype.tz.zone}
+                elif "pytz" not in stz:
+                    pd.Series([pd.to_datetime('now', utc=True)]).dt.tz_localize(stz)
+                    extra_metadata = {'timezone': stz}
+                elif "Offset" in stz:
+                    extra_metadata = {'timezone': f"{dtype.tz._minutes // 60:+03}:00"}
+                else:
+                    raise KeyError
+            except Exception as e:
+                raise ValueError("Time-zone information could not be serialised: "
+                                "%s, please use another" % str(dtype.tz)) from e
+    else:
+        extra_metadata = None
+
+    if isinstance(name, tuple):
+        name = str(name)
+    elif not isinstance(name, str):
+        raise TypeError(
+            'Column name must be a string. Got column {} of type {}'.format(
+                name, type(name).__name__
+            )
+        )
+
+    return {
+        'name': name,
+        'field_name': name,
+        'pandas_type': {
+            'string': 'unicode',
+            'datetime64': (
+                'datetimetz' if hasattr(dtype, 'tz')
+                else 'datetime'
+            ),
+            'integer': str(dtype),
+            'floating': str(dtype),
+        }.get(inferred_dtype, inferred_dtype),
+        'numpy_type': get_numpy_type(dtype),
+        'metadata': extra_metadata,
+    }
+
+
+def get_numpy_type(dtype):
+    if isinstance(dtype, pd.CategoricalDtype):
+        return 'category'
+    elif "Int" in str(dtype):
+        return str(dtype).lower()
+    elif str(dtype) == "boolean":
+        return "bool"
+    elif str(dtype) == "string":
+        return "object"
+    else:
+        return str(dtype)
+
+
+def get_file_scheme(paths):
+    """For the given row groups, figure out if the partitioning scheme
+
+    Parameters
+    ----------
+    paths: list of str
+        normally from row_group.columns[0].file_path
+
+    Returns
+    -------
+    'empty': no rgs at all
+    'simple': all rgs in a single file
+    'flat': multiple files in one directory
+    'hive': directories are all `key=value`; all files are at the same
+        directory depth
+    'drill': assume directory names are labels, and field names are of the
+        form dir0, dir1; all files are at the same directory depth
+    'other': none of the above, assume no partitioning
+    """
+    if not paths:
+        return 'empty'
+    if set(paths) == {None}:
+        return 'simple'
+    if None in paths:
+        return 'other'
+    parts = [p.split('/') for p in paths]
+    lens = [len(p) for p in parts]
+    if len(set(lens)) > 1:
+        return 'other'
+    if set(lens) == {1}:
+        return 'flat'
+    matches = all(all("=" in p[1:-1] for p in part[:-1]) for part in parts)
+    return "hive" if matches else "drill"
+
+
+def join_path(*path):
+    return "/".join([str(p).replace("\\", "/").rstrip("/") for p in path if p])
+
+
+def _strip_path_tail(paths) -> set:
+    return {path.rsplit("/", 1)[0] if "/" in path else "" for path in paths}
+
+
+ops = {
+    "==": operator.eq,
+    "=": operator.eq,
+    "!=": operator.ne,
+    ">": operator.gt,
+    ">=": operator.ge,
+    "<": operator.lt,
+    "<=": operator.le
+}
+
+
+def norm_col_name(name, is_index:bool=None):
+    if isinstance(name, tuple):
+        if is_index:
+            return name[0]
+        else:
+            return str(name)
+    return name
+
+
+def get_fs(fn, open_with, mkdirs):
+    fs = None
+    if "FastParquetImpl.write.<locals>.<lambda>" in str(open_with):
+        import inspect
+        so = inspect.getclosurevars(open_with).nonlocals["storage_options"] or {}
+        fs, fn = fsspec.core.url_to_fs(fn, **so)
+        open_with = fs.open
+        mkdirs = mkdirs or (lambda d: fs.mkdirs(d, exist_ok=True))
+    return fs, fn, open_with, mkdirs
+