AI-Stock-Trader/WebServer/AIPython/python/lib/python3.11/site-packages/fastparquet/util.py

from collections import defaultdict
import copy
from packaging.version import Version
from functools import lru_cache
import io
import struct
import os
import operator
import re
import numbers
import zoneinfo

import numpy as np
import pandas as pd

import fsspec

from fastparquet import parquet_thrift
from fastparquet.cencoding import ThriftObject
from fastparquet import __version__

PANDAS_VERSION = Version(pd.__version__)
created_by = f"fastparquet-python version {__version__} (build 0)"


class ParquetException(Exception):
    """Generic Exception related to unexpected data format when
     reading parquet file."""
    pass


def default_mkdirs(f):
    os.makedirs(f, exist_ok=True)


PATH_DATE_FMT = '%Y%m%d_%H%M%S.%f'


def path_string(o):
    if isinstance(o, pd.Timestamp):
        return o.isoformat()
    return str(o)


default_open = open


def default_remove(paths):
    for path in paths:
        try:
            os.unlink(path)
        except IOError:
            pass


def val_from_meta(x, meta):
    try:
        if meta['pandas_type'] == 'categorical':
            return x
        t = np.dtype(meta['numpy_type'])
        if t == "bool":
            return x in [True, "true", "True", 't', "T", 1, "1"]
        return np.dtype(t).type(x)
    except ValueError:
        if meta['numpy_type'] == 'datetime64[ns]':
            return pd.to_datetime(x, format=PATH_DATE_FMT)
        else:
            raise


def val_to_num(x, meta=None):
    """Parse a string as a number, date or timedelta if possible"""
    if meta:
        return val_from_meta(x, meta)
    return _val_to_num(x)


@lru_cache(1000)
def _val_to_num(x):
    if isinstance(x, numbers.Real):
        return x
    if x in ['now', 'NOW', 'TODAY', '']:
        return x
    if type(x) == str and x.lower() == 'nan':
        return x
    if x == "True":
        return True
    if x == "False":
        return False
    try:
        return int(x, base=10)
    except:
        pass
    try:
        return float(x)
    except:
        pass
    try:
        return pd.Timestamp(x)
    except:
        pass
    try:
        # TODO: determine the valid usecases for this, then try to limit the set
        #  ofstrings which may get inadvertently converted to timedeltas
        return pd.Timedelta(x)
    except:
        return x


def ensure_bytes(s):
    return s.encode('utf-8') if isinstance(s, str) else s


def ensure_str(b, *, ignore_error=False):
    if isinstance(b, str):
        return b
    else:
        try:
            return b.decode('utf-8')
        except (UnicodeDecodeError, AttributeError):
            if not ignore_error:
                raise
            return b


def check_column_names(columns, *args):
    """Ensure that parameters listing column names have corresponding columns"""
    for arg in args:
        if isinstance(arg, (tuple, list)):
            missing = set(arg) - set(columns)
            if missing:
                raise ValueError("Following columns were requested but are "
                                 "not available: %s.\n"
                                 "All requested columns: %s\n"
                                 "Available columns: %s"
                                 "" % (missing, arg, columns))


def reset_row_idx(data: pd.DataFrame) -> pd.DataFrame:
    """Reset row (multi-)index as column(s) of the DataFrame.

    Multi-index are stored in columns, one per index level.

    Parameters
    ----------
    data : dataframe

    Returns
    -------
    dataframe
    """
    if isinstance(data.index, pd.MultiIndex):
        for name, cats, codes in zip(data.index.names, data.index.levels,
                                     data.index.codes):
            data = data.assign(**{name: pd.Categorical.from_codes(codes,
                                                                  cats)})
        data.reset_index(drop=True)
    else:
        data = data.reset_index()
    return data


def metadata_from_many(file_list, verify_schema=False, open_with=default_open,
                       root=False, fs=None):
    """
    Given list of parquet files, make a FileMetaData that points to them

    Parameters
    ----------
    file_list: list of paths of parquet files
    verify_schema: bool (False)
        Whether to assert that the schemas in each file are identical
    open_with: function
        Use this to open each path.
    root: str
        Top of the dataset's directory tree, for cases where it can't be
        automatically inferred.
    fs: fsspsec.AbstractFileSystem
        Used in preference to open_with, if given

    Returns
    -------
    basepath: the root path that other paths are relative to
    fmd: metadata thrift structure
    """
    from fastparquet import api

    legacy = True
    if all(isinstance(pf, api.ParquetFile) for pf in file_list):
        pfs = file_list
        file_list = [pf.fn for pf in pfs]
    elif all(not isinstance(pf, api.ParquetFile) for pf in file_list):

        if verify_schema or fs is None or len(file_list) < 3:
            pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
        else:
            # activate new code path here
            f0 = file_list[0]
            pf0 = api.ParquetFile(f0, open_with=open_with)
            if pf0.file_scheme not in ['empty', 'simple']:
                # set of directories, revert
                pfs = [pf0] + [api.ParquetFile(fn, open_with=open_with) for fn in file_list[1:]]
            else:
                # permits concurrent fetch of footers; needs fsspec >= 2021.6
                size = int(1.4 * pf0._head_size)
                pieces = fs.cat(file_list[1:], start=-size)
                sizes = {path: int.from_bytes(piece[-8:-4], "little") + 8 for
                         path, piece in pieces.items()}
                not_bigenough = [path for path, s in sizes.items() if s > size]
                if not_bigenough:
                    new_pieces = fs.cat(not_bigenough, start=-max(sizes.values()))
                    pieces.update(new_pieces)
                pieces = {k: _get_fmd(v) for k, v in pieces.items()}
                pieces = [(fn, pieces[fn]) for fn in file_list[1:]]  # recover ordering
                legacy = False
    else:
        raise ValueError("Merge requires all ParquetFile instances or none")
    basepath, file_list = analyse_paths(file_list, root=root)

    if legacy:
        # legacy code path
        if verify_schema:
            for pf in pfs[1:]:
                if pf._schema != pfs[0]._schema:
                    raise ValueError('Incompatible schemas')

        fmd = copy.copy(pfs[0].fmd)  # we inherit "created by" field
        rgs = []

        for pf, fn in zip(pfs, file_list):
            if pf.file_scheme not in ['simple', 'empty']:
                for rg in pf.row_groups:
                    rg = copy.copy(rg)
                    rg.columns = [copy.copy(c) for c in rg.columns]
                    for chunk in rg.columns:
                        chunk.file_path = '/'.join(
                            [fn, chunk.file_path if isinstance(chunk.file_path, str) else chunk.file_path.decode()]
                        )
                    rgs.append(rg)

            else:
                for rg in pf.row_groups:
                    rg = copy.copy(rg)
                    rg.columns = [copy.copy(c) for c in rg.columns]
                    for chunk in rg.columns:
                        chunk.file_path = fn
                    rgs.append(rg)

        fmd.row_groups = rgs
        fmd.num_rows = sum(rg.num_rows for rg in fmd.row_groups)
        return basepath, fmd

    for rg in pf0.fmd.row_groups:
        # chunks of first file, which would have file_path=None
        rg.columns[0].file_path = f0[len(basepath):].lstrip("/")

    rgs0 = pf0.fmd.row_groups
    for k, v in pieces:
        # Set file paths on other files
        if len(v.schema) > len(pf0.fmd.schema):
            # or was UPDATED with supercast
            pf0.fmd.schema = v.schema
        rgs = v.row_groups or []
        for rg in rgs:
            rg.columns[0].file_path = k[len(basepath):].lstrip("/")
        rgs0.extend(rgs)
    pf0.fmd.row_groups = rgs0
    pf0.fmd.num_rows = sum(rg.num_rows for rg in pf0.fmd.row_groups)
    return basepath, pf0.fmd


def _get_fmd(inbytes):
    from .cencoding import from_buffer

    f = io.BytesIO(inbytes)
    f.seek(-8, 2)
    head_size = struct.unpack('<i', f.read(4))[0]
    f.seek(-(head_size + 8), 2)
    data = f.read(head_size)
    return from_buffer(data, "FileMetaData")


def update_custom_metadata(obj, custom_metadata : dict):
    """Update custom metadata stored in thrift object or parquet file.

    Update strategy depends if key found in new custom metadata is also found
    in already existing custom metadata within thrift object, as well as its
    value.

      - If not found in existing, it is added.
      - If found in existing, it is updated.
      - If its value is `None`, it is not added, and if found in existing,
        it is removed from existing.

    Parameters
    ----------
    obj : metadata ThriftObject or parquet file
        Thrift object or parquet file which metadata is to update.
    custom_metadata : dict
        Key-value metadata to update in thrift object.
        The values must be strings or binary. To pass a dictionary, serialize it as json string then encode it in binary.
    Notes
    -----
    Key-value metadata are expected binary encoded. This function ensures it
    is.
    """
    kvm = (obj.key_value_metadata if isinstance(obj, ThriftObject)
           else obj.fmd.key_value_metadata)

    if kvm is None:
        kvm = []

    # Spare list of keys.
    kvm_keys = [item.key for item in kvm]
    for key, value in custom_metadata.items():
        key_b = ensure_bytes(key)
        if key_b in kvm_keys:
            idx = kvm_keys.index(key_b)
            if value is None:
                # Remove item.
                del kvm[idx]
                # Update 'kvm_keys' as well, for keeping indexing
                # up-to-date.
                del kvm_keys[idx]
            else:
                # Replace item.
                kvm[idx] = parquet_thrift.KeyValue(key=key_b,
                                                   value=ensure_bytes(value))
        elif value is not None:
            kvm.append(parquet_thrift.KeyValue(key=key_b,
                                               value=ensure_bytes(value)))
    if isinstance(obj, ThriftObject):
        obj.key_value_metadata = kvm
    else:
        obj.fmd.key_value_metadata = kvm
        # Reset '_kvm' to refresh 'key_value_metadata' cached property.
        obj._kvm = None


# simple cache to avoid re compile every time
seps = {}


def ex_from_sep(sep):
    """Generate regex for category folder matching"""
    if sep not in seps:
        if sep in r'\^$.|?*+()[]':
            s = re.compile(r"([a-zA-Z_0-9]+)=([^\\{}]+)".format(sep))
        else:
            s = re.compile("([a-zA-Z_0-9]+)=([^{}]+)".format(sep))
        seps[sep] = s
    return seps[sep]


def analyse_paths(file_list, root=False):
    """Consolidate list of file-paths into  parquet relative paths"""
    path_parts_list = [join_path(fn).split('/') for fn in file_list]
    if root is False:
        basepath = path_parts_list[0][:-1]
        for i, path_parts in enumerate(path_parts_list):
            j = len(path_parts) - 1
            for k, (base_part, path_part) in enumerate(
                    zip(basepath, path_parts)):
                if base_part != path_part:
                    j = k
                    break
            basepath = basepath[:j]
        l = len(basepath)

    else:
        basepath = join_path(root).split('/')
        l = len(basepath)
        assert all(p[:l] == basepath for p in path_parts_list
                   ), "All paths must begin with the given root"
    out_list = []
    for path_parts in path_parts_list:
        out_list.append('/'.join(path_parts[l:]))  # use '/'.join() instead of join_path to be consistent with split('/')

    return '/'.join(basepath), out_list  # use '/'.join() instead of join_path to be consistent with split('/')


def infer_dtype(column):
    try:
        return pd.api.types.infer_dtype(column, skipna=False)
    except AttributeError:
        return pd.lib.infer_dtype(column)


def groupby_types(iterable):
    groups = defaultdict(list)
    for x in iterable:
        groups[type(x)].append(x)
    return groups


def get_column_metadata(column, name, object_dtype=None):
    """Produce pandas column metadata block"""
    inferred_dtypes = {
        "utf8": "unicode",
        "bytes": "bytes",
        "bool": "bool",
        "int": "int",
        "json": "object",
        "bson": "object"
    }
    dtype = column.dtype
    if object_dtype in inferred_dtypes and dtype == "object":
        inferred_dtype = inferred_dtypes.get(object_dtype, "mixed")
    else:
        inferred_dtype = infer_dtype(column)
    if str(dtype) == "bool":
        # pandas accidentally calls this "boolean"
        inferred_dtype = "bool"

    if isinstance(dtype, pd.CategoricalDtype):
        extra_metadata = {
            'num_categories': len(column.cat.categories),
            'ordered': column.cat.ordered,
        }
        dtype = column.cat.codes.dtype
    elif isinstance(dtype, pd.DatetimeTZDtype):
        if isinstance(dtype.tz, zoneinfo.ZoneInfo):
            extra_metadata = {'timezone': dtype.tz.key}
        else:
            try:
                stz = str(dtype.tz)
                if "UTC" in stz and ":" in stz:
                    extra_metadata = {'timezone': stz.strip("UTC")}
                elif len(str(stz)) == 3:  # like "UTC", "CET", ...
                    extra_metadata = {'timezone': str(stz)}
                elif getattr(dtype.tz, "zone", False):
                    extra_metadata = {'timezone': dtype.tz.zone}
                elif "pytz" not in stz:
                    pd.Series([pd.to_datetime('now', utc=True)]).dt.tz_localize(stz)
                    extra_metadata = {'timezone': stz}
                elif "Offset" in stz:
                    extra_metadata = {'timezone': f"{dtype.tz._minutes // 60:+03}:00"}
                else:
                    raise KeyError
            except Exception as e:
                raise ValueError("Time-zone information could not be serialised: "
                                "%s, please use another" % str(dtype.tz)) from e
    else:
        extra_metadata = None

    if isinstance(name, tuple):
        name = str(name)
    elif not isinstance(name, str):
        raise TypeError(
            'Column name must be a string. Got column {} of type {}'.format(
                name, type(name).__name__
            )
        )

    return {
        'name': name,
        'field_name': name,
        'pandas_type': {
            'string': 'unicode',
            'datetime64': (
                'datetimetz' if hasattr(dtype, 'tz')
                else 'datetime'
            ),
            'integer': str(dtype),
            'floating': str(dtype),
        }.get(inferred_dtype, inferred_dtype),
        'numpy_type': get_numpy_type(dtype),
        'metadata': extra_metadata,
    }


def get_numpy_type(dtype):
    if isinstance(dtype, pd.CategoricalDtype):
        return 'category'
    elif "Int" in str(dtype):
        return str(dtype).lower()
    elif str(dtype) == "boolean":
        return "bool"
    elif str(dtype) == "string":
        return "object"
    else:
        return str(dtype)


def get_file_scheme(paths):
    """For the given row groups, figure out if the partitioning scheme

    Parameters
    ----------
    paths: list of str
        normally from row_group.columns[0].file_path

    Returns
    -------
    'empty': no rgs at all
    'simple': all rgs in a single file
    'flat': multiple files in one directory
    'hive': directories are all `key=value`; all files are at the same
        directory depth
    'drill': assume directory names are labels, and field names are of the
        form dir0, dir1; all files are at the same directory depth
    'other': none of the above, assume no partitioning
    """
    if not paths:
        return 'empty'
    if set(paths) == {None}:
        return 'simple'
    if None in paths:
        return 'other'
    parts = [p.split('/') for p in paths]
    lens = [len(p) for p in parts]
    if len(set(lens)) > 1:
        return 'other'
    if set(lens) == {1}:
        return 'flat'
    matches = all(all("=" in p[1:-1] for p in part[:-1]) for part in parts)
    return "hive" if matches else "drill"


def join_path(*path):
    return "/".join([str(p).replace("\\", "/").rstrip("/") for p in path if p])


def _strip_path_tail(paths) -> set:
    return {path.rsplit("/", 1)[0] if "/" in path else "" for path in paths}


ops = {
    "==": operator.eq,
    "=": operator.eq,
    "!=": operator.ne,
    ">": operator.gt,
    ">=": operator.ge,
    "<": operator.lt,
    "<=": operator.le
}


def norm_col_name(name, is_index:bool=None):
    if isinstance(name, tuple):
        if is_index:
            return name[0]
        else:
            return str(name)
    return name


def get_fs(fn, open_with, mkdirs):
    fs = None
    if "FastParquetImpl.write.<locals>.<lambda>" in str(open_with):
        import inspect
        so = inspect.getclosurevars(open_with).nonlocals["storage_options"] or {}
        fs, fn = fsspec.core.url_to_fs(fn, **so)
        open_with = fs.open
        mkdirs = mkdirs or (lambda d: fs.mkdirs(d, exist_ok=True))
    return fs, fn, open_with, mkdirs