import re from collections import OrderedDict from packaging.version import Version import numpy as np from pandas import ( Categorical, DataFrame, Series, CategoricalIndex, RangeIndex, Index, MultiIndex, DatetimeIndex, CategoricalDtype, DatetimeTZDtype ) from pandas.core.arrays.masked import BaseMaskedDtype import warnings from fastparquet.util import PANDAS_VERSION class Dummy(object): pass def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None, columns_dtype=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. file_has_columns: bool, default False for files that are filtered but had columns before Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, str): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[str(col)] = Categorical.from_codes([], categories=cat(col)) elif isinstance(t, BaseMaskedDtype): # pandas masked types arr_type = t.construct_array_type() df[str(col)] = arr_type( values=np.empty(0, dtype=t.numpy_dtype), mask=np.empty(0, dtype=np.bool_), copy=False ) else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base if ("M" in str(t) or "time" in str(t)) and "[" not in str(t): t = str(t) + "[ns]" d = np.empty(0, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: try: z = tz_to_dt_tz(timezones[str(col)]) d = Series(d).dt.tz_localize(z) except: warnings.warn("Inferring time-zone from %s in column %s " "failed, using time-zone-agnostic" "" % (timezones[str(col)], col)) df[str(col)] = d columns = Index(df.keys(), dtype=columns_dtype) if columns_dtype is not None else None df = DataFrame(df, columns=columns) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': # https://github.com/dask/fastparquet/issues/576#issuecomment-805579337 temp = Categorical.from_codes([], categories=cat(col)) vals = np.zeros(size, dtype=temp.codes.dtype) c = Categorical.from_codes(vals, dtype=temp.dtype) index = CategoricalIndex(c) views[col] = vals views[col+'-catdef'] = index._data else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base # Initialize datetime index to zero: uninitialized data might fail # validation due to being an out-of-bounds datetime. xref # https://github.com/dask/fastparquet/issues/778 dtype = np.dtype(t) if dtype.kind == "M": d = np.zeros(size, dtype=dtype) # 1) create the DatetimeIndex in UTC as no datetime conversion is needed and # it works with d uninitialised data (no NonExistentTimeError or AmbiguousTimeError) # 2) convert to timezone (if UTC=noop, if None=remove tz, if other=change tz) if str(col) in timezones: index = DatetimeIndex(d, tz="UTC").tz_convert( tz_to_dt_tz(timezones[str(col)])) else: index = DatetimeIndex(d, tz=None) d = index._data._ndarray else: d = np.empty(size, dtype=dtype) index = Index(d) views[col] = d else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() index._codes = list() index._names = list(index_names) for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col if index._levels[i][0] is None: index._levels[i] = values elif not index._levels[i].equals(values): raise RuntimeError("Different dictionaries encountered" " while building categorical") x = Dummy() x._set_categories = set_cats x._multiindex = True d = np.zeros(size, dtype=int) if PANDAS_VERSION >= Version("0.24.0"): index._codes = list(index._codes) + [d] else: index._labels.append(d) views[col] = d views[col+'-catdef'] = x # Patch our blocks with desired-length arrays. Kids: don't try this at home. mgr = df._mgr for block in mgr.blocks: bvalues = block.values shape = list(bvalues.shape) shape[-1] = size if isinstance(bvalues, Categorical): code = np.full(fill_value=-1, shape=shape, dtype=bvalues.codes.dtype) values = Categorical.from_codes(codes=code, dtype=bvalues.dtype) elif isinstance(bvalues.dtype, DatetimeTZDtype): dt = "M8[ns]" if PANDAS_VERSION.major < 2 else f'M8[{bvalues.dtype.unit}]' values = np.zeros(shape=shape, dtype=dt) values = type(bvalues)._from_sequence(values.view("int64"), copy=False, dtype=bvalues.dtype) else: if not isinstance(bvalues, np.ndarray): # e.g. DatetimeLikeBlock backed by DatetimeArray/TimedeltaArray if bvalues.dtype.kind == "m": dt = "m8[ns]" if PANDAS_VERSION.major < 2 else bvalues.dtype values = np.zeros(shape=shape, dtype=dt) values = type(bvalues)._from_sequence(values.view("int64"), copy=False, dtype=bvalues.dtype) elif bvalues.dtype.kind == "M": dt = "M8[ns]" if PANDAS_VERSION.major < 2 else bvalues.dtype values = np.zeros(shape=shape, dtype=dt) values = type(bvalues)._from_sequence(values.view("int64"), copy=False, dtype=bvalues.dtype) elif str(bvalues.dtype)[0] in {"I", "U"} or str(bvalues.dtype) == "boolean": arr_type = bvalues.dtype.construct_array_type() values = arr_type( values=np.empty(size, dtype=bvalues.dtype.numpy_dtype), mask=np.zeros(size, dtype=np.bool_) ) else: raise NotImplementedError else: values = np.empty(shape=shape, dtype=bvalues.dtype) block.values = values mgr.axes[-1] = index # create views for block in df._mgr.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if isinstance(dtype, CategoricalDtype): views[col] = block.values._codes views[col+'-catdef'] = block.values elif getattr(block.dtype, 'tz', None): arr = block.values._ndarray if len(arr.shape) > 1: # pandas >= 1.3 does this for some reason arr = arr.squeeze(axis=0) views[col] = arr elif str(dtype)[0] in {"I", "U"} or str(dtype) == "boolean": views[col] = block.values else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views def tz_to_dt_tz(z): if ":" in z: import datetime hours, mins = z.split(":", 1) sign = z.startswith("-") z = int(hours) * 3600 z += (1, -1)[sign] * int(mins) * 60 z = datetime.timezone(datetime.timedelta(seconds=z)) return z