Files
AI-Stock-Trader/WebServer/AIPython/python/lib/python3.11/site-packages/fastparquet/schema.py
T

204 lines
6.9 KiB
Python

"""Utils for working with the parquet thrift models."""
from collections import OrderedDict
from fastparquet import parquet_thrift
def schema_tree(schema, i=0):
root = schema[i]
root["children"] = OrderedDict()
while len(root["children"]) < root.num_children:
i += 1
s = schema[i]
root["children"][s.name] = s
if s.num_children not in [None, 0]:
i = schema_tree(schema, i)
if root.num_children:
return i
else:
return i + 1
def schema_to_text(root, indent=[]):
text = "".join(indent) + '- ' + root.name + ": "
parts = []
if root.type is not None:
parts.append(parquet_thrift.Type._VALUES_TO_NAMES[root.type])
if root.logicalType is not None:
for key in dir(root.logicalType):
if getattr(root.logicalType, key) is not None:
if key == "TIMESTAMP":
unit = [k for k in dir(root.logicalType.TIMESTAMP.unit) if getattr(
root.logicalType.TIMESTAMP.unit, k) is not None][0]
parts.append(f"TIMESTAMP[{unit}]")
else:
# extra parameters possible here
parts.append(key)
break
if root.converted_type is not None:
parts.append(parquet_thrift.ConvertedType._VALUES_TO_NAMES[
root.converted_type])
if root.repetition_type is not None:
parts.append(parquet_thrift.FieldRepetitionType._VALUES_TO_NAMES[
root.repetition_type])
text += ', '.join(parts)
indent.append('|')
if hasattr(root, 'children'):
indent[-1] = '| '
for i, child in enumerate(root["children"].values()):
if i == len(root["children"]) - 1:
indent[-1] = ' '
text += '\n' + schema_to_text(child, indent)
indent.pop()
return text
def flatten(schema, root, name_parts=[]):
if not hasattr(schema, 'children'):
return
if schema is not root:
name_parts = name_parts + [schema.name]
# root["children"].pop('.'.join(name_parts), None)
for name, item in schema["children"].copy().items():
if schema.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
continue
if len(getattr(item, 'children', [])) == 0:
root["children"]['.'.join(name_parts + [name])] = item
elif item.converted_type in [parquet_thrift.ConvertedType.LIST,
parquet_thrift.ConvertedType.MAP]:
root["children"]['.'.join(name_parts + [name])] = item
else:
flatten(item, root, name_parts)
item["isflat"] = True
class SchemaHelper(object):
"""Utility providing convenience methods for schema_elements."""
def __init__(self, schema_elements):
"""Initialize with the specified schema_elements."""
self.schema_elements = schema_elements
for se in schema_elements:
try:
se.name = se.name.decode()
except AttributeError:
pass # already a str
self.root = schema_elements[0]
self.schema_elements_by_name = dict(
[(se.name, se) for se in schema_elements])
schema_tree(schema_elements)
self._text = None
flatten(self.root, self.root)
@property
def text(self):
if self._text is None:
self._text = schema_to_text(self.schema_elements[0])
return self._text
def __eq__(self, other):
return self.schema_elements == other.schema_elements
def __ne__(self, other):
return not self.__eq__(other)
def __str__(self):
return self.text
def __repr__(self):
return "<Parquet Schema with {} entries>".format(
len(self.schema_elements))
def schema_element(self, name):
"""Get the schema element with the given name or path"""
root = self.root
if isinstance(name, str):
name = name.split('.')
for part in name:
root = root["children"][part]
return root
def is_required(self, name):
"""Return true if the schema element with the given name is required."""
required = True
if isinstance(name, str):
name = name.split('.')
parts = []
for part in name:
parts.append(part)
s = self.schema_element(parts)
if s.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED:
required = False
break
return required
def max_repetition_level(self, parts):
"""Get the max repetition level for the given schema path."""
max_level = 0
if isinstance(parts, str):
parts = parts.split('.')
for i in range(len(parts)):
element = self.schema_element(parts[:i+1])
if element.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
max_level += 1
return max_level
def max_definition_level(self, parts):
"""Get the max definition level for the given schema path."""
max_level = 0
if isinstance(parts, str):
parts = parts.split('.')
for i in range(len(parts)):
element = self.schema_element(parts[:i+1])
if element.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED:
max_level += 1
return max_level
def _is_list_like(helper, column):
if len(column.meta_data.path_in_schema) < 3:
return False
se = helper.schema_element(
column.meta_data.path_in_schema[:-2])
ct = se.converted_type
if ct != parquet_thrift.ConvertedType.LIST:
return False
if len(se["children"]) > 1:
return False
se2 = list(se["children"].values())[0]
if len(se2["children"]) > 1:
return False
if se2.repetition_type != parquet_thrift.FieldRepetitionType.REPEATED:
return False
se3 = list(se2["children"].values())[0]
if se3.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
return False
return True
def _is_map_like(helper, column):
if len(column.meta_data.path_in_schema) < 3:
return False
se = helper.schema_element(
column.meta_data.path_in_schema[:-2])
ct = se.converted_type
if ct != parquet_thrift.ConvertedType.MAP:
return False
if len(se["children"]) > 1:
return False
se2 = list(se["children"].values())[0]
if len(se2["children"]) != 2:
return False
if se2.repetition_type != parquet_thrift.FieldRepetitionType.REPEATED:
return False
if set(se2["children"]) != {'key', 'value'}:
return False
se3 = se2["children"]['key']
if se3.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED:
return False
se3 = se2["children"]['value']
if se3.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
return False
return True