Import python venv for stability
This commit is contained in:
@@ -0,0 +1,13 @@
|
||||
# ruff: noqa: TC004
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# import modules that have public classes/functions
|
||||
from pandas.io import (
|
||||
formats,
|
||||
json,
|
||||
stata,
|
||||
)
|
||||
|
||||
# mark only those modules as public
|
||||
__all__ = ["formats", "json", "stata"]
|
||||
@@ -0,0 +1,169 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat import (
|
||||
pa_version_under18p0,
|
||||
pa_version_under19p0,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
|
||||
import pandas as pd
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
DtypeArg,
|
||||
DtypeBackend,
|
||||
)
|
||||
|
||||
|
||||
def _arrow_dtype_mapping() -> dict:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
return {
|
||||
pa.int8(): pd.Int8Dtype(),
|
||||
pa.int16(): pd.Int16Dtype(),
|
||||
pa.int32(): pd.Int32Dtype(),
|
||||
pa.int64(): pd.Int64Dtype(),
|
||||
pa.uint8(): pd.UInt8Dtype(),
|
||||
pa.uint16(): pd.UInt16Dtype(),
|
||||
pa.uint32(): pd.UInt32Dtype(),
|
||||
pa.uint64(): pd.UInt64Dtype(),
|
||||
pa.bool_(): pd.BooleanDtype(),
|
||||
pa.string(): pd.StringDtype(),
|
||||
pa.float32(): pd.Float32Dtype(),
|
||||
pa.float64(): pd.Float64Dtype(),
|
||||
pa.string(): pd.StringDtype(),
|
||||
pa.large_string(): pd.StringDtype(),
|
||||
}
|
||||
|
||||
|
||||
def _arrow_string_types_mapper() -> Callable:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
|
||||
mapping = {
|
||||
pa.string(): pd.StringDtype(na_value=np.nan),
|
||||
pa.large_string(): pd.StringDtype(na_value=np.nan),
|
||||
}
|
||||
if not pa_version_under18p0:
|
||||
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
|
||||
|
||||
return mapping.get
|
||||
|
||||
|
||||
def arrow_table_to_pandas(
|
||||
table: pyarrow.Table,
|
||||
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
|
||||
null_to_int64: bool = False,
|
||||
to_pandas_kwargs: dict | None = None,
|
||||
dtype: DtypeArg | None = None,
|
||||
names: Sequence[Hashable] | None = None,
|
||||
) -> pd.DataFrame:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
|
||||
to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs
|
||||
|
||||
types_mapper: type[pd.ArrowDtype] | None | Callable
|
||||
if dtype_backend == "numpy_nullable":
|
||||
mapping = _arrow_dtype_mapping()
|
||||
if null_to_int64:
|
||||
# Modify the default mapping to also map null to Int64
|
||||
# (to match other engines - only for CSV parser)
|
||||
mapping[pa.null()] = pd.Int64Dtype()
|
||||
types_mapper = mapping.get
|
||||
elif dtype_backend == "pyarrow":
|
||||
types_mapper = pd.ArrowDtype
|
||||
elif using_string_dtype():
|
||||
if pa_version_under19p0:
|
||||
types_mapper = _arrow_string_types_mapper()
|
||||
elif dtype is not None:
|
||||
# GH#56136 Avoid lossy conversion to float64
|
||||
# We'll convert to numpy below if
|
||||
types_mapper = {
|
||||
pa.int8(): pd.Int8Dtype(),
|
||||
pa.int16(): pd.Int16Dtype(),
|
||||
pa.int32(): pd.Int32Dtype(),
|
||||
pa.int64(): pd.Int64Dtype(),
|
||||
}.get
|
||||
else:
|
||||
types_mapper = None
|
||||
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
|
||||
if dtype is not None:
|
||||
# GH#56136 Avoid lossy conversion to float64
|
||||
# We'll convert to numpy below if
|
||||
types_mapper = {
|
||||
pa.int8(): pd.Int8Dtype(),
|
||||
pa.int16(): pd.Int16Dtype(),
|
||||
pa.int32(): pd.Int32Dtype(),
|
||||
pa.int64(): pd.Int64Dtype(),
|
||||
}.get
|
||||
else:
|
||||
types_mapper = None
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
|
||||
return _post_convert_dtypes(df, dtype_backend, dtype, names)
|
||||
|
||||
|
||||
def _post_convert_dtypes(
|
||||
df: pd.DataFrame,
|
||||
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault,
|
||||
dtype: DtypeArg | None,
|
||||
names: Sequence[Hashable] | None,
|
||||
) -> pd.DataFrame:
|
||||
if dtype is not None and (
|
||||
dtype_backend is lib.no_default or dtype_backend == "numpy"
|
||||
):
|
||||
# GH#56136 apply any user-provided dtype, and convert any IntegerDtype
|
||||
# columns the user didn't explicitly ask for.
|
||||
if isinstance(dtype, dict):
|
||||
if names is not None:
|
||||
df.columns = names
|
||||
|
||||
cmp_dtypes = {
|
||||
pd.Int8Dtype(),
|
||||
pd.Int16Dtype(),
|
||||
pd.Int32Dtype(),
|
||||
pd.Int64Dtype(),
|
||||
}
|
||||
for col in df.columns:
|
||||
if col not in dtype and df[col].dtype in cmp_dtypes:
|
||||
# Any key that the user didn't explicitly specify
|
||||
# that got converted to IntegerDtype now gets converted
|
||||
# to numpy dtype.
|
||||
dtype[col] = df[col].dtype.numpy_dtype
|
||||
|
||||
# Ignore non-existent columns from dtype mapping
|
||||
# like other parsers do
|
||||
dtype = {
|
||||
key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns
|
||||
}
|
||||
|
||||
else:
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
try:
|
||||
df = df.astype(dtype)
|
||||
except TypeError as err:
|
||||
# GH#44901 reraise to keep api consistent
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Data I/O API
|
||||
"""
|
||||
|
||||
from pandas.io.clipboards import read_clipboard
|
||||
from pandas.io.excel import (
|
||||
ExcelFile,
|
||||
ExcelWriter,
|
||||
read_excel,
|
||||
)
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.html import read_html
|
||||
from pandas.io.iceberg import read_iceberg
|
||||
from pandas.io.json import read_json
|
||||
from pandas.io.orc import read_orc
|
||||
from pandas.io.parquet import read_parquet
|
||||
from pandas.io.parsers import (
|
||||
read_csv,
|
||||
read_fwf,
|
||||
read_table,
|
||||
)
|
||||
from pandas.io.pickle import (
|
||||
read_pickle,
|
||||
to_pickle,
|
||||
)
|
||||
from pandas.io.pytables import (
|
||||
HDFStore,
|
||||
read_hdf,
|
||||
)
|
||||
from pandas.io.sas import read_sas
|
||||
from pandas.io.spss import read_spss
|
||||
from pandas.io.sql import (
|
||||
read_sql,
|
||||
read_sql_query,
|
||||
read_sql_table,
|
||||
)
|
||||
from pandas.io.stata import read_stata
|
||||
from pandas.io.xml import read_xml
|
||||
|
||||
__all__ = [
|
||||
"ExcelFile",
|
||||
"ExcelWriter",
|
||||
"HDFStore",
|
||||
"read_clipboard",
|
||||
"read_csv",
|
||||
"read_excel",
|
||||
"read_feather",
|
||||
"read_fwf",
|
||||
"read_hdf",
|
||||
"read_html",
|
||||
"read_iceberg",
|
||||
"read_json",
|
||||
"read_orc",
|
||||
"read_parquet",
|
||||
"read_pickle",
|
||||
"read_sas",
|
||||
"read_spss",
|
||||
"read_sql",
|
||||
"read_sql_query",
|
||||
"read_sql_table",
|
||||
"read_stata",
|
||||
"read_table",
|
||||
"read_xml",
|
||||
"to_pickle",
|
||||
]
|
||||
+747
@@ -0,0 +1,747 @@
|
||||
"""
|
||||
Pyperclip
|
||||
|
||||
A cross-platform clipboard module for Python,
|
||||
with copy & paste functions for plain text.
|
||||
By Al Sweigart al@inventwithpython.com
|
||||
Licence at LICENSES/PYPERCLIP_LICENSE
|
||||
|
||||
Usage:
|
||||
import pyperclip
|
||||
pyperclip.copy('The text to be copied to the clipboard.')
|
||||
spam = pyperclip.paste()
|
||||
|
||||
if not pyperclip.is_available():
|
||||
print("Copy functionality unavailable!")
|
||||
|
||||
On Windows, no additional modules are needed.
|
||||
On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli
|
||||
commands. (These commands should come with OS X.).
|
||||
On Linux, install xclip, xsel, or wl-clipboard (for "wayland" sessions) via
|
||||
package manager.
|
||||
For example, in Debian:
|
||||
sudo apt-get install xclip
|
||||
sudo apt-get install xsel
|
||||
sudo apt-get install wl-clipboard
|
||||
|
||||
Otherwise on Linux, you will need the PyQt5 modules installed.
|
||||
|
||||
This module does not work with PyGObject yet.
|
||||
|
||||
Cygwin is currently not supported.
|
||||
|
||||
Security Note: This module runs programs with these names:
|
||||
- pbcopy
|
||||
- pbpaste
|
||||
- xclip
|
||||
- xsel
|
||||
- wl-copy/wl-paste
|
||||
- klipper
|
||||
- qdbus
|
||||
A malicious user could rename or add programs with these names, tricking
|
||||
Pyperclip into running them with whatever permissions the Python process has.
|
||||
|
||||
"""
|
||||
|
||||
__version__ = "1.8.2"
|
||||
|
||||
|
||||
import contextlib
|
||||
import ctypes
|
||||
from ctypes import (
|
||||
c_size_t,
|
||||
c_wchar,
|
||||
c_wchar_p,
|
||||
get_errno,
|
||||
sizeof,
|
||||
)
|
||||
import os
|
||||
import platform
|
||||
from shutil import which as _executable_exists
|
||||
import subprocess
|
||||
import time
|
||||
import warnings
|
||||
|
||||
from pandas.errors import (
|
||||
PyperclipException,
|
||||
PyperclipWindowsException,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
# `import PyQt4` sys.exit()s if DISPLAY is not in the environment.
|
||||
# Thus, we need to detect the presence of $DISPLAY manually
|
||||
# and not load PyQt4 if it is absent.
|
||||
HAS_DISPLAY = os.getenv("DISPLAY")
|
||||
|
||||
EXCEPT_MSG = """
|
||||
Pyperclip could not find a copy/paste mechanism for your system.
|
||||
For more information, please visit
|
||||
https://pyperclip.readthedocs.io/en/latest/index.html#not-implemented-error
|
||||
"""
|
||||
|
||||
ENCODING = "utf-8"
|
||||
|
||||
|
||||
class PyperclipTimeoutException(PyperclipException):
|
||||
pass
|
||||
|
||||
|
||||
def _stringifyText(text) -> str:
|
||||
acceptedTypes = (str, int, float, bool)
|
||||
if not isinstance(text, acceptedTypes):
|
||||
raise PyperclipException(
|
||||
f"only str, int, float, and bool values "
|
||||
f"can be copied to the clipboard, not {type(text).__name__}"
|
||||
)
|
||||
return str(text)
|
||||
|
||||
|
||||
def init_osx_pbcopy_clipboard():
|
||||
def copy_osx_pbcopy(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
with subprocess.Popen(
|
||||
["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_osx_pbcopy():
|
||||
with subprocess.Popen(
|
||||
["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_osx_pbcopy, paste_osx_pbcopy
|
||||
|
||||
|
||||
def init_osx_pyobjc_clipboard():
|
||||
def copy_osx_pyobjc(text):
|
||||
"""Copy string argument to clipboard"""
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
newStr = Foundation.NSString.stringWithString_(text).nsstring()
|
||||
newData = newStr.dataUsingEncoding_(Foundation.NSUTF8StringEncoding)
|
||||
board = AppKit.NSPasteboard.generalPasteboard()
|
||||
board.declareTypes_owner_([AppKit.NSStringPboardType], None)
|
||||
board.setData_forType_(newData, AppKit.NSStringPboardType)
|
||||
|
||||
def paste_osx_pyobjc():
|
||||
"""Returns contents of clipboard"""
|
||||
board = AppKit.NSPasteboard.generalPasteboard()
|
||||
content = board.stringForType_(AppKit.NSStringPboardType)
|
||||
return content
|
||||
|
||||
return copy_osx_pyobjc, paste_osx_pyobjc
|
||||
|
||||
|
||||
def init_qt_clipboard():
|
||||
global QApplication
|
||||
# $DISPLAY should exist
|
||||
|
||||
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
|
||||
try:
|
||||
from qtpy.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
try:
|
||||
from PyQt5.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
from PyQt4.QtGui import QApplication
|
||||
|
||||
app = QApplication.instance()
|
||||
if app is None:
|
||||
app = QApplication([])
|
||||
|
||||
def copy_qt(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
cb = app.clipboard()
|
||||
cb.setText(text)
|
||||
|
||||
def paste_qt() -> str:
|
||||
cb = app.clipboard()
|
||||
return str(cb.text())
|
||||
|
||||
return copy_qt, paste_qt
|
||||
|
||||
|
||||
def init_xclip_clipboard():
|
||||
DEFAULT_SELECTION = "c"
|
||||
PRIMARY_SELECTION = "p"
|
||||
|
||||
def copy_xclip(text, primary=False):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
selection = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xclip", "-selection", selection], stdin=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_xclip(primary=False):
|
||||
selection = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xclip", "-selection", selection, "-o"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
# Intentionally ignore extraneous output on stderr when clipboard is empty
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_xclip, paste_xclip
|
||||
|
||||
|
||||
def init_xsel_clipboard():
|
||||
DEFAULT_SELECTION = "-b"
|
||||
PRIMARY_SELECTION = "-p"
|
||||
|
||||
def copy_xsel(text, primary=False):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
selection_flag = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection_flag = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xsel", selection_flag, "-i"], stdin=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_xsel(primary=False):
|
||||
selection_flag = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection_flag = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_xsel, paste_xsel
|
||||
|
||||
|
||||
def init_wl_clipboard():
|
||||
PRIMARY_SELECTION = "-p"
|
||||
|
||||
def copy_wl(text, primary=False):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
args = ["wl-copy"]
|
||||
if primary:
|
||||
args.append(PRIMARY_SELECTION)
|
||||
if not text:
|
||||
args.append("--clear")
|
||||
subprocess.check_call(args, close_fds=True)
|
||||
else:
|
||||
p = subprocess.Popen(args, stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_wl(primary=False):
|
||||
args = ["wl-paste", "-n"]
|
||||
if primary:
|
||||
args.append(PRIMARY_SELECTION)
|
||||
p = subprocess.Popen(args, stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, _stderr = p.communicate()
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_wl, paste_wl
|
||||
|
||||
|
||||
def init_klipper_clipboard():
|
||||
def copy_klipper(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
with subprocess.Popen(
|
||||
[
|
||||
"qdbus",
|
||||
"org.kde.klipper",
|
||||
"/klipper",
|
||||
"setClipboardContents",
|
||||
text.encode(ENCODING),
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
p.communicate(input=None)
|
||||
|
||||
def paste_klipper():
|
||||
with subprocess.Popen(
|
||||
["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
|
||||
stdout=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
|
||||
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
|
||||
# TODO: https://github.com/asweigart/pyperclip/issues/43
|
||||
clipboardContents = stdout.decode(ENCODING)
|
||||
# even if blank, Klipper will append a newline at the end
|
||||
assert len(clipboardContents) > 0
|
||||
# make sure that newline is there
|
||||
assert clipboardContents.endswith("\n")
|
||||
if clipboardContents.endswith("\n"):
|
||||
clipboardContents = clipboardContents[:-1]
|
||||
return clipboardContents
|
||||
|
||||
return copy_klipper, paste_klipper
|
||||
|
||||
|
||||
def init_dev_clipboard_clipboard():
|
||||
def copy_dev_clipboard(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
if text == "":
|
||||
warnings.warn(
|
||||
"Pyperclip cannot copy a blank string to the clipboard on Cygwin. "
|
||||
"This is effectively a no-op.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
if "\r" in text:
|
||||
warnings.warn(
|
||||
"Pyperclip cannot handle \\r characters on Cygwin.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
with open("/dev/clipboard", "w", encoding="utf-8") as fd:
|
||||
fd.write(text)
|
||||
|
||||
def paste_dev_clipboard() -> str:
|
||||
with open("/dev/clipboard", encoding="utf-8") as fd:
|
||||
content = fd.read()
|
||||
return content
|
||||
|
||||
return copy_dev_clipboard, paste_dev_clipboard
|
||||
|
||||
|
||||
def init_no_clipboard():
|
||||
class ClipboardUnavailable:
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise PyperclipException(EXCEPT_MSG)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return False
|
||||
|
||||
return ClipboardUnavailable(), ClipboardUnavailable()
|
||||
|
||||
|
||||
# Windows-related clipboard functions:
|
||||
class CheckedCall:
|
||||
def __init__(self, f) -> None:
|
||||
super().__setattr__("f", f)
|
||||
|
||||
def __call__(self, *args):
|
||||
ret = self.f(*args)
|
||||
if not ret and get_errno():
|
||||
raise PyperclipWindowsException("Error calling " + self.f.__name__)
|
||||
return ret
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
setattr(self.f, key, value)
|
||||
|
||||
|
||||
def init_windows_clipboard():
|
||||
global HGLOBAL, LPVOID, DWORD, LPCSTR, INT
|
||||
global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE
|
||||
from ctypes.wintypes import (
|
||||
BOOL,
|
||||
DWORD,
|
||||
HANDLE,
|
||||
HGLOBAL,
|
||||
HINSTANCE,
|
||||
HMENU,
|
||||
HWND,
|
||||
INT,
|
||||
LPCSTR,
|
||||
LPVOID,
|
||||
UINT,
|
||||
)
|
||||
|
||||
windll = ctypes.windll
|
||||
msvcrt = ctypes.CDLL("msvcrt")
|
||||
|
||||
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
|
||||
safeCreateWindowExA.argtypes = [
|
||||
DWORD,
|
||||
LPCSTR,
|
||||
LPCSTR,
|
||||
DWORD,
|
||||
INT,
|
||||
INT,
|
||||
INT,
|
||||
INT,
|
||||
HWND,
|
||||
HMENU,
|
||||
HINSTANCE,
|
||||
LPVOID,
|
||||
]
|
||||
safeCreateWindowExA.restype = HWND
|
||||
|
||||
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
|
||||
safeDestroyWindow.argtypes = [HWND]
|
||||
safeDestroyWindow.restype = BOOL
|
||||
|
||||
OpenClipboard = windll.user32.OpenClipboard
|
||||
OpenClipboard.argtypes = [HWND]
|
||||
OpenClipboard.restype = BOOL
|
||||
|
||||
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
|
||||
safeCloseClipboard.argtypes = []
|
||||
safeCloseClipboard.restype = BOOL
|
||||
|
||||
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
|
||||
safeEmptyClipboard.argtypes = []
|
||||
safeEmptyClipboard.restype = BOOL
|
||||
|
||||
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
|
||||
safeGetClipboardData.argtypes = [UINT]
|
||||
safeGetClipboardData.restype = HANDLE
|
||||
|
||||
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
|
||||
safeSetClipboardData.argtypes = [UINT, HANDLE]
|
||||
safeSetClipboardData.restype = HANDLE
|
||||
|
||||
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
|
||||
safeGlobalAlloc.argtypes = [UINT, c_size_t]
|
||||
safeGlobalAlloc.restype = HGLOBAL
|
||||
|
||||
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
|
||||
safeGlobalLock.argtypes = [HGLOBAL]
|
||||
safeGlobalLock.restype = LPVOID
|
||||
|
||||
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
|
||||
safeGlobalUnlock.argtypes = [HGLOBAL]
|
||||
safeGlobalUnlock.restype = BOOL
|
||||
|
||||
wcslen = CheckedCall(msvcrt.wcslen)
|
||||
wcslen.argtypes = [c_wchar_p]
|
||||
wcslen.restype = UINT
|
||||
|
||||
GMEM_MOVEABLE = 0x0002
|
||||
CF_UNICODETEXT = 13
|
||||
|
||||
@contextlib.contextmanager
|
||||
def window():
|
||||
"""
|
||||
Context that provides a valid Windows hwnd.
|
||||
"""
|
||||
# we really just need the hwnd, so setting "STATIC"
|
||||
# as predefined lpClass is just fine.
|
||||
hwnd = safeCreateWindowExA(
|
||||
0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
|
||||
)
|
||||
try:
|
||||
yield hwnd
|
||||
finally:
|
||||
safeDestroyWindow(hwnd)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def clipboard(hwnd):
|
||||
"""
|
||||
Context manager that opens the clipboard and prevents
|
||||
other applications from modifying the clipboard content.
|
||||
"""
|
||||
# We may not get the clipboard handle immediately because
|
||||
# some other application is accessing it (?)
|
||||
# We try for at least 500ms to get the clipboard.
|
||||
t = time.time() + 0.5
|
||||
success = False
|
||||
while time.time() < t:
|
||||
success = OpenClipboard(hwnd)
|
||||
if success:
|
||||
break
|
||||
time.sleep(0.01)
|
||||
if not success:
|
||||
raise PyperclipWindowsException("Error calling OpenClipboard")
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
safeCloseClipboard()
|
||||
|
||||
def copy_windows(text):
|
||||
# This function is heavily based on
|
||||
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
|
||||
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
|
||||
with window() as hwnd:
|
||||
# http://msdn.com/ms649048
|
||||
# If an application calls OpenClipboard with hwnd set to NULL,
|
||||
# EmptyClipboard sets the clipboard owner to NULL;
|
||||
# this causes SetClipboardData to fail.
|
||||
# => We need a valid hwnd to copy something.
|
||||
with clipboard(hwnd):
|
||||
safeEmptyClipboard()
|
||||
|
||||
if text:
|
||||
# http://msdn.com/ms649051
|
||||
# If the hMem parameter identifies a memory object,
|
||||
# the object must have been allocated using the
|
||||
# function with the GMEM_MOVEABLE flag.
|
||||
count = wcslen(text) + 1
|
||||
handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
|
||||
locked_handle = safeGlobalLock(handle)
|
||||
|
||||
ctypes.memmove(
|
||||
c_wchar_p(locked_handle),
|
||||
c_wchar_p(text),
|
||||
count * sizeof(c_wchar),
|
||||
)
|
||||
|
||||
safeGlobalUnlock(handle)
|
||||
safeSetClipboardData(CF_UNICODETEXT, handle)
|
||||
|
||||
def paste_windows():
|
||||
with clipboard(None):
|
||||
handle = safeGetClipboardData(CF_UNICODETEXT)
|
||||
if not handle:
|
||||
# GetClipboardData may return NULL with errno == NO_ERROR
|
||||
# if the clipboard is empty.
|
||||
# (Also, it may return a handle to an empty buffer,
|
||||
# but technically that's not empty)
|
||||
return ""
|
||||
return c_wchar_p(handle).value
|
||||
|
||||
return copy_windows, paste_windows
|
||||
|
||||
|
||||
def init_wsl_clipboard():
|
||||
def copy_wsl(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
with subprocess.Popen(["clip.exe"], stdin=subprocess.PIPE, close_fds=True) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_wsl():
|
||||
with subprocess.Popen(
|
||||
["powershell.exe", "-command", "Get-Clipboard"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
# WSL appends "\r\n" to the contents.
|
||||
return stdout[:-2].decode(ENCODING)
|
||||
|
||||
return copy_wsl, paste_wsl
|
||||
|
||||
|
||||
# Automatic detection of clipboard mechanisms
|
||||
# and importing is done in determine_clipboard():
|
||||
def determine_clipboard():
|
||||
"""
|
||||
Determine the OS/platform and set the copy() and paste() functions
|
||||
accordingly.
|
||||
"""
|
||||
global Foundation, AppKit, qtpy, PyQt4, PyQt5
|
||||
|
||||
# Setup for the CYGWIN platform:
|
||||
if (
|
||||
"cygwin" in platform.system().lower()
|
||||
): # Cygwin has a variety of values returned by platform.system(),
|
||||
# such as 'CYGWIN_NT-6.1'
|
||||
# FIXME(pyperclip#55): pyperclip currently does not support Cygwin,
|
||||
# see https://github.com/asweigart/pyperclip/issues/55
|
||||
if os.path.exists("/dev/clipboard"):
|
||||
warnings.warn(
|
||||
"Pyperclip's support for Cygwin is not perfect, "
|
||||
"see https://github.com/asweigart/pyperclip/issues/55",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return init_dev_clipboard_clipboard()
|
||||
|
||||
# Setup for the WINDOWS platform:
|
||||
elif os.name == "nt" or platform.system() == "Windows":
|
||||
return init_windows_clipboard()
|
||||
|
||||
if platform.system() == "Linux":
|
||||
if _executable_exists("wslconfig.exe"):
|
||||
return init_wsl_clipboard()
|
||||
|
||||
# Setup for the macOS platform:
|
||||
if os.name == "mac" or platform.system() == "Darwin":
|
||||
try:
|
||||
import AppKit
|
||||
import Foundation # check if pyobjc is installed
|
||||
except ImportError:
|
||||
return init_osx_pbcopy_clipboard()
|
||||
else:
|
||||
return init_osx_pyobjc_clipboard()
|
||||
|
||||
# Setup for the LINUX platform:
|
||||
if HAS_DISPLAY:
|
||||
if os.environ.get("WAYLAND_DISPLAY") and _executable_exists("wl-copy"):
|
||||
return init_wl_clipboard()
|
||||
if _executable_exists("xsel"):
|
||||
return init_xsel_clipboard()
|
||||
if _executable_exists("xclip"):
|
||||
return init_xclip_clipboard()
|
||||
if _executable_exists("klipper") and _executable_exists("qdbus"):
|
||||
return init_klipper_clipboard()
|
||||
|
||||
try:
|
||||
# qtpy is a small abstraction layer that lets you write applications
|
||||
# using a single api call to either PyQt or PySide.
|
||||
# https://pypi.python.org/project/QtPy
|
||||
import qtpy # check if qtpy is installed
|
||||
except ImportError:
|
||||
# If qtpy isn't installed, fall back on importing PyQt4.
|
||||
try:
|
||||
import PyQt5 # check if PyQt5 is installed
|
||||
except ImportError:
|
||||
try:
|
||||
import PyQt4 # check if PyQt4 is installed
|
||||
except ImportError:
|
||||
pass # We want to fail fast for all non-ImportError exceptions.
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
|
||||
return init_no_clipboard()
|
||||
|
||||
|
||||
def set_clipboard(clipboard):
|
||||
"""
|
||||
Explicitly sets the clipboard mechanism. The "clipboard mechanism" is how
|
||||
the copy() and paste() functions interact with the operating system to
|
||||
implement the copy/paste feature. The clipboard parameter must be one of:
|
||||
- pbcopy
|
||||
- pyobjc (default on macOS)
|
||||
- qt
|
||||
- xclip
|
||||
- xsel
|
||||
- klipper
|
||||
- windows (default on Windows)
|
||||
- no (this is what is set when no clipboard mechanism can be found)
|
||||
"""
|
||||
global copy, paste
|
||||
|
||||
clipboard_types = {
|
||||
"pbcopy": init_osx_pbcopy_clipboard,
|
||||
"pyobjc": init_osx_pyobjc_clipboard,
|
||||
"qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5'
|
||||
"xclip": init_xclip_clipboard,
|
||||
"xsel": init_xsel_clipboard,
|
||||
"wl-clipboard": init_wl_clipboard,
|
||||
"klipper": init_klipper_clipboard,
|
||||
"windows": init_windows_clipboard,
|
||||
"no": init_no_clipboard,
|
||||
}
|
||||
|
||||
if clipboard not in clipboard_types:
|
||||
allowed_clipboard_types = [repr(_) for _ in clipboard_types]
|
||||
raise ValueError(
|
||||
f"Argument must be one of {', '.join(allowed_clipboard_types)}"
|
||||
)
|
||||
|
||||
# Sets pyperclip's copy() and paste() functions:
|
||||
copy, paste = clipboard_types[clipboard]()
|
||||
|
||||
|
||||
def lazy_load_stub_copy(text):
|
||||
"""
|
||||
A stub function for copy(), which will load the real copy() function when
|
||||
called so that the real copy() function is used for later calls.
|
||||
|
||||
This allows users to import pyperclip without having determine_clipboard()
|
||||
automatically run, which will automatically select a clipboard mechanism.
|
||||
This could be a problem if it selects, say, the memory-heavy PyQt4 module
|
||||
but the user was just going to immediately call set_clipboard() to use a
|
||||
different clipboard mechanism.
|
||||
|
||||
The lazy loading this stub function implements gives the user a chance to
|
||||
call set_clipboard() to pick another clipboard mechanism. Or, if the user
|
||||
simply calls copy() or paste() without calling set_clipboard() first,
|
||||
will fall back on whatever clipboard mechanism that determine_clipboard()
|
||||
automatically chooses.
|
||||
"""
|
||||
global copy, paste
|
||||
copy, paste = determine_clipboard()
|
||||
return copy(text)
|
||||
|
||||
|
||||
def lazy_load_stub_paste():
|
||||
"""
|
||||
A stub function for paste(), which will load the real paste() function when
|
||||
called so that the real paste() function is used for later calls.
|
||||
|
||||
This allows users to import pyperclip without having determine_clipboard()
|
||||
automatically run, which will automatically select a clipboard mechanism.
|
||||
This could be a problem if it selects, say, the memory-heavy PyQt4 module
|
||||
but the user was just going to immediately call set_clipboard() to use a
|
||||
different clipboard mechanism.
|
||||
|
||||
The lazy loading this stub function implements gives the user a chance to
|
||||
call set_clipboard() to pick another clipboard mechanism. Or, if the user
|
||||
simply calls copy() or paste() without calling set_clipboard() first,
|
||||
will fall back on whatever clipboard mechanism that determine_clipboard()
|
||||
automatically chooses.
|
||||
"""
|
||||
global copy, paste
|
||||
copy, paste = determine_clipboard()
|
||||
return paste()
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
return copy != lazy_load_stub_copy and paste != lazy_load_stub_paste
|
||||
|
||||
|
||||
# Initially, copy() and paste() are set to lazy loading wrappers which will
|
||||
# set `copy` and `paste` to real functions the first time they're used, unless
|
||||
# set_clipboard() or determine_clipboard() is called first.
|
||||
copy, paste = lazy_load_stub_copy, lazy_load_stub_paste
|
||||
|
||||
|
||||
def waitForPaste(timeout=None):
|
||||
"""This function call blocks until a non-empty text string exists on the
|
||||
clipboard. It returns this text.
|
||||
|
||||
This function raises PyperclipTimeoutException if timeout was set to
|
||||
a number of seconds that has elapsed without non-empty text being put on
|
||||
the clipboard."""
|
||||
startTime = time.time()
|
||||
while True:
|
||||
clipboardText = paste()
|
||||
if clipboardText != "":
|
||||
return clipboardText
|
||||
time.sleep(0.01)
|
||||
|
||||
if timeout is not None and time.time() > startTime + timeout:
|
||||
raise PyperclipTimeoutException(
|
||||
"waitForPaste() timed out after " + str(timeout) + " seconds."
|
||||
)
|
||||
|
||||
|
||||
def waitForNewPaste(timeout=None):
|
||||
"""This function call blocks until a new text string exists on the
|
||||
clipboard that is different from the text that was there when the function
|
||||
was first called. It returns this text.
|
||||
|
||||
This function raises PyperclipTimeoutException if timeout was set to
|
||||
a number of seconds that has elapsed without non-empty text being put on
|
||||
the clipboard."""
|
||||
startTime = time.time()
|
||||
originalText = paste()
|
||||
while True:
|
||||
currentText = paste()
|
||||
if currentText != originalText:
|
||||
return currentText
|
||||
time.sleep(0.01)
|
||||
|
||||
if timeout is not None and time.time() > startTime + timeout:
|
||||
raise PyperclipTimeoutException(
|
||||
"waitForNewPaste() timed out after " + str(timeout) + " seconds."
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"copy",
|
||||
"paste",
|
||||
"waitForPaste",
|
||||
"waitForNewPaste",
|
||||
"set_clipboard",
|
||||
"determine_clipboard",
|
||||
]
|
||||
|
||||
# pandas aliases
|
||||
clipboard_get = paste
|
||||
clipboard_set = copy
|
||||
@@ -0,0 +1,200 @@
|
||||
"""io on the clipboard"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from io import StringIO
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.util._decorators import set_module
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.dtypes.generic import ABCDataFrame
|
||||
|
||||
from pandas import (
|
||||
get_option,
|
||||
option_context,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import DtypeBackend
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_clipboard(
|
||||
sep: str = r"\s+",
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
**kwargs,
|
||||
): # pragma: no cover
|
||||
r"""
|
||||
Read text from clipboard and pass to :func:`~pandas.read_csv`.
|
||||
|
||||
Parses clipboard contents similar to how CSV files are parsed
|
||||
using :func:`~pandas.read_csv`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sep : str, default '\\s+'
|
||||
A string or regex delimiter. The default of ``'\\s+'`` denotes
|
||||
one or more whitespace characters.
|
||||
|
||||
dtype_backend : {'numpy_nullable', 'pyarrow'}
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). If not specified, the default behavior
|
||||
is to not use nullable data types. If specified, the behavior
|
||||
is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable
|
||||
:class:`ArrowDtype` :class:`DataFrame`
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
**kwargs
|
||||
See :func:`~pandas.read_csv` for the full argument list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A parsed :class:`~pandas.DataFrame` object.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_clipboard : Copy object to the system clipboard.
|
||||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||||
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
|
||||
>>> df.to_clipboard() # doctest: +SKIP
|
||||
>>> pd.read_clipboard() # doctest: +SKIP
|
||||
A B C
|
||||
0 1 2 3
|
||||
1 4 5 6
|
||||
"""
|
||||
encoding = kwargs.pop("encoding", "utf-8")
|
||||
|
||||
# only utf-8 is valid for passed value because that's what clipboard
|
||||
# supports
|
||||
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
|
||||
raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
|
||||
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
from pandas.io.clipboard import clipboard_get
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
text = clipboard_get()
|
||||
|
||||
# Try to decode (if needed, as "text" might already be a string here).
|
||||
try:
|
||||
text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Excel copies into clipboard with \t separation
|
||||
# inspect no more then the 10 first lines, if they
|
||||
# all contain an equal number (>0) of tabs, infer
|
||||
# that this came from excel and set 'sep' accordingly
|
||||
lines = text[:10000].split("\n")[:-1][:10]
|
||||
|
||||
# Need to remove leading white space, since read_csv
|
||||
# accepts:
|
||||
# a b
|
||||
# 0 1 2
|
||||
# 1 3 4
|
||||
|
||||
counts = {x.lstrip(" ").count("\t") for x in lines}
|
||||
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
|
||||
sep = "\t"
|
||||
# check the number of leading tabs in the first line
|
||||
# to account for index columns
|
||||
index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
|
||||
if index_length != 0:
|
||||
kwargs.setdefault("index_col", list(range(index_length)))
|
||||
|
||||
elif not isinstance(sep, str):
|
||||
raise ValueError(f"{sep=} must be a string")
|
||||
|
||||
# Regex separator currently only works with python engine.
|
||||
# Default to python if separator is multi-character (regex)
|
||||
if len(sep) > 1 and kwargs.get("engine") is None:
|
||||
kwargs["engine"] = "python"
|
||||
elif len(sep) > 1 and kwargs.get("engine") == "c":
|
||||
warnings.warn(
|
||||
"read_clipboard with regex separator does not work properly with c engine.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs)
|
||||
|
||||
|
||||
def to_clipboard(
|
||||
obj, excel: bool | None = True, sep: str | None = None, **kwargs
|
||||
) -> None: # pragma: no cover
|
||||
"""
|
||||
Attempt to write text representation of object to the system clipboard
|
||||
The clipboard can be then pasted into Excel for example.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : the object to write to the clipboard
|
||||
excel : bool, defaults to True
|
||||
if True, use the provided separator, writing in a csv
|
||||
format for allowing easy pasting into excel.
|
||||
if False, write a string representation of the object
|
||||
to the clipboard
|
||||
sep : optional, defaults to tab
|
||||
other keywords are passed to to_csv
|
||||
|
||||
Notes
|
||||
-----
|
||||
Requirements for your platform
|
||||
- Linux: xclip, or xsel (with PyQt4 modules)
|
||||
- Windows:
|
||||
- OS X:
|
||||
"""
|
||||
encoding = kwargs.pop("encoding", "utf-8")
|
||||
|
||||
# testing if an invalid encoding is passed to clipboard
|
||||
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
|
||||
raise ValueError("clipboard only supports utf-8 encoding")
|
||||
|
||||
from pandas.io.clipboard import clipboard_set
|
||||
|
||||
if excel is None:
|
||||
excel = True
|
||||
|
||||
if excel:
|
||||
try:
|
||||
if sep is None:
|
||||
sep = "\t"
|
||||
buf = StringIO()
|
||||
|
||||
# clipboard_set (pyperclip) expects unicode
|
||||
obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
|
||||
text = buf.getvalue()
|
||||
|
||||
clipboard_set(text)
|
||||
return
|
||||
except TypeError:
|
||||
warnings.warn(
|
||||
"to_clipboard in excel mode requires a single character separator.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
elif sep is not None:
|
||||
warnings.warn(
|
||||
"to_clipboard with excel=False ignores the sep argument.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
if isinstance(obj, ABCDataFrame):
|
||||
# str(df) has various unhelpful defaults, like truncation
|
||||
with option_context("display.max_colwidth", None):
|
||||
objstr = obj.to_string(**kwargs)
|
||||
else:
|
||||
objstr = str(obj)
|
||||
clipboard_set(objstr)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,19 @@
|
||||
from pandas.io.excel._base import (
|
||||
ExcelFile,
|
||||
ExcelWriter,
|
||||
read_excel,
|
||||
)
|
||||
from pandas.io.excel._odswriter import ODSWriter as _ODSWriter
|
||||
from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter
|
||||
from pandas.io.excel._util import register_writer
|
||||
from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter
|
||||
|
||||
__all__ = ["ExcelFile", "ExcelWriter", "read_excel"]
|
||||
|
||||
|
||||
register_writer(_OpenpyxlWriter)
|
||||
|
||||
register_writer(_XlsxWriter)
|
||||
|
||||
|
||||
register_writer(_ODSWriter)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,129 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import (
|
||||
date,
|
||||
datetime,
|
||||
time,
|
||||
timedelta,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
TypeAlias,
|
||||
)
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from python_calamine import (
|
||||
CalamineSheet,
|
||||
CalamineWorkbook,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
NaTType,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
|
||||
_CellValue: TypeAlias = int | float | str | bool | time | date | datetime | timedelta
|
||||
|
||||
|
||||
class CalamineReader(BaseExcelReader["CalamineWorkbook"]):
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using calamine engine (xlsx/xls/xlsb/ods).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path to be parsed or
|
||||
an open readable stream.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("python_calamine")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[CalamineWorkbook]:
|
||||
from python_calamine import CalamineWorkbook
|
||||
|
||||
return CalamineWorkbook
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any
|
||||
) -> CalamineWorkbook:
|
||||
from python_calamine import load_workbook
|
||||
|
||||
return load_workbook(
|
||||
filepath_or_buffer,
|
||||
**engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
from python_calamine import SheetTypeEnum
|
||||
|
||||
return [
|
||||
sheet.name
|
||||
for sheet in self.book.sheets_metadata
|
||||
if sheet.typ == SheetTypeEnum.WorkSheet
|
||||
]
|
||||
|
||||
def get_sheet_by_name(self, name: str) -> CalamineSheet:
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book.get_sheet_by_name(name)
|
||||
|
||||
def get_sheet_by_index(self, index: int) -> CalamineSheet:
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
return self.book.get_sheet_by_index(index)
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet: CalamineSheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar | NaTType | time]]:
|
||||
def _convert_cell(value: _CellValue) -> Scalar | NaTType | time:
|
||||
if isinstance(value, float):
|
||||
val = int(value)
|
||||
if val == value:
|
||||
return val
|
||||
else:
|
||||
return value
|
||||
elif isinstance(value, (datetime, timedelta)):
|
||||
# Return as-is to match openpyxl behavior (GH#59186)
|
||||
return value
|
||||
elif isinstance(value, date):
|
||||
# Convert date to datetime to match openpyxl behavior (GH#59186)
|
||||
return datetime(value.year, value.month, value.day)
|
||||
elif isinstance(value, time):
|
||||
return value
|
||||
|
||||
return value
|
||||
|
||||
rows: list[list[_CellValue]] = sheet.to_python(
|
||||
skip_empty_area=False, nrows=file_rows_needed
|
||||
)
|
||||
data = [[_convert_cell(cell) for cell in row] for row in rows]
|
||||
|
||||
return data
|
||||
@@ -0,0 +1,249 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
from pandas._libs.tslibs.nattype import NaTType
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
class ODFReader(BaseExcelReader["OpenDocument"]):
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Read tables out of OpenDocument formatted files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path to be parsed or
|
||||
an open readable stream.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("odf")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[OpenDocument]:
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
return OpenDocument
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> OpenDocument:
|
||||
from odf.opendocument import load
|
||||
|
||||
return load(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def empty_value(self) -> str:
|
||||
"""Property for compat with other readers."""
|
||||
return ""
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
"""Return a list of sheet names present in the document"""
|
||||
from odf.table import Table
|
||||
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return [t.getAttribute("name") for t in tables]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
from odf.table import Table
|
||||
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return tables[index]
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
from odf.table import Table
|
||||
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
tables = self.book.getElementsByType(Table)
|
||||
|
||||
for table in tables:
|
||||
if table.getAttribute("name") == name:
|
||||
return table
|
||||
|
||||
self.close()
|
||||
raise ValueError(f"sheet {name} not found")
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar | NaTType]]:
|
||||
"""
|
||||
Parse an ODF Table into a list of lists
|
||||
"""
|
||||
from odf.table import (
|
||||
CoveredTableCell,
|
||||
TableCell,
|
||||
TableRow,
|
||||
)
|
||||
|
||||
covered_cell_name = CoveredTableCell().qname
|
||||
table_cell_name = TableCell().qname
|
||||
cell_names = {covered_cell_name, table_cell_name}
|
||||
|
||||
sheet_rows = sheet.getElementsByType(TableRow)
|
||||
empty_rows = 0
|
||||
max_row_len = 0
|
||||
|
||||
table: list[list[Scalar | NaTType]] = []
|
||||
|
||||
for sheet_row in sheet_rows:
|
||||
empty_cells = 0
|
||||
table_row: list[Scalar | NaTType] = []
|
||||
|
||||
for sheet_cell in sheet_row.childNodes:
|
||||
if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
|
||||
if sheet_cell.qname == table_cell_name:
|
||||
value = self._get_cell_value(sheet_cell)
|
||||
else:
|
||||
value = self.empty_value
|
||||
|
||||
column_repeat = self._get_column_repeat(sheet_cell)
|
||||
|
||||
# Queue up empty values, writing only if content succeeds them
|
||||
if value == self.empty_value:
|
||||
empty_cells += column_repeat
|
||||
else:
|
||||
table_row.extend([self.empty_value] * empty_cells)
|
||||
empty_cells = 0
|
||||
table_row.extend([value] * column_repeat)
|
||||
|
||||
if max_row_len < len(table_row):
|
||||
max_row_len = len(table_row)
|
||||
|
||||
row_repeat = self._get_row_repeat(sheet_row)
|
||||
if len(table_row) == 0:
|
||||
empty_rows += row_repeat
|
||||
else:
|
||||
# add blank rows to our table
|
||||
table.extend([[self.empty_value]] * empty_rows)
|
||||
empty_rows = 0
|
||||
table.extend(table_row for _ in range(row_repeat))
|
||||
if file_rows_needed is not None and len(table) >= file_rows_needed:
|
||||
break
|
||||
|
||||
# Make our table square
|
||||
for row in table:
|
||||
if len(row) < max_row_len:
|
||||
row.extend([self.empty_value] * (max_row_len - len(row)))
|
||||
|
||||
return table
|
||||
|
||||
def _get_row_repeat(self, row) -> int:
|
||||
"""
|
||||
Return number of times this row was repeated
|
||||
Repeating an empty row appeared to be a common way
|
||||
of representing sparse rows in the table.
|
||||
"""
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
|
||||
|
||||
def _get_column_repeat(self, cell) -> int:
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
|
||||
|
||||
def _get_cell_value(self, cell) -> Scalar | NaTType:
|
||||
from odf.namespaces import OFFICENS
|
||||
|
||||
if str(cell) == "#N/A":
|
||||
return np.nan
|
||||
|
||||
cell_type = cell.attributes.get((OFFICENS, "value-type"))
|
||||
if cell_type == "boolean":
|
||||
if str(cell) == "TRUE":
|
||||
return True
|
||||
return False
|
||||
if cell_type is None:
|
||||
return self.empty_value
|
||||
elif cell_type == "float":
|
||||
# GH5394
|
||||
cell_value = float(cell.attributes.get((OFFICENS, "value")))
|
||||
val = int(cell_value)
|
||||
if val == cell_value:
|
||||
return val
|
||||
return cell_value
|
||||
elif cell_type == "percentage":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "string":
|
||||
return self._get_cell_string_value(cell)
|
||||
elif cell_type == "currency":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "date":
|
||||
cell_value = cell.attributes.get((OFFICENS, "date-value"))
|
||||
return pd.Timestamp(cell_value)
|
||||
elif cell_type == "time":
|
||||
stamp = pd.Timestamp(str(cell))
|
||||
# cast needed here because Scalar doesn't include datetime.time
|
||||
return cast(Scalar, stamp.time())
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(f"Unrecognized type {cell_type}")
|
||||
|
||||
def _get_cell_string_value(self, cell) -> str:
|
||||
"""
|
||||
Find and decode OpenDocument text:s tags that represent
|
||||
a run length encoded sequence of space characters.
|
||||
"""
|
||||
from odf.element import Element
|
||||
from odf.namespaces import TEXTNS
|
||||
from odf.office import Annotation
|
||||
from odf.text import S
|
||||
|
||||
office_annotation = Annotation().qname
|
||||
text_s = S().qname
|
||||
|
||||
value = []
|
||||
|
||||
for fragment in cell.childNodes:
|
||||
if isinstance(fragment, Element):
|
||||
if fragment.qname == text_s:
|
||||
spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
|
||||
value.append(" " * spaces)
|
||||
elif fragment.qname == office_annotation:
|
||||
continue
|
||||
else:
|
||||
# recursive impl needed in case of nested fragments
|
||||
# with multiple spaces
|
||||
# https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
|
||||
value.append(self._get_cell_string_value(fragment))
|
||||
else:
|
||||
value.append(str(fragment).strip("\n"))
|
||||
return "".join(value)
|
||||
@@ -0,0 +1,362 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
import datetime
|
||||
import json
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
DefaultDict,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
from pandas.io.excel._util import (
|
||||
combine_kwargs,
|
||||
validate_freeze_panes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from odf.opendocument import OpenDocumentSpreadsheet
|
||||
|
||||
from pandas._typing import (
|
||||
ExcelWriterIfSheetExists,
|
||||
FilePath,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
from pandas.io.formats.excel import ExcelCell
|
||||
|
||||
|
||||
class ODSWriter(ExcelWriter):
|
||||
_engine = "odf"
|
||||
_supported_extensions = (".ods",)
|
||||
|
||||
def __init__( # pyright: ignore[reportInconsistentConstructor]
|
||||
self,
|
||||
path: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
engine: str | None = None,
|
||||
date_format: str | None = None,
|
||||
datetime_format: str | None = None,
|
||||
mode: str = "w",
|
||||
storage_options: StorageOptions | None = None,
|
||||
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
from odf.opendocument import OpenDocumentSpreadsheet
|
||||
|
||||
if mode == "a":
|
||||
raise ValueError("Append mode is not supported with odf!")
|
||||
|
||||
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
|
||||
self._book = OpenDocumentSpreadsheet(**engine_kwargs)
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
mode=mode,
|
||||
storage_options=storage_options,
|
||||
if_sheet_exists=if_sheet_exists,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
self._style_dict: dict[str, str] = {}
|
||||
|
||||
@property
|
||||
def book(self) -> OpenDocumentSpreadsheet:
|
||||
"""
|
||||
Book instance of class odf.opendocument.OpenDocumentSpreadsheet.
|
||||
|
||||
This attribute can be used to access engine-specific features.
|
||||
"""
|
||||
return self._book
|
||||
|
||||
@property
|
||||
def sheets(self) -> dict[str, Any]:
|
||||
"""Mapping of sheet names to sheet objects."""
|
||||
from odf.table import Table
|
||||
|
||||
result = {
|
||||
sheet.getAttribute("name"): sheet
|
||||
for sheet in self.book.getElementsByType(Table)
|
||||
}
|
||||
return result
|
||||
|
||||
def _save(self) -> None:
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
for sheet in self.sheets.values():
|
||||
self.book.spreadsheet.addElement(sheet)
|
||||
self.book.save(self._handles.handle)
|
||||
|
||||
def _write_cells(
|
||||
self,
|
||||
cells: list[ExcelCell],
|
||||
sheet_name: str | None = None,
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
autofilter_range: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Write the frame cells using odf
|
||||
"""
|
||||
|
||||
if autofilter_range:
|
||||
raise ValueError("Autofilter is not supported with odf!")
|
||||
|
||||
from odf.table import (
|
||||
Table,
|
||||
TableCell,
|
||||
TableRow,
|
||||
)
|
||||
from odf.text import P
|
||||
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
assert sheet_name is not None
|
||||
|
||||
if sheet_name in self.sheets:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = Table(name=sheet_name)
|
||||
self.book.spreadsheet.addElement(wks)
|
||||
|
||||
if validate_freeze_panes(freeze_panes):
|
||||
freeze_panes = cast(tuple[int, int], freeze_panes)
|
||||
self._create_freeze_panes(sheet_name, freeze_panes)
|
||||
|
||||
for _ in range(startrow):
|
||||
wks.addElement(TableRow())
|
||||
|
||||
rows: DefaultDict = defaultdict(TableRow)
|
||||
col_count: DefaultDict = defaultdict(int)
|
||||
|
||||
for cell in sorted(cells, key=lambda cell: (cell.row, cell.col)):
|
||||
# only add empty cells if the row is still empty
|
||||
if not col_count[cell.row]:
|
||||
for _ in range(startcol):
|
||||
rows[cell.row].addElement(TableCell())
|
||||
|
||||
# fill with empty cells if needed
|
||||
for _ in range(cell.col - col_count[cell.row]):
|
||||
rows[cell.row].addElement(TableCell())
|
||||
col_count[cell.row] += 1
|
||||
|
||||
pvalue, tc = self._make_table_cell(cell)
|
||||
rows[cell.row].addElement(tc)
|
||||
col_count[cell.row] += 1
|
||||
p = P(text=pvalue)
|
||||
tc.addElement(p)
|
||||
|
||||
# add all rows to the sheet
|
||||
if len(rows) > 0:
|
||||
for row_nr in range(max(rows.keys()) + 1):
|
||||
wks.addElement(rows[row_nr])
|
||||
|
||||
def _make_table_cell_attributes(self, cell: ExcelCell) -> dict[str, int | str]:
|
||||
"""Convert cell attributes to OpenDocument attributes
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cell : ExcelCell
|
||||
Spreadsheet cell data
|
||||
|
||||
Returns
|
||||
-------
|
||||
attributes : Dict[str, Union[int, str]]
|
||||
Dictionary with attributes and attribute values
|
||||
"""
|
||||
attributes: dict[str, int | str] = {}
|
||||
style_name = self._process_style(cell.style)
|
||||
if style_name is not None:
|
||||
attributes["stylename"] = style_name
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
attributes["numberrowsspanned"] = max(1, cell.mergestart)
|
||||
attributes["numbercolumnsspanned"] = cell.mergeend
|
||||
return attributes
|
||||
|
||||
def _make_table_cell(self, cell: ExcelCell) -> tuple[object, Any]:
|
||||
"""Convert cell data to an OpenDocument spreadsheet cell
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cell : ExcelCell
|
||||
Spreadsheet cell data
|
||||
|
||||
Returns
|
||||
-------
|
||||
pvalue, cell : Tuple[str, TableCell]
|
||||
Display value, Cell value
|
||||
"""
|
||||
from odf.table import TableCell
|
||||
|
||||
attributes = self._make_table_cell_attributes(cell)
|
||||
val, fmt = self._value_with_fmt(cell.val)
|
||||
pvalue = value = val
|
||||
if isinstance(val, bool):
|
||||
value = str(val).lower()
|
||||
pvalue = str(val).upper()
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(
|
||||
valuetype="boolean",
|
||||
booleanvalue=value,
|
||||
attributes=attributes,
|
||||
),
|
||||
)
|
||||
elif isinstance(val, datetime.datetime):
|
||||
# Fast formatting
|
||||
value = val.isoformat()
|
||||
# Slow but locale-dependent
|
||||
pvalue = val.strftime("%c")
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(valuetype="date", datevalue=value, attributes=attributes),
|
||||
)
|
||||
elif isinstance(val, datetime.date):
|
||||
# Fast formatting
|
||||
value = f"{val.year}-{val.month:02d}-{val.day:02d}"
|
||||
# Slow but locale-dependent
|
||||
pvalue = val.strftime("%x")
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(valuetype="date", datevalue=value, attributes=attributes),
|
||||
)
|
||||
elif isinstance(val, str):
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(
|
||||
valuetype="string",
|
||||
stringvalue=value,
|
||||
attributes=attributes,
|
||||
),
|
||||
)
|
||||
else:
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(
|
||||
valuetype="float",
|
||||
value=value,
|
||||
attributes=attributes,
|
||||
),
|
||||
)
|
||||
|
||||
@overload
|
||||
def _process_style(self, style: dict[str, Any]) -> str: ...
|
||||
|
||||
@overload
|
||||
def _process_style(self, style: None) -> None: ...
|
||||
|
||||
def _process_style(self, style: dict[str, Any] | None) -> str | None:
|
||||
"""Convert a style dictionary to an OpenDocument style sheet
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style : Dict
|
||||
Style dictionary
|
||||
|
||||
Returns
|
||||
-------
|
||||
style_key : str
|
||||
Unique style key for later reference in sheet
|
||||
"""
|
||||
from odf.style import (
|
||||
ParagraphProperties,
|
||||
Style,
|
||||
TableCellProperties,
|
||||
TextProperties,
|
||||
)
|
||||
|
||||
if style is None:
|
||||
return None
|
||||
style_key = json.dumps(style)
|
||||
if style_key in self._style_dict:
|
||||
return self._style_dict[style_key]
|
||||
name = f"pd{len(self._style_dict) + 1}"
|
||||
self._style_dict[style_key] = name
|
||||
odf_style = Style(name=name, family="table-cell")
|
||||
if "font" in style:
|
||||
font = style["font"]
|
||||
if font.get("bold", False):
|
||||
odf_style.addElement(TextProperties(fontweight="bold"))
|
||||
if "borders" in style:
|
||||
borders = style["borders"]
|
||||
for side, thickness in borders.items():
|
||||
thickness_translation = {"thin": "0.75pt solid #000000"}
|
||||
odf_style.addElement(
|
||||
TableCellProperties(
|
||||
attributes={f"border{side}": thickness_translation[thickness]}
|
||||
)
|
||||
)
|
||||
if "alignment" in style:
|
||||
alignment = style["alignment"]
|
||||
horizontal = alignment.get("horizontal")
|
||||
if horizontal:
|
||||
odf_style.addElement(ParagraphProperties(textalign=horizontal))
|
||||
vertical = alignment.get("vertical")
|
||||
if vertical:
|
||||
odf_style.addElement(TableCellProperties(verticalalign=vertical))
|
||||
self.book.styles.addElement(odf_style)
|
||||
return name
|
||||
|
||||
def _create_freeze_panes(
|
||||
self, sheet_name: str, freeze_panes: tuple[int, int]
|
||||
) -> None:
|
||||
"""
|
||||
Create freeze panes in the sheet.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sheet_name : str
|
||||
Name of the spreadsheet
|
||||
freeze_panes : tuple of (int, int)
|
||||
Freeze pane location x and y
|
||||
"""
|
||||
from odf.config import (
|
||||
ConfigItem,
|
||||
ConfigItemMapEntry,
|
||||
ConfigItemMapIndexed,
|
||||
ConfigItemMapNamed,
|
||||
ConfigItemSet,
|
||||
)
|
||||
|
||||
config_item_set = ConfigItemSet(name="ooo:view-settings")
|
||||
self.book.settings.addElement(config_item_set)
|
||||
|
||||
config_item_map_indexed = ConfigItemMapIndexed(name="Views")
|
||||
config_item_set.addElement(config_item_map_indexed)
|
||||
|
||||
config_item_map_entry = ConfigItemMapEntry()
|
||||
config_item_map_indexed.addElement(config_item_map_entry)
|
||||
|
||||
config_item_map_named = ConfigItemMapNamed(name="Tables")
|
||||
config_item_map_entry.addElement(config_item_map_named)
|
||||
|
||||
config_item_map_entry = ConfigItemMapEntry(name=sheet_name)
|
||||
config_item_map_named.addElement(config_item_map_entry)
|
||||
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="HorizontalSplitMode", type="short", text="2")
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="VerticalSplitMode", type="short", text="2")
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(
|
||||
name="HorizontalSplitPosition", type="int", text=str(freeze_panes[0])
|
||||
)
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(
|
||||
name="VerticalSplitPosition", type="int", text=str(freeze_panes[1])
|
||||
)
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="PositionRight", type="int", text=str(freeze_panes[0]))
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="PositionBottom", type="int", text=str(freeze_panes[1]))
|
||||
)
|
||||
@@ -0,0 +1,646 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import mmap
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import (
|
||||
BaseExcelReader,
|
||||
ExcelWriter,
|
||||
)
|
||||
from pandas.io.excel._util import (
|
||||
combine_kwargs,
|
||||
validate_freeze_panes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.descriptors.serialisable import Serialisable
|
||||
from openpyxl.styles import Fill
|
||||
|
||||
from pandas._typing import (
|
||||
ExcelWriterIfSheetExists,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
|
||||
class OpenpyxlWriter(ExcelWriter):
|
||||
_engine = "openpyxl"
|
||||
_supported_extensions = (".xlsx", ".xlsm")
|
||||
|
||||
def __init__( # pyright: ignore[reportInconsistentConstructor]
|
||||
self,
|
||||
path: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
engine: str | None = None,
|
||||
date_format: str | None = None,
|
||||
datetime_format: str | None = None,
|
||||
mode: str = "w",
|
||||
storage_options: StorageOptions | None = None,
|
||||
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Use the openpyxl module as the Excel writer.
|
||||
from openpyxl.workbook import Workbook
|
||||
|
||||
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
mode=mode,
|
||||
storage_options=storage_options,
|
||||
if_sheet_exists=if_sheet_exists,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
# ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
|
||||
# the file and later write to it
|
||||
if "r+" in self._mode: # Load from existing workbook
|
||||
from openpyxl import load_workbook
|
||||
|
||||
try:
|
||||
self._book = load_workbook(self._handles.handle, **engine_kwargs)
|
||||
except TypeError:
|
||||
self._handles.handle.close()
|
||||
raise
|
||||
self._handles.handle.seek(0)
|
||||
else:
|
||||
# Create workbook object with default optimized_write=True.
|
||||
try:
|
||||
self._book = Workbook(**engine_kwargs)
|
||||
except TypeError:
|
||||
self._handles.handle.close()
|
||||
raise
|
||||
|
||||
if self.book.worksheets:
|
||||
self.book.remove(self.book.worksheets[0])
|
||||
|
||||
@property
|
||||
def book(self) -> Workbook:
|
||||
"""
|
||||
Book instance of class openpyxl.workbook.Workbook.
|
||||
|
||||
This attribute can be used to access engine-specific features.
|
||||
"""
|
||||
return self._book
|
||||
|
||||
@property
|
||||
def sheets(self) -> dict[str, Any]:
|
||||
"""Mapping of sheet names to sheet objects."""
|
||||
result = {name: self.book[name] for name in self.book.sheetnames}
|
||||
return result
|
||||
|
||||
def _save(self) -> None:
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
self.book.save(self._handles.handle)
|
||||
if "r+" in self._mode and not isinstance(self._handles.handle, mmap.mmap):
|
||||
# truncate file to the written content
|
||||
self._handles.handle.truncate()
|
||||
|
||||
@classmethod
|
||||
def _convert_to_style_kwargs(
|
||||
cls, style_dict: dict[str, Serialisable]
|
||||
) -> dict[str, Serialisable]:
|
||||
"""
|
||||
Convert a style_dict to a set of kwargs suitable for initializing
|
||||
or updating-on-copy an openpyxl v2 style object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'font'
|
||||
'fill'
|
||||
'border' ('borders')
|
||||
'alignment'
|
||||
'number_format'
|
||||
'protection'
|
||||
|
||||
Returns
|
||||
-------
|
||||
style_kwargs : dict
|
||||
A dict with the same, normalized keys as ``style_dict`` but each
|
||||
value has been replaced with a native openpyxl style object of the
|
||||
appropriate class.
|
||||
"""
|
||||
_style_key_map = {"borders": "border"}
|
||||
|
||||
style_kwargs: dict[str, Serialisable] = {}
|
||||
for k, v in style_dict.items():
|
||||
k = _style_key_map.get(k, k)
|
||||
_conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None)
|
||||
new_v = _conv_to_x(v)
|
||||
if new_v:
|
||||
style_kwargs[k] = new_v
|
||||
|
||||
return style_kwargs
|
||||
|
||||
@classmethod
|
||||
def _convert_to_color(cls, color_spec):
|
||||
"""
|
||||
Convert ``color_spec`` to an openpyxl v2 Color object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
color_spec : str, dict
|
||||
A 32-bit ARGB hex string, or a dict with zero or more of the
|
||||
following keys.
|
||||
'rgb'
|
||||
'indexed'
|
||||
'auto'
|
||||
'theme'
|
||||
'tint'
|
||||
'index'
|
||||
'type'
|
||||
|
||||
Returns
|
||||
-------
|
||||
color : openpyxl.styles.Color
|
||||
"""
|
||||
from openpyxl.styles import Color
|
||||
|
||||
if isinstance(color_spec, str):
|
||||
return Color(color_spec)
|
||||
else:
|
||||
return Color(**color_spec)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_font(cls, font_dict):
|
||||
"""
|
||||
Convert ``font_dict`` to an openpyxl v2 Font object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
font_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'name'
|
||||
'size' ('sz')
|
||||
'bold' ('b')
|
||||
'italic' ('i')
|
||||
'underline' ('u')
|
||||
'strikethrough' ('strike')
|
||||
'color'
|
||||
'vertAlign' ('vertalign')
|
||||
'charset'
|
||||
'scheme'
|
||||
'family'
|
||||
'outline'
|
||||
'shadow'
|
||||
'condense'
|
||||
|
||||
Returns
|
||||
-------
|
||||
font : openpyxl.styles.Font
|
||||
"""
|
||||
from openpyxl.styles import Font
|
||||
|
||||
_font_key_map = {
|
||||
"sz": "size",
|
||||
"b": "bold",
|
||||
"i": "italic",
|
||||
"u": "underline",
|
||||
"strike": "strikethrough",
|
||||
"vertalign": "vertAlign",
|
||||
}
|
||||
|
||||
font_kwargs = {}
|
||||
for k, v in font_dict.items():
|
||||
k = _font_key_map.get(k, k)
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
font_kwargs[k] = v
|
||||
|
||||
return Font(**font_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_stop(cls, stop_seq):
|
||||
"""
|
||||
Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
|
||||
suitable for initializing the ``GradientFill`` ``stop`` parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stop_seq : iterable
|
||||
An iterable that yields objects suitable for consumption by
|
||||
``_convert_to_color``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stop : list of openpyxl.styles.Color
|
||||
"""
|
||||
return map(cls._convert_to_color, stop_seq)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_fill(cls, fill_dict: dict[str, Any]) -> Fill:
|
||||
"""
|
||||
Convert ``fill_dict`` to an openpyxl v2 Fill object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fill_dict : dict
|
||||
A dict with one or more of the following keys (or their synonyms),
|
||||
'fill_type' ('patternType', 'patterntype')
|
||||
'start_color' ('fgColor', 'fgcolor')
|
||||
'end_color' ('bgColor', 'bgcolor')
|
||||
or one or more of the following keys (or their synonyms).
|
||||
'type' ('fill_type')
|
||||
'degree'
|
||||
'left'
|
||||
'right'
|
||||
'top'
|
||||
'bottom'
|
||||
'stop'
|
||||
|
||||
Returns
|
||||
-------
|
||||
fill : openpyxl.styles.Fill
|
||||
"""
|
||||
from openpyxl.styles import (
|
||||
GradientFill,
|
||||
PatternFill,
|
||||
)
|
||||
|
||||
_pattern_fill_key_map = {
|
||||
"patternType": "fill_type",
|
||||
"patterntype": "fill_type",
|
||||
"fgColor": "start_color",
|
||||
"fgcolor": "start_color",
|
||||
"bgColor": "end_color",
|
||||
"bgcolor": "end_color",
|
||||
}
|
||||
|
||||
_gradient_fill_key_map = {"fill_type": "type"}
|
||||
|
||||
pfill_kwargs = {}
|
||||
gfill_kwargs = {}
|
||||
for k, v in fill_dict.items():
|
||||
pk = _pattern_fill_key_map.get(k)
|
||||
gk = _gradient_fill_key_map.get(k)
|
||||
if pk in ["start_color", "end_color"]:
|
||||
v = cls._convert_to_color(v)
|
||||
if gk == "stop":
|
||||
v = cls._convert_to_stop(v)
|
||||
if pk:
|
||||
pfill_kwargs[pk] = v
|
||||
elif gk:
|
||||
gfill_kwargs[gk] = v
|
||||
else:
|
||||
pfill_kwargs[k] = v
|
||||
gfill_kwargs[k] = v
|
||||
|
||||
try:
|
||||
return PatternFill(**pfill_kwargs)
|
||||
except TypeError:
|
||||
return GradientFill(**gfill_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_side(cls, side_spec):
|
||||
"""
|
||||
Convert ``side_spec`` to an openpyxl v2 Side object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
side_spec : str, dict
|
||||
A string specifying the border style, or a dict with zero or more
|
||||
of the following keys (or their synonyms).
|
||||
'style' ('border_style')
|
||||
'color'
|
||||
|
||||
Returns
|
||||
-------
|
||||
side : openpyxl.styles.Side
|
||||
"""
|
||||
from openpyxl.styles import Side
|
||||
|
||||
_side_key_map = {"border_style": "style"}
|
||||
|
||||
if isinstance(side_spec, str):
|
||||
return Side(style=side_spec)
|
||||
|
||||
side_kwargs = {}
|
||||
for k, v in side_spec.items():
|
||||
k = _side_key_map.get(k, k)
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
side_kwargs[k] = v
|
||||
|
||||
return Side(**side_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_border(cls, border_dict):
|
||||
"""
|
||||
Convert ``border_dict`` to an openpyxl v2 Border object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
border_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'left'
|
||||
'right'
|
||||
'top'
|
||||
'bottom'
|
||||
'diagonal'
|
||||
'diagonal_direction'
|
||||
'vertical'
|
||||
'horizontal'
|
||||
'diagonalUp' ('diagonalup')
|
||||
'diagonalDown' ('diagonaldown')
|
||||
'outline'
|
||||
|
||||
Returns
|
||||
-------
|
||||
border : openpyxl.styles.Border
|
||||
"""
|
||||
from openpyxl.styles import Border
|
||||
|
||||
_border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
|
||||
|
||||
border_kwargs = {}
|
||||
for k, v in border_dict.items():
|
||||
k = _border_key_map.get(k, k)
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
if k in ["left", "right", "top", "bottom", "diagonal"]:
|
||||
v = cls._convert_to_side(v)
|
||||
border_kwargs[k] = v
|
||||
|
||||
return Border(**border_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_alignment(cls, alignment_dict):
|
||||
"""
|
||||
Convert ``alignment_dict`` to an openpyxl v2 Alignment object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alignment_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'horizontal'
|
||||
'vertical'
|
||||
'text_rotation'
|
||||
'wrap_text'
|
||||
'shrink_to_fit'
|
||||
'indent'
|
||||
Returns
|
||||
-------
|
||||
alignment : openpyxl.styles.Alignment
|
||||
"""
|
||||
from openpyxl.styles import Alignment
|
||||
|
||||
return Alignment(**alignment_dict)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_number_format(cls, number_format_dict):
|
||||
"""
|
||||
Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
|
||||
initializer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
number_format_dict : dict
|
||||
A dict with zero or more of the following keys.
|
||||
'format_code' : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
number_format : str
|
||||
"""
|
||||
return number_format_dict["format_code"]
|
||||
|
||||
@classmethod
|
||||
def _convert_to_protection(cls, protection_dict):
|
||||
"""
|
||||
Convert ``protection_dict`` to an openpyxl v2 Protection object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
protection_dict : dict
|
||||
A dict with zero or more of the following keys.
|
||||
'locked'
|
||||
'hidden'
|
||||
|
||||
Returns
|
||||
-------
|
||||
"""
|
||||
from openpyxl.styles import Protection
|
||||
|
||||
return Protection(**protection_dict)
|
||||
|
||||
def _write_cells(
|
||||
self,
|
||||
cells,
|
||||
sheet_name: str | None = None,
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
autofilter_range: str | None = None,
|
||||
) -> None:
|
||||
# Write the frame cells using openpyxl.
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
_style_cache: dict[str, dict[str, Serialisable]] = {}
|
||||
|
||||
if sheet_name in self.sheets and self._if_sheet_exists != "new":
|
||||
if "r+" in self._mode:
|
||||
if self._if_sheet_exists == "replace":
|
||||
old_wks = self.sheets[sheet_name]
|
||||
target_index = self.book.index(old_wks)
|
||||
del self.book[sheet_name]
|
||||
wks = self.book.create_sheet(sheet_name, target_index)
|
||||
elif self._if_sheet_exists == "error":
|
||||
raise ValueError(
|
||||
f"Sheet '{sheet_name}' already exists and "
|
||||
f"if_sheet_exists is set to 'error'."
|
||||
)
|
||||
elif self._if_sheet_exists == "overlay":
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"'{self._if_sheet_exists}' is not valid for if_sheet_exists. "
|
||||
"Valid options are 'error', 'new', 'replace' and 'overlay'."
|
||||
)
|
||||
else:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = self.book.create_sheet()
|
||||
wks.title = sheet_name
|
||||
|
||||
if validate_freeze_panes(freeze_panes):
|
||||
freeze_panes = cast(tuple[int, int], freeze_panes)
|
||||
wks.freeze_panes = wks.cell(
|
||||
row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
|
||||
)
|
||||
|
||||
for cell in cells:
|
||||
xcell = wks.cell(
|
||||
row=startrow + cell.row + 1, column=startcol + cell.col + 1
|
||||
)
|
||||
xcell.value, fmt = self._value_with_fmt(cell.val)
|
||||
if fmt:
|
||||
xcell.number_format = fmt
|
||||
|
||||
style_kwargs: dict[str, Serialisable] | None = {}
|
||||
if cell.style:
|
||||
key = str(cell.style)
|
||||
style_kwargs = _style_cache.get(key)
|
||||
if style_kwargs is None:
|
||||
style_kwargs = self._convert_to_style_kwargs(cell.style)
|
||||
_style_cache[key] = style_kwargs
|
||||
|
||||
if style_kwargs:
|
||||
for k, v in style_kwargs.items():
|
||||
setattr(xcell, k, v)
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
wks.merge_cells(
|
||||
start_row=startrow + cell.row + 1,
|
||||
start_column=startcol + cell.col + 1,
|
||||
end_column=startcol + cell.mergeend + 1,
|
||||
end_row=startrow + cell.mergestart + 1,
|
||||
)
|
||||
|
||||
# When cells are merged only the top-left cell is preserved
|
||||
# The behaviour of the other cells in a merged range is
|
||||
# undefined
|
||||
if style_kwargs:
|
||||
first_row = startrow + cell.row + 1
|
||||
last_row = startrow + cell.mergestart + 1
|
||||
first_col = startcol + cell.col + 1
|
||||
last_col = startcol + cell.mergeend + 1
|
||||
|
||||
for row in range(first_row, last_row + 1):
|
||||
for col in range(first_col, last_col + 1):
|
||||
if row == first_row and col == first_col:
|
||||
# Ignore first cell. It is already handled.
|
||||
continue
|
||||
xcell = wks.cell(column=col, row=row)
|
||||
for k, v in style_kwargs.items():
|
||||
setattr(xcell, k, v)
|
||||
|
||||
if autofilter_range:
|
||||
wks.auto_filter.ref = autofilter_range
|
||||
|
||||
|
||||
class OpenpyxlReader(BaseExcelReader["Workbook"]):
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using openpyxl engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or Workbook
|
||||
Object to be parsed.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("openpyxl")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[Workbook]:
|
||||
from openpyxl import Workbook
|
||||
|
||||
return Workbook
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> Workbook:
|
||||
from openpyxl import load_workbook
|
||||
|
||||
default_kwargs = {"read_only": True, "data_only": True, "keep_links": False}
|
||||
|
||||
return load_workbook(
|
||||
filepath_or_buffer,
|
||||
**(default_kwargs | engine_kwargs),
|
||||
)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
return [sheet.title for sheet in self.book.worksheets]
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book[name]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
return self.book.worksheets[index]
|
||||
|
||||
def _convert_cell(self, cell) -> Scalar:
|
||||
from openpyxl.cell.cell import (
|
||||
TYPE_ERROR,
|
||||
TYPE_NUMERIC,
|
||||
)
|
||||
|
||||
if cell.value is None:
|
||||
return "" # compat with xlrd
|
||||
elif cell.data_type == TYPE_ERROR:
|
||||
return np.nan
|
||||
elif cell.data_type == TYPE_NUMERIC:
|
||||
val = int(cell.value)
|
||||
if val == cell.value:
|
||||
return val
|
||||
return float(cell.value)
|
||||
|
||||
return cell.value
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar]]:
|
||||
if self.book.read_only:
|
||||
sheet.reset_dimensions()
|
||||
|
||||
data: list[list[Scalar]] = []
|
||||
last_row_with_data = -1
|
||||
for row_number, row in enumerate(sheet.rows):
|
||||
converted_row = [self._convert_cell(cell) for cell in row]
|
||||
while converted_row and converted_row[-1] == "":
|
||||
# trim trailing empty elements
|
||||
converted_row.pop()
|
||||
if converted_row:
|
||||
last_row_with_data = row_number
|
||||
data.append(converted_row)
|
||||
if file_rows_needed is not None and len(data) >= file_rows_needed:
|
||||
break
|
||||
|
||||
# Trim trailing empty rows
|
||||
data = data[: last_row_with_data + 1]
|
||||
|
||||
if len(data) > 0:
|
||||
# extend rows to max width
|
||||
max_width = max(len(data_row) for data_row in data)
|
||||
if min(len(data_row) for data_row in data) < max_width:
|
||||
empty_cell: list[Scalar] = [""]
|
||||
data = [
|
||||
data_row + (max_width - len(data_row)) * empty_cell
|
||||
for data_row in data
|
||||
]
|
||||
|
||||
return data
|
||||
@@ -0,0 +1,131 @@
|
||||
# pyright: reportMissingImports=false
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pyxlsb import Workbook
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
|
||||
|
||||
class PyxlsbReader(BaseExcelReader["Workbook"]):
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using pyxlsb engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or Workbook
|
||||
Object to be parsed.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("pyxlsb")
|
||||
# This will call load_workbook on the filepath or buffer
|
||||
# And set the result to the book-attribute
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[Workbook]:
|
||||
from pyxlsb import Workbook
|
||||
|
||||
return Workbook
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> Workbook:
|
||||
from pyxlsb import open_workbook
|
||||
|
||||
# TODO: hack in buffer capability
|
||||
# This might need some modifications to the Pyxlsb library
|
||||
# Actual work for opening it is in xlsbpackage.py, line 20-ish
|
||||
|
||||
return open_workbook(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
return self.book.sheets
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book.get_sheet(name)
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
# pyxlsb sheets are indexed from 1 onwards
|
||||
# There's a fix for this in the source, but the pypi package doesn't have it
|
||||
return self.book.get_sheet(index + 1)
|
||||
|
||||
def _convert_cell(self, cell) -> Scalar:
|
||||
# TODO: there is no way to distinguish between floats and datetimes in pyxlsb
|
||||
# This means that there is no way to read datetime types from an xlsb file yet
|
||||
if cell.v is None:
|
||||
return "" # Prevents non-named columns from not showing up as Unnamed: i
|
||||
if isinstance(cell.v, float):
|
||||
val = int(cell.v)
|
||||
if val == cell.v:
|
||||
return val
|
||||
else:
|
||||
return float(cell.v)
|
||||
|
||||
return cell.v
|
||||
|
||||
def get_sheet_data(
|
||||
self,
|
||||
sheet,
|
||||
file_rows_needed: int | None = None,
|
||||
) -> list[list[Scalar]]:
|
||||
data: list[list[Scalar]] = []
|
||||
previous_row_number = -1
|
||||
# When sparse=True the rows can have different lengths and empty rows are
|
||||
# not returned. The cells are namedtuples of row, col, value (r, c, v).
|
||||
for row in sheet.rows(sparse=True):
|
||||
row_number = row[0].r
|
||||
converted_row = [self._convert_cell(cell) for cell in row]
|
||||
while converted_row and converted_row[-1] == "":
|
||||
# trim trailing empty elements
|
||||
converted_row.pop()
|
||||
if converted_row:
|
||||
data.extend([[]] * (row_number - previous_row_number - 1))
|
||||
data.append(converted_row)
|
||||
previous_row_number = row_number
|
||||
if file_rows_needed is not None and len(data) >= file_rows_needed:
|
||||
break
|
||||
if data:
|
||||
# extend rows to max_width
|
||||
max_width = max(len(data_row) for data_row in data)
|
||||
if min(len(data_row) for data_row in data) < max_width:
|
||||
empty_cell: list[Scalar] = [""]
|
||||
data = [
|
||||
data_row + (max_width - len(data_row)) * empty_cell
|
||||
for data_row in data
|
||||
]
|
||||
return data
|
||||
@@ -0,0 +1,328 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Hashable,
|
||||
Iterable,
|
||||
MutableMapping,
|
||||
Sequence,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
TypeVar,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer,
|
||||
is_list_like,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
|
||||
ExcelWriter_t = type[ExcelWriter]
|
||||
usecols_func = TypeVar("usecols_func", bound=Callable[[Hashable], object])
|
||||
|
||||
_writers: MutableMapping[str, ExcelWriter_t] = {}
|
||||
|
||||
|
||||
def register_writer(klass: ExcelWriter_t) -> None:
|
||||
"""
|
||||
Add engine to the excel writer registry.io.excel.
|
||||
|
||||
You must use this method to integrate with ``to_excel``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
klass : ExcelWriter
|
||||
"""
|
||||
if not callable(klass):
|
||||
raise ValueError("Can only register callables as engines")
|
||||
engine_name = klass._engine
|
||||
_writers[engine_name] = klass
|
||||
|
||||
|
||||
def get_default_engine(ext: str, mode: Literal["reader", "writer"] = "reader") -> str:
|
||||
"""
|
||||
Return the default reader/writer for the given extension.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ext : str
|
||||
The excel file extension for which to get the default engine.
|
||||
mode : str {'reader', 'writer'}
|
||||
Whether to get the default engine for reading or writing.
|
||||
Either 'reader' or 'writer'
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The default engine for the extension.
|
||||
"""
|
||||
_default_readers = {
|
||||
"xlsx": "openpyxl",
|
||||
"xlsm": "openpyxl",
|
||||
"xlsb": "pyxlsb",
|
||||
"xls": "xlrd",
|
||||
"ods": "odf",
|
||||
}
|
||||
_default_writers = {
|
||||
"xlsx": "openpyxl",
|
||||
"xlsm": "openpyxl",
|
||||
"xlsb": "pyxlsb",
|
||||
"ods": "odf",
|
||||
}
|
||||
assert mode in ["reader", "writer"]
|
||||
if mode == "writer":
|
||||
# Prefer xlsxwriter over openpyxl if installed
|
||||
xlsxwriter = import_optional_dependency("xlsxwriter", errors="warn")
|
||||
if xlsxwriter:
|
||||
_default_writers["xlsx"] = "xlsxwriter"
|
||||
return _default_writers[ext]
|
||||
else:
|
||||
return _default_readers[ext]
|
||||
|
||||
|
||||
def get_writer(engine_name: str) -> ExcelWriter_t:
|
||||
try:
|
||||
return _writers[engine_name]
|
||||
except KeyError as err:
|
||||
raise ValueError(f"No Excel writer '{engine_name}'") from err
|
||||
|
||||
|
||||
def _excel2num(x: str) -> int:
|
||||
"""
|
||||
Convert Excel column name like 'AB' to 0-based column index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : str
|
||||
The Excel column name to convert to a 0-based column index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
num : int
|
||||
The column index corresponding to the name.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
Part of the Excel column name was invalid.
|
||||
"""
|
||||
index = 0
|
||||
|
||||
for c in x.upper().strip():
|
||||
cp = ord(c)
|
||||
|
||||
if cp < ord("A") or cp > ord("Z"):
|
||||
raise ValueError(f"Invalid column name: {x}")
|
||||
|
||||
index = index * 26 + cp - ord("A") + 1
|
||||
|
||||
return index - 1
|
||||
|
||||
|
||||
def _range2cols(areas: str) -> list[int]:
|
||||
"""
|
||||
Convert comma separated list of column names and ranges to indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
areas : str
|
||||
A string containing a sequence of column ranges (or areas).
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
A list of 0-based column indices.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _range2cols("A:E")
|
||||
[0, 1, 2, 3, 4]
|
||||
>>> _range2cols("A,C,Z:AB")
|
||||
[0, 2, 25, 26, 27]
|
||||
"""
|
||||
cols: list[int] = []
|
||||
|
||||
for rng in areas.split(","):
|
||||
if ":" in rng:
|
||||
rngs = rng.split(":")
|
||||
cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1))
|
||||
else:
|
||||
cols.append(_excel2num(rng))
|
||||
|
||||
return cols
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: str | list[int]) -> list[int]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: list[str]) -> list[str]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: usecols_func) -> usecols_func: ...
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: None) -> None: ...
|
||||
|
||||
|
||||
def maybe_convert_usecols(
|
||||
usecols: str | list[int] | list[str] | usecols_func | None,
|
||||
) -> None | list[int] | list[str] | usecols_func:
|
||||
"""
|
||||
Convert `usecols` into a compatible format for parsing in `parsers.py`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : object
|
||||
The use-columns object to potentially convert.
|
||||
|
||||
Returns
|
||||
-------
|
||||
converted : object
|
||||
The compatible format of `usecols`.
|
||||
"""
|
||||
if usecols is None:
|
||||
return usecols
|
||||
|
||||
if is_integer(usecols):
|
||||
raise ValueError(
|
||||
"Passing an integer for `usecols` is no longer supported. "
|
||||
"Please pass in a list of int from 0 to `usecols` inclusive instead."
|
||||
)
|
||||
|
||||
if isinstance(usecols, str):
|
||||
return _range2cols(usecols)
|
||||
|
||||
return usecols
|
||||
|
||||
|
||||
@overload
|
||||
def validate_freeze_panes(freeze_panes: tuple[int, int]) -> Literal[True]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def validate_freeze_panes(freeze_panes: None) -> Literal[False]: ...
|
||||
|
||||
|
||||
def validate_freeze_panes(freeze_panes: tuple[int, int] | None) -> bool:
|
||||
if freeze_panes is not None:
|
||||
if len(freeze_panes) == 2 and all(
|
||||
isinstance(item, int) for item in freeze_panes
|
||||
):
|
||||
return True
|
||||
|
||||
raise ValueError(
|
||||
"freeze_panes must be of form (row, column) "
|
||||
"where row and column are integers"
|
||||
)
|
||||
|
||||
# freeze_panes wasn't specified, return False so it won't be applied
|
||||
# to output sheet
|
||||
return False
|
||||
|
||||
|
||||
def fill_mi_header(
|
||||
row: list[Hashable], control_row: list[bool]
|
||||
) -> tuple[list[Hashable], list[bool]]:
|
||||
"""
|
||||
Forward fill blank entries in row but only inside the same parent index.
|
||||
|
||||
Used for creating headers in Multiindex.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row : list
|
||||
List of items in a single row.
|
||||
control_row : list of bool
|
||||
Helps to determine if particular column is in same parent index as the
|
||||
previous value. Used to stop propagation of empty cells between
|
||||
different indexes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Returns changed row and control_row
|
||||
"""
|
||||
last = row[0]
|
||||
for i in range(1, len(row)):
|
||||
if not control_row[i]:
|
||||
last = row[i]
|
||||
|
||||
if row[i] == "" or row[i] is None:
|
||||
row[i] = last
|
||||
else:
|
||||
control_row[i] = False
|
||||
last = row[i]
|
||||
|
||||
return row, control_row
|
||||
|
||||
|
||||
def pop_header_name(
|
||||
row: list[Hashable], index_col: int | Sequence[int]
|
||||
) -> tuple[Hashable | None, list[Hashable]]:
|
||||
"""
|
||||
Pop the header name for MultiIndex parsing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row : list
|
||||
The data row to parse for the header name.
|
||||
index_col : int, list
|
||||
The index columns for our data. Assumed to be non-null.
|
||||
|
||||
Returns
|
||||
-------
|
||||
header_name : str
|
||||
The extracted header name.
|
||||
trimmed_row : list
|
||||
The original data row with the header name removed.
|
||||
"""
|
||||
# Pop out header name and fill w/blank.
|
||||
if is_list_like(index_col):
|
||||
assert isinstance(index_col, Iterable)
|
||||
i = max(index_col)
|
||||
else:
|
||||
assert not isinstance(index_col, Iterable)
|
||||
i = index_col
|
||||
|
||||
header_name = row[i]
|
||||
header_name = None if header_name == "" else header_name
|
||||
|
||||
return header_name, [*row[:i], "", *row[i + 1 :]]
|
||||
|
||||
|
||||
def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict:
|
||||
"""
|
||||
Used to combine two sources of kwargs for the backend engine.
|
||||
|
||||
Use of kwargs is deprecated, this function is solely for use in 1.3 and should
|
||||
be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs
|
||||
or kwargs must be None or empty respectively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine_kwargs: dict
|
||||
kwargs to be passed through to the engine.
|
||||
kwargs: dict
|
||||
kwargs to be psased through to the engine (deprecated)
|
||||
|
||||
Returns
|
||||
-------
|
||||
engine_kwargs combined with kwargs
|
||||
"""
|
||||
if engine_kwargs is None:
|
||||
result = {}
|
||||
else:
|
||||
result = engine_kwargs.copy()
|
||||
result.update(kwargs)
|
||||
return result
|
||||
@@ -0,0 +1,147 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import time
|
||||
import math
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from xlrd import Book
|
||||
|
||||
from pandas._typing import (
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
|
||||
|
||||
class XlrdReader(BaseExcelReader["Book"]):
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer,
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using xlrd engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or Workbook
|
||||
Object to be parsed.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection,
|
||||
e.g. host, port, username, password, etc. For HTTP(S) URLs the
|
||||
key-value pairs are forwarded to ``urllib.request.Request`` as
|
||||
header options. For other URLs (e.g. starting with "s3://", and
|
||||
"gcs://") the key-value pairs are forwarded to ``fsspec.open``.
|
||||
Please see ``fsspec`` and ``urllib`` for more details, and for more
|
||||
examples on storage options refer `here <https://pandas.pydata.org/
|
||||
pandas-docs/stable/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`__.
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
|
||||
import_optional_dependency("xlrd", extra=err_msg)
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[Book]:
|
||||
from xlrd import Book
|
||||
|
||||
return Book
|
||||
|
||||
def load_workbook(self, filepath_or_buffer, engine_kwargs) -> Book:
|
||||
from xlrd import open_workbook
|
||||
|
||||
if hasattr(filepath_or_buffer, "read"):
|
||||
data = filepath_or_buffer.read()
|
||||
return open_workbook(file_contents=data, **engine_kwargs)
|
||||
else:
|
||||
return open_workbook(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def sheet_names(self):
|
||||
return self.book.sheet_names()
|
||||
|
||||
def get_sheet_by_name(self, name):
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book.sheet_by_name(name)
|
||||
|
||||
def get_sheet_by_index(self, index):
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
return self.book.sheet_by_index(index)
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar]]:
|
||||
from xlrd import (
|
||||
XL_CELL_BOOLEAN,
|
||||
XL_CELL_DATE,
|
||||
XL_CELL_ERROR,
|
||||
XL_CELL_NUMBER,
|
||||
xldate,
|
||||
)
|
||||
|
||||
epoch1904 = self.book.datemode
|
||||
|
||||
def _parse_cell(cell_contents, cell_typ):
|
||||
"""
|
||||
converts the contents of the cell into a pandas appropriate object
|
||||
"""
|
||||
if cell_typ == XL_CELL_DATE:
|
||||
# Use the newer xlrd datetime handling.
|
||||
try:
|
||||
cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
|
||||
except OverflowError:
|
||||
return cell_contents
|
||||
|
||||
# Excel doesn't distinguish between dates and time,
|
||||
# so we treat dates on the epoch as times only.
|
||||
# Also, Excel supports 1900 and 1904 epochs.
|
||||
year = (cell_contents.timetuple())[0:3]
|
||||
if (not epoch1904 and year == (1899, 12, 31)) or (
|
||||
epoch1904 and year == (1904, 1, 1)
|
||||
):
|
||||
cell_contents = time(
|
||||
cell_contents.hour,
|
||||
cell_contents.minute,
|
||||
cell_contents.second,
|
||||
cell_contents.microsecond,
|
||||
)
|
||||
|
||||
elif cell_typ == XL_CELL_ERROR:
|
||||
cell_contents = np.nan
|
||||
elif cell_typ == XL_CELL_BOOLEAN:
|
||||
cell_contents = bool(cell_contents)
|
||||
elif cell_typ == XL_CELL_NUMBER:
|
||||
# GH5394 - Excel 'numbers' are always floats
|
||||
# it's a minimal perf hit and less surprising
|
||||
if math.isfinite(cell_contents):
|
||||
# GH54564 - don't attempt to convert NaN/Inf
|
||||
val = int(cell_contents)
|
||||
if val == cell_contents:
|
||||
cell_contents = val
|
||||
return cell_contents
|
||||
|
||||
nrows = sheet.nrows
|
||||
if file_rows_needed is not None:
|
||||
nrows = min(nrows, file_rows_needed)
|
||||
return [
|
||||
[
|
||||
_parse_cell(value, typ)
|
||||
for value, typ in zip(
|
||||
sheet.row_values(i), sheet.row_types(i), strict=True
|
||||
)
|
||||
]
|
||||
for i in range(nrows)
|
||||
]
|
||||
+288
@@ -0,0 +1,288 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
from pandas.io.excel._util import (
|
||||
combine_kwargs,
|
||||
validate_freeze_panes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ExcelWriterIfSheetExists,
|
||||
FilePath,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
|
||||
class _XlsxStyler:
|
||||
# Map from openpyxl-oriented styles to flatter xlsxwriter representation
|
||||
# Ordering necessary for both determinism and because some are keyed by
|
||||
# prefixes of others.
|
||||
STYLE_MAPPING: dict[str, list[tuple[tuple[str, ...], str]]] = {
|
||||
"font": [
|
||||
(("name",), "font_name"),
|
||||
(("sz",), "font_size"),
|
||||
(("size",), "font_size"),
|
||||
(("color", "rgb"), "font_color"),
|
||||
(("color",), "font_color"),
|
||||
(("b",), "bold"),
|
||||
(("bold",), "bold"),
|
||||
(("i",), "italic"),
|
||||
(("italic",), "italic"),
|
||||
(("u",), "underline"),
|
||||
(("underline",), "underline"),
|
||||
(("strike",), "font_strikeout"),
|
||||
(("vertAlign",), "font_script"),
|
||||
(("vertalign",), "font_script"),
|
||||
],
|
||||
"number_format": [(("format_code",), "num_format"), ((), "num_format")],
|
||||
"protection": [(("locked",), "locked"), (("hidden",), "hidden")],
|
||||
"alignment": [
|
||||
(("horizontal",), "align"),
|
||||
(("vertical",), "valign"),
|
||||
(("text_rotation",), "rotation"),
|
||||
(("wrap_text",), "text_wrap"),
|
||||
(("indent",), "indent"),
|
||||
(("shrink_to_fit",), "shrink"),
|
||||
],
|
||||
"fill": [
|
||||
(("patternType",), "pattern"),
|
||||
(("patterntype",), "pattern"),
|
||||
(("fill_type",), "pattern"),
|
||||
(("start_color", "rgb"), "fg_color"),
|
||||
(("fgColor", "rgb"), "fg_color"),
|
||||
(("fgcolor", "rgb"), "fg_color"),
|
||||
(("start_color",), "fg_color"),
|
||||
(("fgColor",), "fg_color"),
|
||||
(("fgcolor",), "fg_color"),
|
||||
(("end_color", "rgb"), "bg_color"),
|
||||
(("bgColor", "rgb"), "bg_color"),
|
||||
(("bgcolor", "rgb"), "bg_color"),
|
||||
(("end_color",), "bg_color"),
|
||||
(("bgColor",), "bg_color"),
|
||||
(("bgcolor",), "bg_color"),
|
||||
],
|
||||
"border": [
|
||||
(("color", "rgb"), "border_color"),
|
||||
(("color",), "border_color"),
|
||||
(("style",), "border"),
|
||||
(("top", "color", "rgb"), "top_color"),
|
||||
(("top", "color"), "top_color"),
|
||||
(("top", "style"), "top"),
|
||||
(("top",), "top"),
|
||||
(("right", "color", "rgb"), "right_color"),
|
||||
(("right", "color"), "right_color"),
|
||||
(("right", "style"), "right"),
|
||||
(("right",), "right"),
|
||||
(("bottom", "color", "rgb"), "bottom_color"),
|
||||
(("bottom", "color"), "bottom_color"),
|
||||
(("bottom", "style"), "bottom"),
|
||||
(("bottom",), "bottom"),
|
||||
(("left", "color", "rgb"), "left_color"),
|
||||
(("left", "color"), "left_color"),
|
||||
(("left", "style"), "left"),
|
||||
(("left",), "left"),
|
||||
],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def convert(cls, style_dict, num_format_str=None) -> dict[str, Any]:
|
||||
"""
|
||||
converts a style_dict to an xlsxwriter format dict
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style_dict : style dictionary to convert
|
||||
num_format_str : optional number format string
|
||||
"""
|
||||
# Create an XlsxWriter format object.
|
||||
props = {}
|
||||
|
||||
if num_format_str is not None:
|
||||
props["num_format"] = num_format_str
|
||||
|
||||
if style_dict is None:
|
||||
return props
|
||||
|
||||
if "borders" in style_dict:
|
||||
style_dict = style_dict.copy()
|
||||
style_dict["border"] = style_dict.pop("borders")
|
||||
|
||||
for style_group_key, style_group in style_dict.items():
|
||||
for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
|
||||
# src is a sequence of keys into a nested dict
|
||||
# dst is a flat key
|
||||
if dst in props:
|
||||
continue
|
||||
v = style_group
|
||||
for k in src:
|
||||
try:
|
||||
v = v[k]
|
||||
except (KeyError, TypeError):
|
||||
break
|
||||
else:
|
||||
props[dst] = v
|
||||
|
||||
if isinstance(props.get("pattern"), str):
|
||||
# TODO: support other fill patterns
|
||||
props["pattern"] = 0 if props["pattern"] == "none" else 1
|
||||
|
||||
for k in ["border", "top", "right", "bottom", "left"]:
|
||||
if isinstance(props.get(k), str):
|
||||
try:
|
||||
props[k] = [
|
||||
"none",
|
||||
"thin",
|
||||
"medium",
|
||||
"dashed",
|
||||
"dotted",
|
||||
"thick",
|
||||
"double",
|
||||
"hair",
|
||||
"mediumDashed",
|
||||
"dashDot",
|
||||
"mediumDashDot",
|
||||
"dashDotDot",
|
||||
"mediumDashDotDot",
|
||||
"slantDashDot",
|
||||
].index(props[k])
|
||||
except ValueError:
|
||||
props[k] = 2
|
||||
|
||||
if isinstance(props.get("font_script"), str):
|
||||
props["font_script"] = ["baseline", "superscript", "subscript"].index(
|
||||
props["font_script"]
|
||||
)
|
||||
|
||||
if isinstance(props.get("underline"), str):
|
||||
props["underline"] = {
|
||||
"none": 0,
|
||||
"single": 1,
|
||||
"double": 2,
|
||||
"singleAccounting": 33,
|
||||
"doubleAccounting": 34,
|
||||
}[props["underline"]]
|
||||
|
||||
# GH 30107 - xlsxwriter uses different name
|
||||
if props.get("valign") == "center":
|
||||
props["valign"] = "vcenter"
|
||||
|
||||
return props
|
||||
|
||||
|
||||
class XlsxWriter(ExcelWriter):
|
||||
_engine = "xlsxwriter"
|
||||
_supported_extensions = (".xlsx",)
|
||||
|
||||
def __init__( # pyright: ignore[reportInconsistentConstructor]
|
||||
self,
|
||||
path: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
engine: str | None = None,
|
||||
date_format: str | None = None,
|
||||
datetime_format: str | None = None,
|
||||
mode: str = "w",
|
||||
storage_options: StorageOptions | None = None,
|
||||
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Use the xlsxwriter module as the Excel writer.
|
||||
from xlsxwriter import Workbook
|
||||
|
||||
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
|
||||
|
||||
if mode == "a":
|
||||
raise ValueError("Append mode is not supported with xlsxwriter!")
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
engine=engine,
|
||||
date_format=date_format,
|
||||
datetime_format=datetime_format,
|
||||
mode=mode,
|
||||
storage_options=storage_options,
|
||||
if_sheet_exists=if_sheet_exists,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
self._book = Workbook(self._handles.handle, **engine_kwargs)
|
||||
except TypeError:
|
||||
self._handles.handle.close()
|
||||
raise
|
||||
|
||||
@property
|
||||
def book(self):
|
||||
"""
|
||||
Book instance of class xlsxwriter.Workbook.
|
||||
|
||||
This attribute can be used to access engine-specific features.
|
||||
"""
|
||||
return self._book
|
||||
|
||||
@property
|
||||
def sheets(self) -> dict[str, Any]:
|
||||
result = self.book.sheetnames
|
||||
return result
|
||||
|
||||
def _save(self) -> None:
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
self.book.close()
|
||||
|
||||
def _write_cells(
|
||||
self,
|
||||
cells,
|
||||
sheet_name: str | None = None,
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
autofilter_range: str | None = None,
|
||||
) -> None:
|
||||
# Write the frame cells using xlsxwriter.
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
wks = self.book.get_worksheet_by_name(sheet_name)
|
||||
if wks is None:
|
||||
wks = self.book.add_worksheet(sheet_name)
|
||||
|
||||
style_dict = {"null": None}
|
||||
|
||||
if validate_freeze_panes(freeze_panes):
|
||||
wks.freeze_panes(*(freeze_panes))
|
||||
|
||||
for cell in cells:
|
||||
val, fmt = self._value_with_fmt(cell.val)
|
||||
|
||||
stylekey = json.dumps(cell.style)
|
||||
if fmt:
|
||||
stylekey += fmt
|
||||
|
||||
if stylekey in style_dict:
|
||||
style = style_dict[stylekey]
|
||||
else:
|
||||
style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
|
||||
style_dict[stylekey] = style
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
wks.merge_range(
|
||||
startrow + cell.row,
|
||||
startcol + cell.col,
|
||||
startrow + cell.mergestart,
|
||||
startcol + cell.mergeend,
|
||||
val,
|
||||
style,
|
||||
)
|
||||
else:
|
||||
wks.write(startrow + cell.row, startcol + cell.col, val, style)
|
||||
|
||||
if autofilter_range:
|
||||
wks.autofilter(autofilter_range)
|
||||
@@ -0,0 +1,172 @@
|
||||
"""feather-format compat"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import Pandas4Warning
|
||||
from pandas.util._decorators import set_module
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.api import DataFrame
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.common import get_handle
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
DtypeBackend,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
|
||||
def to_feather(
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
Write a DataFrame to the binary Feather format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
path : str, path object, or file-like object
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
**kwargs :
|
||||
Additional keywords passed to `pyarrow.feather.write_feather`.
|
||||
|
||||
"""
|
||||
import_optional_dependency("pyarrow")
|
||||
from pyarrow import feather
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("feather only support IO with DataFrames")
|
||||
|
||||
with get_handle(
|
||||
path, "wb", storage_options=storage_options, is_text=False
|
||||
) as handles:
|
||||
feather.write_feather(df, handles.handle, **kwargs)
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_feather(
|
||||
path: FilePath | ReadBuffer[bytes],
|
||||
columns: Sequence[Hashable] | None = None,
|
||||
use_threads: bool = True,
|
||||
storage_options: StorageOptions | None = None,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load a feather-format object from the file path.
|
||||
|
||||
Feather is particularly useful for scenarios that require efficient
|
||||
serialization and deserialization of tabular data. It supports
|
||||
schema preservation, making it a reliable choice for use cases
|
||||
such as sharing data between Python and R, or persisting intermediate
|
||||
results during data processing pipelines. This method provides additional
|
||||
flexibility with options for selective column reading, thread parallelism,
|
||||
and choosing the backend for data types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be a URL.
|
||||
Valid URL schemes include http, ftp, s3, gs and file. For file URLs, a host is
|
||||
expected. A local file could be: ``file://localhost/path/to/table.feather``.
|
||||
columns : sequence, default None
|
||||
If not provided, all columns are read.
|
||||
use_threads : bool, default True
|
||||
Whether to parallelize reading using multiple threads.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
|
||||
dtype_backend : {{'numpy_nullable', 'pyarrow'}}
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). If not specified, the default behavior
|
||||
is to not use nullable data types. If specified, the behavior
|
||||
is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`.
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable
|
||||
:class:`ArrowDtype` :class:`DataFrame`
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
type of object stored in file
|
||||
DataFrame object stored in the file.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_csv : Read a comma-separated values (csv) file into a pandas DataFrame.
|
||||
read_excel : Read an Excel file into a pandas DataFrame.
|
||||
read_spss : Read an SPSS file into a pandas DataFrame.
|
||||
read_orc : Load an ORC object into a pandas DataFrame.
|
||||
read_sas : Read SAS file into a pandas DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP
|
||||
"""
|
||||
import_optional_dependency("pyarrow")
|
||||
from pyarrow import feather
|
||||
|
||||
# import utils to register the pyarrow extension types
|
||||
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
|
||||
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
with get_handle(
|
||||
path, "rb", storage_options=storage_options, is_text=False
|
||||
) as handles:
|
||||
if dtype_backend is lib.no_default and not using_string_dtype():
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
Pandas4Warning,
|
||||
)
|
||||
|
||||
return feather.read_feather(
|
||||
handles.handle, columns=columns, use_threads=bool(use_threads)
|
||||
)
|
||||
|
||||
pa_table = feather.read_table(
|
||||
handles.handle, columns=columns, use_threads=bool(use_threads)
|
||||
)
|
||||
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
|
||||
@@ -0,0 +1,9 @@
|
||||
# ruff: noqa: TC004
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# import modules that have public classes/functions
|
||||
from pandas.io.formats import style
|
||||
|
||||
# and mark only those modules as public
|
||||
__all__ = ["style"]
|
||||
+157
@@ -0,0 +1,157 @@
|
||||
# GH37967: Enable the use of CSS named colors, as defined in
|
||||
# matplotlib.colors.CSS4_COLORS, when exporting to Excel.
|
||||
# This data has been copied here, instead of being imported from matplotlib,
|
||||
# not to have ``to_excel`` methods require matplotlib.
|
||||
# source: matplotlib._color_data (3.3.3)
|
||||
from __future__ import annotations
|
||||
|
||||
CSS4_COLORS = {
|
||||
"aliceblue": "F0F8FF",
|
||||
"antiquewhite": "FAEBD7",
|
||||
"aqua": "00FFFF",
|
||||
"aquamarine": "7FFFD4",
|
||||
"azure": "F0FFFF",
|
||||
"beige": "F5F5DC",
|
||||
"bisque": "FFE4C4",
|
||||
"black": "000000",
|
||||
"blanchedalmond": "FFEBCD",
|
||||
"blue": "0000FF",
|
||||
"blueviolet": "8A2BE2",
|
||||
"brown": "A52A2A",
|
||||
"burlywood": "DEB887",
|
||||
"cadetblue": "5F9EA0",
|
||||
"chartreuse": "7FFF00",
|
||||
"chocolate": "D2691E",
|
||||
"coral": "FF7F50",
|
||||
"cornflowerblue": "6495ED",
|
||||
"cornsilk": "FFF8DC",
|
||||
"crimson": "DC143C",
|
||||
"cyan": "00FFFF",
|
||||
"darkblue": "00008B",
|
||||
"darkcyan": "008B8B",
|
||||
"darkgoldenrod": "B8860B",
|
||||
"darkgray": "A9A9A9",
|
||||
"darkgreen": "006400",
|
||||
"darkgrey": "A9A9A9",
|
||||
"darkkhaki": "BDB76B",
|
||||
"darkmagenta": "8B008B",
|
||||
"darkolivegreen": "556B2F",
|
||||
"darkorange": "FF8C00",
|
||||
"darkorchid": "9932CC",
|
||||
"darkred": "8B0000",
|
||||
"darksalmon": "E9967A",
|
||||
"darkseagreen": "8FBC8F",
|
||||
"darkslateblue": "483D8B",
|
||||
"darkslategray": "2F4F4F",
|
||||
"darkslategrey": "2F4F4F",
|
||||
"darkturquoise": "00CED1",
|
||||
"darkviolet": "9400D3",
|
||||
"deeppink": "FF1493",
|
||||
"deepskyblue": "00BFFF",
|
||||
"dimgray": "696969",
|
||||
"dimgrey": "696969",
|
||||
"dodgerblue": "1E90FF",
|
||||
"firebrick": "B22222",
|
||||
"floralwhite": "FFFAF0",
|
||||
"forestgreen": "228B22",
|
||||
"fuchsia": "FF00FF",
|
||||
"gainsboro": "DCDCDC",
|
||||
"ghostwhite": "F8F8FF",
|
||||
"gold": "FFD700",
|
||||
"goldenrod": "DAA520",
|
||||
"gray": "808080",
|
||||
"green": "008000",
|
||||
"greenyellow": "ADFF2F",
|
||||
"grey": "808080",
|
||||
"honeydew": "F0FFF0",
|
||||
"hotpink": "FF69B4",
|
||||
"indianred": "CD5C5C",
|
||||
"indigo": "4B0082",
|
||||
"ivory": "FFFFF0",
|
||||
"khaki": "F0E68C",
|
||||
"lavender": "E6E6FA",
|
||||
"lavenderblush": "FFF0F5",
|
||||
"lawngreen": "7CFC00",
|
||||
"lemonchiffon": "FFFACD",
|
||||
"lightblue": "ADD8E6",
|
||||
"lightcoral": "F08080",
|
||||
"lightcyan": "E0FFFF",
|
||||
"lightgoldenrodyellow": "FAFAD2",
|
||||
"lightgray": "D3D3D3",
|
||||
"lightgreen": "90EE90",
|
||||
"lightgrey": "D3D3D3",
|
||||
"lightpink": "FFB6C1",
|
||||
"lightsalmon": "FFA07A",
|
||||
"lightseagreen": "20B2AA",
|
||||
"lightskyblue": "87CEFA",
|
||||
"lightslategray": "778899",
|
||||
"lightslategrey": "778899",
|
||||
"lightsteelblue": "B0C4DE",
|
||||
"lightyellow": "FFFFE0",
|
||||
"lime": "00FF00",
|
||||
"limegreen": "32CD32",
|
||||
"linen": "FAF0E6",
|
||||
"magenta": "FF00FF",
|
||||
"maroon": "800000",
|
||||
"mediumaquamarine": "66CDAA",
|
||||
"mediumblue": "0000CD",
|
||||
"mediumorchid": "BA55D3",
|
||||
"mediumpurple": "9370DB",
|
||||
"mediumseagreen": "3CB371",
|
||||
"mediumslateblue": "7B68EE",
|
||||
"mediumspringgreen": "00FA9A",
|
||||
"mediumturquoise": "48D1CC",
|
||||
"mediumvioletred": "C71585",
|
||||
"midnightblue": "191970",
|
||||
"mintcream": "F5FFFA",
|
||||
"mistyrose": "FFE4E1",
|
||||
"moccasin": "FFE4B5",
|
||||
"navajowhite": "FFDEAD",
|
||||
"navy": "000080",
|
||||
"oldlace": "FDF5E6",
|
||||
"olive": "808000",
|
||||
"olivedrab": "6B8E23",
|
||||
"orange": "FFA500",
|
||||
"orangered": "FF4500",
|
||||
"orchid": "DA70D6",
|
||||
"palegoldenrod": "EEE8AA",
|
||||
"palegreen": "98FB98",
|
||||
"paleturquoise": "AFEEEE",
|
||||
"palevioletred": "DB7093",
|
||||
"papayawhip": "FFEFD5",
|
||||
"peachpuff": "FFDAB9",
|
||||
"peru": "CD853F",
|
||||
"pink": "FFC0CB",
|
||||
"plum": "DDA0DD",
|
||||
"powderblue": "B0E0E6",
|
||||
"purple": "800080",
|
||||
"rebeccapurple": "663399",
|
||||
"red": "FF0000",
|
||||
"rosybrown": "BC8F8F",
|
||||
"royalblue": "4169E1",
|
||||
"saddlebrown": "8B4513",
|
||||
"salmon": "FA8072",
|
||||
"sandybrown": "F4A460",
|
||||
"seagreen": "2E8B57",
|
||||
"seashell": "FFF5EE",
|
||||
"sienna": "A0522D",
|
||||
"silver": "C0C0C0",
|
||||
"skyblue": "87CEEB",
|
||||
"slateblue": "6A5ACD",
|
||||
"slategray": "708090",
|
||||
"slategrey": "708090",
|
||||
"snow": "FFFAFA",
|
||||
"springgreen": "00FF7F",
|
||||
"steelblue": "4682B4",
|
||||
"tan": "D2B48C",
|
||||
"teal": "008080",
|
||||
"thistle": "D8BFD8",
|
||||
"tomato": "FF6347",
|
||||
"turquoise": "40E0D0",
|
||||
"violet": "EE82EE",
|
||||
"wheat": "F5DEB3",
|
||||
"white": "FFFFFF",
|
||||
"whitesmoke": "F5F5F5",
|
||||
"yellow": "FFFF00",
|
||||
"yellowgreen": "9ACD32",
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
"""
|
||||
Internal module for console introspection
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from shutil import get_terminal_size
|
||||
|
||||
|
||||
def get_console_size() -> tuple[int | None, int | None]:
|
||||
"""
|
||||
Return console size as tuple = (width, height).
|
||||
|
||||
Returns (None,None) in non-interactive session.
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
display_width = get_option("display.width")
|
||||
display_height = get_option("display.max_rows")
|
||||
|
||||
# Consider
|
||||
# interactive shell terminal, can detect term size
|
||||
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
|
||||
# size non-interactive script, should disregard term size
|
||||
|
||||
# in addition
|
||||
# width,height have default values, but setting to 'None' signals
|
||||
# should use Auto-Detection, But only in interactive shell-terminal.
|
||||
# Simple. yeah.
|
||||
|
||||
if in_interactive_session():
|
||||
if in_ipython_frontend():
|
||||
# sane defaults for interactive non-shell terminal
|
||||
# match default for width,height in config_init
|
||||
from pandas._config.config import get_default_val
|
||||
|
||||
terminal_width = get_default_val("display.width")
|
||||
terminal_height = get_default_val("display.max_rows")
|
||||
else:
|
||||
# pure terminal
|
||||
terminal_width, terminal_height = get_terminal_size()
|
||||
else:
|
||||
terminal_width, terminal_height = None, None
|
||||
|
||||
# Note if the User sets width/Height to None (auto-detection)
|
||||
# and we're in a script (non-inter), this will return (None,None)
|
||||
# caller needs to deal.
|
||||
return display_width or terminal_width, display_height or terminal_height
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Detect our environment
|
||||
|
||||
|
||||
def in_interactive_session() -> bool:
|
||||
"""
|
||||
Check if we're running in an interactive shell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if running under python/ipython interactive shell.
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
def check_main() -> bool:
|
||||
try:
|
||||
import __main__ as main
|
||||
except ModuleNotFoundError:
|
||||
return get_option("mode.sim_interactive")
|
||||
return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
|
||||
|
||||
try:
|
||||
# error: Name '__IPYTHON__' is not defined
|
||||
return __IPYTHON__ or check_main() # type: ignore[name-defined]
|
||||
except NameError:
|
||||
return check_main()
|
||||
|
||||
|
||||
def in_ipython_frontend() -> bool:
|
||||
"""
|
||||
Check if we're inside an IPython zmq frontend.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
try:
|
||||
# error: Name 'get_ipython' is not defined
|
||||
ip = get_ipython() # type: ignore[name-defined]
|
||||
return "zmq" in str(type(ip)).lower()
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
return False
|
||||
@@ -0,0 +1,425 @@
|
||||
"""
|
||||
Utilities for interpreting CSS from Stylers for formatting non-HTML outputs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas.errors import CSSWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Generator,
|
||||
Iterable,
|
||||
Iterator,
|
||||
)
|
||||
|
||||
|
||||
def _side_expander(prop_fmt: str) -> Callable:
|
||||
"""
|
||||
Wrapper to expand shorthand property into top, right, bottom, left properties
|
||||
|
||||
Parameters
|
||||
----------
|
||||
side : str
|
||||
The border side to expand into properties
|
||||
|
||||
Returns
|
||||
-------
|
||||
function: Return to call when a 'border(-{side}): {value}' string is encountered
|
||||
"""
|
||||
|
||||
def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]:
|
||||
"""
|
||||
Expand shorthand property into side-specific property (top, right, bottom, left)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prop (str): CSS property name
|
||||
value (str): String token for property
|
||||
|
||||
Yields
|
||||
------
|
||||
Tuple (str, str): Expanded property, value
|
||||
"""
|
||||
tokens = value.split()
|
||||
try:
|
||||
mapping = self.SIDE_SHORTHANDS[len(tokens)]
|
||||
except KeyError:
|
||||
warnings.warn(
|
||||
f'Could not expand "{prop}: {value}"',
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return
|
||||
for key, idx in zip(self.SIDES, mapping, strict=True):
|
||||
yield prop_fmt.format(key), tokens[idx]
|
||||
|
||||
return expand
|
||||
|
||||
|
||||
def _border_expander(side: str = "") -> Callable:
|
||||
"""
|
||||
Wrapper to expand 'border' property into border color, style, and width properties
|
||||
|
||||
Parameters
|
||||
----------
|
||||
side : str
|
||||
The border side to expand into properties
|
||||
|
||||
Returns
|
||||
-------
|
||||
function: Return to call when a 'border(-{side}): {value}' string is encountered
|
||||
"""
|
||||
if side != "":
|
||||
side = f"-{side}"
|
||||
|
||||
def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]:
|
||||
"""
|
||||
Expand border into color, style, and width tuples
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prop : str
|
||||
CSS property name passed to styler
|
||||
value : str
|
||||
Value passed to styler for property
|
||||
|
||||
Yields
|
||||
------
|
||||
Tuple (str, str): Expanded property, value
|
||||
"""
|
||||
tokens = value.split()
|
||||
if len(tokens) == 0 or len(tokens) > 3:
|
||||
warnings.warn(
|
||||
f'Too many tokens provided to "{prop}" (expected 1-3)',
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
# TODO: Can we use current color as initial value to comply with CSS standards?
|
||||
border_declarations = {
|
||||
f"border{side}-color": "black",
|
||||
f"border{side}-style": "none",
|
||||
f"border{side}-width": "medium",
|
||||
}
|
||||
for token in tokens:
|
||||
if token.lower() in self.BORDER_STYLES:
|
||||
border_declarations[f"border{side}-style"] = token
|
||||
elif any(ratio in token.lower() for ratio in self.BORDER_WIDTH_RATIOS):
|
||||
border_declarations[f"border{side}-width"] = token
|
||||
else:
|
||||
border_declarations[f"border{side}-color"] = token
|
||||
# TODO: Warn user if item entered more than once (e.g. "border: red green")
|
||||
|
||||
# Per CSS, "border" will reset previous "border-*" definitions
|
||||
yield from self.atomize(border_declarations.items())
|
||||
|
||||
return expand
|
||||
|
||||
|
||||
class CSSResolver:
|
||||
"""
|
||||
A callable for parsing and resolving CSS to atomic properties.
|
||||
"""
|
||||
|
||||
UNIT_RATIOS = {
|
||||
"pt": ("pt", 1),
|
||||
"em": ("em", 1),
|
||||
"rem": ("pt", 12),
|
||||
"ex": ("em", 0.5),
|
||||
# 'ch':
|
||||
"px": ("pt", 0.75),
|
||||
"pc": ("pt", 12),
|
||||
"in": ("pt", 72),
|
||||
"cm": ("in", 1 / 2.54),
|
||||
"mm": ("in", 1 / 25.4),
|
||||
"q": ("mm", 0.25),
|
||||
"!!default": ("em", 0),
|
||||
}
|
||||
|
||||
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
|
||||
FONT_SIZE_RATIOS.update(
|
||||
{
|
||||
"%": ("em", 0.01),
|
||||
"xx-small": ("rem", 0.5),
|
||||
"x-small": ("rem", 0.625),
|
||||
"small": ("rem", 0.8),
|
||||
"medium": ("rem", 1),
|
||||
"large": ("rem", 1.125),
|
||||
"x-large": ("rem", 1.5),
|
||||
"xx-large": ("rem", 2),
|
||||
"smaller": ("em", 1 / 1.2),
|
||||
"larger": ("em", 1.2),
|
||||
"!!default": ("em", 1),
|
||||
}
|
||||
)
|
||||
|
||||
MARGIN_RATIOS = UNIT_RATIOS.copy()
|
||||
MARGIN_RATIOS.update({"none": ("pt", 0)})
|
||||
|
||||
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
|
||||
BORDER_WIDTH_RATIOS.update(
|
||||
{
|
||||
"none": ("pt", 0),
|
||||
"thick": ("px", 4),
|
||||
"medium": ("px", 2),
|
||||
"thin": ("px", 1),
|
||||
# Default: medium only if solid
|
||||
}
|
||||
)
|
||||
|
||||
BORDER_STYLES = [
|
||||
"none",
|
||||
"hidden",
|
||||
"dotted",
|
||||
"dashed",
|
||||
"solid",
|
||||
"double",
|
||||
"groove",
|
||||
"ridge",
|
||||
"inset",
|
||||
"outset",
|
||||
"mediumdashdot",
|
||||
"dashdotdot",
|
||||
"hair",
|
||||
"mediumdashdotdot",
|
||||
"dashdot",
|
||||
"slantdashdot",
|
||||
"mediumdashed",
|
||||
]
|
||||
|
||||
SIDE_SHORTHANDS = {
|
||||
1: [0, 0, 0, 0],
|
||||
2: [0, 1, 0, 1],
|
||||
3: [0, 1, 2, 1],
|
||||
4: [0, 1, 2, 3],
|
||||
}
|
||||
|
||||
SIDES = ("top", "right", "bottom", "left")
|
||||
|
||||
CSS_EXPANSIONS = {
|
||||
**{
|
||||
(f"border-{prop}" if prop else "border"): _border_expander(prop)
|
||||
for prop in ["", "top", "right", "bottom", "left"]
|
||||
},
|
||||
**{
|
||||
f"border-{prop}": _side_expander(f"border-{{:s}}-{prop}")
|
||||
for prop in ["color", "style", "width"]
|
||||
},
|
||||
"margin": _side_expander("margin-{:s}"),
|
||||
"padding": _side_expander("padding-{:s}"),
|
||||
}
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
declarations: str | Iterable[tuple[str, str]],
|
||||
inherited: dict[str, str] | None = None,
|
||||
) -> dict[str, str]:
|
||||
"""
|
||||
The given declarations to atomic properties.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str | Iterable[tuple[str, str]]
|
||||
A CSS string or set of CSS declaration tuples
|
||||
e.g. "font-weight: bold; background: blue" or
|
||||
{("font-weight", "bold"), ("background", "blue")}
|
||||
inherited : dict, optional
|
||||
Atomic properties indicating the inherited style context in which
|
||||
declarations_str is to be resolved. ``inherited`` should already
|
||||
be resolved, i.e. valid output of this method.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
Atomic CSS 2.2 properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> resolve = CSSResolver()
|
||||
>>> inherited = {"font-family": "serif", "font-weight": "bold"}
|
||||
>>> out = resolve(
|
||||
... '''
|
||||
... border-color: BLUE RED;
|
||||
... font-size: 1em;
|
||||
... font-size: 2em;
|
||||
... font-weight: normal;
|
||||
... font-weight: inherit;
|
||||
... ''',
|
||||
... inherited,
|
||||
... )
|
||||
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('border-bottom-color', 'blue'),
|
||||
('border-left-color', 'red'),
|
||||
('border-right-color', 'red'),
|
||||
('border-top-color', 'blue'),
|
||||
('font-family', 'serif'),
|
||||
('font-size', '24pt'),
|
||||
('font-weight', 'bold')]
|
||||
"""
|
||||
if isinstance(declarations, str):
|
||||
declarations = self.parse(declarations)
|
||||
props = dict(self.atomize(declarations))
|
||||
if inherited is None:
|
||||
inherited = {}
|
||||
|
||||
props = self._update_initial(props, inherited)
|
||||
props = self._update_font_size(props, inherited)
|
||||
return self._update_other_units(props)
|
||||
|
||||
def _update_initial(
|
||||
self,
|
||||
props: dict[str, str],
|
||||
inherited: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
# 1. resolve inherited, initial
|
||||
for prop, val in inherited.items():
|
||||
if prop not in props:
|
||||
props[prop] = val
|
||||
|
||||
new_props = props.copy()
|
||||
for prop, val in props.items():
|
||||
if val == "inherit":
|
||||
val = inherited.get(prop, "initial")
|
||||
|
||||
if val in ("initial", None):
|
||||
# we do not define a complete initial stylesheet
|
||||
del new_props[prop]
|
||||
else:
|
||||
new_props[prop] = val
|
||||
return new_props
|
||||
|
||||
def _update_font_size(
|
||||
self,
|
||||
props: dict[str, str],
|
||||
inherited: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
# 2. resolve relative font size
|
||||
if props.get("font-size"):
|
||||
props["font-size"] = self.size_to_pt(
|
||||
props["font-size"],
|
||||
self._get_font_size(inherited),
|
||||
conversions=self.FONT_SIZE_RATIOS,
|
||||
)
|
||||
return props
|
||||
|
||||
def _get_font_size(self, props: dict[str, str]) -> float | None:
|
||||
if props.get("font-size"):
|
||||
font_size_string = props["font-size"]
|
||||
return self._get_float_font_size_from_pt(font_size_string)
|
||||
return None
|
||||
|
||||
def _get_float_font_size_from_pt(self, font_size_string: str) -> float:
|
||||
assert font_size_string.endswith("pt")
|
||||
return float(font_size_string.rstrip("pt"))
|
||||
|
||||
def _update_other_units(self, props: dict[str, str]) -> dict[str, str]:
|
||||
font_size = self._get_font_size(props)
|
||||
# 3. TODO: resolve other font-relative units
|
||||
for side in self.SIDES:
|
||||
prop = f"border-{side}-width"
|
||||
if prop in props:
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop],
|
||||
em_pt=font_size,
|
||||
conversions=self.BORDER_WIDTH_RATIOS,
|
||||
)
|
||||
|
||||
for prop in [f"margin-{side}", f"padding-{side}"]:
|
||||
if prop in props:
|
||||
# TODO: support %
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop],
|
||||
em_pt=font_size,
|
||||
conversions=self.MARGIN_RATIOS,
|
||||
)
|
||||
return props
|
||||
|
||||
def size_to_pt(
|
||||
self, in_val: str, em_pt: float | None = None, conversions: dict = UNIT_RATIOS
|
||||
) -> str:
|
||||
def _error() -> str:
|
||||
warnings.warn(
|
||||
f"Unhandled size: {in_val!r}",
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return self.size_to_pt("1!!default", conversions=conversions)
|
||||
|
||||
match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val)
|
||||
if match is None:
|
||||
return _error()
|
||||
|
||||
val, unit = match.groups()
|
||||
if val == "":
|
||||
# hack for 'large' etc.
|
||||
val = 1
|
||||
else:
|
||||
try:
|
||||
val = float(val)
|
||||
except ValueError:
|
||||
return _error()
|
||||
|
||||
while unit != "pt":
|
||||
if unit == "em":
|
||||
if em_pt is None:
|
||||
unit = "rem"
|
||||
else:
|
||||
val *= em_pt
|
||||
unit = "pt"
|
||||
continue
|
||||
|
||||
try:
|
||||
unit, mul = conversions[unit]
|
||||
except KeyError:
|
||||
return _error()
|
||||
val *= mul
|
||||
|
||||
val = round(val, 5)
|
||||
if int(val) == val:
|
||||
size_fmt = f"{int(val):d}pt"
|
||||
else:
|
||||
size_fmt = f"{val:f}pt"
|
||||
return size_fmt
|
||||
|
||||
def atomize(self, declarations: Iterable) -> Generator[tuple[str, str]]:
|
||||
for prop, value in declarations:
|
||||
prop = prop.lower()
|
||||
value = value.lower()
|
||||
if prop in self.CSS_EXPANSIONS:
|
||||
expand = self.CSS_EXPANSIONS[prop]
|
||||
yield from expand(self, prop, value)
|
||||
else:
|
||||
yield prop, value
|
||||
|
||||
def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]:
|
||||
"""
|
||||
Generates (prop, value) pairs from declarations.
|
||||
|
||||
In a future version may generate parsed tokens from tinycss/tinycss2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str
|
||||
"""
|
||||
for decl in declarations_str.split(";"):
|
||||
if not decl.strip():
|
||||
continue
|
||||
prop, sep, val = decl.partition(":")
|
||||
prop = prop.strip().lower()
|
||||
# TODO: don't lowercase case sensitive parts of values (strings)
|
||||
val = val.strip().lower()
|
||||
if sep:
|
||||
yield prop, val
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Ill-formatted attribute: expected a colon in {decl!r}",
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
@@ -0,0 +1,336 @@
|
||||
"""
|
||||
Module for formatting output data into CSV files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Sequence,
|
||||
)
|
||||
import csv as csvlib
|
||||
import os
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import writers as libwriters
|
||||
from pandas._typing import SequenceNotStr
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDatetimeIndex,
|
||||
ABCIndex,
|
||||
ABCMultiIndex,
|
||||
ABCPeriodIndex,
|
||||
)
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.core.indexes.api import Index
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
FloatFormatType,
|
||||
IndexLabel,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.io.formats.format import DataFrameFormatter
|
||||
|
||||
|
||||
_DEFAULT_CHUNKSIZE_CELLS = 100_000
|
||||
|
||||
|
||||
class CSVFormatter:
|
||||
cols: npt.NDArray[np.object_]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formatter: DataFrameFormatter,
|
||||
path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
|
||||
sep: str = ",",
|
||||
cols: Sequence[Hashable] | None = None,
|
||||
index_label: IndexLabel | None = None,
|
||||
mode: str = "w",
|
||||
encoding: str | None = None,
|
||||
errors: str = "strict",
|
||||
compression: CompressionOptions = "infer",
|
||||
quoting: int | None = None,
|
||||
lineterminator: str | None = "\n",
|
||||
chunksize: int | None = None,
|
||||
quotechar: str | None = '"',
|
||||
date_format: str | None = None,
|
||||
doublequote: bool = True,
|
||||
escapechar: str | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> None:
|
||||
self.fmt = formatter
|
||||
|
||||
self.obj = self.fmt.frame
|
||||
|
||||
self.filepath_or_buffer = path_or_buf
|
||||
self.encoding = encoding
|
||||
self.compression: CompressionOptions = compression
|
||||
self.mode = mode
|
||||
self.storage_options = storage_options
|
||||
|
||||
self.sep = sep
|
||||
self.index_label = self._initialize_index_label(index_label)
|
||||
self.errors = errors
|
||||
self.quoting = quoting or csvlib.QUOTE_MINIMAL
|
||||
self.doublequote = doublequote
|
||||
self.escapechar = escapechar
|
||||
self.quotechar = self._initialize_quotechar(quotechar)
|
||||
self.lineterminator = lineterminator or os.linesep
|
||||
self.date_format = date_format
|
||||
self.cols = self._initialize_columns(cols)
|
||||
self.chunksize = self._initialize_chunksize(chunksize)
|
||||
|
||||
@property
|
||||
def na_rep(self) -> str:
|
||||
return self.fmt.na_rep
|
||||
|
||||
@property
|
||||
def float_format(self) -> FloatFormatType | None:
|
||||
return self.fmt.float_format
|
||||
|
||||
@property
|
||||
def decimal(self) -> str:
|
||||
return self.fmt.decimal
|
||||
|
||||
@property
|
||||
def header(self) -> bool | SequenceNotStr[str]:
|
||||
return self.fmt.header
|
||||
|
||||
@property
|
||||
def index(self) -> bool:
|
||||
return self.fmt.index
|
||||
|
||||
def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel:
|
||||
if index_label is not False:
|
||||
if index_label is None:
|
||||
return self._get_index_label_from_obj()
|
||||
elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)):
|
||||
# given a string for a DF with Index
|
||||
return [index_label]
|
||||
return index_label
|
||||
|
||||
def _get_index_label_from_obj(self) -> Sequence[Hashable]:
|
||||
if isinstance(self.obj.index, ABCMultiIndex):
|
||||
return self._get_index_label_multiindex()
|
||||
else:
|
||||
return self._get_index_label_flat()
|
||||
|
||||
def _get_index_label_multiindex(self) -> Sequence[Hashable]:
|
||||
return [name or "" for name in self.obj.index.names]
|
||||
|
||||
def _get_index_label_flat(self) -> Sequence[Hashable]:
|
||||
index_label = self.obj.index.name
|
||||
return [""] if index_label is None else [index_label]
|
||||
|
||||
def _initialize_quotechar(self, quotechar: str | None) -> str | None:
|
||||
if self.quoting != csvlib.QUOTE_NONE or self.escapechar is not None:
|
||||
# prevents crash in _csv
|
||||
return quotechar
|
||||
return None
|
||||
|
||||
@property
|
||||
def has_mi_columns(self) -> bool:
|
||||
return bool(isinstance(self.obj.columns, ABCMultiIndex))
|
||||
|
||||
def _initialize_columns(
|
||||
self, cols: Iterable[Hashable] | None
|
||||
) -> npt.NDArray[np.object_]:
|
||||
# validate mi options
|
||||
if self.has_mi_columns:
|
||||
if cols is not None:
|
||||
msg = "cannot specify cols with a MultiIndex on the columns"
|
||||
raise TypeError(msg)
|
||||
|
||||
if cols is not None:
|
||||
if isinstance(cols, ABCIndex):
|
||||
cols = cols._get_values_for_csv(**self._number_format)
|
||||
else:
|
||||
cols = list(cols)
|
||||
self.obj = self.obj.loc[:, cols]
|
||||
|
||||
# update columns to include possible multiplicity of dupes
|
||||
# and make sure cols is just a list of labels
|
||||
new_cols = self.obj.columns
|
||||
return new_cols._get_values_for_csv(**self._number_format)
|
||||
|
||||
def _initialize_chunksize(self, chunksize: int | None) -> int:
|
||||
if chunksize is None:
|
||||
return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
|
||||
return int(chunksize)
|
||||
|
||||
@property
|
||||
def _number_format(self) -> dict[str, Any]:
|
||||
"""Dictionary used for storing number formatting settings."""
|
||||
return {
|
||||
"na_rep": self.na_rep,
|
||||
"float_format": self.float_format,
|
||||
"date_format": self.date_format,
|
||||
"quoting": self.quoting,
|
||||
"decimal": self.decimal,
|
||||
}
|
||||
|
||||
@cache_readonly
|
||||
def data_index(self) -> Index:
|
||||
data_index = self.obj.index
|
||||
if (
|
||||
isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex))
|
||||
and self.date_format is not None
|
||||
):
|
||||
data_index = Index(
|
||||
[x.strftime(self.date_format) if notna(x) else "" for x in data_index]
|
||||
)
|
||||
elif isinstance(data_index, ABCMultiIndex):
|
||||
data_index = data_index.remove_unused_levels()
|
||||
return data_index
|
||||
|
||||
@property
|
||||
def nlevels(self) -> int:
|
||||
if self.index:
|
||||
return getattr(self.data_index, "nlevels", 1)
|
||||
else:
|
||||
return 0
|
||||
|
||||
@property
|
||||
def _has_aliases(self) -> bool:
|
||||
return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
|
||||
|
||||
@property
|
||||
def _need_to_save_header(self) -> bool:
|
||||
return bool(self._has_aliases or self.header)
|
||||
|
||||
@property
|
||||
def write_cols(self) -> SequenceNotStr[Hashable]:
|
||||
if self._has_aliases:
|
||||
assert not isinstance(self.header, bool)
|
||||
if len(self.header) != len(self.cols):
|
||||
raise ValueError(
|
||||
f"Writing {len(self.cols)} cols but got {len(self.header)} aliases"
|
||||
)
|
||||
return self.header
|
||||
else:
|
||||
# self.cols is an ndarray derived from Index._get_values_for_csv,
|
||||
# so its entries are strings, i.e. hashable
|
||||
return cast(SequenceNotStr[Hashable], self.cols)
|
||||
|
||||
@property
|
||||
def encoded_labels(self) -> list[Hashable]:
|
||||
encoded_labels: list[Hashable] = []
|
||||
|
||||
if self.index and self.index_label:
|
||||
assert isinstance(self.index_label, Sequence)
|
||||
encoded_labels = list(self.index_label)
|
||||
|
||||
if not self.has_mi_columns or self._has_aliases:
|
||||
encoded_labels += list(self.write_cols)
|
||||
|
||||
return encoded_labels
|
||||
|
||||
def save(self) -> None:
|
||||
"""
|
||||
Create the writer & save.
|
||||
"""
|
||||
# apply compression and byte/text conversion
|
||||
with get_handle(
|
||||
self.filepath_or_buffer,
|
||||
self.mode,
|
||||
encoding=self.encoding,
|
||||
errors=self.errors,
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
) as handles:
|
||||
# Note: self.encoding is irrelevant here
|
||||
# error: Argument "quoting" to "writer" has incompatible type "int";
|
||||
# expected "Literal[0, 1, 2, 3]"
|
||||
self.writer = csvlib.writer(
|
||||
handles.handle,
|
||||
lineterminator=self.lineterminator,
|
||||
delimiter=self.sep,
|
||||
quoting=self.quoting, # type: ignore[arg-type]
|
||||
doublequote=self.doublequote,
|
||||
escapechar=self.escapechar,
|
||||
quotechar=self.quotechar,
|
||||
)
|
||||
|
||||
self._save()
|
||||
|
||||
def _save(self) -> None:
|
||||
if self._need_to_save_header:
|
||||
self._save_header()
|
||||
self._save_body()
|
||||
|
||||
def _save_header(self) -> None:
|
||||
if not self.has_mi_columns or self._has_aliases:
|
||||
self.writer.writerow(self.encoded_labels)
|
||||
else:
|
||||
for row in self._generate_multiindex_header_rows():
|
||||
self.writer.writerow(row)
|
||||
|
||||
def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
|
||||
columns = self.obj.columns
|
||||
for i in range(columns.nlevels):
|
||||
# we need at least 1 index column to write our col names
|
||||
col_line = []
|
||||
if self.index:
|
||||
# name is the first column
|
||||
col_line.append(columns.names[i])
|
||||
|
||||
if isinstance(self.index_label, list) and len(self.index_label) > 1:
|
||||
col_line.extend([""] * (len(self.index_label) - 1))
|
||||
|
||||
col_line.extend(columns._get_level_values(i))
|
||||
yield col_line
|
||||
|
||||
# Write out the index line if it's not empty.
|
||||
# Otherwise, we will print out an extraneous
|
||||
# blank line between the mi and the data rows.
|
||||
if self.encoded_labels and set(self.encoded_labels) != {""}:
|
||||
yield self.encoded_labels + [""] * len(columns)
|
||||
|
||||
def _save_body(self) -> None:
|
||||
nrows = len(self.data_index)
|
||||
chunks = (nrows // self.chunksize) + 1
|
||||
for i in range(chunks):
|
||||
start_i = i * self.chunksize
|
||||
end_i = min(start_i + self.chunksize, nrows)
|
||||
if start_i >= end_i:
|
||||
break
|
||||
self._save_chunk(start_i, end_i)
|
||||
|
||||
def _save_chunk(self, start_i: int, end_i: int) -> None:
|
||||
# create the data for a chunk
|
||||
slicer = slice(start_i, end_i)
|
||||
df = self.obj.iloc[slicer]
|
||||
|
||||
res = df._get_values_for_csv(**self._number_format)
|
||||
data = list(res._iter_column_arrays())
|
||||
|
||||
ix = (
|
||||
self.data_index[slicer]._get_values_for_csv(**self._number_format)
|
||||
if self.nlevels != 0
|
||||
else np.empty(end_i - start_i)
|
||||
)
|
||||
libwriters.write_csv_rows(
|
||||
data,
|
||||
ix,
|
||||
self.nlevels,
|
||||
self.cols,
|
||||
self.writer,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,657 @@
|
||||
"""
|
||||
Module for formatting output data in HTML.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from textwrap import dedent
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Final,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas import (
|
||||
MultiIndex,
|
||||
option_context,
|
||||
)
|
||||
|
||||
from pandas.io.common import is_url
|
||||
from pandas.io.formats.format import (
|
||||
DataFrameFormatter,
|
||||
get_level_lengths,
|
||||
)
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
)
|
||||
|
||||
|
||||
class HTMLFormatter:
|
||||
"""
|
||||
Internal class for formatting output data in html.
|
||||
This class is intended for shared functionality between
|
||||
DataFrame.to_html() and DataFrame._repr_html_().
|
||||
Any logic in common with other output formatting methods
|
||||
should ideally be inherited from classes in format.py
|
||||
and this class responsible for only producing html markup.
|
||||
"""
|
||||
|
||||
indent_delta: Final = 2
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formatter: DataFrameFormatter,
|
||||
classes: str | list[str] | tuple[str, ...] | None = None,
|
||||
border: int | bool | None = None,
|
||||
table_id: str | None = None,
|
||||
render_links: bool = False,
|
||||
) -> None:
|
||||
self.fmt = formatter
|
||||
self.classes = classes
|
||||
|
||||
self.frame = self.fmt.frame
|
||||
self.columns = self.fmt.tr_frame.columns
|
||||
self.elements: list[str] = []
|
||||
self.bold_rows = self.fmt.bold_rows
|
||||
self.escape = self.fmt.escape
|
||||
self.show_dimensions = self.fmt.show_dimensions
|
||||
if border is None or border is True:
|
||||
border = cast(int, get_option("display.html.border"))
|
||||
elif not border:
|
||||
border = None
|
||||
|
||||
self.border = border
|
||||
self.table_id = table_id
|
||||
self.render_links = render_links
|
||||
|
||||
self.col_space = {}
|
||||
is_multi_index = isinstance(self.columns, MultiIndex)
|
||||
for column, value in self.fmt.col_space.items():
|
||||
col_space_value = f"{value}px" if isinstance(value, int) else value
|
||||
self.col_space[column] = col_space_value
|
||||
# GH 53885: Handling case where column is index
|
||||
# Flatten the data in the multi index and add in the map
|
||||
if is_multi_index and isinstance(column, tuple):
|
||||
for column_index in column:
|
||||
self.col_space[str(column_index)] = col_space_value
|
||||
|
||||
def to_string(self) -> str:
|
||||
lines = self.render()
|
||||
if any(isinstance(x, str) for x in lines):
|
||||
lines = [str(x) for x in lines]
|
||||
return "\n".join(lines)
|
||||
|
||||
def render(self) -> list[str]:
|
||||
self._write_table()
|
||||
|
||||
if self.should_show_dimensions:
|
||||
by = chr(215) # × # noqa: RUF003
|
||||
self.write(
|
||||
f"<p>{len(self.frame)} rows {by} {len(self.frame.columns)} columns</p>"
|
||||
)
|
||||
|
||||
return self.elements
|
||||
|
||||
@property
|
||||
def should_show_dimensions(self) -> bool:
|
||||
return self.fmt.should_show_dimensions
|
||||
|
||||
@property
|
||||
def show_row_idx_names(self) -> bool:
|
||||
return self.fmt.show_row_idx_names
|
||||
|
||||
@property
|
||||
def show_col_idx_names(self) -> bool:
|
||||
return self.fmt.show_col_idx_names
|
||||
|
||||
@property
|
||||
def row_levels(self) -> int:
|
||||
if self.fmt.index:
|
||||
# showing (row) index
|
||||
return self.frame.index.nlevels
|
||||
elif self.show_col_idx_names:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# If the row index is not displayed a column of
|
||||
# blank cells need to be included before the DataFrame values.
|
||||
return 1
|
||||
# not showing (row) index
|
||||
return 0
|
||||
|
||||
def _get_columns_formatted_values(self) -> Iterable:
|
||||
return self.columns
|
||||
|
||||
@property
|
||||
def is_truncated(self) -> bool:
|
||||
return self.fmt.is_truncated
|
||||
|
||||
@property
|
||||
def ncols(self) -> int:
|
||||
return len(self.fmt.tr_frame.columns)
|
||||
|
||||
def write(self, s: Any, indent: int = 0) -> None:
|
||||
rs = pprint_thing(s)
|
||||
self.elements.append(" " * indent + rs)
|
||||
|
||||
def write_th(
|
||||
self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None
|
||||
) -> None:
|
||||
"""
|
||||
Method for writing a formatted <th> cell.
|
||||
|
||||
If col_space is set on the formatter then that is used for
|
||||
the value of min-width.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s : object
|
||||
The data to be written inside the cell.
|
||||
header : bool, default False
|
||||
Set to True if the <th> is for use inside <thead>. This will
|
||||
cause min-width to be set if there is one.
|
||||
indent : int, default 0
|
||||
The indentation level of the cell.
|
||||
tags : str, default None
|
||||
Tags to include in the cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A written <th> cell.
|
||||
"""
|
||||
col_space = self.col_space.get(s, None)
|
||||
|
||||
if header and col_space is not None:
|
||||
tags = tags or ""
|
||||
tags += f'style="min-width: {col_space};"'
|
||||
|
||||
self._write_cell(s, kind="th", indent=indent, tags=tags)
|
||||
|
||||
def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None:
|
||||
self._write_cell(s, kind="td", indent=indent, tags=tags)
|
||||
|
||||
def _write_cell(
|
||||
self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None
|
||||
) -> None:
|
||||
if tags is not None:
|
||||
start_tag = f"<{kind} {tags}>"
|
||||
else:
|
||||
start_tag = f"<{kind}>"
|
||||
|
||||
if self.escape:
|
||||
# escape & first to prevent double escaping of &
|
||||
esc = {"&": r"&", "<": r"<", ">": r">"}
|
||||
else:
|
||||
esc = {}
|
||||
|
||||
rs = pprint_thing(s, escape_chars=esc).strip()
|
||||
# replace spaces betweens strings with non-breaking spaces
|
||||
rs = rs.replace(" ", " ")
|
||||
|
||||
if self.render_links and is_url(rs):
|
||||
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
|
||||
start_tag += f'<a href="{rs_unescaped}" target="_blank">'
|
||||
end_a = "</a>"
|
||||
else:
|
||||
end_a = ""
|
||||
|
||||
self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
|
||||
|
||||
def write_tr(
|
||||
self,
|
||||
line: Iterable,
|
||||
indent: int = 0,
|
||||
indent_delta: int = 0,
|
||||
header: bool = False,
|
||||
align: str | None = None,
|
||||
tags: dict[int, str] | None = None,
|
||||
nindex_levels: int = 0,
|
||||
) -> None:
|
||||
if tags is None:
|
||||
tags = {}
|
||||
|
||||
if align is None:
|
||||
self.write("<tr>", indent)
|
||||
else:
|
||||
self.write(f'<tr style="text-align: {align};">', indent)
|
||||
indent += indent_delta
|
||||
|
||||
for i, s in enumerate(line):
|
||||
val_tag = tags.get(i, None)
|
||||
if header or (self.bold_rows and i < nindex_levels):
|
||||
self.write_th(s, indent=indent, header=header, tags=val_tag)
|
||||
else:
|
||||
self.write_td(s, indent, tags=val_tag)
|
||||
|
||||
indent -= indent_delta
|
||||
self.write("</tr>", indent)
|
||||
|
||||
def _write_table(self, indent: int = 0) -> None:
|
||||
_classes = ["dataframe"] # Default class.
|
||||
use_mathjax = get_option("display.html.use_mathjax")
|
||||
if not use_mathjax:
|
||||
_classes.append("tex2jax_ignore")
|
||||
_classes.append("mathjax_ignore")
|
||||
if self.classes is not None:
|
||||
if isinstance(self.classes, str):
|
||||
self.classes = self.classes.split()
|
||||
if not isinstance(self.classes, (list, tuple)):
|
||||
raise TypeError(
|
||||
"classes must be a string, list, "
|
||||
f"or tuple, not {type(self.classes)}"
|
||||
)
|
||||
_classes.extend(self.classes)
|
||||
|
||||
if self.table_id is None:
|
||||
id_section = ""
|
||||
else:
|
||||
id_section = f' id="{self.table_id}"'
|
||||
|
||||
if self.border is None:
|
||||
border_attr = ""
|
||||
else:
|
||||
border_attr = f' border="{self.border}"'
|
||||
|
||||
self.write(
|
||||
f'<table{border_attr} class="{" ".join(_classes)}"{id_section}>',
|
||||
indent,
|
||||
)
|
||||
|
||||
if self.fmt.header or self.show_row_idx_names:
|
||||
self._write_header(indent + self.indent_delta)
|
||||
|
||||
self._write_body(indent + self.indent_delta)
|
||||
|
||||
self.write("</table>", indent)
|
||||
|
||||
def _write_col_header(self, indent: int) -> None:
|
||||
row: list[Hashable]
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
if isinstance(self.columns, MultiIndex):
|
||||
template = 'colspan="{span:d}" halign="left"'
|
||||
|
||||
sentinel: lib.NoDefault | bool
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = lib.no_default
|
||||
else:
|
||||
sentinel = False
|
||||
levels = self.columns._format_multi(sparsify=sentinel, include_names=False)
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
for lnum, (records, values) in enumerate(
|
||||
zip(level_lengths, levels, strict=True)
|
||||
):
|
||||
if is_truncated_horizontally:
|
||||
# modify the header lines
|
||||
ins_col = self.fmt.tr_col_num
|
||||
if self.fmt.sparsify:
|
||||
recs_new = {}
|
||||
# Increment tags after ... col.
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
elif tag + span > ins_col:
|
||||
recs_new[tag] = span + 1
|
||||
if lnum == inner_lvl:
|
||||
values = (
|
||||
*values[:ins_col],
|
||||
"...",
|
||||
*values[ins_col:],
|
||||
)
|
||||
else:
|
||||
# sparse col headers do not receive a ...
|
||||
values = (
|
||||
*values[:ins_col],
|
||||
values[ins_col - 1],
|
||||
*values[ins_col:],
|
||||
)
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
# if ins_col lies between tags, all col headers
|
||||
# get ...
|
||||
if tag + span == ins_col:
|
||||
recs_new[ins_col] = 1
|
||||
values = (*values[:ins_col], "...", *values[ins_col:])
|
||||
records = recs_new
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if lnum == inner_lvl:
|
||||
records[ins_col] = 1
|
||||
else:
|
||||
recs_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
recs_new[ins_col] = 1
|
||||
records = recs_new
|
||||
values = [*values[:ins_col], "...", *values[ins_col:]]
|
||||
|
||||
# see gh-22579
|
||||
# Column Offset Bug with to_html(index=False) with
|
||||
# MultiIndex Columns and Index.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code
|
||||
# block below for standard columns index.
|
||||
row = [""] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class and create a
|
||||
# _get_formatted_column_labels function for code
|
||||
# parity with DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
name = self.columns.names[lnum]
|
||||
row.append(pprint_thing(name or ""))
|
||||
else:
|
||||
row.append("")
|
||||
|
||||
tags = {}
|
||||
j = len(row)
|
||||
for i, v in enumerate(values):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
continue
|
||||
j += 1
|
||||
row.append(v)
|
||||
self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
|
||||
else:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code block
|
||||
# above for columns MultiIndex.
|
||||
row = [""] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
row.append(self.columns.name or "")
|
||||
else:
|
||||
row.append("")
|
||||
row.extend(self._get_columns_formatted_values())
|
||||
align = self.fmt.justify
|
||||
|
||||
if is_truncated_horizontally:
|
||||
ins_col = self.row_levels + self.fmt.tr_col_num
|
||||
row.insert(ins_col, "...")
|
||||
|
||||
self.write_tr(row, indent, self.indent_delta, header=True, align=align)
|
||||
|
||||
def _write_row_header(self, indent: int) -> None:
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
|
||||
self.ncols + (1 if is_truncated_horizontally else 0)
|
||||
)
|
||||
self.write_tr(row, indent, self.indent_delta, header=True)
|
||||
|
||||
def _write_header(self, indent: int) -> None:
|
||||
self.write("<thead>", indent)
|
||||
|
||||
if self.fmt.header:
|
||||
self._write_col_header(indent + self.indent_delta)
|
||||
|
||||
if self.show_row_idx_names:
|
||||
self._write_row_header(indent + self.indent_delta)
|
||||
|
||||
self.write("</thead>", indent)
|
||||
|
||||
def _get_formatted_values(self) -> dict[int, list[str]]:
|
||||
with option_context("display.max_colwidth", None):
|
||||
fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)}
|
||||
return fmt_values
|
||||
|
||||
def _write_body(self, indent: int) -> None:
|
||||
self.write("<tbody>", indent)
|
||||
fmt_values = self._get_formatted_values()
|
||||
|
||||
# write values
|
||||
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
|
||||
self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
|
||||
else:
|
||||
self._write_regular_rows(fmt_values, indent + self.indent_delta)
|
||||
|
||||
self.write("</tbody>", indent)
|
||||
|
||||
def _write_regular_rows(
|
||||
self, fmt_values: Mapping[int, list[str]], indent: int
|
||||
) -> None:
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
is_truncated_vertically = self.fmt.is_truncated_vertically
|
||||
|
||||
nrows = len(self.fmt.tr_frame)
|
||||
|
||||
if self.fmt.index:
|
||||
fmt = self.fmt._get_formatter("__index__")
|
||||
if fmt is not None:
|
||||
index_values = self.fmt.tr_frame.index.map(fmt)
|
||||
else:
|
||||
# only reached with non-Multi index
|
||||
index_values = self.fmt.tr_frame.index._format_flat(include_name=False)
|
||||
|
||||
row: list[str] = []
|
||||
for i in range(nrows):
|
||||
if is_truncated_vertically and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ["..."] * len(row)
|
||||
self.write_tr(
|
||||
str_sep_row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=self.row_levels,
|
||||
)
|
||||
|
||||
row = []
|
||||
if self.fmt.index:
|
||||
row.append(index_values[i])
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Add blank cell before data cells.
|
||||
elif self.show_col_idx_names:
|
||||
row.append("")
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
|
||||
if is_truncated_horizontally:
|
||||
dot_col_ix = self.fmt.tr_col_num + self.row_levels
|
||||
row.insert(dot_col_ix, "...")
|
||||
self.write_tr(
|
||||
row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
|
||||
)
|
||||
|
||||
def _write_hierarchical_rows(
|
||||
self, fmt_values: Mapping[int, list[str]], indent: int
|
||||
) -> None:
|
||||
template = 'rowspan="{span}" valign="top"'
|
||||
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
is_truncated_vertically = self.fmt.is_truncated_vertically
|
||||
frame = self.fmt.tr_frame
|
||||
nrows = len(frame)
|
||||
|
||||
assert isinstance(frame.index, MultiIndex)
|
||||
idx_values = frame.index._format_multi(sparsify=False, include_names=False)
|
||||
idx_values = list(zip(*idx_values, strict=True))
|
||||
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = lib.no_default
|
||||
levels = frame.index._format_multi(sparsify=sentinel, include_names=False)
|
||||
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if is_truncated_vertically:
|
||||
# Insert ... row and adjust idx_values and
|
||||
# level_lengths to take this into account.
|
||||
ins_row = self.fmt.tr_row_num
|
||||
inserted = False
|
||||
for lnum, records in enumerate(level_lengths):
|
||||
rec_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_row:
|
||||
rec_new[tag + 1] = span
|
||||
elif tag + span > ins_row:
|
||||
rec_new[tag] = span + 1
|
||||
|
||||
# GH 14882 - Make sure insertion done once
|
||||
if not inserted:
|
||||
dot_row = list(idx_values[ins_row - 1])
|
||||
dot_row[-1] = "..."
|
||||
idx_values.insert(ins_row, tuple(dot_row))
|
||||
inserted = True
|
||||
else:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = "..."
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
else:
|
||||
rec_new[tag] = span
|
||||
# If ins_row lies between tags, all cols idx cols
|
||||
# receive ...
|
||||
if tag + span == ins_row:
|
||||
rec_new[ins_row] = 1
|
||||
if lnum == 0:
|
||||
idx_values.insert(
|
||||
ins_row, tuple(["..."] * len(level_lengths))
|
||||
)
|
||||
|
||||
# GH 14882 - Place ... in correct level
|
||||
elif inserted:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = "..."
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
level_lengths[lnum] = rec_new
|
||||
|
||||
level_lengths[inner_lvl][ins_row] = 1
|
||||
for ix_col in fmt_values:
|
||||
fmt_values[ix_col].insert(ins_row, "...")
|
||||
nrows += 1
|
||||
|
||||
for i in range(nrows):
|
||||
row = []
|
||||
tags = {}
|
||||
|
||||
sparse_offset = 0
|
||||
j = 0
|
||||
for records, v in zip(level_lengths, idx_values[i], strict=True):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
sparse_offset += 1
|
||||
continue
|
||||
|
||||
j += 1
|
||||
row.append(v)
|
||||
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if is_truncated_horizontally:
|
||||
row.insert(
|
||||
self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
|
||||
)
|
||||
self.write_tr(
|
||||
row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=tags,
|
||||
nindex_levels=len(levels) - sparse_offset,
|
||||
)
|
||||
else:
|
||||
row = []
|
||||
for i in range(len(frame)):
|
||||
if is_truncated_vertically and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ["..."] * len(row)
|
||||
self.write_tr(
|
||||
str_sep_row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=self.row_levels,
|
||||
)
|
||||
|
||||
idx_values = list(
|
||||
zip(
|
||||
*frame.index._format_multi(sparsify=False, include_names=False),
|
||||
strict=True,
|
||||
)
|
||||
)
|
||||
row = []
|
||||
row.extend(idx_values[i])
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if is_truncated_horizontally:
|
||||
row.insert(self.row_levels + self.fmt.tr_col_num, "...")
|
||||
self.write_tr(
|
||||
row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=frame.index.nlevels,
|
||||
)
|
||||
|
||||
|
||||
class NotebookFormatter(HTMLFormatter):
|
||||
"""
|
||||
Internal class for formatting output data in html for display in Jupyter
|
||||
Notebooks. This class is intended for functionality specific to
|
||||
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
|
||||
"""
|
||||
|
||||
def _get_formatted_values(self) -> dict[int, list[str]]:
|
||||
return {i: self.fmt.format_col(i) for i in range(self.ncols)}
|
||||
|
||||
def _get_columns_formatted_values(self) -> list[str]:
|
||||
# only reached with non-Multi Index
|
||||
return self.columns._format_flat(include_name=False)
|
||||
|
||||
def write_style(self) -> None:
|
||||
# We use the "scoped" attribute here so that the desired
|
||||
# style properties for the data frame are not then applied
|
||||
# throughout the entire notebook.
|
||||
template_first = """\
|
||||
<style scoped>"""
|
||||
template_last = """\
|
||||
</style>"""
|
||||
template_select = """\
|
||||
.dataframe %s {
|
||||
%s: %s;
|
||||
}"""
|
||||
element_props = [
|
||||
("tbody tr th:only-of-type", "vertical-align", "middle"),
|
||||
("tbody tr th", "vertical-align", "top"),
|
||||
]
|
||||
if isinstance(self.columns, MultiIndex):
|
||||
element_props.append(("thead tr th", "text-align", "left"))
|
||||
if self.show_row_idx_names:
|
||||
element_props.append(
|
||||
("thead tr:last-of-type th", "text-align", "right")
|
||||
)
|
||||
else:
|
||||
element_props.append(("thead th", "text-align", "right"))
|
||||
template_mid = "\n\n".join(template_select % t for t in element_props)
|
||||
template = dedent(f"{template_first}\n{template_mid}\n{template_last}")
|
||||
self.write(template)
|
||||
|
||||
def render(self) -> list[str]:
|
||||
self.write("<div>")
|
||||
self.write_style()
|
||||
super().render()
|
||||
self.write("</div>")
|
||||
return self.elements
|
||||
@@ -0,0 +1,943 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABC,
|
||||
abstractmethod,
|
||||
)
|
||||
import sys
|
||||
from textwrap import dedent
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas.io.formats import format as fmt
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Iterable,
|
||||
Iterator,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
show_counts_sub = dedent(
|
||||
"""\
|
||||
show_counts : bool, optional
|
||||
Whether to show the non-null counts. By default, this is shown
|
||||
only if the DataFrame is smaller than
|
||||
``pandas.options.display.max_info_rows`` and
|
||||
``pandas.options.display.max_info_columns``. A value of True always
|
||||
shows the counts, and False never shows the counts."""
|
||||
)
|
||||
|
||||
series_examples_sub = dedent(
|
||||
"""\
|
||||
>>> int_values = [1, 2, 3, 4, 5]
|
||||
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
|
||||
>>> s = pd.Series(text_values, index=int_values)
|
||||
>>> s.info()
|
||||
<class 'pandas.Series'>
|
||||
Index: 5 entries, 1 to 5
|
||||
Series name: None
|
||||
Non-Null Count Dtype
|
||||
-------------- -----
|
||||
5 non-null object
|
||||
dtypes: object(1)
|
||||
memory usage: 80.0+ bytes
|
||||
|
||||
Prints a summary excluding information about its values:
|
||||
|
||||
>>> s.info(verbose=False)
|
||||
<class 'pandas.Series'>
|
||||
Index: 5 entries, 1 to 5
|
||||
dtypes: object(1)
|
||||
memory usage: 80.0+ bytes
|
||||
|
||||
Pipe output of Series.info to buffer instead of sys.stdout, get
|
||||
buffer content and writes to a text file:
|
||||
|
||||
>>> import io
|
||||
>>> buffer = io.StringIO()
|
||||
>>> s.info(buf=buffer)
|
||||
>>> s = buffer.getvalue()
|
||||
>>> with open("df_info.txt", "w",
|
||||
... encoding="utf-8") as f: # doctest: +SKIP
|
||||
... f.write(s)
|
||||
260
|
||||
|
||||
The `memory_usage` parameter allows deep introspection mode, specially
|
||||
useful for big Series and fine-tune memory optimization:
|
||||
|
||||
>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
|
||||
>>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
|
||||
>>> s.info()
|
||||
<class 'pandas.Series'>
|
||||
RangeIndex: 1000000 entries, 0 to 999999
|
||||
Series name: None
|
||||
Non-Null Count Dtype
|
||||
-------------- -----
|
||||
1000000 non-null object
|
||||
dtypes: object(1)
|
||||
memory usage: 7.6+ MB
|
||||
|
||||
>>> s.info(memory_usage='deep')
|
||||
<class 'pandas.Series'>
|
||||
RangeIndex: 1000000 entries, 0 to 999999
|
||||
Series name: None
|
||||
Non-Null Count Dtype
|
||||
-------------- -----
|
||||
1000000 non-null object
|
||||
dtypes: object(1)
|
||||
memory usage: 55.3 MB"""
|
||||
)
|
||||
|
||||
|
||||
series_see_also_sub = dedent(
|
||||
"""\
|
||||
Series.describe: Generate descriptive statistics of Series.
|
||||
Series.memory_usage: Memory usage of Series."""
|
||||
)
|
||||
series_max_cols_sub = dedent(
|
||||
"""\
|
||||
max_cols : int, optional
|
||||
Unused, exists only for compatibility with DataFrame.info."""
|
||||
)
|
||||
|
||||
|
||||
series_sub_kwargs = {
|
||||
"klass": "Series",
|
||||
"type_sub": "",
|
||||
"max_cols_sub": series_max_cols_sub,
|
||||
"show_counts_sub": show_counts_sub,
|
||||
"examples_sub": series_examples_sub,
|
||||
"see_also_sub": series_see_also_sub,
|
||||
"version_added_sub": "\n.. versionadded:: 1.4.0\n",
|
||||
}
|
||||
|
||||
|
||||
def _put_str(s: str | Dtype, space: int) -> str:
|
||||
"""
|
||||
Make string of specified length, padding to the right if necessary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s : Union[str, Dtype]
|
||||
String to be formatted.
|
||||
space : int
|
||||
Length to force string to be of.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
String coerced to given length.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.io.formats.info._put_str("panda", 6)
|
||||
'panda '
|
||||
>>> pd.io.formats.info._put_str("panda", 4)
|
||||
'pand'
|
||||
"""
|
||||
return str(s)[:space].ljust(space)
|
||||
|
||||
|
||||
def _sizeof_fmt(num: float, size_qualifier: str) -> str:
|
||||
"""
|
||||
Return size in human readable format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num : int
|
||||
Size in bytes.
|
||||
size_qualifier : str
|
||||
Either empty, or '+' (if lower bound).
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Size in human readable format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _sizeof_fmt(23028, "")
|
||||
'22.5 KB'
|
||||
|
||||
>>> _sizeof_fmt(23028, "+")
|
||||
'22.5+ KB'
|
||||
"""
|
||||
for x in ["bytes", "KB", "MB", "GB", "TB"]:
|
||||
if num < 1024.0:
|
||||
return f"{num:3.1f}{size_qualifier} {x}"
|
||||
num /= 1024.0
|
||||
return f"{num:3.1f}{size_qualifier} PB"
|
||||
|
||||
|
||||
def _initialize_memory_usage(
|
||||
memory_usage: bool | str | None = None,
|
||||
) -> bool | str:
|
||||
"""Get memory usage based on inputs and display options."""
|
||||
if memory_usage is None:
|
||||
memory_usage = get_option("display.memory_usage")
|
||||
return memory_usage
|
||||
|
||||
|
||||
class _BaseInfo(ABC):
|
||||
"""
|
||||
Base class for DataFrameInfo and SeriesInfo.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame or Series
|
||||
Either dataframe or series.
|
||||
memory_usage : bool or str, optional
|
||||
If "deep", introspect the data deeply by interrogating object dtypes
|
||||
for system-level memory consumption, and include it in the returned
|
||||
values.
|
||||
"""
|
||||
|
||||
data: DataFrame | Series
|
||||
memory_usage: bool | str
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def dtypes(self) -> Iterable[Dtype]:
|
||||
"""
|
||||
Dtypes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dtypes : sequence
|
||||
Dtype of each of the DataFrame's columns (or one series column).
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def dtype_counts(self) -> Mapping[str, int]:
|
||||
"""Mapping dtype - number of counts."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def non_null_counts(self) -> list[int] | Series:
|
||||
"""Sequence of non-null counts for all columns or column (if series)."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def memory_usage_bytes(self) -> int:
|
||||
"""
|
||||
Memory usage in bytes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
memory_usage_bytes : int
|
||||
Object's total memory usage in bytes.
|
||||
"""
|
||||
|
||||
@property
|
||||
def memory_usage_string(self) -> str:
|
||||
"""Memory usage in a form of human readable string."""
|
||||
return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"
|
||||
|
||||
@property
|
||||
def size_qualifier(self) -> str:
|
||||
size_qualifier = ""
|
||||
if self.memory_usage:
|
||||
if self.memory_usage != "deep":
|
||||
# size_qualifier is just a best effort; not guaranteed to catch
|
||||
# all cases (e.g., it misses categorical data even with object
|
||||
# categories)
|
||||
if (
|
||||
"object" in self.dtype_counts
|
||||
or self.data.index._is_memory_usage_qualified
|
||||
):
|
||||
size_qualifier = "+"
|
||||
return size_qualifier
|
||||
|
||||
@abstractmethod
|
||||
def render(
|
||||
self,
|
||||
*,
|
||||
buf: WriteBuffer[str] | None,
|
||||
max_cols: int | None,
|
||||
verbose: bool | None,
|
||||
show_counts: bool | None,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class DataFrameInfo(_BaseInfo):
|
||||
"""
|
||||
Class storing dataframe-specific info.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data: DataFrame,
|
||||
memory_usage: bool | str | None = None,
|
||||
) -> None:
|
||||
self.data: DataFrame = data
|
||||
self.memory_usage = _initialize_memory_usage(memory_usage)
|
||||
|
||||
@property
|
||||
def dtype_counts(self) -> Mapping[str, int]:
|
||||
return _get_dataframe_dtype_counts(self.data)
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Iterable[Dtype]:
|
||||
"""
|
||||
Dtypes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dtypes
|
||||
Dtype of each of the DataFrame's columns.
|
||||
"""
|
||||
return self.data.dtypes
|
||||
|
||||
@property
|
||||
def ids(self) -> Index:
|
||||
"""
|
||||
Column names.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ids : Index
|
||||
DataFrame's column names.
|
||||
"""
|
||||
return self.data.columns
|
||||
|
||||
@property
|
||||
def col_count(self) -> int:
|
||||
"""Number of columns to be summarized."""
|
||||
return len(self.ids)
|
||||
|
||||
@property
|
||||
def non_null_counts(self) -> Series:
|
||||
"""Sequence of non-null counts for all columns or column (if series)."""
|
||||
return self.data.count()
|
||||
|
||||
@property
|
||||
def memory_usage_bytes(self) -> int:
|
||||
deep = self.memory_usage == "deep"
|
||||
return self.data.memory_usage(index=True, deep=deep).sum()
|
||||
|
||||
def render(
|
||||
self,
|
||||
*,
|
||||
buf: WriteBuffer[str] | None,
|
||||
max_cols: int | None,
|
||||
verbose: bool | None,
|
||||
show_counts: bool | None,
|
||||
) -> None:
|
||||
printer = _DataFrameInfoPrinter(
|
||||
info=self,
|
||||
max_cols=max_cols,
|
||||
verbose=verbose,
|
||||
show_counts=show_counts,
|
||||
)
|
||||
printer.to_buffer(buf)
|
||||
|
||||
|
||||
class SeriesInfo(_BaseInfo):
|
||||
"""
|
||||
Class storing series-specific info.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data: Series,
|
||||
memory_usage: bool | str | None = None,
|
||||
) -> None:
|
||||
self.data: Series = data
|
||||
self.memory_usage = _initialize_memory_usage(memory_usage)
|
||||
|
||||
def render(
|
||||
self,
|
||||
*,
|
||||
buf: WriteBuffer[str] | None = None,
|
||||
max_cols: int | None = None,
|
||||
verbose: bool | None = None,
|
||||
show_counts: bool | None = None,
|
||||
) -> None:
|
||||
if max_cols is not None:
|
||||
raise ValueError(
|
||||
"Argument `max_cols` can only be passed "
|
||||
"in DataFrame.info, not Series.info"
|
||||
)
|
||||
printer = _SeriesInfoPrinter(
|
||||
info=self,
|
||||
verbose=verbose,
|
||||
show_counts=show_counts,
|
||||
)
|
||||
printer.to_buffer(buf)
|
||||
|
||||
@property
|
||||
def non_null_counts(self) -> list[int]:
|
||||
return [self.data.count()]
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Iterable[Dtype]:
|
||||
return [self.data.dtypes]
|
||||
|
||||
@property
|
||||
def dtype_counts(self) -> Mapping[str, int]:
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
return _get_dataframe_dtype_counts(DataFrame(self.data))
|
||||
|
||||
@property
|
||||
def memory_usage_bytes(self) -> int:
|
||||
"""Memory usage in bytes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
memory_usage_bytes : int
|
||||
Object's total memory usage in bytes.
|
||||
"""
|
||||
deep = self.memory_usage == "deep"
|
||||
return self.data.memory_usage(index=True, deep=deep)
|
||||
|
||||
|
||||
class _InfoPrinterAbstract:
|
||||
"""
|
||||
Class for printing dataframe or series info.
|
||||
"""
|
||||
|
||||
def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
|
||||
"""Save dataframe info into buffer."""
|
||||
table_builder = self._create_table_builder()
|
||||
lines = table_builder.get_lines()
|
||||
if buf is None: # pragma: no cover
|
||||
buf = sys.stdout
|
||||
fmt.buffer_put_lines(buf, lines)
|
||||
|
||||
@abstractmethod
|
||||
def _create_table_builder(self) -> _TableBuilderAbstract:
|
||||
"""Create instance of table builder."""
|
||||
|
||||
|
||||
class _DataFrameInfoPrinter(_InfoPrinterAbstract):
|
||||
"""
|
||||
Class for printing dataframe info.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
info : DataFrameInfo
|
||||
Instance of DataFrameInfo.
|
||||
max_cols : int, optional
|
||||
When to switch from the verbose to the truncated output.
|
||||
verbose : bool, optional
|
||||
Whether to print the full summary.
|
||||
show_counts : bool, optional
|
||||
Whether to show the non-null counts.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
info: DataFrameInfo,
|
||||
max_cols: int | None = None,
|
||||
verbose: bool | None = None,
|
||||
show_counts: bool | None = None,
|
||||
) -> None:
|
||||
self.info = info
|
||||
self.data = info.data
|
||||
self.verbose = verbose
|
||||
self.max_cols = self._initialize_max_cols(max_cols)
|
||||
self.show_counts = self._initialize_show_counts(show_counts)
|
||||
|
||||
@property
|
||||
def max_rows(self) -> int:
|
||||
"""Maximum info rows to be displayed."""
|
||||
return get_option("display.max_info_rows")
|
||||
|
||||
@property
|
||||
def exceeds_info_cols(self) -> bool:
|
||||
"""Check if number of columns to be summarized does not exceed maximum."""
|
||||
return bool(self.col_count > self.max_cols)
|
||||
|
||||
@property
|
||||
def exceeds_info_rows(self) -> bool:
|
||||
"""Check if number of rows to be summarized does not exceed maximum."""
|
||||
return bool(len(self.data) > self.max_rows)
|
||||
|
||||
@property
|
||||
def col_count(self) -> int:
|
||||
"""Number of columns to be summarized."""
|
||||
return self.info.col_count
|
||||
|
||||
def _initialize_max_cols(self, max_cols: int | None) -> int:
|
||||
if max_cols is None:
|
||||
return get_option("display.max_info_columns")
|
||||
return max_cols
|
||||
|
||||
def _initialize_show_counts(self, show_counts: bool | None) -> bool:
|
||||
if show_counts is None:
|
||||
return bool(not self.exceeds_info_cols and not self.exceeds_info_rows)
|
||||
else:
|
||||
return show_counts
|
||||
|
||||
def _create_table_builder(self) -> _DataFrameTableBuilder:
|
||||
"""
|
||||
Create instance of table builder based on verbosity and display settings.
|
||||
"""
|
||||
if self.verbose:
|
||||
return _DataFrameTableBuilderVerbose(
|
||||
info=self.info,
|
||||
with_counts=self.show_counts,
|
||||
)
|
||||
elif self.verbose is False: # specifically set to False, not necessarily None
|
||||
return _DataFrameTableBuilderNonVerbose(info=self.info)
|
||||
elif self.exceeds_info_cols:
|
||||
return _DataFrameTableBuilderNonVerbose(info=self.info)
|
||||
else:
|
||||
return _DataFrameTableBuilderVerbose(
|
||||
info=self.info,
|
||||
with_counts=self.show_counts,
|
||||
)
|
||||
|
||||
|
||||
class _SeriesInfoPrinter(_InfoPrinterAbstract):
|
||||
"""Class for printing series info.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
info : SeriesInfo
|
||||
Instance of SeriesInfo.
|
||||
verbose : bool, optional
|
||||
Whether to print the full summary.
|
||||
show_counts : bool, optional
|
||||
Whether to show the non-null counts.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
info: SeriesInfo,
|
||||
verbose: bool | None = None,
|
||||
show_counts: bool | None = None,
|
||||
) -> None:
|
||||
self.info = info
|
||||
self.data = info.data
|
||||
self.verbose = verbose
|
||||
self.show_counts = self._initialize_show_counts(show_counts)
|
||||
|
||||
def _create_table_builder(self) -> _SeriesTableBuilder:
|
||||
"""
|
||||
Create instance of table builder based on verbosity.
|
||||
"""
|
||||
if self.verbose or self.verbose is None:
|
||||
return _SeriesTableBuilderVerbose(
|
||||
info=self.info,
|
||||
with_counts=self.show_counts,
|
||||
)
|
||||
else:
|
||||
return _SeriesTableBuilderNonVerbose(info=self.info)
|
||||
|
||||
def _initialize_show_counts(self, show_counts: bool | None) -> bool:
|
||||
if show_counts is None:
|
||||
return True
|
||||
else:
|
||||
return show_counts
|
||||
|
||||
|
||||
class _TableBuilderAbstract(ABC):
|
||||
"""
|
||||
Abstract builder for info table.
|
||||
"""
|
||||
|
||||
_lines: list[str]
|
||||
info: _BaseInfo
|
||||
|
||||
@abstractmethod
|
||||
def get_lines(self) -> list[str]:
|
||||
"""Product in a form of list of lines (strings)."""
|
||||
|
||||
@property
|
||||
def data(self) -> DataFrame | Series:
|
||||
return self.info.data
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Iterable[Dtype]:
|
||||
"""Dtypes of each of the DataFrame's columns."""
|
||||
return self.info.dtypes
|
||||
|
||||
@property
|
||||
def dtype_counts(self) -> Mapping[str, int]:
|
||||
"""Mapping dtype - number of counts."""
|
||||
return self.info.dtype_counts
|
||||
|
||||
@property
|
||||
def display_memory_usage(self) -> bool:
|
||||
"""Whether to display memory usage."""
|
||||
return bool(self.info.memory_usage)
|
||||
|
||||
@property
|
||||
def memory_usage_string(self) -> str:
|
||||
"""Memory usage string with proper size qualifier."""
|
||||
return self.info.memory_usage_string
|
||||
|
||||
@property
|
||||
def non_null_counts(self) -> list[int] | Series:
|
||||
return self.info.non_null_counts
|
||||
|
||||
def add_object_type_line(self) -> None:
|
||||
"""Add line with string representation of dataframe to the table."""
|
||||
self._lines.append(str(type(self.data)))
|
||||
|
||||
def add_index_range_line(self) -> None:
|
||||
"""Add line with range of indices to the table."""
|
||||
self._lines.append(self.data.index._summary())
|
||||
|
||||
def add_dtypes_line(self) -> None:
|
||||
"""Add summary line with dtypes present in dataframe."""
|
||||
collected_dtypes = [
|
||||
f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items())
|
||||
]
|
||||
self._lines.append(f"dtypes: {', '.join(collected_dtypes)}")
|
||||
|
||||
|
||||
class _DataFrameTableBuilder(_TableBuilderAbstract):
|
||||
"""
|
||||
Abstract builder for dataframe info table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
info : DataFrameInfo.
|
||||
Instance of DataFrameInfo.
|
||||
"""
|
||||
|
||||
def __init__(self, *, info: DataFrameInfo) -> None:
|
||||
self.info: DataFrameInfo = info
|
||||
|
||||
def get_lines(self) -> list[str]:
|
||||
self._lines = []
|
||||
if self.col_count == 0:
|
||||
self._fill_empty_info()
|
||||
else:
|
||||
self._fill_non_empty_info()
|
||||
return self._lines
|
||||
|
||||
def _fill_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to empty dataframe."""
|
||||
self.add_object_type_line()
|
||||
self.add_index_range_line()
|
||||
self._lines.append(f"Empty {type(self.data).__name__}\n")
|
||||
|
||||
@abstractmethod
|
||||
def _fill_non_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to non-empty dataframe."""
|
||||
|
||||
@property
|
||||
def data(self) -> DataFrame:
|
||||
"""DataFrame."""
|
||||
return self.info.data
|
||||
|
||||
@property
|
||||
def ids(self) -> Index:
|
||||
"""Dataframe columns."""
|
||||
return self.info.ids
|
||||
|
||||
@property
|
||||
def col_count(self) -> int:
|
||||
"""Number of dataframe columns to be summarized."""
|
||||
return self.info.col_count
|
||||
|
||||
def add_memory_usage_line(self) -> None:
|
||||
"""Add line containing memory usage."""
|
||||
self._lines.append(f"memory usage: {self.memory_usage_string}")
|
||||
|
||||
|
||||
class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder):
|
||||
"""
|
||||
Dataframe info table builder for non-verbose output.
|
||||
"""
|
||||
|
||||
def _fill_non_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to non-empty dataframe."""
|
||||
self.add_object_type_line()
|
||||
self.add_index_range_line()
|
||||
self.add_columns_summary_line()
|
||||
self.add_dtypes_line()
|
||||
if self.display_memory_usage:
|
||||
self.add_memory_usage_line()
|
||||
|
||||
def add_columns_summary_line(self) -> None:
|
||||
self._lines.append(self.ids._summary(name="Columns"))
|
||||
|
||||
|
||||
class _TableBuilderVerboseMixin(_TableBuilderAbstract):
|
||||
"""
|
||||
Mixin for verbose info output.
|
||||
"""
|
||||
|
||||
SPACING: str = " " * 2
|
||||
strrows: Sequence[Sequence[str]]
|
||||
gross_column_widths: Sequence[int]
|
||||
with_counts: bool
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def headers(self) -> Sequence[str]:
|
||||
"""Headers names of the columns in verbose table."""
|
||||
|
||||
@property
|
||||
def header_column_widths(self) -> Sequence[int]:
|
||||
"""Widths of header columns (only titles)."""
|
||||
return [len(col) for col in self.headers]
|
||||
|
||||
def _get_gross_column_widths(self) -> Sequence[int]:
|
||||
"""Get widths of columns containing both headers and actual content."""
|
||||
body_column_widths = self._get_body_column_widths()
|
||||
return [
|
||||
max(*widths)
|
||||
for widths in zip(
|
||||
self.header_column_widths, body_column_widths, strict=False
|
||||
)
|
||||
]
|
||||
|
||||
def _get_body_column_widths(self) -> Sequence[int]:
|
||||
"""Get widths of table content columns."""
|
||||
strcols: Sequence[Sequence[str]] = list(zip(*self.strrows, strict=True))
|
||||
return [max(len(x) for x in col) for col in strcols]
|
||||
|
||||
def _gen_rows(self) -> Iterator[Sequence[str]]:
|
||||
"""
|
||||
Generator function yielding rows content.
|
||||
|
||||
Each element represents a row comprising a sequence of strings.
|
||||
"""
|
||||
if self.with_counts:
|
||||
return self._gen_rows_with_counts()
|
||||
else:
|
||||
return self._gen_rows_without_counts()
|
||||
|
||||
@abstractmethod
|
||||
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
|
||||
"""Iterator with string representation of body data with counts."""
|
||||
|
||||
@abstractmethod
|
||||
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
|
||||
"""Iterator with string representation of body data without counts."""
|
||||
|
||||
def add_header_line(self) -> None:
|
||||
header_line = self.SPACING.join(
|
||||
[
|
||||
_put_str(header, col_width)
|
||||
for header, col_width in zip(
|
||||
self.headers, self.gross_column_widths, strict=True
|
||||
)
|
||||
]
|
||||
)
|
||||
self._lines.append(header_line)
|
||||
|
||||
def add_separator_line(self) -> None:
|
||||
separator_line = self.SPACING.join(
|
||||
[
|
||||
_put_str("-" * header_colwidth, gross_colwidth)
|
||||
for header_colwidth, gross_colwidth in zip(
|
||||
self.header_column_widths, self.gross_column_widths, strict=True
|
||||
)
|
||||
]
|
||||
)
|
||||
self._lines.append(separator_line)
|
||||
|
||||
def add_body_lines(self) -> None:
|
||||
for row in self.strrows:
|
||||
body_line = self.SPACING.join(
|
||||
[
|
||||
_put_str(col, gross_colwidth)
|
||||
for col, gross_colwidth in zip(
|
||||
row, self.gross_column_widths, strict=True
|
||||
)
|
||||
]
|
||||
)
|
||||
self._lines.append(body_line)
|
||||
|
||||
def _gen_non_null_counts(self) -> Iterator[str]:
|
||||
"""Iterator with string representation of non-null counts."""
|
||||
for count in self.non_null_counts:
|
||||
yield f"{count} non-null"
|
||||
|
||||
def _gen_dtypes(self) -> Iterator[str]:
|
||||
"""Iterator with string representation of column dtypes."""
|
||||
for dtype in self.dtypes:
|
||||
yield pprint_thing(dtype)
|
||||
|
||||
|
||||
class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin):
|
||||
"""
|
||||
Dataframe info table builder for verbose output.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
info: DataFrameInfo,
|
||||
with_counts: bool,
|
||||
) -> None:
|
||||
self.info = info
|
||||
self.with_counts = with_counts
|
||||
self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
|
||||
self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
|
||||
|
||||
def _fill_non_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to non-empty dataframe."""
|
||||
self.add_object_type_line()
|
||||
self.add_index_range_line()
|
||||
self.add_columns_summary_line()
|
||||
self.add_header_line()
|
||||
self.add_separator_line()
|
||||
self.add_body_lines()
|
||||
self.add_dtypes_line()
|
||||
if self.display_memory_usage:
|
||||
self.add_memory_usage_line()
|
||||
|
||||
@property
|
||||
def headers(self) -> Sequence[str]:
|
||||
"""Headers names of the columns in verbose table."""
|
||||
if self.with_counts:
|
||||
return [" # ", "Column", "Non-Null Count", "Dtype"]
|
||||
return [" # ", "Column", "Dtype"]
|
||||
|
||||
def add_columns_summary_line(self) -> None:
|
||||
self._lines.append(f"Data columns (total {self.col_count} columns):")
|
||||
|
||||
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
|
||||
"""Iterator with string representation of body data without counts."""
|
||||
yield from zip(
|
||||
self._gen_line_numbers(),
|
||||
self._gen_columns(),
|
||||
self._gen_dtypes(),
|
||||
strict=True,
|
||||
)
|
||||
|
||||
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
|
||||
"""Iterator with string representation of body data with counts."""
|
||||
yield from zip(
|
||||
self._gen_line_numbers(),
|
||||
self._gen_columns(),
|
||||
self._gen_non_null_counts(),
|
||||
self._gen_dtypes(),
|
||||
strict=True,
|
||||
)
|
||||
|
||||
def _gen_line_numbers(self) -> Iterator[str]:
|
||||
"""Iterator with string representation of column numbers."""
|
||||
for i, _ in enumerate(self.ids):
|
||||
yield f" {i}"
|
||||
|
||||
def _gen_columns(self) -> Iterator[str]:
|
||||
"""Iterator with string representation of column names."""
|
||||
for col in self.ids:
|
||||
yield pprint_thing(col)
|
||||
|
||||
|
||||
class _SeriesTableBuilder(_TableBuilderAbstract):
|
||||
"""
|
||||
Abstract builder for series info table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
info : SeriesInfo.
|
||||
Instance of SeriesInfo.
|
||||
"""
|
||||
|
||||
def __init__(self, *, info: SeriesInfo) -> None:
|
||||
self.info: SeriesInfo = info
|
||||
|
||||
def get_lines(self) -> list[str]:
|
||||
self._lines = []
|
||||
self._fill_non_empty_info()
|
||||
return self._lines
|
||||
|
||||
@property
|
||||
def data(self) -> Series:
|
||||
"""Series."""
|
||||
return self.info.data
|
||||
|
||||
def add_memory_usage_line(self) -> None:
|
||||
"""Add line containing memory usage."""
|
||||
self._lines.append(f"memory usage: {self.memory_usage_string}")
|
||||
|
||||
@abstractmethod
|
||||
def _fill_non_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to non-empty series."""
|
||||
|
||||
|
||||
class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder):
|
||||
"""
|
||||
Series info table builder for non-verbose output.
|
||||
"""
|
||||
|
||||
def _fill_non_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to non-empty series."""
|
||||
self.add_object_type_line()
|
||||
self.add_index_range_line()
|
||||
self.add_dtypes_line()
|
||||
if self.display_memory_usage:
|
||||
self.add_memory_usage_line()
|
||||
|
||||
|
||||
class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin):
|
||||
"""
|
||||
Series info table builder for verbose output.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
info: SeriesInfo,
|
||||
with_counts: bool,
|
||||
) -> None:
|
||||
self.info = info
|
||||
self.with_counts = with_counts
|
||||
self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
|
||||
self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
|
||||
|
||||
def _fill_non_empty_info(self) -> None:
|
||||
"""Add lines to the info table, pertaining to non-empty series."""
|
||||
self.add_object_type_line()
|
||||
self.add_index_range_line()
|
||||
self.add_series_name_line()
|
||||
self.add_header_line()
|
||||
self.add_separator_line()
|
||||
self.add_body_lines()
|
||||
self.add_dtypes_line()
|
||||
if self.display_memory_usage:
|
||||
self.add_memory_usage_line()
|
||||
|
||||
def add_series_name_line(self) -> None:
|
||||
self._lines.append(f"Series name: {self.data.name}")
|
||||
|
||||
@property
|
||||
def headers(self) -> Sequence[str]:
|
||||
"""Headers names of the columns in verbose table."""
|
||||
if self.with_counts:
|
||||
return ["Non-Null Count", "Dtype"]
|
||||
return ["Dtype"]
|
||||
|
||||
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
|
||||
"""Iterator with string representation of body data without counts."""
|
||||
yield from ([dtype] for dtype in self._gen_dtypes())
|
||||
|
||||
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
|
||||
"""Iterator with string representation of body data with counts."""
|
||||
yield from zip(self._gen_non_null_counts(), self._gen_dtypes(), strict=True)
|
||||
|
||||
|
||||
def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]:
|
||||
"""
|
||||
Create mapping between datatypes and their number of occurrences.
|
||||
"""
|
||||
# groupby dtype.name to collect e.g. Categorical columns
|
||||
return df.dtypes.value_counts().groupby(lambda x: x.name).sum()
|
||||
@@ -0,0 +1,587 @@
|
||||
"""
|
||||
Printing tools.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
import sys
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
TypeAlias,
|
||||
TypeVar,
|
||||
)
|
||||
from unicodedata import east_asian_width
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas.core.dtypes.inference import is_sequence
|
||||
|
||||
from pandas.io.formats.console import get_console_size
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import ListLike
|
||||
EscapeChars: TypeAlias = Mapping[str, str] | Iterable[str]
|
||||
_KT = TypeVar("_KT")
|
||||
_VT = TypeVar("_VT")
|
||||
|
||||
|
||||
def adjoin(space: int, *lists: list[str], **kwargs: Any) -> str:
|
||||
"""
|
||||
Glues together two sets of strings using the amount of space requested.
|
||||
The idea is to prettify.
|
||||
|
||||
----------
|
||||
space : int
|
||||
number of spaces for padding
|
||||
lists : str
|
||||
list of str which being joined
|
||||
strlen : callable
|
||||
function used to calculate the length of each str. Needed for unicode
|
||||
handling.
|
||||
justfunc : callable
|
||||
function used to justify str. Needed for unicode handling.
|
||||
"""
|
||||
strlen = kwargs.pop("strlen", len)
|
||||
justfunc = kwargs.pop("justfunc", _adj_justify)
|
||||
|
||||
newLists = []
|
||||
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
|
||||
# not the last one
|
||||
lengths.append(max(map(len, lists[-1])))
|
||||
maxLen = max(map(len, lists))
|
||||
for i, lst in enumerate(lists):
|
||||
nl = justfunc(lst, lengths[i], mode="left")
|
||||
nl = ([" " * lengths[i]] * (maxLen - len(lst))) + nl
|
||||
newLists.append(nl)
|
||||
toJoin = zip(*newLists, strict=True)
|
||||
return "\n".join("".join(lines) for lines in toJoin)
|
||||
|
||||
|
||||
def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]:
|
||||
"""
|
||||
Perform ljust, center, rjust against string or list-like
|
||||
"""
|
||||
if mode == "left":
|
||||
return [x.ljust(max_len) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(max_len) for x in texts]
|
||||
else:
|
||||
return [x.rjust(max_len) for x in texts]
|
||||
|
||||
|
||||
# Unicode consolidation
|
||||
# ---------------------
|
||||
#
|
||||
# pprinting utility functions for generating Unicode text or
|
||||
# bytes(3.x)/str(2.x) representations of objects.
|
||||
# Try to use these as much as possible rather than rolling your own.
|
||||
#
|
||||
# When to use
|
||||
# -----------
|
||||
#
|
||||
# 1) If you're writing code internal to pandas (no I/O directly involved),
|
||||
# use pprint_thing().
|
||||
#
|
||||
# It will always return unicode text which can handled by other
|
||||
# parts of the package without breakage.
|
||||
#
|
||||
# 2) if you need to write something out to file, use
|
||||
# pprint_thing_encoded(encoding).
|
||||
#
|
||||
# If no encoding is specified, it defaults to utf-8. Since encoding pure
|
||||
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
|
||||
# working with straight ascii.
|
||||
|
||||
|
||||
def _pprint_seq(
|
||||
seq: ListLike, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds: Any
|
||||
) -> str:
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather than calling this directly.
|
||||
|
||||
bounds length of printed sequence, depending on options
|
||||
"""
|
||||
if isinstance(seq, set):
|
||||
fmt = "{{{body}}}"
|
||||
elif isinstance(seq, frozenset):
|
||||
fmt = "frozenset({{{body}}})"
|
||||
else:
|
||||
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
|
||||
|
||||
if max_seq_items is False:
|
||||
max_items = None
|
||||
else:
|
||||
max_items = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
s = iter(seq)
|
||||
# handle sets, no slicing
|
||||
r = []
|
||||
max_items_reached = False
|
||||
for i, item in enumerate(s):
|
||||
if (max_items is not None) and (i >= max_items):
|
||||
max_items_reached = True
|
||||
break
|
||||
r.append(pprint_thing(item, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))
|
||||
body = ", ".join(r)
|
||||
|
||||
if max_items_reached:
|
||||
body += ", ..."
|
||||
elif isinstance(seq, tuple) and len(seq) == 1:
|
||||
body += ","
|
||||
|
||||
return fmt.format(body=body)
|
||||
|
||||
|
||||
def _pprint_dict(
|
||||
seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds: Any
|
||||
) -> str:
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather than calling this directly.
|
||||
"""
|
||||
fmt = "{{{things}}}"
|
||||
pairs = []
|
||||
|
||||
pfmt = "{key}: {val}"
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
for k, v in list(seq.items())[:nitems]:
|
||||
pairs.append(
|
||||
pfmt.format(
|
||||
key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
|
||||
val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
|
||||
)
|
||||
)
|
||||
|
||||
if nitems < len(seq):
|
||||
return fmt.format(things=", ".join(pairs) + ", ...")
|
||||
else:
|
||||
return fmt.format(things=", ".join(pairs))
|
||||
|
||||
|
||||
def pprint_thing(
|
||||
thing: object,
|
||||
_nest_lvl: int = 0,
|
||||
escape_chars: EscapeChars | None = None,
|
||||
default_escapes: bool = False,
|
||||
quote_strings: bool = False,
|
||||
max_seq_items: int | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
This function is the sanctioned way of converting objects
|
||||
to a string representation and properly handles nested sequences.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
thing : anything to be formatted
|
||||
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
|
||||
with pprint_sequence, this argument is used to keep track of the
|
||||
current nesting level, and limit it.
|
||||
escape_chars : list[str] or Mapping[str, str], optional
|
||||
Characters to escape. If a Mapping is passed the values are the
|
||||
replacements
|
||||
default_escapes : bool, default False
|
||||
Whether the input escape characters replaces or adds to the defaults
|
||||
max_seq_items : int or None, default None
|
||||
Pass through to other pretty printers to limit sequence printing
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
"""
|
||||
|
||||
def as_escaped_string(
|
||||
thing: Any, escape_chars: EscapeChars | None = escape_chars
|
||||
) -> str:
|
||||
translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r", "'": r"\'"}
|
||||
if isinstance(escape_chars, Mapping):
|
||||
if default_escapes:
|
||||
translate.update(escape_chars)
|
||||
else:
|
||||
translate = escape_chars # type: ignore[assignment]
|
||||
escape_chars = list(escape_chars.keys())
|
||||
else:
|
||||
escape_chars = escape_chars or ()
|
||||
|
||||
result = str(thing)
|
||||
for c in escape_chars:
|
||||
result = result.replace(c, translate[c])
|
||||
return result
|
||||
|
||||
if hasattr(thing, "__next__"):
|
||||
return str(thing)
|
||||
elif isinstance(thing, Mapping) and _nest_lvl < get_option(
|
||||
"display.pprint_nest_depth"
|
||||
):
|
||||
result = _pprint_dict(
|
||||
thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
|
||||
)
|
||||
elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
|
||||
result = _pprint_seq(
|
||||
# error: Argument 1 to "_pprint_seq" has incompatible type "object";
|
||||
# expected "ExtensionArray | ndarray[Any, Any] | Index | Series |
|
||||
# SequenceNotStr[Any] | range"
|
||||
thing, # type: ignore[arg-type]
|
||||
_nest_lvl,
|
||||
escape_chars=escape_chars,
|
||||
quote_strings=quote_strings,
|
||||
max_seq_items=max_seq_items,
|
||||
)
|
||||
elif isinstance(thing, str) and quote_strings:
|
||||
result = f"'{as_escaped_string(thing)}'"
|
||||
else:
|
||||
result = as_escaped_string(thing)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def pprint_thing_encoded(
|
||||
object: object, encoding: str = "utf-8", errors: str = "replace"
|
||||
) -> bytes:
|
||||
value = pprint_thing(object) # get unicode representation of object
|
||||
return value.encode(encoding, errors)
|
||||
|
||||
|
||||
def enable_data_resource_formatter(enable: bool) -> None:
|
||||
if "IPython" not in sys.modules:
|
||||
# definitely not in IPython
|
||||
return
|
||||
from IPython import get_ipython
|
||||
|
||||
# error: Call to untyped function "get_ipython" in typed context
|
||||
ip = get_ipython() # type: ignore[no-untyped-call]
|
||||
if ip is None:
|
||||
# still not in IPython
|
||||
return
|
||||
|
||||
formatters = ip.display_formatter.formatters
|
||||
mimetype = "application/vnd.dataresource+json"
|
||||
|
||||
if enable:
|
||||
if mimetype not in formatters:
|
||||
# define tableschema formatter
|
||||
from IPython.core.formatters import BaseFormatter
|
||||
from traitlets import ObjectName
|
||||
|
||||
class TableSchemaFormatter(BaseFormatter):
|
||||
print_method = ObjectName("_repr_data_resource_")
|
||||
_return_type = (dict,)
|
||||
|
||||
# register it:
|
||||
formatters[mimetype] = TableSchemaFormatter()
|
||||
# enable it if it's been disabled:
|
||||
formatters[mimetype].enabled = True
|
||||
# unregister tableschema mime-type
|
||||
elif mimetype in formatters:
|
||||
formatters[mimetype].enabled = False
|
||||
|
||||
|
||||
def default_pprint(thing: Any, max_seq_items: int | None = None) -> str:
|
||||
return pprint_thing(
|
||||
thing,
|
||||
escape_chars=("\t", "\r", "\n"),
|
||||
quote_strings=True,
|
||||
max_seq_items=max_seq_items,
|
||||
)
|
||||
|
||||
|
||||
def format_object_summary(
|
||||
obj: ListLike,
|
||||
formatter: Callable,
|
||||
is_justify: bool = True,
|
||||
name: str | None = None,
|
||||
indent_for_name: bool = True,
|
||||
line_break_each_value: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Return the formatted obj as a unicode string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
must be iterable and support __getitem__
|
||||
formatter : callable
|
||||
string formatter for an element
|
||||
is_justify : bool
|
||||
should justify the display
|
||||
name : name, optional
|
||||
defaults to the class name of the obj
|
||||
indent_for_name : bool, default True
|
||||
Whether subsequent lines should be indented to
|
||||
align with the name.
|
||||
line_break_each_value : bool, default False
|
||||
If True, inserts a line break for each value of ``obj``.
|
||||
If False, only break lines when the a line of values gets wider
|
||||
than the display width.
|
||||
|
||||
Returns
|
||||
-------
|
||||
summary string
|
||||
"""
|
||||
display_width, _ = get_console_size()
|
||||
if display_width is None:
|
||||
display_width = get_option("display.width") or 80
|
||||
if name is None:
|
||||
name = type(obj).__name__
|
||||
|
||||
if indent_for_name:
|
||||
name_len = len(name)
|
||||
space1 = f"\n{(' ' * (name_len + 1))}"
|
||||
space2 = f"\n{(' ' * (name_len + 2))}"
|
||||
else:
|
||||
space1 = "\n"
|
||||
space2 = "\n " # space for the opening '['
|
||||
|
||||
n = len(obj)
|
||||
if line_break_each_value:
|
||||
# If we want to vertically align on each value of obj, we need to
|
||||
# separate values by a line break and indent the values
|
||||
sep = ",\n " + " " * len(name)
|
||||
else:
|
||||
sep = ","
|
||||
max_seq_items = get_option("display.max_seq_items") or n
|
||||
|
||||
# are we a truncated display
|
||||
is_truncated = n > max_seq_items
|
||||
|
||||
# adj can optionally handle unicode eastern asian width
|
||||
adj = get_adjustment()
|
||||
|
||||
def _extend_line(
|
||||
s: str, line: str, value: str, display_width: int, next_line_prefix: str
|
||||
) -> tuple[str, str]:
|
||||
if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
|
||||
s += line.rstrip()
|
||||
line = next_line_prefix
|
||||
line += value
|
||||
return s, line
|
||||
|
||||
def best_len(values: list[str]) -> int:
|
||||
if values:
|
||||
return max(adj.len(x) for x in values)
|
||||
else:
|
||||
return 0
|
||||
|
||||
close = ", "
|
||||
|
||||
if n == 0:
|
||||
summary = f"[]{close}"
|
||||
elif n == 1 and not line_break_each_value:
|
||||
first = formatter(obj[0])
|
||||
summary = f"[{first}]{close}"
|
||||
elif n == 2 and not line_break_each_value:
|
||||
first = formatter(obj[0])
|
||||
last = formatter(obj[-1])
|
||||
summary = f"[{first}, {last}]{close}"
|
||||
else:
|
||||
if max_seq_items == 1:
|
||||
# If max_seq_items=1 show only last element
|
||||
head = []
|
||||
tail = [formatter(x) for x in obj[-1:]]
|
||||
elif n > max_seq_items:
|
||||
n = min(max_seq_items // 2, 10)
|
||||
head = [formatter(x) for x in obj[:n]]
|
||||
tail = [formatter(x) for x in obj[-n:]]
|
||||
else:
|
||||
head = []
|
||||
tail = [formatter(x) for x in obj]
|
||||
|
||||
# adjust all values to max length if needed
|
||||
if is_justify:
|
||||
if line_break_each_value:
|
||||
# Justify each string in the values of head and tail, so the
|
||||
# strings will right align when head and tail are stacked
|
||||
# vertically.
|
||||
head, tail = _justify(head, tail)
|
||||
elif is_truncated or not (
|
||||
len(", ".join(head)) < display_width
|
||||
and len(", ".join(tail)) < display_width
|
||||
):
|
||||
# Each string in head and tail should align with each other
|
||||
max_length = max(best_len(head), best_len(tail))
|
||||
head = [x.rjust(max_length) for x in head]
|
||||
tail = [x.rjust(max_length) for x in tail]
|
||||
# If we are not truncated and we are only a single
|
||||
# line, then don't justify
|
||||
|
||||
if line_break_each_value:
|
||||
# Now head and tail are of type List[Tuple[str]]. Below we
|
||||
# convert them into List[str], so there will be one string per
|
||||
# value. Also truncate items horizontally if wider than
|
||||
# max_space
|
||||
max_space = display_width - len(space2)
|
||||
value = tail[0]
|
||||
max_items = 1
|
||||
for num_items in reversed(range(1, len(value) + 1)):
|
||||
pprinted_seq = _pprint_seq(value, max_seq_items=num_items)
|
||||
if len(pprinted_seq) < max_space:
|
||||
max_items = num_items
|
||||
break
|
||||
head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
|
||||
tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
|
||||
|
||||
summary = ""
|
||||
line = space2
|
||||
|
||||
for head_value in head:
|
||||
word = head_value + sep + " "
|
||||
summary, line = _extend_line(summary, line, word, display_width, space2)
|
||||
|
||||
if is_truncated:
|
||||
# remove trailing space of last line
|
||||
summary += line.rstrip() + space2 + "..."
|
||||
line = space2
|
||||
|
||||
for tail_item in tail[:-1]:
|
||||
word = tail_item + sep + " "
|
||||
summary, line = _extend_line(summary, line, word, display_width, space2)
|
||||
|
||||
# last value: no sep added + 1 space of width used for trailing ','
|
||||
summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
|
||||
summary += line
|
||||
|
||||
# right now close is either '' or ', '
|
||||
# Now we want to include the ']', but not the maybe space.
|
||||
close = "]" + close.rstrip(" ")
|
||||
summary += close
|
||||
|
||||
if len(summary) > (display_width) or line_break_each_value:
|
||||
summary += space1
|
||||
else: # one row
|
||||
summary += " "
|
||||
|
||||
# remove initial space
|
||||
summary = "[" + summary[len(space2) :]
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def _justify(
|
||||
head: list[Sequence[str]], tail: list[Sequence[str]]
|
||||
) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]:
|
||||
"""
|
||||
Justify items in head and tail, so they are right-aligned when stacked.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
head : list-like of list-likes of strings
|
||||
tail : list-like of list-likes of strings
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of list of tuples of strings
|
||||
Same as head and tail, but items are right aligned when stacked
|
||||
vertically.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _justify([["a", "b"]], [["abc", "abcd"]])
|
||||
([(' a', ' b')], [('abc', 'abcd')])
|
||||
"""
|
||||
combined = head + tail
|
||||
|
||||
# For each position for the sequences in ``combined``,
|
||||
# find the length of the largest string.
|
||||
max_length = [0] * len(combined[0])
|
||||
for inner_seq in combined:
|
||||
length = [len(item) for item in inner_seq]
|
||||
max_length = [max(x, y) for x, y in zip(max_length, length, strict=True)]
|
||||
|
||||
# justify each item in each list-like in head and tail using max_length
|
||||
head_tuples = [
|
||||
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length, strict=True))
|
||||
for seq in head
|
||||
]
|
||||
tail_tuples = [
|
||||
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length, strict=True))
|
||||
for seq in tail
|
||||
]
|
||||
return head_tuples, tail_tuples
|
||||
|
||||
|
||||
class PrettyDict(dict[_KT, _VT]):
|
||||
"""Dict extension to support abbreviated __repr__"""
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return pprint_thing(self)
|
||||
|
||||
|
||||
class _TextAdjustment:
|
||||
def __init__(self) -> None:
|
||||
self.encoding = get_option("display.encoding")
|
||||
|
||||
def len(self, text: str) -> int:
|
||||
return len(text)
|
||||
|
||||
def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
|
||||
"""
|
||||
Perform ljust, center, rjust against string or list-like
|
||||
"""
|
||||
if mode == "left":
|
||||
return [x.ljust(max_len) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(max_len) for x in texts]
|
||||
else:
|
||||
return [x.rjust(max_len) for x in texts]
|
||||
|
||||
def adjoin(self, space: int, *lists: Any, **kwargs: Any) -> str:
|
||||
return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs)
|
||||
|
||||
|
||||
class _EastAsianTextAdjustment(_TextAdjustment):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
if get_option("display.unicode.ambiguous_as_wide"):
|
||||
self.ambiguous_width = 2
|
||||
else:
|
||||
self.ambiguous_width = 1
|
||||
|
||||
# Definition of East Asian Width
|
||||
# https://unicode.org/reports/tr11/
|
||||
# Ambiguous width can be changed by option
|
||||
self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
|
||||
|
||||
def len(self, text: str) -> int:
|
||||
"""
|
||||
Calculate display width considering unicode East Asian Width
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return len(text)
|
||||
|
||||
return sum(
|
||||
self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
|
||||
)
|
||||
|
||||
def justify(
|
||||
self, texts: Iterable[str], max_len: int, mode: str = "right"
|
||||
) -> list[str]:
|
||||
# re-calculate padding space per str considering East Asian Width
|
||||
def _get_pad(t: str) -> int:
|
||||
return max_len - self.len(t) + len(t)
|
||||
|
||||
if mode == "left":
|
||||
return [x.ljust(_get_pad(x)) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(_get_pad(x)) for x in texts]
|
||||
else:
|
||||
return [x.rjust(_get_pad(x)) for x in texts]
|
||||
|
||||
|
||||
def get_adjustment() -> _TextAdjustment:
|
||||
use_east_asian_width = get_option("display.unicode.east_asian_width")
|
||||
if use_east_asian_width:
|
||||
return _EastAsianTextAdjustment()
|
||||
else:
|
||||
return _TextAdjustment()
|
||||
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Module for formatting output data in console (to string).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from shutil import get_terminal_size
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from pandas.io.formats.format import DataFrameFormatter
|
||||
|
||||
|
||||
class StringFormatter:
|
||||
"""Formatter for string representation of a dataframe."""
|
||||
|
||||
def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> None:
|
||||
self.fmt = fmt
|
||||
self.adj = fmt.adj
|
||||
self.frame = fmt.frame
|
||||
self.line_width = line_width
|
||||
|
||||
def to_string(self) -> str:
|
||||
text = self._get_string_representation()
|
||||
if self.fmt.should_show_dimensions:
|
||||
text = f"{text}{self.fmt.dimensions_info}"
|
||||
return text
|
||||
|
||||
def _get_strcols(self) -> list[list[str]]:
|
||||
strcols = self.fmt.get_strcols()
|
||||
if self.fmt.is_truncated:
|
||||
strcols = self._insert_dot_separators(strcols)
|
||||
return strcols
|
||||
|
||||
def _get_string_representation(self) -> str:
|
||||
if self.fmt.frame.empty:
|
||||
return self._empty_info_line
|
||||
|
||||
strcols = self._get_strcols()
|
||||
|
||||
if self.line_width is None:
|
||||
# no need to wrap around just print the whole frame
|
||||
return self.adj.adjoin(1, *strcols)
|
||||
|
||||
if self._need_to_wrap_around:
|
||||
return self._join_multiline(strcols)
|
||||
|
||||
return self._fit_strcols_to_terminal_width(strcols)
|
||||
|
||||
@property
|
||||
def _empty_info_line(self) -> str:
|
||||
return (
|
||||
f"Empty {type(self.frame).__name__}\n"
|
||||
f"Columns: {pprint_thing(self.frame.columns)}\n"
|
||||
f"Index: {pprint_thing(self.frame.index)}"
|
||||
)
|
||||
|
||||
@property
|
||||
def _need_to_wrap_around(self) -> bool:
|
||||
return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0)
|
||||
|
||||
def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]:
|
||||
str_index = self.fmt._get_formatted_index(self.fmt.tr_frame)
|
||||
index_length = len(str_index)
|
||||
|
||||
if self.fmt.is_truncated_horizontally:
|
||||
strcols = self._insert_dot_separator_horizontal(strcols, index_length)
|
||||
|
||||
if self.fmt.is_truncated_vertically:
|
||||
strcols = self._insert_dot_separator_vertical(strcols, index_length)
|
||||
|
||||
return strcols
|
||||
|
||||
@property
|
||||
def _adjusted_tr_col_num(self) -> int:
|
||||
return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num
|
||||
|
||||
def _insert_dot_separator_horizontal(
|
||||
self, strcols: list[list[str]], index_length: int
|
||||
) -> list[list[str]]:
|
||||
strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length)
|
||||
return strcols
|
||||
|
||||
def _insert_dot_separator_vertical(
|
||||
self, strcols: list[list[str]], index_length: int
|
||||
) -> list[list[str]]:
|
||||
n_header_rows = index_length - len(self.fmt.tr_frame)
|
||||
row_num = self.fmt.tr_row_num
|
||||
for ix, col in enumerate(strcols):
|
||||
cwidth = self.adj.len(col[row_num])
|
||||
|
||||
if self.fmt.is_truncated_horizontally:
|
||||
is_dot_col = ix == self._adjusted_tr_col_num
|
||||
else:
|
||||
is_dot_col = False
|
||||
|
||||
if cwidth > 3 or is_dot_col:
|
||||
dots = "..."
|
||||
else:
|
||||
dots = ".."
|
||||
|
||||
if ix == 0 and self.fmt.index:
|
||||
dot_mode = "left"
|
||||
elif is_dot_col:
|
||||
cwidth = 4
|
||||
dot_mode = "right"
|
||||
else:
|
||||
dot_mode = "right"
|
||||
|
||||
dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0]
|
||||
col.insert(row_num + n_header_rows, dot_str)
|
||||
return strcols
|
||||
|
||||
def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
|
||||
lwidth = self.line_width
|
||||
adjoin_width = 1
|
||||
strcols = list(strcols_input)
|
||||
|
||||
if self.fmt.index:
|
||||
idx = strcols.pop(0)
|
||||
lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
|
||||
|
||||
col_widths = [
|
||||
np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
|
||||
for col in strcols
|
||||
]
|
||||
|
||||
assert lwidth is not None
|
||||
col_bins = _binify(col_widths, lwidth)
|
||||
nbins = len(col_bins)
|
||||
|
||||
str_lst = []
|
||||
start = 0
|
||||
for i, end in enumerate(col_bins):
|
||||
row = strcols[start:end]
|
||||
if self.fmt.index:
|
||||
row.insert(0, idx)
|
||||
if nbins > 1:
|
||||
nrows = len(row[-1])
|
||||
if end <= len(strcols) and i < nbins - 1:
|
||||
row.append([" \\"] + [" "] * (nrows - 1))
|
||||
else:
|
||||
row.append([" "] * nrows)
|
||||
str_lst.append(self.adj.adjoin(adjoin_width, *row))
|
||||
start = end
|
||||
return "\n\n".join(str_lst)
|
||||
|
||||
def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str:
|
||||
from pandas import Series
|
||||
|
||||
lines = self.adj.adjoin(1, *strcols).split("\n")
|
||||
max_len = Series(lines).str.len().max()
|
||||
# plus truncate dot col
|
||||
width, _ = get_terminal_size()
|
||||
dif = max_len - width
|
||||
# '+ 1' to avoid too wide repr (GH PR #17023)
|
||||
adj_dif = dif + 1
|
||||
col_lens = Series([Series(ele).str.len().max() for ele in strcols])
|
||||
n_cols = len(col_lens)
|
||||
counter = 0
|
||||
while adj_dif > 0 and n_cols > 1:
|
||||
counter += 1
|
||||
mid = round(n_cols / 2)
|
||||
mid_ix = col_lens.index[mid]
|
||||
col_len = col_lens[mid_ix]
|
||||
# adjoin adds one
|
||||
adj_dif -= col_len + 1
|
||||
col_lens = col_lens.drop(mid_ix)
|
||||
n_cols = len(col_lens)
|
||||
|
||||
# subtract index column
|
||||
max_cols_fitted = n_cols - self.fmt.index
|
||||
# GH-21180. Ensure that we print at least two.
|
||||
max_cols_fitted = max(max_cols_fitted, 2)
|
||||
self.fmt.max_cols_fitted = max_cols_fitted
|
||||
|
||||
# Call again _truncate to cut frame appropriately
|
||||
# and then generate string representation
|
||||
self.fmt.truncate()
|
||||
strcols = self._get_strcols()
|
||||
return self.adj.adjoin(1, *strcols)
|
||||
|
||||
|
||||
def _binify(cols: list[int], line_width: int) -> list[int]:
|
||||
adjoin_width = 1
|
||||
bins = []
|
||||
curr_width = 0
|
||||
i_last_column = len(cols) - 1
|
||||
for i, w in enumerate(cols):
|
||||
w_adjoined = w + adjoin_width
|
||||
curr_width += w_adjoined
|
||||
if i_last_column == i:
|
||||
wrap = curr_width + 1 > line_width and i > 0
|
||||
else:
|
||||
wrap = curr_width + 2 > line_width and i > 0
|
||||
if wrap:
|
||||
bins.append(i)
|
||||
curr_width = w_adjoined
|
||||
|
||||
bins.append(len(cols))
|
||||
return bins
|
||||
File diff suppressed because it is too large
Load Diff
+2681
File diff suppressed because it is too large
Load Diff
+16
@@ -0,0 +1,16 @@
|
||||
{# Update the html_style/table_structure.html documentation too #}
|
||||
{% if doctype_html %}
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="{{encoding}}">
|
||||
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
|
||||
</head>
|
||||
<body>
|
||||
{% include html_table_tpl %}
|
||||
</body>
|
||||
</html>
|
||||
{% elif not doctype_html %}
|
||||
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
|
||||
{% include html_table_tpl %}
|
||||
{% endif %}
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
{%- block before_style -%}{%- endblock before_style -%}
|
||||
{% block style %}
|
||||
<style type="text/css">
|
||||
{% block table_styles %}
|
||||
{% for s in table_styles %}
|
||||
#T_{{uuid}} {{s.selector}} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor %}
|
||||
}
|
||||
{% endfor %}
|
||||
{% endblock table_styles %}
|
||||
{% block before_cellstyle %}{% endblock before_cellstyle %}
|
||||
{% block cellstyle %}
|
||||
{% for cs in [cellstyle, cellstyle_index, cellstyle_columns] %}
|
||||
{% for s in cs %}
|
||||
{% for selector in s.selectors %}{% if not loop.first %}, {% endif %}#T_{{uuid}}_{{selector}}{% endfor %} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor %}
|
||||
}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock cellstyle %}
|
||||
</style>
|
||||
{% endblock style %}
|
||||
+63
@@ -0,0 +1,63 @@
|
||||
{% block before_table %}{% endblock before_table %}
|
||||
{% block table %}
|
||||
{% if exclude_styles %}
|
||||
<table>
|
||||
{% else %}
|
||||
<table id="T_{{uuid}}"{% if table_attributes %} {{table_attributes}}{% endif %}>
|
||||
{% endif %}
|
||||
{% block caption %}
|
||||
{% if caption and caption is string %}
|
||||
<caption>{{caption}}</caption>
|
||||
{% elif caption and caption is sequence %}
|
||||
<caption>{{caption[0]}}</caption>
|
||||
{% endif %}
|
||||
{% endblock caption %}
|
||||
{% block thead %}
|
||||
<thead>
|
||||
{% block before_head_rows %}{% endblock %}
|
||||
{% for r in head %}
|
||||
{% block head_tr scoped %}
|
||||
<tr>
|
||||
{% if exclude_styles %}
|
||||
{% for c in r %}
|
||||
{% if c.is_visible != False %}
|
||||
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
{% for c in r %}
|
||||
{% if c.is_visible != False %}
|
||||
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endblock head_tr %}
|
||||
{% endfor %}
|
||||
{% block after_head_rows %}{% endblock %}
|
||||
</thead>
|
||||
{% endblock thead %}
|
||||
{% block tbody %}
|
||||
<tbody>
|
||||
{% block before_rows %}{% endblock before_rows %}
|
||||
{% for r in body %}
|
||||
{% block tr scoped %}
|
||||
<tr>
|
||||
{% if exclude_styles %}
|
||||
{% for c in r %}{% if c.is_visible != False %}
|
||||
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}{% endfor %}
|
||||
{% else %}
|
||||
{% for c in r %}{% if c.is_visible != False %}
|
||||
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}{% endfor %}
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endblock tr %}
|
||||
{% endfor %}
|
||||
{% block after_rows %}{% endblock after_rows %}
|
||||
</tbody>
|
||||
{% endblock tbody %}
|
||||
</table>
|
||||
{% endblock table %}
|
||||
{% block after_table %}{% endblock after_table %}
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
{% if environment == "longtable" %}
|
||||
{% include "latex_longtable.tpl" %}
|
||||
{% else %}
|
||||
{% include "latex_table.tpl" %}
|
||||
{% endif %}
|
||||
+82
@@ -0,0 +1,82 @@
|
||||
\begin{longtable}
|
||||
{%- set position = parse_table(table_styles, 'position') %}
|
||||
{%- if position is not none %}
|
||||
[{{position}}]
|
||||
{%- endif %}
|
||||
{%- set column_format = parse_table(table_styles, 'column_format') %}
|
||||
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
|
||||
|
||||
{% for style in table_styles %}
|
||||
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format', 'label'] %}
|
||||
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% if caption and caption is string %}
|
||||
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
|
||||
{%- set label = parse_table(table_styles, 'label') %}
|
||||
{%- if label is not none %}
|
||||
\label{{label}}
|
||||
{%- endif %} \\
|
||||
{% elif caption and caption is sequence %}
|
||||
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
|
||||
{%- set label = parse_table(table_styles, 'label') %}
|
||||
{%- if label is not none %}
|
||||
\label{{label}}
|
||||
{%- endif %} \\
|
||||
{% else %}
|
||||
{%- set label = parse_table(table_styles, 'label') %}
|
||||
{%- if label is not none %}
|
||||
\label{{label}} \\
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% set toprule = parse_table(table_styles, 'toprule') %}
|
||||
{% if toprule is not none %}
|
||||
\{{toprule}}
|
||||
{% endif %}
|
||||
{% for row in head %}
|
||||
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
|
||||
{% endfor %}
|
||||
{% set midrule = parse_table(table_styles, 'midrule') %}
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\endfirsthead
|
||||
{% if caption and caption is string %}
|
||||
\caption[]{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %} \\
|
||||
{% elif caption and caption is sequence %}
|
||||
\caption[]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %} \\
|
||||
{% endif %}
|
||||
{% if toprule is not none %}
|
||||
\{{toprule}}
|
||||
{% endif %}
|
||||
{% for row in head %}
|
||||
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
|
||||
{% endfor %}
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\endhead
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\multicolumn{% raw %}{{% endraw %}{{body[0]|length}}{% raw %}}{% endraw %}{r}{Continued on next page} \\
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\endfoot
|
||||
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
|
||||
{% if bottomrule is not none %}
|
||||
\{{bottomrule}}
|
||||
{% endif %}
|
||||
\endlastfoot
|
||||
{% for row in body %}
|
||||
{% for c in row %}{% if not loop.first %} & {% endif %}
|
||||
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
|
||||
{%- endfor %} \\
|
||||
{% if clines and clines[loop.index] | length > 0 %}
|
||||
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
\end{longtable}
|
||||
{% raw %}{% endraw %}
|
||||
+57
@@ -0,0 +1,57 @@
|
||||
{% if environment or parse_wrap(table_styles, caption) %}
|
||||
\begin{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
|
||||
{%- set position = parse_table(table_styles, 'position') %}
|
||||
{%- if position is not none %}
|
||||
[{{position}}]
|
||||
{%- endif %}
|
||||
|
||||
{% set position_float = parse_table(table_styles, 'position_float') %}
|
||||
{% if position_float is not none%}
|
||||
\{{position_float}}
|
||||
{% endif %}
|
||||
{% if caption and caption is string %}
|
||||
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
|
||||
|
||||
{% elif caption and caption is sequence %}
|
||||
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
|
||||
|
||||
{% endif %}
|
||||
{% for style in table_styles %}
|
||||
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format'] %}
|
||||
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
\begin{tabular}
|
||||
{%- set column_format = parse_table(table_styles, 'column_format') %}
|
||||
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
|
||||
|
||||
{% set toprule = parse_table(table_styles, 'toprule') %}
|
||||
{% if toprule is not none %}
|
||||
\{{toprule}}
|
||||
{% endif %}
|
||||
{% for row in head %}
|
||||
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx, convert_css)}}{% endfor %} \\
|
||||
{% endfor %}
|
||||
{% set midrule = parse_table(table_styles, 'midrule') %}
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
{% for row in body %}
|
||||
{% for c in row %}{% if not loop.first %} & {% endif %}
|
||||
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align, False, convert_css)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
|
||||
{%- endfor %} \\
|
||||
{% if clines and clines[loop.index] | length > 0 %}
|
||||
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
|
||||
{% if bottomrule is not none %}
|
||||
\{{bottomrule}}
|
||||
{% endif %}
|
||||
\end{tabular}
|
||||
{% if environment or parse_wrap(table_styles, caption) %}
|
||||
\end{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
|
||||
|
||||
{% endif %}
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
{% for r in head %}
|
||||
{% for c in r %}{% if c["is_visible"] %}
|
||||
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
|
||||
{% endif %}{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
{% for r in body %}
|
||||
{% for c in r %}{% if c["is_visible"] %}
|
||||
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
|
||||
{% endif %}{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
#table(
|
||||
columns: {{ head[0] | length }},
|
||||
{% for r in head %}
|
||||
{% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
|
||||
{% for r in body %}
|
||||
{% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
)
|
||||
@@ -0,0 +1,566 @@
|
||||
"""
|
||||
:mod:`pandas.io.formats.xml` is a module for formatting data in XML.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
import io
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
final,
|
||||
)
|
||||
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
from pandas.io.xml import get_data_from_filepath
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class _BaseXMLFormatter:
|
||||
"""
|
||||
Subclass for formatting data in XML.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buffer : str or file-like
|
||||
This can be either a string of raw XML, a valid URL,
|
||||
file or file-like object.
|
||||
|
||||
index : bool
|
||||
Whether to include index in xml document.
|
||||
|
||||
row_name : str
|
||||
Name for root of xml document. Default is 'data'.
|
||||
|
||||
root_name : str
|
||||
Name for row elements of xml document. Default is 'row'.
|
||||
|
||||
na_rep : str
|
||||
Missing data representation.
|
||||
|
||||
attrs_cols : list
|
||||
List of columns to write as attributes in row element.
|
||||
|
||||
elem_cols : list
|
||||
List of columns to write as children in row element.
|
||||
|
||||
namespaces : dict
|
||||
The namespaces to define in XML document as dicts with key
|
||||
being namespace and value the URI.
|
||||
|
||||
prefix : str
|
||||
The prefix for each element in XML document including root.
|
||||
|
||||
encoding : str
|
||||
Encoding of xml object or document.
|
||||
|
||||
xml_declaration : bool
|
||||
Whether to include xml declaration at top line item in xml.
|
||||
|
||||
pretty_print : bool
|
||||
Whether to write xml document with line breaks and indentation.
|
||||
|
||||
stylesheet : str or file-like
|
||||
A URL, file, file-like object, or a raw string containing XSLT.
|
||||
|
||||
compression : str or dict, default 'infer'
|
||||
For on-the-fly compression of the output data. If 'infer' and 'path_or_buffer'
|
||||
is path-like, then detect compression from the following extensions: '.gz',
|
||||
'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
|
||||
(otherwise no compression).
|
||||
Set to ``None`` for no compression.
|
||||
Can also be a dict with key ``'method'`` set
|
||||
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
|
||||
and other key-value pairs are forwarded to
|
||||
``zipfile.ZipFile``, ``gzip.GzipFile``,
|
||||
``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
|
||||
``tarfile.TarFile``, respectively.
|
||||
As an example, the following could be passed for faster compression and to
|
||||
create a reproducible gzip archive:
|
||||
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
|
||||
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
|
||||
See also
|
||||
--------
|
||||
pandas.io.formats.xml.EtreeXMLFormatter
|
||||
pandas.io.formats.xml.LxmlXMLFormatter
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
frame: DataFrame,
|
||||
path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
|
||||
index: bool = True,
|
||||
root_name: str | None = "data",
|
||||
row_name: str | None = "row",
|
||||
na_rep: str | None = None,
|
||||
attr_cols: list[str] | None = None,
|
||||
elem_cols: list[str] | None = None,
|
||||
namespaces: dict[str | None, str] | None = None,
|
||||
prefix: str | None = None,
|
||||
encoding: str = "utf-8",
|
||||
xml_declaration: bool | None = True,
|
||||
pretty_print: bool | None = True,
|
||||
stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
|
||||
compression: CompressionOptions = "infer",
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> None:
|
||||
self.frame = frame
|
||||
self.path_or_buffer = path_or_buffer
|
||||
self.index = index
|
||||
self.root_name = root_name
|
||||
self.row_name = row_name
|
||||
self.na_rep = na_rep
|
||||
self.attr_cols = attr_cols
|
||||
self.elem_cols = elem_cols
|
||||
self.namespaces = namespaces
|
||||
self.prefix = prefix
|
||||
self.encoding = encoding
|
||||
self.xml_declaration = xml_declaration
|
||||
self.pretty_print = pretty_print
|
||||
self.stylesheet = stylesheet
|
||||
self.compression: CompressionOptions = compression
|
||||
self.storage_options = storage_options
|
||||
|
||||
self.orig_cols = self.frame.columns.tolist()
|
||||
self.frame_dicts = self._process_dataframe()
|
||||
|
||||
self._validate_columns()
|
||||
self._validate_encoding()
|
||||
self.prefix_uri = self._get_prefix_uri()
|
||||
self._handle_indexes()
|
||||
|
||||
def _build_tree(self) -> bytes:
|
||||
"""
|
||||
Build tree from data.
|
||||
|
||||
This method initializes the root and builds attributes and elements
|
||||
with optional namespaces.
|
||||
"""
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@final
|
||||
def _validate_columns(self) -> None:
|
||||
"""
|
||||
Validate elems_cols and attrs_cols.
|
||||
|
||||
This method will check if columns is list-like.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If value is not a list and less then length of nodes.
|
||||
"""
|
||||
if self.attr_cols and not is_list_like(self.attr_cols):
|
||||
raise TypeError(
|
||||
f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
|
||||
)
|
||||
|
||||
if self.elem_cols and not is_list_like(self.elem_cols):
|
||||
raise TypeError(
|
||||
f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
|
||||
)
|
||||
|
||||
@final
|
||||
def _validate_encoding(self) -> None:
|
||||
"""
|
||||
Validate encoding.
|
||||
|
||||
This method will check if encoding is among listed under codecs.
|
||||
|
||||
Raises
|
||||
------
|
||||
LookupError
|
||||
* If encoding is not available in codecs.
|
||||
"""
|
||||
|
||||
codecs.lookup(self.encoding)
|
||||
|
||||
@final
|
||||
def _process_dataframe(self) -> dict[int | str, dict[str, Any]]:
|
||||
"""
|
||||
Adjust Data Frame to fit xml output.
|
||||
|
||||
This method will adjust underlying data frame for xml output,
|
||||
including optionally replacing missing values and including indexes.
|
||||
"""
|
||||
|
||||
df = self.frame
|
||||
|
||||
if self.index:
|
||||
df = df.reset_index()
|
||||
|
||||
if self.na_rep is not None:
|
||||
df = df.fillna(self.na_rep)
|
||||
|
||||
return df.to_dict(orient="index")
|
||||
|
||||
@final
|
||||
def _handle_indexes(self) -> None:
|
||||
"""
|
||||
Handle indexes.
|
||||
|
||||
This method will add indexes into attr_cols or elem_cols.
|
||||
"""
|
||||
|
||||
if not self.index:
|
||||
return
|
||||
|
||||
first_key = next(iter(self.frame_dicts))
|
||||
indexes: list[str] = [
|
||||
x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
|
||||
]
|
||||
|
||||
if self.attr_cols:
|
||||
self.attr_cols = indexes + self.attr_cols
|
||||
|
||||
if self.elem_cols:
|
||||
self.elem_cols = indexes + self.elem_cols
|
||||
|
||||
def _get_prefix_uri(self) -> str:
|
||||
"""
|
||||
Get uri of namespace prefix.
|
||||
|
||||
This method retrieves corresponding URI to prefix in namespaces.
|
||||
|
||||
Raises
|
||||
------
|
||||
KeyError
|
||||
*If prefix is not included in namespace dict.
|
||||
"""
|
||||
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@final
|
||||
def _other_namespaces(self) -> dict:
|
||||
"""
|
||||
Define other namespaces.
|
||||
|
||||
This method will build dictionary of namespaces attributes
|
||||
for root element, conditionally with optional namespaces and
|
||||
prefix.
|
||||
"""
|
||||
|
||||
nmsp_dict: dict[str, str] = {}
|
||||
if self.namespaces:
|
||||
nmsp_dict = {
|
||||
f"xmlns{p if p == '' else f':{p}'}": n
|
||||
for p, n in self.namespaces.items()
|
||||
if n != self.prefix_uri[1:-1]
|
||||
}
|
||||
|
||||
return nmsp_dict
|
||||
|
||||
@final
|
||||
def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
|
||||
"""
|
||||
Create attributes of row.
|
||||
|
||||
This method adds attributes using attr_cols to row element and
|
||||
works with tuples for multindex or hierarchical columns.
|
||||
"""
|
||||
|
||||
if not self.attr_cols:
|
||||
return elem_row
|
||||
|
||||
for col in self.attr_cols:
|
||||
attr_name = self._get_flat_col_name(col)
|
||||
try:
|
||||
if not isna(d[col]):
|
||||
elem_row.attrib[attr_name] = str(d[col])
|
||||
except KeyError as err:
|
||||
raise KeyError(f"no valid column, {col}") from err
|
||||
return elem_row
|
||||
|
||||
@final
|
||||
def _get_flat_col_name(self, col: str | tuple) -> str:
|
||||
flat_col = col
|
||||
if isinstance(col, tuple):
|
||||
flat_col = (
|
||||
"".join([str(c) for c in col]).strip()
|
||||
if "" in col
|
||||
else "_".join([str(c) for c in col]).strip()
|
||||
)
|
||||
return f"{self.prefix_uri}{flat_col}"
|
||||
|
||||
@cache_readonly
|
||||
def _sub_element_cls(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@final
|
||||
def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
|
||||
"""
|
||||
Create child elements of row.
|
||||
|
||||
This method adds child elements using elem_cols to row element and
|
||||
works with tuples for multindex or hierarchical columns.
|
||||
"""
|
||||
sub_element_cls = self._sub_element_cls
|
||||
|
||||
if not self.elem_cols:
|
||||
return
|
||||
|
||||
for col in self.elem_cols:
|
||||
elem_name = self._get_flat_col_name(col)
|
||||
try:
|
||||
val = None if isna(d[col]) or d[col] == "" else str(d[col])
|
||||
sub_element_cls(elem_row, elem_name).text = val
|
||||
except KeyError as err:
|
||||
raise KeyError(f"no valid column, {col}") from err
|
||||
|
||||
@final
|
||||
def write_output(self) -> str | None:
|
||||
xml_doc = self._build_tree()
|
||||
|
||||
if self.path_or_buffer is not None:
|
||||
with get_handle(
|
||||
self.path_or_buffer,
|
||||
"wb",
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
is_text=False,
|
||||
) as handles:
|
||||
handles.handle.write(xml_doc)
|
||||
return None
|
||||
|
||||
else:
|
||||
return xml_doc.decode(self.encoding).rstrip()
|
||||
|
||||
|
||||
class EtreeXMLFormatter(_BaseXMLFormatter):
|
||||
"""
|
||||
Class for formatting data in xml using Python standard library
|
||||
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
|
||||
"""
|
||||
|
||||
def _build_tree(self) -> bytes:
|
||||
from xml.etree.ElementTree import (
|
||||
Element,
|
||||
SubElement,
|
||||
tostring,
|
||||
)
|
||||
|
||||
self.root = Element(
|
||||
f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces()
|
||||
)
|
||||
|
||||
for d in self.frame_dicts.values():
|
||||
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
|
||||
|
||||
if not self.attr_cols and not self.elem_cols:
|
||||
self.elem_cols = list(d.keys())
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
else:
|
||||
elem_row = self._build_attribs(d, elem_row)
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
self.out_xml = tostring(
|
||||
self.root,
|
||||
method="xml",
|
||||
encoding=self.encoding,
|
||||
xml_declaration=self.xml_declaration,
|
||||
)
|
||||
|
||||
if self.pretty_print:
|
||||
self.out_xml = self._prettify_tree()
|
||||
|
||||
if self.stylesheet is not None:
|
||||
raise ValueError(
|
||||
"To use stylesheet, you need lxml installed and selected as parser."
|
||||
)
|
||||
|
||||
return self.out_xml
|
||||
|
||||
def _get_prefix_uri(self) -> str:
|
||||
from xml.etree.ElementTree import register_namespace
|
||||
|
||||
uri = ""
|
||||
if self.namespaces:
|
||||
for p, n in self.namespaces.items():
|
||||
if isinstance(p, str) and isinstance(n, str):
|
||||
register_namespace(p, n)
|
||||
if self.prefix:
|
||||
try:
|
||||
uri = f"{{{self.namespaces[self.prefix]}}}"
|
||||
except KeyError as err:
|
||||
raise KeyError(
|
||||
f"{self.prefix} is not included in namespaces"
|
||||
) from err
|
||||
elif "" in self.namespaces:
|
||||
uri = f"{{{self.namespaces['']}}}"
|
||||
else:
|
||||
uri = ""
|
||||
|
||||
return uri
|
||||
|
||||
@cache_readonly
|
||||
def _sub_element_cls(self):
|
||||
from xml.etree.ElementTree import SubElement
|
||||
|
||||
return SubElement
|
||||
|
||||
def _prettify_tree(self) -> bytes:
|
||||
"""
|
||||
Output tree for pretty print format.
|
||||
|
||||
This method will pretty print xml with line breaks and indentation.
|
||||
"""
|
||||
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
dom = parseString(self.out_xml)
|
||||
|
||||
return dom.toprettyxml(indent=" ", encoding=self.encoding)
|
||||
|
||||
|
||||
class LxmlXMLFormatter(_BaseXMLFormatter):
|
||||
"""
|
||||
Class for formatting data in xml using Python standard library
|
||||
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._convert_empty_str_key()
|
||||
|
||||
def _build_tree(self) -> bytes:
|
||||
"""
|
||||
Build tree from data.
|
||||
|
||||
This method initializes the root and builds attributes and elements
|
||||
with optional namespaces.
|
||||
"""
|
||||
from lxml.etree import (
|
||||
Element,
|
||||
SubElement,
|
||||
tostring,
|
||||
)
|
||||
|
||||
self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
|
||||
|
||||
for d in self.frame_dicts.values():
|
||||
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
|
||||
|
||||
if not self.attr_cols and not self.elem_cols:
|
||||
self.elem_cols = list(d.keys())
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
else:
|
||||
elem_row = self._build_attribs(d, elem_row)
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
self.out_xml = tostring(
|
||||
self.root,
|
||||
pretty_print=self.pretty_print,
|
||||
method="xml",
|
||||
encoding=self.encoding,
|
||||
xml_declaration=self.xml_declaration,
|
||||
)
|
||||
|
||||
if self.stylesheet is not None:
|
||||
self.out_xml = self._transform_doc()
|
||||
|
||||
return self.out_xml
|
||||
|
||||
def _convert_empty_str_key(self) -> None:
|
||||
"""
|
||||
Replace zero-length string in `namespaces`.
|
||||
|
||||
This method will replace '' with None to align to `lxml`
|
||||
requirement that empty string prefixes are not allowed.
|
||||
"""
|
||||
|
||||
if self.namespaces and "" in self.namespaces.keys():
|
||||
self.namespaces[None] = self.namespaces.pop("", "default")
|
||||
|
||||
def _get_prefix_uri(self) -> str:
|
||||
uri = ""
|
||||
if self.namespaces:
|
||||
if self.prefix:
|
||||
try:
|
||||
uri = f"{{{self.namespaces[self.prefix]}}}"
|
||||
except KeyError as err:
|
||||
raise KeyError(
|
||||
f"{self.prefix} is not included in namespaces"
|
||||
) from err
|
||||
elif "" in self.namespaces:
|
||||
uri = f"{{{self.namespaces['']}}}"
|
||||
else:
|
||||
uri = ""
|
||||
|
||||
return uri
|
||||
|
||||
@cache_readonly
|
||||
def _sub_element_cls(self):
|
||||
from lxml.etree import SubElement
|
||||
|
||||
return SubElement
|
||||
|
||||
def _transform_doc(self) -> bytes:
|
||||
"""
|
||||
Parse stylesheet from file or buffer and run it.
|
||||
|
||||
This method will parse stylesheet object into tree for parsing
|
||||
conditionally by its specific object type, then transforms
|
||||
original tree with XSLT script.
|
||||
"""
|
||||
from lxml.etree import (
|
||||
XSLT,
|
||||
XMLParser,
|
||||
fromstring,
|
||||
parse,
|
||||
)
|
||||
|
||||
style_doc = self.stylesheet
|
||||
assert style_doc is not None # is ensured by caller
|
||||
|
||||
handle_data = get_data_from_filepath(
|
||||
filepath_or_buffer=style_doc,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
)
|
||||
|
||||
with handle_data as xml_data:
|
||||
curr_parser = XMLParser(encoding=self.encoding)
|
||||
|
||||
if isinstance(xml_data, io.StringIO):
|
||||
xsl_doc = fromstring(
|
||||
xml_data.getvalue().encode(self.encoding), parser=curr_parser
|
||||
)
|
||||
else:
|
||||
xsl_doc = parse(xml_data, parser=curr_parser)
|
||||
|
||||
transformer = XSLT(xsl_doc)
|
||||
new_doc = transformer(self.root)
|
||||
|
||||
return bytes(new_doc)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,155 @@
|
||||
from typing import (
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import set_module
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_iceberg(
|
||||
table_identifier: str,
|
||||
catalog_name: str | None = None,
|
||||
*,
|
||||
catalog_properties: dict[str, Any] | None = None,
|
||||
columns: list[str] | None = None,
|
||||
row_filter: str | None = None,
|
||||
case_sensitive: bool = True,
|
||||
snapshot_id: int | None = None,
|
||||
limit: int | None = None,
|
||||
scan_properties: dict[str, Any] | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Read an Apache Iceberg table into a pandas DataFrame.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
.. warning::
|
||||
|
||||
read_iceberg is experimental and may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_identifier : str
|
||||
Table identifier.
|
||||
catalog_name : str, optional
|
||||
The name of the catalog.
|
||||
catalog_properties : dict of {str: str}, optional
|
||||
The properties that are used next to the catalog configuration.
|
||||
columns : list of str, optional
|
||||
A list of strings representing the column names to return in the output
|
||||
dataframe.
|
||||
row_filter : str, optional
|
||||
A string that describes the desired rows.
|
||||
case_sensitive : bool, default True
|
||||
If True column matching is case sensitive.
|
||||
snapshot_id : int, optional
|
||||
Snapshot ID to time travel to. By default the table will be scanned as of the
|
||||
current snapshot ID.
|
||||
limit : int, optional
|
||||
An integer representing the number of rows to return in the scan result.
|
||||
By default all matching rows will be fetched.
|
||||
scan_properties : dict of {str: obj}, optional
|
||||
Additional Table properties as a dictionary of string key value pairs to use
|
||||
for this scan.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
DataFrame based on the Iceberg table.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_parquet : Read a Parquet file.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_iceberg(
|
||||
... table_identifier="my_table",
|
||||
... catalog_name="my_catalog",
|
||||
... catalog_properties={"s3.secret-access-key": "my-secret"},
|
||||
... row_filter="trip_distance >= 10.0",
|
||||
... columns=["VendorID", "tpep_pickup_datetime"],
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
|
||||
pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
|
||||
if catalog_properties is None:
|
||||
catalog_properties = {}
|
||||
catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
|
||||
table = catalog.load_table(table_identifier)
|
||||
if row_filter is None:
|
||||
row_filter = pyiceberg_expressions.AlwaysTrue()
|
||||
if columns is None:
|
||||
selected_fields = ("*",)
|
||||
else:
|
||||
selected_fields = tuple(columns) # type: ignore[assignment]
|
||||
if scan_properties is None:
|
||||
scan_properties = {}
|
||||
result = table.scan(
|
||||
row_filter=row_filter,
|
||||
selected_fields=selected_fields,
|
||||
case_sensitive=case_sensitive,
|
||||
snapshot_id=snapshot_id,
|
||||
options=scan_properties,
|
||||
limit=limit,
|
||||
)
|
||||
return result.to_pandas()
|
||||
|
||||
|
||||
def to_iceberg(
|
||||
df: DataFrame,
|
||||
table_identifier: str,
|
||||
catalog_name: str | None = None,
|
||||
*,
|
||||
catalog_properties: dict[str, Any] | None = None,
|
||||
location: str | None = None,
|
||||
append: bool = False,
|
||||
snapshot_properties: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Write a DataFrame to an Apache Iceberg table.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_identifier : str
|
||||
Table identifier.
|
||||
catalog_name : str, optional
|
||||
The name of the catalog.
|
||||
catalog_properties : dict of {str: str}, optional
|
||||
The properties that are used next to the catalog configuration.
|
||||
location : str, optional
|
||||
Location for the table.
|
||||
append : bool, default False
|
||||
If ``True``, append data to the table, instead of replacing the content.
|
||||
snapshot_properties : dict of {str: str}, optional
|
||||
Custom properties to be added to the snapshot summary
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_iceberg : Read an Apache Iceberg table.
|
||||
DataFrame.to_parquet : Write a DataFrame in Parquet format.
|
||||
"""
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
|
||||
if catalog_properties is None:
|
||||
catalog_properties = {}
|
||||
catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
table = catalog.create_table_if_not_exists(
|
||||
identifier=table_identifier,
|
||||
schema=arrow_table.schema,
|
||||
location=location,
|
||||
# we could add `partition_spec`, `sort_order` and `properties` in the
|
||||
# future, but it may not be trivial without exposing PyIceberg objects
|
||||
)
|
||||
if snapshot_properties is None:
|
||||
snapshot_properties = {}
|
||||
if append:
|
||||
table.append(arrow_table, snapshot_properties=snapshot_properties)
|
||||
else:
|
||||
table.overwrite(arrow_table, snapshot_properties=snapshot_properties)
|
||||
@@ -0,0 +1,15 @@
|
||||
from pandas.io.json._json import (
|
||||
read_json,
|
||||
to_json,
|
||||
ujson_dumps,
|
||||
ujson_loads,
|
||||
)
|
||||
from pandas.io.json._table_schema import build_table_schema
|
||||
|
||||
__all__ = [
|
||||
"build_table_schema",
|
||||
"read_json",
|
||||
"to_json",
|
||||
"ujson_dumps",
|
||||
"ujson_loads",
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,648 @@
|
||||
# ---------------------------------------------------------------------
|
||||
# JSON normalization routines
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import (
|
||||
abc,
|
||||
defaultdict,
|
||||
)
|
||||
import copy
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
DefaultDict,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.writers import convert_json_to_lines
|
||||
from pandas.util._decorators import set_module
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from pandas._typing import (
|
||||
IgnoreRaise,
|
||||
Scalar,
|
||||
)
|
||||
|
||||
|
||||
def convert_to_line_delimits(s: str) -> str:
|
||||
"""
|
||||
Helper function that converts JSON lists to line delimited JSON.
|
||||
"""
|
||||
# Determine we have a JSON list to turn to lines otherwise just return the
|
||||
# json object, only lists can
|
||||
if not s[0] == "[" and s[-1] == "]":
|
||||
return s
|
||||
s = s[1:-1]
|
||||
|
||||
return convert_json_to_lines(s)
|
||||
|
||||
|
||||
@overload
|
||||
def nested_to_record(
|
||||
ds: dict,
|
||||
prefix: str = ...,
|
||||
sep: str = ...,
|
||||
level: int = ...,
|
||||
max_level: int | None = ...,
|
||||
) -> dict[str, Any]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def nested_to_record(
|
||||
ds: list[dict],
|
||||
prefix: str = ...,
|
||||
sep: str = ...,
|
||||
level: int = ...,
|
||||
max_level: int | None = ...,
|
||||
) -> list[dict[str, Any]]: ...
|
||||
|
||||
|
||||
def nested_to_record(
|
||||
ds: dict | list[dict],
|
||||
prefix: str = "",
|
||||
sep: str = ".",
|
||||
level: int = 0,
|
||||
max_level: int | None = None,
|
||||
) -> dict[str, Any] | list[dict[str, Any]]:
|
||||
"""
|
||||
A simplified json_normalize
|
||||
|
||||
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
|
||||
it does not attempt to extract a subset of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ds : dict or list of dicts
|
||||
prefix: the prefix, optional, default: ""
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
level: int, optional, default: 0
|
||||
The number of levels in the json string.
|
||||
|
||||
max_level: int, optional, default: None
|
||||
The max depth to normalize.
|
||||
|
||||
Returns
|
||||
-------
|
||||
d - dict or list of dicts, matching `ds`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> nested_to_record(
|
||||
... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
|
||||
... )
|
||||
{\
|
||||
'flat1': 1, \
|
||||
'dict1.c': 1, \
|
||||
'dict1.d': 2, \
|
||||
'nested.e.c': 1, \
|
||||
'nested.e.d': 2, \
|
||||
'nested.d': 2\
|
||||
}
|
||||
"""
|
||||
singleton = False
|
||||
if isinstance(ds, dict):
|
||||
ds = [ds]
|
||||
singleton = True
|
||||
new_ds = []
|
||||
for d in ds:
|
||||
new_d = copy.deepcopy(d)
|
||||
for k, v in d.items():
|
||||
# each key gets renamed with prefix
|
||||
if not isinstance(k, str):
|
||||
k = str(k)
|
||||
if level == 0:
|
||||
newkey = k
|
||||
else:
|
||||
newkey = prefix + sep + k
|
||||
|
||||
# flatten if type is dict and
|
||||
# current dict level < maximum level provided and
|
||||
# only dicts gets recurse-flattened
|
||||
# only at level>1 do we rename the rest of the keys
|
||||
if not isinstance(v, dict) or (
|
||||
max_level is not None and level >= max_level
|
||||
):
|
||||
if level != 0: # so we skip copying for top level, common case
|
||||
v = new_d.pop(k)
|
||||
new_d[newkey] = v
|
||||
continue
|
||||
|
||||
v = new_d.pop(k)
|
||||
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
|
||||
new_ds.append(new_d)
|
||||
|
||||
if singleton:
|
||||
return new_ds[0]
|
||||
return new_ds
|
||||
|
||||
|
||||
def _normalize_json(
|
||||
data: Any,
|
||||
key_string: str,
|
||||
normalized_dict: dict[str, Any],
|
||||
separator: str,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Main recursive function
|
||||
Designed for the most basic use case of pd.json_normalize(data)
|
||||
intended as a performance improvement, see #15621
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Any
|
||||
Type dependent on types contained within nested Json
|
||||
key_string : str
|
||||
New key (with separator(s) in) for data
|
||||
normalized_dict : dict
|
||||
The new normalized/flattened Json dict
|
||||
separator : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
new_key = f"{key_string}{separator}{key}"
|
||||
|
||||
if not key_string:
|
||||
new_key = new_key.removeprefix(separator)
|
||||
|
||||
_normalize_json(
|
||||
data=value,
|
||||
key_string=new_key,
|
||||
normalized_dict=normalized_dict,
|
||||
separator=separator,
|
||||
)
|
||||
else:
|
||||
normalized_dict[key_string] = data
|
||||
return normalized_dict
|
||||
|
||||
|
||||
def _normalize_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
|
||||
"""
|
||||
Order the top level keys and then recursively go to depth
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dict or list of dicts
|
||||
separator : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict or list of dicts, matching `normalized_json_object`
|
||||
"""
|
||||
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
|
||||
nested_dict_ = _normalize_json(
|
||||
data={k: v for k, v in data.items() if isinstance(v, dict)},
|
||||
key_string="",
|
||||
normalized_dict={},
|
||||
separator=separator,
|
||||
)
|
||||
return {**top_dict_, **nested_dict_}
|
||||
|
||||
|
||||
def _simple_json_normalize(
|
||||
ds: dict | list[dict],
|
||||
sep: str = ".",
|
||||
) -> dict | list[dict] | Any:
|
||||
"""
|
||||
An optimized basic json_normalize
|
||||
|
||||
Converts a nested dict into a flat dict ("record"), unlike
|
||||
json_normalize and nested_to_record it doesn't do anything clever.
|
||||
But for the most basic use cases it enhances performance.
|
||||
E.g. pd.json_normalize(data)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ds : dict or list of dicts
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame : DataFrame
|
||||
d - dict or list of dicts, matching `normalized_json_object`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _simple_json_normalize(
|
||||
... {
|
||||
... "flat1": 1,
|
||||
... "dict1": {"c": 1, "d": 2},
|
||||
... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
|
||||
... }
|
||||
... )
|
||||
{\
|
||||
'flat1': 1, \
|
||||
'dict1.c': 1, \
|
||||
'dict1.d': 2, \
|
||||
'nested.e.c': 1, \
|
||||
'nested.e.d': 2, \
|
||||
'nested.d': 2\
|
||||
}
|
||||
|
||||
"""
|
||||
normalized_json_object = {}
|
||||
# expect a dictionary, as most jsons are. However, lists are perfectly valid
|
||||
if isinstance(ds, dict):
|
||||
normalized_json_object = _normalize_json_ordered(data=ds, separator=sep)
|
||||
elif isinstance(ds, list):
|
||||
normalized_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
|
||||
return normalized_json_list
|
||||
return normalized_json_object
|
||||
|
||||
|
||||
def _validate_meta(meta: str | list[str | list[str]] | None) -> None:
|
||||
"""
|
||||
Validate that meta parameter contains only strings or lists of strings.
|
||||
Parameters
|
||||
----------
|
||||
meta : str or list of str or list of list of str or None
|
||||
The meta parameter to validate.
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If meta contains elements that are not strings or lists of strings.
|
||||
"""
|
||||
if meta is None:
|
||||
return
|
||||
if isinstance(meta, str):
|
||||
return
|
||||
for item in meta:
|
||||
if isinstance(item, list):
|
||||
for subitem in item:
|
||||
if not isinstance(subitem, str):
|
||||
raise TypeError(
|
||||
"All elements in nested meta paths must be strings. "
|
||||
f"Found {type(subitem).__name__}: {subitem!r}"
|
||||
)
|
||||
elif not isinstance(item, str):
|
||||
raise TypeError(
|
||||
"All elements in 'meta' must be strings or lists of strings. "
|
||||
f"Found {type(item).__name__}: {item!r}"
|
||||
)
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def json_normalize(
|
||||
data: dict | list[dict] | Series,
|
||||
record_path: str | list | None = None,
|
||||
meta: str | list[str | list[str]] | None = None,
|
||||
meta_prefix: str | None = None,
|
||||
record_prefix: str | None = None,
|
||||
errors: IgnoreRaise = "raise",
|
||||
sep: str = ".",
|
||||
max_level: int | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Normalize semi-structured JSON data into a flat table.
|
||||
|
||||
This method is designed to transform semi-structured JSON data, such as nested
|
||||
dictionaries or lists, into a flat table. This is particularly useful when
|
||||
handling JSON-like data structures that contain deeply nested fields.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dict, list of dicts, or Series of dicts
|
||||
Unserialized JSON objects.
|
||||
record_path : str or list of str, default None
|
||||
Path in each object to list of records. If not passed, data will be
|
||||
assumed to be an array of records.
|
||||
meta : list of paths (str or list of str), default None
|
||||
Fields to use as metadata for each record in resulting table.
|
||||
meta_prefix : str, default None
|
||||
String to prefix records with dotted path, e.g. foo.bar.field if
|
||||
meta is ['foo', 'bar'].
|
||||
record_prefix : str, default None
|
||||
String to prefix records with dotted path, e.g. foo.bar.field if
|
||||
path to records is ['foo', 'bar'].
|
||||
errors : {'raise', 'ignore'}, default 'raise'
|
||||
Configures error handling.
|
||||
|
||||
* 'ignore' : will ignore KeyError if keys listed in meta are not
|
||||
always present.
|
||||
* 'raise' : will raise KeyError if keys listed in meta are not
|
||||
always present.
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep.
|
||||
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
|
||||
max_level : int, default None
|
||||
Max number of levels(depth of dict) to normalize.
|
||||
if None, normalizes all levels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The normalized data, represented as a pandas DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data.
|
||||
Series : One-dimensional ndarray with axis labels (including time series).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> data = [
|
||||
... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
|
||||
... {"name": {"given": "Mark", "family": "Regner"}},
|
||||
... {"id": 2, "name": "Faye Raker"},
|
||||
... ]
|
||||
>>> pd.json_normalize(data)
|
||||
id name.first name.last name.given name.family name
|
||||
0 1.0 Coleen Volk NaN NaN NaN
|
||||
1 NaN NaN NaN Mark Regner NaN
|
||||
2 2.0 NaN NaN NaN NaN Faye Raker
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "id": 1,
|
||||
... "name": "Cole Volk",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
|
||||
... {
|
||||
... "id": 2,
|
||||
... "name": "Faye Raker",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... ]
|
||||
>>> pd.json_normalize(data, max_level=0)
|
||||
id name fitness
|
||||
0 1.0 Cole Volk {'height': 130, 'weight': 60}
|
||||
1 NaN Mark Reg {'height': 130, 'weight': 60}
|
||||
2 2.0 Faye Raker {'height': 130, 'weight': 60}
|
||||
|
||||
Normalizes nested data up to level 1.
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "id": 1,
|
||||
... "name": "Cole Volk",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
|
||||
... {
|
||||
... "id": 2,
|
||||
... "name": "Faye Raker",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... ]
|
||||
>>> pd.json_normalize(data, max_level=1)
|
||||
id name fitness.height fitness.weight
|
||||
0 1.0 Cole Volk 130 60
|
||||
1 NaN Mark Reg 130 60
|
||||
2 2.0 Faye Raker 130 60
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "id": 1,
|
||||
... "name": "Cole Volk",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
|
||||
... {
|
||||
... "id": 2,
|
||||
... "name": "Faye Raker",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... ]
|
||||
>>> series = pd.Series(data, index=pd.Index(["a", "b", "c"]))
|
||||
>>> pd.json_normalize(series)
|
||||
id name fitness.height fitness.weight
|
||||
a 1.0 Cole Volk 130 60
|
||||
b NaN Mark Reg 130 60
|
||||
c 2.0 Faye Raker 130 60
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "state": "Florida",
|
||||
... "shortname": "FL",
|
||||
... "info": {"governor": "Rick Scott"},
|
||||
... "counties": [
|
||||
... {"name": "Dade", "population": 12345},
|
||||
... {"name": "Broward", "population": 40000},
|
||||
... {"name": "Palm Beach", "population": 60000},
|
||||
... ],
|
||||
... },
|
||||
... {
|
||||
... "state": "Ohio",
|
||||
... "shortname": "OH",
|
||||
... "info": {"governor": "John Kasich"},
|
||||
... "counties": [
|
||||
... {"name": "Summit", "population": 1234},
|
||||
... {"name": "Cuyahoga", "population": 1337},
|
||||
... ],
|
||||
... },
|
||||
... ]
|
||||
>>> result = pd.json_normalize(
|
||||
... data, "counties", ["state", "shortname", ["info", "governor"]]
|
||||
... )
|
||||
>>> result
|
||||
name population state shortname info.governor
|
||||
0 Dade 12345 Florida FL Rick Scott
|
||||
1 Broward 40000 Florida FL Rick Scott
|
||||
2 Palm Beach 60000 Florida FL Rick Scott
|
||||
3 Summit 1234 Ohio OH John Kasich
|
||||
4 Cuyahoga 1337 Ohio OH John Kasich
|
||||
|
||||
>>> data = {"A": [1, 2]}
|
||||
>>> pd.json_normalize(data, "A", record_prefix="Prefix.")
|
||||
Prefix.0
|
||||
0 1
|
||||
1 2
|
||||
|
||||
Returns normalized data with columns prefixed with the given string.
|
||||
"""
|
||||
_validate_meta(meta)
|
||||
|
||||
def _pull_field(
|
||||
js: dict[str, Any], spec: list | str, extract_record: bool = False
|
||||
) -> Scalar | Iterable:
|
||||
"""Internal function to pull field"""
|
||||
result = js
|
||||
try:
|
||||
if isinstance(spec, list):
|
||||
for field in spec:
|
||||
if result is None:
|
||||
raise KeyError(field)
|
||||
result = result[field]
|
||||
else:
|
||||
result = result[spec]
|
||||
except KeyError as e:
|
||||
if extract_record:
|
||||
raise KeyError(
|
||||
f"Key {e} not found. If specifying a record_path, all elements of "
|
||||
f"data should have the path."
|
||||
) from e
|
||||
if errors == "ignore":
|
||||
return np.nan
|
||||
else:
|
||||
raise KeyError(
|
||||
f"Key {e} not found. To replace missing values of {e} with "
|
||||
f"np.nan, pass in errors='ignore'"
|
||||
) from e
|
||||
|
||||
return result
|
||||
|
||||
def _pull_records(js: dict[str, Any], spec: list | str) -> list:
|
||||
"""
|
||||
Internal function to pull field for records, and similar to
|
||||
_pull_field, but require to return list. And will raise error
|
||||
if has non iterable value.
|
||||
"""
|
||||
result = _pull_field(js, spec, extract_record=True)
|
||||
|
||||
# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
|
||||
# null, otherwise return an empty list
|
||||
if not isinstance(result, list):
|
||||
if pd.isnull(result):
|
||||
result = []
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Path must contain list or null, "
|
||||
f"but got {type(result).__name__} at {spec!r}"
|
||||
)
|
||||
return result
|
||||
|
||||
if isinstance(data, Series):
|
||||
index = data.index
|
||||
else:
|
||||
index = None
|
||||
|
||||
if isinstance(data, list) and not data:
|
||||
return DataFrame()
|
||||
elif isinstance(data, dict):
|
||||
# A bit of a hackjob
|
||||
data = [data]
|
||||
elif isinstance(data, abc.Iterable) and not isinstance(data, str):
|
||||
# GH35923 Fix pd.json_normalize to not skip the first element of a
|
||||
# generator input
|
||||
data = list(data)
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
msg = (
|
||||
"All items in data must be of type dict, "
|
||||
f"found {type(item).__name__}"
|
||||
)
|
||||
raise TypeError(msg)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
# check to see if a simple recursive function is possible to
|
||||
# improve performance (see #15621) but only for cases such
|
||||
# as pd.Dataframe(data) or pd.Dataframe(data, sep)
|
||||
if (
|
||||
record_path is None
|
||||
and meta is None
|
||||
and meta_prefix is None
|
||||
and record_prefix is None
|
||||
and max_level is None
|
||||
):
|
||||
return DataFrame(_simple_json_normalize(data, sep=sep), index=index)
|
||||
|
||||
if record_path is None:
|
||||
if any([isinstance(x, dict) for x in y.values()] for y in data):
|
||||
# naive normalization, this is idempotent for flat records
|
||||
# and potentially will inflate the data considerably for
|
||||
# deeply nested structures:
|
||||
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
|
||||
#
|
||||
# TODO: handle record value which are lists, at least error
|
||||
# reasonably
|
||||
data = nested_to_record(data, sep=sep, max_level=max_level)
|
||||
result = DataFrame(data, index=index)
|
||||
if record_prefix is not None:
|
||||
result = result.rename(columns=lambda x: f"{record_prefix}{x}")
|
||||
return result
|
||||
elif not isinstance(record_path, list):
|
||||
record_path = [record_path]
|
||||
|
||||
if meta is None:
|
||||
meta = []
|
||||
elif not isinstance(meta, list):
|
||||
meta = [meta]
|
||||
|
||||
_meta = [m if isinstance(m, list) else [m] for m in meta]
|
||||
|
||||
# Disastrously inefficient for now
|
||||
records: list = []
|
||||
lengths = []
|
||||
|
||||
meta_vals: DefaultDict = defaultdict(list)
|
||||
meta_keys = [sep.join(val) for val in _meta]
|
||||
|
||||
def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
|
||||
if isinstance(data, dict):
|
||||
data = [data]
|
||||
if len(path) > 1:
|
||||
for obj in data:
|
||||
for val, key in zip(_meta, meta_keys, strict=True):
|
||||
if level + 1 == len(val):
|
||||
seen_meta[key] = _pull_field(obj, val[-1])
|
||||
|
||||
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
|
||||
else:
|
||||
for obj in data:
|
||||
recs = _pull_records(obj, path[0])
|
||||
recs = [
|
||||
nested_to_record(r, sep=sep, max_level=max_level)
|
||||
if isinstance(r, dict)
|
||||
else r
|
||||
for r in recs
|
||||
]
|
||||
|
||||
# For repeating the metadata later
|
||||
lengths.append(len(recs))
|
||||
for val, key in zip(_meta, meta_keys, strict=True):
|
||||
if level + 1 > len(val):
|
||||
meta_val = seen_meta[key]
|
||||
else:
|
||||
meta_val = _pull_field(obj, val[level:])
|
||||
meta_vals[key].append(meta_val)
|
||||
records.extend(recs)
|
||||
|
||||
_recursive_extract(data, record_path, {}, level=0)
|
||||
|
||||
result = DataFrame(records)
|
||||
|
||||
if record_prefix is not None:
|
||||
result = result.rename(columns=lambda x: f"{record_prefix}{x}")
|
||||
|
||||
# Data types, a problem
|
||||
for k, v in meta_vals.items():
|
||||
if meta_prefix is not None:
|
||||
k = meta_prefix + k
|
||||
|
||||
if k in result:
|
||||
raise ValueError(
|
||||
f"Conflicting metadata name {k}, need distinguishing prefix "
|
||||
)
|
||||
# GH 37782
|
||||
|
||||
values = np.array(v, dtype=object)
|
||||
|
||||
if values.ndim > 1:
|
||||
# GH 37782
|
||||
values = np.empty((len(v),), dtype=object)
|
||||
for i, val in enumerate(v):
|
||||
values[i] = val
|
||||
|
||||
result[k] = values.repeat(lengths)
|
||||
if index is not None:
|
||||
result.index = index.repeat(lengths)
|
||||
return result
|
||||
+402
@@ -0,0 +1,402 @@
|
||||
"""
|
||||
Table Schema builders
|
||||
|
||||
https://specs.frictionlessdata.io/table-schema/
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas._config import option_context
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.json import ujson_loads
|
||||
from pandas._libs.tslibs import timezones
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.base import _registry as registry
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_integer_dtype,
|
||||
is_numeric_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.core.common as com
|
||||
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
JSONSerializable,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
from pandas.core.indexes.multi import MultiIndex
|
||||
|
||||
|
||||
TABLE_SCHEMA_VERSION = "1.4.0"
|
||||
|
||||
|
||||
def as_json_table_type(x: DtypeObj) -> str:
|
||||
"""
|
||||
Convert a NumPy / pandas type to its corresponding json_table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.dtype or ExtensionDtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
the Table Schema data types
|
||||
|
||||
Notes
|
||||
-----
|
||||
This table shows the relationship between NumPy / pandas dtypes,
|
||||
and Table Schema dtypes.
|
||||
|
||||
============== =================
|
||||
Pandas type Table Schema type
|
||||
============== =================
|
||||
int64 integer
|
||||
float64 number
|
||||
bool boolean
|
||||
datetime64[ns] datetime
|
||||
timedelta64[ns] duration
|
||||
object str
|
||||
categorical any
|
||||
=============== =================
|
||||
"""
|
||||
if is_integer_dtype(x):
|
||||
return "integer"
|
||||
elif is_bool_dtype(x):
|
||||
return "boolean"
|
||||
elif is_numeric_dtype(x):
|
||||
return "number"
|
||||
elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)):
|
||||
return "datetime"
|
||||
elif lib.is_np_dtype(x, "m"):
|
||||
return "duration"
|
||||
elif is_string_dtype(x):
|
||||
return "string"
|
||||
else:
|
||||
return "any"
|
||||
|
||||
|
||||
def set_default_names(data):
|
||||
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
|
||||
if com.all_not_none(*data.index.names):
|
||||
nms = data.index.names
|
||||
if len(nms) == 1 and data.index.name == "index":
|
||||
warnings.warn(
|
||||
"Index name of 'index' is not round-trippable.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
|
||||
warnings.warn(
|
||||
"Index names beginning with 'level_' are not round-trippable.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return data
|
||||
|
||||
data = data.copy(deep=False)
|
||||
if data.index.nlevels > 1:
|
||||
data.index.names = com.fill_missing_names(data.index.names)
|
||||
else:
|
||||
data.index.name = data.index.name or "index"
|
||||
return data
|
||||
|
||||
|
||||
def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
|
||||
dtype = arr.dtype
|
||||
name: JSONSerializable
|
||||
if arr.name is None:
|
||||
name = "values"
|
||||
else:
|
||||
name = arr.name
|
||||
field: dict[str, JSONSerializable] = {
|
||||
"name": name,
|
||||
"type": as_json_table_type(dtype),
|
||||
}
|
||||
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
cats = dtype.categories
|
||||
ordered = dtype.ordered
|
||||
|
||||
field["constraints"] = {"enum": list(cats)}
|
||||
field["ordered"] = ordered
|
||||
elif isinstance(dtype, PeriodDtype):
|
||||
field["freq"] = dtype.freq.freqstr
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
if timezones.is_utc(dtype.tz):
|
||||
field["tz"] = "UTC"
|
||||
else:
|
||||
zone = timezones.get_timezone(dtype.tz)
|
||||
if isinstance(zone, str):
|
||||
field["tz"] = zone
|
||||
elif isinstance(dtype, ExtensionDtype):
|
||||
field["extDtype"] = dtype.name
|
||||
return field
|
||||
|
||||
|
||||
def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
|
||||
"""
|
||||
Converts a JSON field descriptor into its corresponding NumPy / pandas type
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field
|
||||
A JSON field descriptor
|
||||
|
||||
Returns
|
||||
-------
|
||||
dtype
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the type of the provided field is unknown or currently unsupported
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
|
||||
'int64'
|
||||
|
||||
>>> convert_json_field_to_pandas_type(
|
||||
... {
|
||||
... "name": "a_categorical",
|
||||
... "type": "any",
|
||||
... "constraints": {"enum": ["a", "b", "c"]},
|
||||
... "ordered": True,
|
||||
... }
|
||||
... )
|
||||
CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=str)
|
||||
|
||||
>>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
|
||||
'datetime64[ns]'
|
||||
|
||||
>>> convert_json_field_to_pandas_type(
|
||||
... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
|
||||
... )
|
||||
'datetime64[ns, US/Central]'
|
||||
"""
|
||||
typ = field["type"]
|
||||
if typ == "string":
|
||||
return field.get("extDtype", None)
|
||||
elif typ == "integer":
|
||||
return field.get("extDtype", "int64")
|
||||
elif typ == "number":
|
||||
return field.get("extDtype", "float64")
|
||||
elif typ == "boolean":
|
||||
return field.get("extDtype", "bool")
|
||||
elif typ == "duration":
|
||||
return "timedelta64"
|
||||
elif typ == "datetime":
|
||||
if field.get("tz"):
|
||||
return f"datetime64[ns, {field['tz']}]"
|
||||
elif field.get("freq"):
|
||||
# GH#9586 rename frequency M to ME for offsets
|
||||
offset = to_offset(field["freq"])
|
||||
freq = PeriodDtype(offset)._freqstr
|
||||
# GH#47747 using datetime over period to minimize the change surface
|
||||
return f"period[{freq}]"
|
||||
else:
|
||||
return "datetime64[ns]"
|
||||
elif typ == "any":
|
||||
if "constraints" in field and "ordered" in field:
|
||||
return CategoricalDtype(
|
||||
categories=field["constraints"]["enum"], ordered=field["ordered"]
|
||||
)
|
||||
elif "extDtype" in field:
|
||||
return registry.find(field["extDtype"])
|
||||
else:
|
||||
return "object"
|
||||
|
||||
raise ValueError(f"Unsupported or invalid field type: {typ}")
|
||||
|
||||
|
||||
def build_table_schema(
|
||||
data: DataFrame | Series,
|
||||
index: bool = True,
|
||||
primary_key: bool | None = None,
|
||||
version: bool = True,
|
||||
) -> dict[str, JSONSerializable]:
|
||||
"""
|
||||
Create a Table schema from ``data``.
|
||||
|
||||
This method is a utility to generate a JSON-serializable schema
|
||||
representation of a pandas Series or DataFrame, compatible with the
|
||||
Table Schema specification. It enables structured data to be shared
|
||||
and validated in various applications, ensuring consistency and
|
||||
interoperability.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series or DataFrame
|
||||
The input data for which the table schema is to be created.
|
||||
index : bool, default True
|
||||
Whether to include ``data.index`` in the schema.
|
||||
primary_key : bool or None, default True
|
||||
Column names to designate as the primary key.
|
||||
The default `None` will set `'primaryKey'` to the index
|
||||
level or levels if the index is unique.
|
||||
version : bool, default True
|
||||
Whether to include a field `pandas_version` with the version
|
||||
of pandas that last revised the table schema. This version
|
||||
can be different from the installed pandas version.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A dictionary representing the Table schema.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_json : Convert the object to a JSON string.
|
||||
read_json : Convert a JSON string to pandas object.
|
||||
|
||||
Notes
|
||||
-----
|
||||
See `Table Schema
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
|
||||
conversion types.
|
||||
Timedeltas as converted to ISO8601 duration format with
|
||||
9 decimal places after the seconds field for nanosecond precision.
|
||||
|
||||
Categoricals are converted to the `any` dtype, and use the `enum` field
|
||||
constraint to list the allowed values. The `ordered` attribute is included
|
||||
in an `ordered` field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.io.json._table_schema import build_table_schema
|
||||
>>> df = pd.DataFrame(
|
||||
... {'A': [1, 2, 3],
|
||||
... 'B': ['a', 'b', 'c'],
|
||||
... 'C': pd.date_range('2016-01-01', freq='D', periods=3),
|
||||
... }, index=pd.Index(range(3), name='idx'))
|
||||
>>> build_table_schema(df)
|
||||
{'fields': \
|
||||
[{'name': 'idx', 'type': 'integer'}, \
|
||||
{'name': 'A', 'type': 'integer'}, \
|
||||
{'name': 'B', 'type': 'string', 'extDtype': 'str'}, \
|
||||
{'name': 'C', 'type': 'datetime'}], \
|
||||
'primaryKey': ['idx'], \
|
||||
'pandas_version': '1.4.0'}
|
||||
"""
|
||||
if index is True:
|
||||
data = set_default_names(data)
|
||||
|
||||
schema: dict[str, Any] = {}
|
||||
fields = []
|
||||
|
||||
if index:
|
||||
if data.index.nlevels > 1:
|
||||
data.index = cast("MultiIndex", data.index)
|
||||
for level, name in zip(data.index.levels, data.index.names, strict=True):
|
||||
new_field = convert_pandas_type_to_json_field(level)
|
||||
new_field["name"] = name
|
||||
fields.append(new_field)
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data.index))
|
||||
|
||||
if data.ndim > 1:
|
||||
for column, s in data.items():
|
||||
fields.append(convert_pandas_type_to_json_field(s))
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data))
|
||||
|
||||
schema["fields"] = fields
|
||||
if index and data.index.is_unique and primary_key is None:
|
||||
if data.index.nlevels == 1:
|
||||
schema["primaryKey"] = [data.index.name]
|
||||
else:
|
||||
schema["primaryKey"] = data.index.names
|
||||
elif primary_key is not None:
|
||||
schema["primaryKey"] = primary_key
|
||||
|
||||
if version:
|
||||
schema["pandas_version"] = TABLE_SCHEMA_VERSION
|
||||
return schema
|
||||
|
||||
|
||||
def parse_table_schema(json, precise_float: bool) -> DataFrame:
|
||||
"""
|
||||
Builds a DataFrame from a given schema
|
||||
|
||||
Parameters
|
||||
----------
|
||||
json :
|
||||
A JSON table schema
|
||||
precise_float : bool
|
||||
Flag controlling precision when decoding string to double values, as
|
||||
dictated by ``read_json``
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
If the JSON table schema contains either timezone or timedelta data
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
|
||||
name-less :class:`Index`, this function sets the name of the returned
|
||||
:class:`DataFrame` to ``None`` when said string is encountered with a
|
||||
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
|
||||
applies to any strings beginning with 'level_'. Therefore, an
|
||||
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
|
||||
with 'level_' are not supported.
|
||||
|
||||
See Also
|
||||
--------
|
||||
build_table_schema : Inverse function.
|
||||
pandas.read_json
|
||||
"""
|
||||
table = ujson_loads(json, precise_float=precise_float)
|
||||
col_order = [field["name"] for field in table["schema"]["fields"]]
|
||||
df = DataFrame(table["data"], columns=col_order)[col_order]
|
||||
|
||||
dtypes = {
|
||||
field["name"]: convert_json_field_to_pandas_type(field)
|
||||
for field in table["schema"]["fields"]
|
||||
}
|
||||
|
||||
# No ISO constructor for Timedelta as of yet, so need to raise
|
||||
if "timedelta64" in dtypes.values():
|
||||
raise NotImplementedError(
|
||||
'table="orient" can not yet read ISO-formatted Timedelta data'
|
||||
)
|
||||
|
||||
with option_context("future.distinguish_nan_and_na", False):
|
||||
df = df.astype(dtypes)
|
||||
|
||||
if "primaryKey" in table["schema"]:
|
||||
df = df.set_index(table["schema"]["primaryKey"])
|
||||
if len(df.index.names) == 1:
|
||||
if df.index.name == "index":
|
||||
df.index.name = None
|
||||
else:
|
||||
df.index.names = [
|
||||
None if x.startswith("level_") else x for x in df.index.names
|
||||
]
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,243 @@
|
||||
"""orc compat"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import set_module
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.indexes.api import default_index
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.common import (
|
||||
get_handle,
|
||||
is_fsspec_url,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import fsspec
|
||||
import pyarrow.fs
|
||||
|
||||
from pandas._typing import (
|
||||
DtypeBackend,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_orc(
|
||||
path: FilePath | ReadBuffer[bytes],
|
||||
columns: list[str] | None = None,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
|
||||
**kwargs: Any,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load an ORC object from the file path, returning a DataFrame.
|
||||
|
||||
This method reads an ORC (Optimized Row Columnar) file into a pandas
|
||||
DataFrame using the `pyarrow.orc` library. ORC is a columnar storage format
|
||||
that provides efficient compression and fast retrieval for analytical workloads.
|
||||
It allows reading specific columns, handling different filesystem
|
||||
types (such as local storage, cloud storage via fsspec, or pyarrow filesystem),
|
||||
and supports different data type backends, including `numpy_nullable` and `pyarrow`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be a URL.
|
||||
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.orc``.
|
||||
columns : list, default None
|
||||
If not None, only these columns will be read from the file.
|
||||
Output always follows the ordering of the file and not the columns list.
|
||||
This mirrors the original behaviour of
|
||||
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
|
||||
dtype_backend : {'numpy_nullable', 'pyarrow'}
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). If not specified, the default behavior
|
||||
is to not use nullable data types. If specified, the behavior
|
||||
is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable
|
||||
:class:`ArrowDtype` :class:`DataFrame`
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
filesystem : fsspec or pyarrow filesystem, default None
|
||||
Filesystem object to use when reading the orc file.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
**kwargs
|
||||
Any additional kwargs are passed to pyarrow.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
DataFrame based on the ORC file.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_csv : Read a comma-separated values (csv) file into a pandas DataFrame.
|
||||
read_excel : Read an Excel file into a pandas DataFrame.
|
||||
read_spss : Read an SPSS file into a pandas DataFrame.
|
||||
read_sas : Load a SAS file into a pandas DataFrame.
|
||||
read_feather : Load a feather-format object into a pandas DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
|
||||
and :ref:`install optional dependencies <install.warn_orc>`.
|
||||
|
||||
If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
|
||||
a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
|
||||
pyarrow or fsspec filesystem object into the filesystem keyword to override this
|
||||
behavior.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
|
||||
"""
|
||||
# we require a newer version of pyarrow than we support for orc
|
||||
|
||||
orc = import_optional_dependency("pyarrow.orc")
|
||||
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
with get_handle(path, "rb", is_text=False) as handles:
|
||||
source = handles.handle
|
||||
if is_fsspec_url(path) and filesystem is None:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pa_fs = import_optional_dependency("pyarrow.fs")
|
||||
try:
|
||||
filesystem, source = pa_fs.FileSystem.from_uri(path)
|
||||
except (TypeError, pa.ArrowInvalid):
|
||||
pass
|
||||
|
||||
pa_table = orc.read_table(
|
||||
source=source, columns=columns, filesystem=filesystem, **kwargs
|
||||
)
|
||||
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
|
||||
|
||||
|
||||
def to_orc(
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes] | None = None,
|
||||
*,
|
||||
engine: Literal["pyarrow"] = "pyarrow",
|
||||
index: bool | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
) -> bytes | None:
|
||||
"""
|
||||
Write a DataFrame to the ORC format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
The dataframe to be written to ORC. Raises NotImplementedError
|
||||
if dtype of one or more columns is category, unsigned integers,
|
||||
intervals, periods or sparse.
|
||||
path : str, file-like object or None, default None
|
||||
If a string, it will be used as Root Directory path
|
||||
when writing a partitioned dataset. By file-like object,
|
||||
we refer to objects with a write() method, such as a file handle
|
||||
(e.g. via builtin open function). If path is None,
|
||||
a bytes object is returned.
|
||||
engine : str, default 'pyarrow'
|
||||
ORC library to use.
|
||||
index : bool, optional
|
||||
If ``True``, include the dataframe's index(es) in the file output. If
|
||||
``False``, they will not be written to the file.
|
||||
If ``None``, similar to ``infer`` the dataframe's index(es)
|
||||
will be saved. However, instead of being saved as values,
|
||||
the RangeIndex will be stored as a range in the metadata so it
|
||||
doesn't require much space and is faster. Other indexes will
|
||||
be included as columns in the file output.
|
||||
engine_kwargs : dict[str, Any] or None, default None
|
||||
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes if no path argument is provided else None
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
Dtype of one or more columns is category, unsigned integers, interval,
|
||||
period or sparse.
|
||||
ValueError
|
||||
engine is not pyarrow.
|
||||
|
||||
Notes
|
||||
-----
|
||||
* Before using this function you should read the
|
||||
:ref:`user guide about ORC <io.orc>` and
|
||||
:ref:`install optional dependencies <install.warn_orc>`.
|
||||
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
|
||||
library.
|
||||
* For supported dtypes please refer to `supported ORC features in Arrow
|
||||
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
|
||||
* Currently timezones in datetime columns are not preserved when a
|
||||
dataframe is converted into ORC files.
|
||||
"""
|
||||
if index is None:
|
||||
index = df.index.names[0] is not None
|
||||
if engine_kwargs is None:
|
||||
engine_kwargs = {}
|
||||
|
||||
# validate index
|
||||
# --------------
|
||||
|
||||
# validate that we have only a default index
|
||||
# raise on anything else as we don't serialize the index
|
||||
|
||||
if not df.index.equals(default_index(len(df))):
|
||||
raise ValueError(
|
||||
"orc does not support serializing a non-default index for the index; "
|
||||
"you can .reset_index() to make the index into column(s)"
|
||||
)
|
||||
|
||||
if df.index.name is not None:
|
||||
raise ValueError("orc does not serialize index meta-data on a default index")
|
||||
|
||||
if engine != "pyarrow":
|
||||
raise ValueError("engine must be 'pyarrow'")
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
orc = import_optional_dependency("pyarrow.orc")
|
||||
|
||||
was_none = path is None
|
||||
if was_none:
|
||||
path = io.BytesIO()
|
||||
assert path is not None # For mypy
|
||||
with get_handle(path, "wb", is_text=False) as handles:
|
||||
try:
|
||||
orc.write_table(
|
||||
pa.Table.from_pandas(df, preserve_index=index),
|
||||
handles.handle,
|
||||
**engine_kwargs,
|
||||
)
|
||||
except (TypeError, pa.ArrowNotImplementedError) as e:
|
||||
raise NotImplementedError(
|
||||
"The dtype of one or more columns is not supported yet."
|
||||
) from e
|
||||
|
||||
if was_none:
|
||||
assert isinstance(path, io.BytesIO) # For mypy
|
||||
return path.getvalue()
|
||||
return None
|
||||
@@ -0,0 +1,680 @@
|
||||
"""parquet compat"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
from warnings import (
|
||||
catch_warnings,
|
||||
filterwarnings,
|
||||
)
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
AbstractMethodError,
|
||||
Pandas4Warning,
|
||||
)
|
||||
from pandas.util._decorators import set_module
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
get_option,
|
||||
)
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.common import (
|
||||
IOHandles,
|
||||
get_handle,
|
||||
is_fsspec_url,
|
||||
is_url,
|
||||
stringify_path,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeBackend,
|
||||
FilePath,
|
||||
ParquetCompressionOptions,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
|
||||
def get_engine(engine: str) -> BaseImpl:
|
||||
"""return our implementation"""
|
||||
if engine == "auto":
|
||||
engine = get_option("io.parquet.engine")
|
||||
|
||||
if engine == "auto":
|
||||
# try engines in this order
|
||||
engine_classes = [PyArrowImpl, FastParquetImpl]
|
||||
|
||||
error_msgs = ""
|
||||
for engine_class in engine_classes:
|
||||
try:
|
||||
return engine_class()
|
||||
except ImportError as err:
|
||||
error_msgs += "\n - " + str(err)
|
||||
|
||||
raise ImportError(
|
||||
"Unable to find a usable engine; "
|
||||
"tried using: 'pyarrow', 'fastparquet'.\n"
|
||||
"A suitable version of "
|
||||
"pyarrow or fastparquet is required for parquet "
|
||||
"support.\n"
|
||||
"Trying to import the above resulted in these errors:"
|
||||
f"{error_msgs}"
|
||||
)
|
||||
|
||||
if engine == "pyarrow":
|
||||
return PyArrowImpl()
|
||||
elif engine == "fastparquet":
|
||||
return FastParquetImpl()
|
||||
|
||||
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
|
||||
|
||||
|
||||
def _get_path_or_handle(
|
||||
path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
|
||||
fs: Any,
|
||||
storage_options: StorageOptions | None = None,
|
||||
mode: str = "rb",
|
||||
is_dir: bool = False,
|
||||
) -> tuple[
|
||||
FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
|
||||
]:
|
||||
"""File handling for PyArrow."""
|
||||
path_or_handle = stringify_path(path)
|
||||
if fs is not None:
|
||||
pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore")
|
||||
fsspec = import_optional_dependency("fsspec", errors="ignore")
|
||||
if pa_fs is not None and isinstance(fs, pa_fs.FileSystem):
|
||||
if storage_options:
|
||||
raise NotImplementedError(
|
||||
"storage_options not supported with a pyarrow FileSystem."
|
||||
)
|
||||
elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem):
|
||||
pass
|
||||
else:
|
||||
raise ValueError(
|
||||
f"filesystem must be a pyarrow or fsspec FileSystem, "
|
||||
f"not a {type(fs).__name__}"
|
||||
)
|
||||
if is_fsspec_url(path_or_handle) and fs is None:
|
||||
if storage_options is None:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pa_fs = import_optional_dependency("pyarrow.fs")
|
||||
|
||||
try:
|
||||
fs, path_or_handle = pa_fs.FileSystem.from_uri(path)
|
||||
except (TypeError, pa.ArrowInvalid):
|
||||
pass
|
||||
if fs is None:
|
||||
fsspec = import_optional_dependency("fsspec")
|
||||
fs, path_or_handle = fsspec.core.url_to_fs(
|
||||
path_or_handle, **(storage_options or {})
|
||||
)
|
||||
elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
|
||||
# can't write to a remote url
|
||||
# without making use of fsspec at the moment
|
||||
raise ValueError("storage_options passed with buffer, or non-supported URL")
|
||||
|
||||
handles = None
|
||||
if (
|
||||
not fs
|
||||
and not is_dir
|
||||
and isinstance(path_or_handle, str)
|
||||
and not os.path.isdir(path_or_handle)
|
||||
):
|
||||
# use get_handle only when we are very certain that it is not a directory
|
||||
# fsspec resources can also point to directories
|
||||
# this branch is used for example when reading from non-fsspec URLs
|
||||
handles = get_handle(
|
||||
path_or_handle, mode, is_text=False, storage_options=storage_options
|
||||
)
|
||||
fs = None
|
||||
path_or_handle = handles.handle
|
||||
return path_or_handle, handles, fs
|
||||
|
||||
|
||||
class BaseImpl:
|
||||
@staticmethod
|
||||
def validate_dataframe(df: DataFrame) -> None:
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("to_parquet only supports IO with DataFrames")
|
||||
|
||||
def write(self, df: DataFrame, path, compression, **kwargs) -> None:
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def read(self, path, columns=None, **kwargs) -> DataFrame:
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
class PyArrowImpl(BaseImpl):
|
||||
def __init__(self) -> None:
|
||||
import_optional_dependency(
|
||||
"pyarrow", extra="pyarrow is required for parquet support."
|
||||
)
|
||||
import pyarrow.parquet
|
||||
|
||||
# import utils to register the pyarrow extension types
|
||||
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
|
||||
|
||||
self.api = pyarrow
|
||||
|
||||
def write(
|
||||
self,
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes],
|
||||
compression: ParquetCompressionOptions = "snappy",
|
||||
index: bool | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
partition_cols: list[str] | None = None,
|
||||
filesystem=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.validate_dataframe(df)
|
||||
|
||||
from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
|
||||
if index is not None:
|
||||
from_pandas_kwargs["preserve_index"] = index
|
||||
|
||||
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
|
||||
|
||||
if df.attrs:
|
||||
df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
|
||||
existing_metadata = table.schema.metadata
|
||||
merged_metadata = {**existing_metadata, **df_metadata}
|
||||
table = table.replace_schema_metadata(merged_metadata)
|
||||
|
||||
path_or_handle, handles, filesystem = _get_path_or_handle(
|
||||
path,
|
||||
filesystem,
|
||||
storage_options=storage_options,
|
||||
mode="wb",
|
||||
is_dir=partition_cols is not None,
|
||||
)
|
||||
if (
|
||||
isinstance(path_or_handle, io.BufferedWriter)
|
||||
and hasattr(path_or_handle, "name")
|
||||
and isinstance(path_or_handle.name, (str, bytes))
|
||||
):
|
||||
if isinstance(path_or_handle.name, bytes):
|
||||
path_or_handle = path_or_handle.name.decode()
|
||||
else:
|
||||
path_or_handle = path_or_handle.name
|
||||
|
||||
try:
|
||||
if partition_cols is not None:
|
||||
# writes to multiple files under the given path
|
||||
self.api.parquet.write_to_dataset(
|
||||
table,
|
||||
path_or_handle,
|
||||
compression=compression,
|
||||
partition_cols=partition_cols,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# write to single output file
|
||||
self.api.parquet.write_table(
|
||||
table,
|
||||
path_or_handle,
|
||||
compression=compression,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
finally:
|
||||
if handles is not None:
|
||||
handles.close()
|
||||
|
||||
def read(
|
||||
self,
|
||||
path,
|
||||
columns=None,
|
||||
filters=None,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
storage_options: StorageOptions | None = None,
|
||||
filesystem=None,
|
||||
to_pandas_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
kwargs["use_pandas_metadata"] = True
|
||||
|
||||
path_or_handle, handles, filesystem = _get_path_or_handle(
|
||||
path,
|
||||
filesystem,
|
||||
storage_options=storage_options,
|
||||
mode="rb",
|
||||
)
|
||||
try:
|
||||
pa_table = self.api.parquet.read_table(
|
||||
path_or_handle,
|
||||
columns=columns,
|
||||
filesystem=filesystem,
|
||||
filters=filters,
|
||||
**kwargs,
|
||||
)
|
||||
with catch_warnings():
|
||||
filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
Pandas4Warning,
|
||||
)
|
||||
result = arrow_table_to_pandas(
|
||||
pa_table,
|
||||
dtype_backend=dtype_backend,
|
||||
to_pandas_kwargs=to_pandas_kwargs,
|
||||
)
|
||||
|
||||
if pa_table.schema.metadata:
|
||||
if b"PANDAS_ATTRS" in pa_table.schema.metadata:
|
||||
df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"]
|
||||
result.attrs = json.loads(df_metadata)
|
||||
return result
|
||||
finally:
|
||||
if handles is not None:
|
||||
handles.close()
|
||||
|
||||
|
||||
class FastParquetImpl(BaseImpl):
|
||||
def __init__(self) -> None:
|
||||
# since pandas is a dependency of fastparquet
|
||||
# we need to import on first use
|
||||
fastparquet = import_optional_dependency(
|
||||
"fastparquet", extra="fastparquet is required for parquet support."
|
||||
)
|
||||
self.api = fastparquet
|
||||
|
||||
def write(
|
||||
self,
|
||||
df: DataFrame,
|
||||
path,
|
||||
compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
|
||||
index=None,
|
||||
partition_cols=None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
filesystem=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.validate_dataframe(df)
|
||||
|
||||
if "partition_on" in kwargs and partition_cols is not None:
|
||||
raise ValueError(
|
||||
"Cannot use both partition_on and "
|
||||
"partition_cols. Use partition_cols for partitioning data"
|
||||
)
|
||||
if "partition_on" in kwargs:
|
||||
partition_cols = kwargs.pop("partition_on")
|
||||
|
||||
if partition_cols is not None:
|
||||
kwargs["file_scheme"] = "hive"
|
||||
|
||||
if filesystem is not None:
|
||||
raise NotImplementedError(
|
||||
"filesystem is not implemented for the fastparquet engine."
|
||||
)
|
||||
|
||||
# cannot use get_handle as write() does not accept file buffers
|
||||
path = stringify_path(path)
|
||||
if is_fsspec_url(path):
|
||||
fsspec = import_optional_dependency("fsspec")
|
||||
|
||||
# if filesystem is provided by fsspec, file must be opened in 'wb' mode.
|
||||
kwargs["open_with"] = lambda path, _: fsspec.open(
|
||||
path, "wb", **(storage_options or {})
|
||||
).open()
|
||||
elif storage_options:
|
||||
raise ValueError(
|
||||
"storage_options passed with file object or non-fsspec file path"
|
||||
)
|
||||
|
||||
with catch_warnings(record=True):
|
||||
self.api.write(
|
||||
path,
|
||||
df,
|
||||
compression=compression,
|
||||
write_index=index,
|
||||
partition_on=partition_cols,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(
|
||||
self,
|
||||
path,
|
||||
columns=None,
|
||||
filters=None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
filesystem=None,
|
||||
to_pandas_kwargs: dict | None = None,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
parquet_kwargs: dict[str, Any] = {}
|
||||
dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
|
||||
# We are disabling nullable dtypes for fastparquet pending discussion
|
||||
parquet_kwargs["pandas_nulls"] = False
|
||||
if dtype_backend is not lib.no_default:
|
||||
raise ValueError(
|
||||
"The 'dtype_backend' argument is not supported for the "
|
||||
"fastparquet engine"
|
||||
)
|
||||
if filesystem is not None:
|
||||
raise NotImplementedError(
|
||||
"filesystem is not implemented for the fastparquet engine."
|
||||
)
|
||||
if to_pandas_kwargs is not None:
|
||||
raise NotImplementedError(
|
||||
"to_pandas_kwargs is not implemented for the fastparquet engine."
|
||||
)
|
||||
path = stringify_path(path)
|
||||
handles = None
|
||||
if is_fsspec_url(path):
|
||||
fsspec = import_optional_dependency("fsspec")
|
||||
|
||||
parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs
|
||||
elif isinstance(path, str) and not os.path.isdir(path):
|
||||
# use get_handle only when we are very certain that it is not a directory
|
||||
# fsspec resources can also point to directories
|
||||
# this branch is used for example when reading from non-fsspec URLs
|
||||
handles = get_handle(
|
||||
path, "rb", is_text=False, storage_options=storage_options
|
||||
)
|
||||
path = handles.handle
|
||||
|
||||
try:
|
||||
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
|
||||
with catch_warnings():
|
||||
filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
Pandas4Warning,
|
||||
)
|
||||
return parquet_file.to_pandas(
|
||||
columns=columns, filters=filters, **kwargs
|
||||
)
|
||||
finally:
|
||||
if handles is not None:
|
||||
handles.close()
|
||||
|
||||
|
||||
def to_parquet(
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes] | None = None,
|
||||
engine: str = "auto",
|
||||
compression: ParquetCompressionOptions = "snappy",
|
||||
index: bool | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
partition_cols: list[str] | None = None,
|
||||
filesystem: Any = None,
|
||||
**kwargs,
|
||||
) -> bytes | None:
|
||||
"""
|
||||
Write a DataFrame to the parquet format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
path : str, path object, file-like object, or None, default None
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``write()`` function. If None, the result
|
||||
is returned as bytes. If a string, it will be used as Root Directory
|
||||
path when writing a partitioned dataset. The engine fastparquet does
|
||||
not accept file-like objects.
|
||||
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
|
||||
When using the ``'pyarrow'`` engine and no storage options are provided
|
||||
and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
|
||||
(e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
|
||||
Use the filesystem keyword with an instantiated fsspec filesystem
|
||||
if you wish to use its implementation.
|
||||
compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
|
||||
default 'snappy'. Name of the compression to use. Use ``None``
|
||||
for no compression.
|
||||
index : bool, default None
|
||||
If ``True``, include the dataframe's index(es) in the file output. If
|
||||
``False``, they will not be written to the file.
|
||||
If ``None``, similar to ``True`` the dataframe's index(es)
|
||||
will be saved. However, instead of being saved as values,
|
||||
the RangeIndex will be stored as a range in the metadata so it
|
||||
doesn't require much space and is faster. Other indexes will
|
||||
be included as columns in the file output.
|
||||
partition_cols : str or list, optional, default None
|
||||
Column names by which to partition the dataset.
|
||||
Columns are partitioned in the order they are given.
|
||||
Must be None if path is not a string.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value
|
||||
pairs are forwarded to ``urllib.request.Request`` as header options.
|
||||
For other URLs (e.g. starting with "s3://", and "gcs://") the
|
||||
key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec``
|
||||
and ``urllib`` for more details, and for more examples on storage
|
||||
options refer `here <https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
filesystem : fsspec or pyarrow filesystem, default None
|
||||
Filesystem object to use when reading the parquet file. Only implemented
|
||||
for ``engine="pyarrow"``.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
**kwargs
|
||||
Additional keyword arguments passed to the engine:
|
||||
|
||||
* For ``engine="pyarrow"``: passed to :func:`pyarrow.parquet.write_table`
|
||||
or :func:`pyarrow.parquet.write_to_dataset` (when using partition_cols)
|
||||
* For ``engine="fastparquet"``: passed to :func:`fastparquet.write`
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes if no path argument is provided else None
|
||||
"""
|
||||
if isinstance(partition_cols, str):
|
||||
partition_cols = [partition_cols]
|
||||
impl = get_engine(engine)
|
||||
|
||||
path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
|
||||
|
||||
impl.write(
|
||||
df,
|
||||
path_or_buf,
|
||||
compression=compression,
|
||||
index=index,
|
||||
partition_cols=partition_cols,
|
||||
storage_options=storage_options,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if path is None:
|
||||
assert isinstance(path_or_buf, io.BytesIO)
|
||||
return path_or_buf.getvalue()
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_parquet(
|
||||
path: FilePath | ReadBuffer[bytes],
|
||||
engine: str = "auto",
|
||||
columns: list[str] | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
filesystem: Any = None,
|
||||
filters: list[tuple] | list[list[tuple]] | None = None,
|
||||
to_pandas_kwargs: dict | None = None,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load a parquet object from the file path, returning a DataFrame.
|
||||
|
||||
The function automatically handles reading the data from a parquet file
|
||||
and creates a DataFrame with the appropriate structure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function.
|
||||
The string could be a URL. Valid URL schemes include http, ftp, s3,
|
||||
gs, and file. For file URLs, a host is expected. A local file could be:
|
||||
``file://localhost/path/to/table.parquet``.
|
||||
A file URL can also be a path to a directory that contains multiple
|
||||
partitioned parquet files. Both pyarrow and fastparquet support
|
||||
paths to directories as well as file URLs. A directory path could be:
|
||||
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
|
||||
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
|
||||
When using the ``'pyarrow'`` engine and no storage options are provided
|
||||
and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
|
||||
(e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
|
||||
Use the filesystem keyword with an instantiated fsspec filesystem
|
||||
if you wish to use its implementation.
|
||||
columns : list, default=None
|
||||
If not None, only these columns will be read from the file.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value
|
||||
pairs are forwarded to ``urllib.request.Request`` as header options.
|
||||
For other URLs (e.g. starting with "s3://", and "gcs://") the
|
||||
key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec``
|
||||
and ``urllib`` for more details, and for more examples on storage
|
||||
options refer `here <https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
dtype_backend : {{'numpy_nullable', 'pyarrow'}}
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). If not specified, the default behavior
|
||||
is to not use nullable data types. If specified, the behavior
|
||||
is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable
|
||||
:class:`ArrowDtype` :class:`DataFrame`
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
filesystem : fsspec or pyarrow filesystem, default None
|
||||
Filesystem object to use when reading the parquet file. Only implemented
|
||||
for ``engine="pyarrow"``.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
filters : List[Tuple] or List[List[Tuple]], default None
|
||||
To filter out data.
|
||||
Filter syntax: [[(column, op, val), ...],...]
|
||||
where op is [==, =, >, >=, <, <=, !=, in, not in]
|
||||
The innermost tuples are transposed into a set of filters applied
|
||||
through an `AND` operation.
|
||||
The outer list combines these sets of filters through an `OR`
|
||||
operation.
|
||||
A single list of tuples can also be used, meaning that no `OR`
|
||||
operation between set of filters is to be conducted.
|
||||
|
||||
Using this argument will NOT result in row-wise filtering of the final
|
||||
partitions unless ``engine="pyarrow"`` is also specified. For
|
||||
other engines, filtering is only performed at the partition level, that is,
|
||||
to prevent the loading of some row-groups and/or files.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
to_pandas_kwargs : dict | None, default None
|
||||
Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas`
|
||||
when ``engine="pyarrow"``.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
**kwargs
|
||||
Additional keyword arguments passed to the engine:
|
||||
|
||||
* For ``engine="pyarrow"``: passed to :func:`pyarrow.parquet.read_table`
|
||||
* For ``engine="fastparquet"``: passed to
|
||||
:meth:`fastparquet.ParquetFile.to_pandas`
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
DataFrame based on parquet file.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_parquet : Create a parquet object that serializes a DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
|
||||
>>> original_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> df_parquet_bytes = original_df.to_parquet()
|
||||
>>> from io import BytesIO
|
||||
>>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes))
|
||||
>>> restored_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> restored_df.equals(original_df)
|
||||
True
|
||||
>>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"])
|
||||
>>> restored_bar
|
||||
bar
|
||||
0 5
|
||||
1 6
|
||||
2 7
|
||||
3 8
|
||||
4 9
|
||||
>>> restored_bar.equals(original_df[["bar"]])
|
||||
True
|
||||
|
||||
The function uses `kwargs` that are passed directly to the engine.
|
||||
In the following example, we use the `filters` argument of the pyarrow
|
||||
engine to filter the rows of the DataFrame.
|
||||
|
||||
Since `pyarrow` is the default engine, we can omit the `engine` argument.
|
||||
Note that the `filters` argument is implemented by the `pyarrow` engine,
|
||||
which can benefit from multithreading and also potentially be more
|
||||
economical in terms of memory.
|
||||
|
||||
>>> sel = [("foo", ">", 2)]
|
||||
>>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel)
|
||||
>>> restored_part
|
||||
foo bar
|
||||
0 3 8
|
||||
1 4 9
|
||||
"""
|
||||
|
||||
impl = get_engine(engine)
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
return impl.read(
|
||||
path,
|
||||
columns=columns,
|
||||
filters=filters,
|
||||
storage_options=storage_options,
|
||||
dtype_backend=dtype_backend,
|
||||
filesystem=filesystem,
|
||||
to_pandas_kwargs=to_pandas_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,9 @@
|
||||
from pandas.io.parsers.readers import (
|
||||
TextFileReader,
|
||||
TextParser,
|
||||
read_csv,
|
||||
read_fwf,
|
||||
read_table,
|
||||
)
|
||||
|
||||
__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]
|
||||
+328
@@ -0,0 +1,328 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
Pandas4Warning,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
from pandas.util._exceptions import (
|
||||
find_stack_level,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.inference import is_integer
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.parsers.base_parser import ParserBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow as pa
|
||||
|
||||
from pandas._typing import ReadBuffer
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class ArrowParserWrapper(ParserBase):
|
||||
"""
|
||||
Wrapper for the pyarrow engine for read_csv()
|
||||
"""
|
||||
|
||||
def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
self.src = src
|
||||
|
||||
self._parse_kwds()
|
||||
|
||||
def _parse_kwds(self) -> None:
|
||||
"""
|
||||
Validates keywords before passing to pyarrow.
|
||||
"""
|
||||
encoding: str | None = self.kwds.get("encoding")
|
||||
self.encoding = "utf-8" if encoding is None else encoding
|
||||
|
||||
na_values = self.kwds["na_values"]
|
||||
if isinstance(na_values, dict):
|
||||
raise ValueError(
|
||||
"The pyarrow engine doesn't support passing a dict for na_values"
|
||||
)
|
||||
self.na_values = list(self.kwds["na_values"])
|
||||
|
||||
def _get_pyarrow_options(self) -> None:
|
||||
"""
|
||||
Rename some arguments to pass to pyarrow
|
||||
"""
|
||||
mapping = {
|
||||
"usecols": "include_columns",
|
||||
"na_values": "null_values",
|
||||
"escapechar": "escape_char",
|
||||
"skip_blank_lines": "ignore_empty_lines",
|
||||
"decimal": "decimal_point",
|
||||
"quotechar": "quote_char",
|
||||
}
|
||||
for pandas_name, pyarrow_name in mapping.items():
|
||||
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
|
||||
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
|
||||
|
||||
# Date format handling
|
||||
# If we get a string, we need to convert it into a list for pyarrow
|
||||
# If we get a dict, we want to parse those separately
|
||||
date_format = self.date_format
|
||||
if isinstance(date_format, str):
|
||||
date_format = [date_format]
|
||||
else:
|
||||
# In case of dict, we don't want to propagate through, so
|
||||
# just set to pyarrow default of None
|
||||
|
||||
# Ideally, in future we disable pyarrow dtype inference (read in as string)
|
||||
# to prevent misreads.
|
||||
date_format = None
|
||||
self.kwds["timestamp_parsers"] = date_format
|
||||
|
||||
self.parse_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
|
||||
}
|
||||
|
||||
on_bad_lines = self.kwds.get("on_bad_lines")
|
||||
if on_bad_lines is not None:
|
||||
if callable(on_bad_lines):
|
||||
self.parse_options["invalid_row_handler"] = on_bad_lines
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
|
||||
self.parse_options["invalid_row_handler"] = (
|
||||
None # PyArrow raises an exception by default
|
||||
)
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
|
||||
|
||||
def handle_warning(invalid_row) -> str:
|
||||
warnings.warn(
|
||||
f"Expected {invalid_row.expected_columns} columns, but found "
|
||||
f"{invalid_row.actual_columns}: {invalid_row.text}",
|
||||
ParserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return "skip"
|
||||
|
||||
self.parse_options["invalid_row_handler"] = handle_warning
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
|
||||
self.parse_options["invalid_row_handler"] = lambda _: "skip"
|
||||
|
||||
self.convert_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in (
|
||||
"include_columns",
|
||||
"null_values",
|
||||
"true_values",
|
||||
"false_values",
|
||||
"decimal_point",
|
||||
"timestamp_parsers",
|
||||
)
|
||||
}
|
||||
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
|
||||
# autogenerated column names are prefixed with 'f' in pyarrow.csv
|
||||
if self.header is None and "include_columns" in self.convert_options:
|
||||
self.convert_options["include_columns"] = [
|
||||
f"f{n}" for n in self.convert_options["include_columns"]
|
||||
]
|
||||
|
||||
self.read_options = {
|
||||
"autogenerate_column_names": self.header is None,
|
||||
"skip_rows": self.header
|
||||
if self.header is not None
|
||||
else self.kwds["skiprows"],
|
||||
"encoding": self.encoding,
|
||||
}
|
||||
|
||||
def _get_convert_options(self):
|
||||
pyarrow_csv = import_optional_dependency("pyarrow.csv")
|
||||
|
||||
try:
|
||||
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
|
||||
except TypeError as err:
|
||||
include = self.convert_options.get("include_columns", None)
|
||||
if include is not None:
|
||||
self._validate_usecols(include)
|
||||
|
||||
nulls = self.convert_options.get("null_values", set())
|
||||
if not lib.is_list_like(nulls) or not all(
|
||||
isinstance(x, str) for x in nulls
|
||||
):
|
||||
raise TypeError(
|
||||
"The 'pyarrow' engine requires all na_values to be strings"
|
||||
) from err
|
||||
|
||||
raise
|
||||
|
||||
return convert_options
|
||||
|
||||
def _adjust_column_names(self, table: pa.Table) -> bool:
|
||||
num_cols = len(table.columns)
|
||||
multi_index_named = True
|
||||
if self.header is None:
|
||||
if self.names is None:
|
||||
self.names = range(num_cols)
|
||||
if len(self.names) != num_cols:
|
||||
# usecols is passed through to pyarrow, we only handle index col here
|
||||
# The only way self.names is not the same length as number of cols is
|
||||
# if we have int index_col. We should just pad the names(they will get
|
||||
# removed anyways) to expected length then.
|
||||
columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
|
||||
self.names = columns_prefix + self.names
|
||||
multi_index_named = False
|
||||
return multi_index_named
|
||||
|
||||
def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
|
||||
if self.index_col is not None:
|
||||
index_to_set = self.index_col.copy()
|
||||
for i, item in enumerate(self.index_col):
|
||||
if is_integer(item):
|
||||
index_to_set[i] = frame.columns[item]
|
||||
# String case
|
||||
elif item not in frame.columns:
|
||||
raise ValueError(f"Index {item} invalid")
|
||||
|
||||
# Process dtype for index_col and drop from dtypes
|
||||
if self.dtype is not None:
|
||||
key, new_dtype = (
|
||||
(item, self.dtype.get(item))
|
||||
if self.dtype.get(item) is not None
|
||||
else (frame.columns[item], self.dtype.get(frame.columns[item]))
|
||||
)
|
||||
if new_dtype is not None:
|
||||
frame[key] = frame[key].astype(new_dtype)
|
||||
del self.dtype[key]
|
||||
|
||||
frame.set_index(index_to_set, drop=True, inplace=True)
|
||||
# Clear names if headerless and no name given
|
||||
if self.header is None and not multi_index_named:
|
||||
frame.index.names = [None] * len(frame.index.names)
|
||||
|
||||
return frame
|
||||
|
||||
def _finalize_dtype(self, frame: DataFrame) -> DataFrame:
|
||||
if self.dtype is not None:
|
||||
# Ignore non-existent columns from dtype mapping
|
||||
# like other parsers do
|
||||
if isinstance(self.dtype, dict):
|
||||
self.dtype = {
|
||||
k: pandas_dtype(v)
|
||||
for k, v in self.dtype.items()
|
||||
if k in frame.columns
|
||||
}
|
||||
else:
|
||||
self.dtype = pandas_dtype(self.dtype)
|
||||
try:
|
||||
frame = frame.astype(self.dtype)
|
||||
except TypeError as err:
|
||||
# GH#44901 reraise to keep api consistent
|
||||
raise ValueError(str(err)) from err
|
||||
return frame
|
||||
|
||||
def _finalize_pandas_output(
|
||||
self, frame: DataFrame, multi_index_named: bool
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Processes data read in based on kwargs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame : DataFrame
|
||||
The DataFrame to process.
|
||||
multi_index_named : bool
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The processed DataFrame.
|
||||
"""
|
||||
frame = self._do_date_conversions(frame.columns, frame)
|
||||
frame = self._finalize_index(frame, multi_index_named)
|
||||
frame = self._finalize_dtype(frame)
|
||||
return frame
|
||||
|
||||
def _validate_usecols(self, usecols) -> None:
|
||||
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be integer "
|
||||
"column positions. Pass a list of string column names instead."
|
||||
)
|
||||
elif callable(usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be a callable."
|
||||
)
|
||||
|
||||
def read(self) -> DataFrame:
|
||||
"""
|
||||
Reads the contents of a CSV file into a DataFrame and
|
||||
processes it according to the kwargs passed in the
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The DataFrame created from the CSV file.
|
||||
"""
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pyarrow_csv = import_optional_dependency("pyarrow.csv")
|
||||
self._get_pyarrow_options()
|
||||
convert_options = self._get_convert_options()
|
||||
|
||||
try:
|
||||
table = pyarrow_csv.read_csv(
|
||||
self.src,
|
||||
read_options=pyarrow_csv.ReadOptions(**self.read_options),
|
||||
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
|
||||
convert_options=convert_options,
|
||||
)
|
||||
except pa.ArrowInvalid as e:
|
||||
raise ParserError(e) from e
|
||||
|
||||
dtype_backend = self.kwds["dtype_backend"]
|
||||
|
||||
# Convert all pa.null() cols -> float64 (non nullable)
|
||||
# else Int64 (nullable case, see below)
|
||||
if dtype_backend is lib.no_default:
|
||||
new_schema = table.schema
|
||||
new_type = pa.float64()
|
||||
for i, arrow_type in enumerate(table.schema.types):
|
||||
if pa.types.is_null(arrow_type):
|
||||
new_schema = new_schema.set(
|
||||
i, new_schema.field(i).with_type(new_type)
|
||||
)
|
||||
|
||||
table = table.cast(new_schema)
|
||||
|
||||
multi_index_named = self._adjust_column_names(table)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
Pandas4Warning,
|
||||
)
|
||||
frame = arrow_table_to_pandas(
|
||||
table,
|
||||
dtype_backend=dtype_backend,
|
||||
null_to_int64=True,
|
||||
dtype=self.dtype,
|
||||
names=self.names,
|
||||
)
|
||||
|
||||
if self.header is None:
|
||||
frame.columns = self.names
|
||||
|
||||
return self._finalize_pandas_output(frame, multi_index_named)
|
||||
+997
@@ -0,0 +1,997 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from copy import copy
|
||||
import csv
|
||||
from enum import Enum
|
||||
import itertools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
final,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
parsers,
|
||||
)
|
||||
import pandas._libs.ops as libops
|
||||
from pandas._libs.parsers import STR_NA_VALUES
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_dict_like,
|
||||
is_float_dtype,
|
||||
is_integer,
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
StringDtype,
|
||||
)
|
||||
from pandas.core import algorithms
|
||||
from pandas.core.arrays import (
|
||||
ArrowExtensionArray,
|
||||
BaseMaskedArray,
|
||||
BooleanArray,
|
||||
FloatingArray,
|
||||
IntegerArray,
|
||||
)
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
default_index,
|
||||
ensure_index_from_sequences,
|
||||
)
|
||||
from pandas.core.series import Series
|
||||
from pandas.core.tools import datetimes as tools
|
||||
|
||||
from pandas.io.common import is_potential_multi_index
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeArg,
|
||||
Hashable,
|
||||
HashableT,
|
||||
Scalar,
|
||||
SequenceT,
|
||||
)
|
||||
|
||||
|
||||
class ParserBase:
|
||||
class BadLineHandleMethod(Enum):
|
||||
ERROR = 0
|
||||
WARN = 1
|
||||
SKIP = 2
|
||||
|
||||
_implicit_index: bool
|
||||
_first_chunk: bool
|
||||
keep_default_na: bool
|
||||
dayfirst: bool
|
||||
cache_dates: bool
|
||||
usecols_dtype: str | None
|
||||
|
||||
def __init__(self, kwds) -> None:
|
||||
self._implicit_index = False
|
||||
|
||||
self.names = kwds.get("names")
|
||||
self.orig_names: Sequence[Hashable] | None = None
|
||||
|
||||
self.index_col = kwds.get("index_col", None)
|
||||
self.unnamed_cols: set = set()
|
||||
self.index_names: Sequence[Hashable] | None = None
|
||||
self.col_names: Sequence[Hashable] | None = None
|
||||
|
||||
parse_dates = kwds.pop("parse_dates", False)
|
||||
if parse_dates is None or lib.is_bool(parse_dates):
|
||||
parse_dates = bool(parse_dates)
|
||||
elif not isinstance(parse_dates, list):
|
||||
raise TypeError(
|
||||
"Only booleans and lists are accepted for the 'parse_dates' parameter"
|
||||
)
|
||||
self.parse_dates: bool | list = parse_dates
|
||||
self.date_parser = kwds.pop("date_parser", lib.no_default)
|
||||
self.date_format = kwds.pop("date_format", None)
|
||||
self.dayfirst = kwds.pop("dayfirst", False)
|
||||
|
||||
self.na_values = kwds.get("na_values")
|
||||
self.na_fvalues = kwds.get("na_fvalues")
|
||||
self.na_filter = kwds.get("na_filter", False)
|
||||
self.keep_default_na = kwds.get("keep_default_na", True)
|
||||
|
||||
self.dtype = copy(kwds.get("dtype", None))
|
||||
self.converters = kwds.get("converters")
|
||||
self.dtype_backend = kwds.get("dtype_backend")
|
||||
|
||||
self.true_values = kwds.get("true_values")
|
||||
self.false_values = kwds.get("false_values")
|
||||
self.cache_dates = kwds.pop("cache_dates", True)
|
||||
|
||||
# validate header options for mi
|
||||
self.header = kwds.get("header")
|
||||
if is_list_like(self.header, allow_sets=False):
|
||||
if kwds.get("usecols"):
|
||||
raise ValueError(
|
||||
"cannot specify usecols when specifying a multi-index header"
|
||||
)
|
||||
if kwds.get("names"):
|
||||
raise ValueError(
|
||||
"cannot specify names when specifying a multi-index header"
|
||||
)
|
||||
|
||||
# validate index_col that only contains integers
|
||||
if self.index_col is not None:
|
||||
# In this case we can pin down index_col as list[int]
|
||||
if is_integer(self.index_col):
|
||||
self.index_col = [self.index_col]
|
||||
elif not (
|
||||
is_list_like(self.index_col, allow_sets=False)
|
||||
and all(map(is_integer, self.index_col))
|
||||
):
|
||||
raise ValueError(
|
||||
"index_col must only contain integers of column positions "
|
||||
"when specifying a multi-index header"
|
||||
)
|
||||
else:
|
||||
self.index_col = list(self.index_col)
|
||||
|
||||
self._first_chunk = True
|
||||
|
||||
self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
|
||||
|
||||
# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
|
||||
# Normally, this arg would get pre-processed earlier on
|
||||
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
@final
|
||||
def _should_parse_dates(self, i: int) -> bool:
|
||||
if isinstance(self.parse_dates, bool):
|
||||
return self.parse_dates
|
||||
else:
|
||||
if self.index_names is not None:
|
||||
name = self.index_names[i]
|
||||
else:
|
||||
name = None
|
||||
j = i if self.index_col is None else self.index_col[i]
|
||||
|
||||
return (j in self.parse_dates) or (
|
||||
name is not None and name in self.parse_dates
|
||||
)
|
||||
|
||||
@final
|
||||
def _extract_multi_indexer_columns(
|
||||
self,
|
||||
header,
|
||||
index_names: Sequence[Hashable] | None,
|
||||
passed_names: bool = False,
|
||||
) -> tuple[
|
||||
Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool
|
||||
]:
|
||||
"""
|
||||
Extract and return the names, index_names, col_names if the column
|
||||
names are a MultiIndex.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
header: list of lists
|
||||
The header rows
|
||||
index_names: list, optional
|
||||
The names of the future index
|
||||
passed_names: bool, default False
|
||||
A flag specifying if names where passed
|
||||
|
||||
"""
|
||||
if len(header) < 2:
|
||||
return header[0], index_names, None, passed_names
|
||||
|
||||
# the names are the tuples of the header that are not the index cols
|
||||
# 0 is the name of the index, assuming index_col is a list of column
|
||||
# numbers
|
||||
ic = self.index_col
|
||||
if ic is None:
|
||||
ic = []
|
||||
|
||||
if not isinstance(ic, (list, tuple, np.ndarray)):
|
||||
ic = [ic]
|
||||
sic = set(ic)
|
||||
|
||||
# clean the index_names
|
||||
index_names = header.pop(-1)
|
||||
index_names, _, _ = self._clean_index_names(index_names, self.index_col)
|
||||
|
||||
# extract the columns
|
||||
field_count = len(header[0])
|
||||
|
||||
# check if header lengths are equal
|
||||
if not all(len(header_iter) == field_count for header_iter in header[1:]):
|
||||
raise ParserError("Header rows must have an equal number of columns.")
|
||||
|
||||
def extract(r):
|
||||
return tuple(r[i] for i in range(field_count) if i not in sic)
|
||||
|
||||
columns = list(zip(*(extract(r) for r in header), strict=True))
|
||||
names = columns.copy()
|
||||
for single_ic in sorted(ic):
|
||||
names.insert(single_ic, single_ic)
|
||||
|
||||
# Clean the column names (if we have an index_col).
|
||||
if ic:
|
||||
col_names = [
|
||||
r[ic[0]]
|
||||
if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
|
||||
else None
|
||||
for r in header
|
||||
]
|
||||
else:
|
||||
col_names = [None] * len(header)
|
||||
|
||||
passed_names = True
|
||||
|
||||
return names, index_names, col_names, passed_names
|
||||
|
||||
@final
|
||||
def _maybe_make_multi_index_columns(
|
||||
self,
|
||||
columns: SequenceT,
|
||||
col_names: Sequence[Hashable] | None = None,
|
||||
) -> SequenceT | MultiIndex:
|
||||
# possibly create a column mi here
|
||||
if is_potential_multi_index(columns):
|
||||
columns_mi = cast("Sequence[tuple[Hashable, ...]]", columns)
|
||||
return MultiIndex.from_tuples(columns_mi, names=col_names)
|
||||
return columns
|
||||
|
||||
@final
|
||||
def _make_index(
|
||||
self, alldata, columns, indexnamerow: list[Scalar] | None = None
|
||||
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
|
||||
index: Index | None
|
||||
if isinstance(self.index_col, list) and len(self.index_col):
|
||||
to_remove = []
|
||||
indexes = []
|
||||
for idx in self.index_col:
|
||||
if isinstance(idx, str):
|
||||
raise ValueError(f"Index {idx} invalid")
|
||||
to_remove.append(idx)
|
||||
indexes.append(alldata[idx])
|
||||
# remove index items from content and columns, don't pop in
|
||||
# loop
|
||||
for i in sorted(to_remove, reverse=True):
|
||||
alldata.pop(i)
|
||||
if not self._implicit_index:
|
||||
columns.pop(i)
|
||||
index = self._agg_index(indexes)
|
||||
|
||||
# add names for the index
|
||||
if indexnamerow:
|
||||
coffset = len(indexnamerow) - len(columns)
|
||||
index = index.set_names(indexnamerow[:coffset])
|
||||
else:
|
||||
index = None
|
||||
|
||||
# maybe create a mi on the columns
|
||||
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
|
||||
|
||||
return index, columns
|
||||
|
||||
@final
|
||||
def _clean_mapping(self, mapping):
|
||||
"""converts col numbers to names"""
|
||||
if not isinstance(mapping, dict):
|
||||
return mapping
|
||||
clean = {}
|
||||
# for mypy
|
||||
assert self.orig_names is not None
|
||||
|
||||
for col, v in mapping.items():
|
||||
if isinstance(col, int) and col not in self.orig_names:
|
||||
col = self.orig_names[col]
|
||||
clean[col] = v
|
||||
if isinstance(mapping, defaultdict):
|
||||
remaining_cols = set(self.orig_names) - set(clean.keys())
|
||||
clean.update({col: mapping[col] for col in remaining_cols})
|
||||
return clean
|
||||
|
||||
@final
|
||||
def _agg_index(self, index) -> Index:
|
||||
arrays = []
|
||||
converters = self._clean_mapping(self.converters)
|
||||
clean_dtypes = self._clean_mapping(self.dtype)
|
||||
|
||||
if self.index_names is not None:
|
||||
names: Iterable = self.index_names
|
||||
zip_strict = True
|
||||
else:
|
||||
names = itertools.cycle([None])
|
||||
zip_strict = False
|
||||
for i, (arr, name) in enumerate(zip(index, names, strict=zip_strict)):
|
||||
if self._should_parse_dates(i):
|
||||
arr = date_converter(
|
||||
arr,
|
||||
col=self.index_names[i] if self.index_names is not None else None,
|
||||
dayfirst=self.dayfirst,
|
||||
cache_dates=self.cache_dates,
|
||||
date_format=self.date_format,
|
||||
)
|
||||
|
||||
if self.na_filter:
|
||||
col_na_values = self.na_values
|
||||
col_na_fvalues = self.na_fvalues
|
||||
else:
|
||||
col_na_values = set()
|
||||
col_na_fvalues = set()
|
||||
|
||||
if isinstance(self.na_values, dict):
|
||||
assert self.index_names is not None
|
||||
col_name = self.index_names[i]
|
||||
if col_name is not None:
|
||||
col_na_values, col_na_fvalues = get_na_values(
|
||||
col_name, self.na_values, self.na_fvalues, self.keep_default_na
|
||||
)
|
||||
else:
|
||||
col_na_values, col_na_fvalues = set(), set()
|
||||
|
||||
cast_type = None
|
||||
index_converter = False
|
||||
if self.index_names is not None:
|
||||
if isinstance(clean_dtypes, dict):
|
||||
cast_type = clean_dtypes.get(self.index_names[i], None)
|
||||
|
||||
if isinstance(converters, dict):
|
||||
index_converter = converters.get(self.index_names[i]) is not None
|
||||
|
||||
try_num_bool = not (
|
||||
(cast_type and is_string_dtype(cast_type)) or index_converter
|
||||
)
|
||||
|
||||
arr, _ = self._infer_types(
|
||||
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
|
||||
)
|
||||
if cast_type is not None:
|
||||
# Don't perform RangeIndex inference
|
||||
idx = Index(arr, name=name, dtype=cast_type, copy=False)
|
||||
else:
|
||||
idx = ensure_index_from_sequences([arr], [name])
|
||||
arrays.append(idx)
|
||||
|
||||
if len(arrays) == 1:
|
||||
return arrays[0]
|
||||
else:
|
||||
return MultiIndex.from_arrays(arrays)
|
||||
|
||||
@final
|
||||
def _set_noconvert_dtype_columns(
|
||||
self, col_indices: list[int], names: Sequence[Hashable]
|
||||
) -> set[int]:
|
||||
"""
|
||||
Set the columns that should not undergo dtype conversions.
|
||||
|
||||
Currently, any column that is involved with date parsing will not
|
||||
undergo such conversions. If usecols is specified, the positions of the columns
|
||||
not to cast is relative to the usecols not to all columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col_indices: The indices specifying order and positions of the columns
|
||||
names: The column names which order is corresponding with the order
|
||||
of col_indices
|
||||
|
||||
Returns
|
||||
-------
|
||||
A set of integers containing the positions of the columns not to convert.
|
||||
"""
|
||||
usecols: list[int] | list[str] | None
|
||||
noconvert_columns = set()
|
||||
if self.usecols_dtype == "integer":
|
||||
# A set of integers will be converted to a list in
|
||||
# the correct order every single time.
|
||||
usecols = sorted(self.usecols)
|
||||
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
|
||||
# The names attribute should have the correct columns
|
||||
# in the proper order for indexing with parse_dates.
|
||||
usecols = col_indices
|
||||
else:
|
||||
# Usecols is empty.
|
||||
usecols = None
|
||||
|
||||
def _set(x) -> int:
|
||||
if usecols is not None and is_integer(x):
|
||||
x = usecols[x]
|
||||
|
||||
if not is_integer(x):
|
||||
x = col_indices[names.index(x)]
|
||||
|
||||
return x
|
||||
|
||||
if isinstance(self.parse_dates, list):
|
||||
validate_parse_dates_presence(self.parse_dates, names)
|
||||
for val in self.parse_dates:
|
||||
noconvert_columns.add(_set(val))
|
||||
|
||||
elif self.parse_dates:
|
||||
if isinstance(self.index_col, list):
|
||||
for k in self.index_col:
|
||||
noconvert_columns.add(_set(k))
|
||||
elif self.index_col is not None:
|
||||
noconvert_columns.add(_set(self.index_col))
|
||||
|
||||
return noconvert_columns
|
||||
|
||||
@final
|
||||
def _infer_types(
|
||||
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
|
||||
) -> tuple[ArrayLike, int]:
|
||||
"""
|
||||
Infer types of values, possibly casting
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
na_values : set
|
||||
no_dtype_specified: Specifies if we want to cast explicitly
|
||||
try_num_bool : bool, default try
|
||||
try to cast values to numeric (first preference) or boolean
|
||||
|
||||
Returns
|
||||
-------
|
||||
converted : ndarray or ExtensionArray
|
||||
na_count : int
|
||||
"""
|
||||
na_count = 0
|
||||
if issubclass(values.dtype.type, (np.number, np.bool_)):
|
||||
# If our array has numeric dtype, we don't have to check for strings in isin
|
||||
na_values = np.array([val for val in na_values if not isinstance(val, str)])
|
||||
mask = algorithms.isin(values, na_values)
|
||||
na_count = mask.astype("uint8", copy=False).sum()
|
||||
if na_count > 0:
|
||||
if is_integer_dtype(values):
|
||||
values = values.astype(np.float64)
|
||||
np.putmask(values, mask, np.nan)
|
||||
return values, na_count
|
||||
|
||||
dtype_backend = self.dtype_backend
|
||||
non_default_dtype_backend = (
|
||||
no_dtype_specified and dtype_backend is not lib.no_default
|
||||
)
|
||||
result: ArrayLike
|
||||
|
||||
if try_num_bool and is_object_dtype(values.dtype):
|
||||
# exclude e.g DatetimeIndex here
|
||||
try:
|
||||
result, result_mask = lib.maybe_convert_numeric(
|
||||
values,
|
||||
na_values,
|
||||
False,
|
||||
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
# e.g. encountering datetime string gets ValueError
|
||||
# TypeError can be raised in floatify
|
||||
na_count = parsers.sanitize_objects(values, na_values)
|
||||
result = values
|
||||
else:
|
||||
if non_default_dtype_backend:
|
||||
if result_mask is None:
|
||||
result_mask = np.zeros(result.shape, dtype=np.bool_)
|
||||
|
||||
if result_mask.all():
|
||||
result = IntegerArray(
|
||||
np.ones(result_mask.shape, dtype=np.int64), result_mask
|
||||
)
|
||||
elif is_integer_dtype(result):
|
||||
result = IntegerArray(result, result_mask)
|
||||
elif is_bool_dtype(result):
|
||||
result = BooleanArray(result, result_mask)
|
||||
elif is_float_dtype(result):
|
||||
result = FloatingArray(result, result_mask)
|
||||
|
||||
na_count = result_mask.sum()
|
||||
else:
|
||||
na_count = isna(result).sum()
|
||||
else:
|
||||
result = values
|
||||
if values.dtype == np.object_:
|
||||
na_count = parsers.sanitize_objects(values, na_values)
|
||||
|
||||
if (
|
||||
result.dtype == np.object_
|
||||
and try_num_bool
|
||||
and (len(result) == 0 or not isinstance(result[0], int))
|
||||
):
|
||||
result, bool_mask = libops.maybe_convert_bool(
|
||||
np.asarray(values),
|
||||
true_values=self.true_values,
|
||||
false_values=self.false_values,
|
||||
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
|
||||
)
|
||||
if result.dtype == np.bool_ and non_default_dtype_backend:
|
||||
if bool_mask is None:
|
||||
bool_mask = np.zeros(result.shape, dtype=np.bool_)
|
||||
result = BooleanArray(result, bool_mask)
|
||||
elif result.dtype == np.object_ and non_default_dtype_backend:
|
||||
# read_excel sends array of datetime objects
|
||||
if not lib.is_datetime_array(result, skipna=True):
|
||||
dtype = StringDtype()
|
||||
cls = dtype.construct_array_type()
|
||||
result = cls._from_sequence(values, dtype=dtype)
|
||||
|
||||
if dtype_backend == "pyarrow":
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
if isinstance(result, np.ndarray):
|
||||
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
|
||||
elif isinstance(result, BaseMaskedArray):
|
||||
if result._mask.all():
|
||||
# We want an arrow null array here
|
||||
result = ArrowExtensionArray(pa.array([None] * len(result)))
|
||||
else:
|
||||
result = ArrowExtensionArray(
|
||||
pa.array(result._data, mask=result._mask)
|
||||
)
|
||||
else:
|
||||
result = ArrowExtensionArray(
|
||||
pa.array(result.to_numpy(), from_pandas=True)
|
||||
)
|
||||
|
||||
return result, na_count
|
||||
|
||||
@overload
|
||||
def _do_date_conversions(
|
||||
self,
|
||||
names: Index,
|
||||
data: DataFrame,
|
||||
) -> DataFrame: ...
|
||||
|
||||
@overload
|
||||
def _do_date_conversions(
|
||||
self,
|
||||
names: Sequence[Hashable],
|
||||
data: Mapping[Hashable, ArrayLike],
|
||||
) -> Mapping[Hashable, ArrayLike]: ...
|
||||
|
||||
@final
|
||||
def _do_date_conversions(
|
||||
self,
|
||||
names: Sequence[Hashable] | Index,
|
||||
data: Mapping[Hashable, ArrayLike] | DataFrame,
|
||||
) -> Mapping[Hashable, ArrayLike] | DataFrame:
|
||||
if not isinstance(self.parse_dates, list):
|
||||
return data
|
||||
for colspec in self.parse_dates:
|
||||
if isinstance(colspec, int) and colspec not in data:
|
||||
colspec = names[colspec]
|
||||
if (isinstance(self.index_col, list) and colspec in self.index_col) or (
|
||||
isinstance(self.index_names, list) and colspec in self.index_names
|
||||
):
|
||||
continue
|
||||
result = date_converter(
|
||||
data[colspec],
|
||||
col=colspec,
|
||||
dayfirst=self.dayfirst,
|
||||
cache_dates=self.cache_dates,
|
||||
date_format=self.date_format,
|
||||
)
|
||||
# error: Unsupported target for indexed assignment
|
||||
# ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame")
|
||||
data[colspec] = result # type: ignore[index]
|
||||
|
||||
return data
|
||||
|
||||
@final
|
||||
def _check_data_length(
|
||||
self,
|
||||
columns: Sequence[Hashable],
|
||||
data: Sequence[ArrayLike],
|
||||
) -> None:
|
||||
"""Checks if length of data is equal to length of column names.
|
||||
|
||||
One set of trailing commas is allowed. self.index_col not False
|
||||
results in a ParserError previously when lengths do not match.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns: list of column names
|
||||
data: list of array-likes containing the data column-wise.
|
||||
"""
|
||||
if not self.index_col and len(columns) != len(data) and columns:
|
||||
empty_str = is_object_dtype(data[-1]) and data[-1] == ""
|
||||
# error: No overload variant of "__ror__" of "ndarray" matches
|
||||
# argument type "ExtensionArray"
|
||||
empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
|
||||
if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
|
||||
return
|
||||
warnings.warn(
|
||||
"Length of header or names does not match length of data. This leads "
|
||||
"to a loss of data with index_col=False.",
|
||||
ParserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
@final
|
||||
def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:
|
||||
"""
|
||||
Validates that all usecols are present in a given
|
||||
list of names. If not, raise a ValueError that
|
||||
shows what usecols are missing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : iterable of usecols
|
||||
The columns to validate are present in names.
|
||||
names : iterable of names
|
||||
The column names to check against.
|
||||
|
||||
Returns
|
||||
-------
|
||||
usecols : iterable of usecols
|
||||
The `usecols` parameter if the validation succeeds.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : Columns were missing. Error message will list them.
|
||||
"""
|
||||
missing = [c for c in usecols if c not in names]
|
||||
if len(missing) > 0:
|
||||
raise ValueError(
|
||||
f"Usecols do not match columns, columns expected but not found: "
|
||||
f"{missing}"
|
||||
)
|
||||
|
||||
return usecols
|
||||
|
||||
@final
|
||||
def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
|
||||
if not is_index_col(index_col):
|
||||
return None, columns, index_col
|
||||
|
||||
columns = list(columns)
|
||||
|
||||
# In case of no rows and multiindex columns we have to set index_names to
|
||||
# list of Nones GH#38292
|
||||
if not columns:
|
||||
return [None] * len(index_col), columns, index_col
|
||||
|
||||
cp_cols = list(columns)
|
||||
index_names: list[str | int | None] = []
|
||||
|
||||
# don't mutate
|
||||
index_col = list(index_col)
|
||||
|
||||
for i, c in enumerate(index_col):
|
||||
if isinstance(c, str):
|
||||
index_names.append(c)
|
||||
for j, name in enumerate(cp_cols):
|
||||
if name == c:
|
||||
index_col[i] = j
|
||||
columns.remove(name)
|
||||
break
|
||||
else:
|
||||
name = cp_cols[c]
|
||||
columns.remove(name)
|
||||
index_names.append(name)
|
||||
|
||||
# Only clean index names that were placeholders.
|
||||
for i, name in enumerate(index_names):
|
||||
if isinstance(name, str) and name in self.unnamed_cols:
|
||||
index_names[i] = None
|
||||
|
||||
return index_names, columns, index_col
|
||||
|
||||
@final
|
||||
def _get_empty_meta(
|
||||
self, columns: Sequence[HashableT], dtype: DtypeArg | None = None
|
||||
) -> tuple[Index, list[HashableT], dict[HashableT, Series]]:
|
||||
columns = list(columns)
|
||||
|
||||
index_col = self.index_col
|
||||
index_names = self.index_names
|
||||
|
||||
# Convert `dtype` to a defaultdict of some kind.
|
||||
# This will enable us to write `dtype[col_name]`
|
||||
# without worrying about KeyError issues later on.
|
||||
dtype_dict: defaultdict[Hashable, Any]
|
||||
if not is_dict_like(dtype):
|
||||
# if dtype == None, default will be object.
|
||||
dtype_dict = defaultdict(lambda: dtype)
|
||||
else:
|
||||
dtype = cast(dict, dtype)
|
||||
dtype_dict = defaultdict(
|
||||
lambda: None,
|
||||
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
|
||||
)
|
||||
|
||||
# Even though we have no data, the "index" of the empty DataFrame
|
||||
# could for example still be an empty MultiIndex. Thus, we need to
|
||||
# check whether we have any index columns specified, via either:
|
||||
#
|
||||
# 1) index_col (column indices)
|
||||
# 2) index_names (column names)
|
||||
#
|
||||
# Both must be non-null to ensure a successful construction. Otherwise,
|
||||
# we have to create a generic empty Index.
|
||||
index: Index
|
||||
if (index_col is None or index_col is False) or index_names is None:
|
||||
index = default_index(0)
|
||||
else:
|
||||
# TODO: We could return default_index(0) if dtype_dict[name] is None
|
||||
data = [
|
||||
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
|
||||
]
|
||||
if len(data) == 1:
|
||||
index = data[0]
|
||||
else:
|
||||
index = MultiIndex.from_arrays(data)
|
||||
index_col.sort()
|
||||
|
||||
for i, n in enumerate(index_col):
|
||||
columns.pop(n - i)
|
||||
|
||||
col_dict = {
|
||||
col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
|
||||
}
|
||||
|
||||
return index, columns, col_dict
|
||||
|
||||
|
||||
def date_converter(
|
||||
date_col,
|
||||
col: Hashable,
|
||||
dayfirst: bool = False,
|
||||
cache_dates: bool = True,
|
||||
date_format: dict[Hashable, str] | str | None = None,
|
||||
):
|
||||
if date_col.dtype.kind in "Mm":
|
||||
return date_col
|
||||
|
||||
date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format
|
||||
|
||||
str_objs = lib.ensure_string_array(np.asarray(date_col))
|
||||
try:
|
||||
result = tools.to_datetime(
|
||||
str_objs,
|
||||
format=date_fmt,
|
||||
utc=False,
|
||||
dayfirst=dayfirst,
|
||||
cache=cache_dates,
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
# test_usecols_with_parse_dates4
|
||||
# test_multi_index_parse_dates
|
||||
return str_objs
|
||||
|
||||
if isinstance(result, DatetimeIndex):
|
||||
arr = result.to_numpy()
|
||||
arr.flags.writeable = True
|
||||
return arr
|
||||
return result._values
|
||||
|
||||
|
||||
parser_defaults = {
|
||||
"delimiter": None,
|
||||
"escapechar": None,
|
||||
"quotechar": '"',
|
||||
"quoting": csv.QUOTE_MINIMAL,
|
||||
"doublequote": True,
|
||||
"skipinitialspace": False,
|
||||
"lineterminator": None,
|
||||
"header": "infer",
|
||||
"index_col": None,
|
||||
"names": None,
|
||||
"skiprows": None,
|
||||
"skipfooter": 0,
|
||||
"nrows": None,
|
||||
"na_values": None,
|
||||
"keep_default_na": True,
|
||||
"true_values": None,
|
||||
"false_values": None,
|
||||
"converters": None,
|
||||
"dtype": None,
|
||||
"cache_dates": True,
|
||||
"thousands": None,
|
||||
"comment": None,
|
||||
"decimal": ".",
|
||||
# 'engine': 'c',
|
||||
"parse_dates": False,
|
||||
"dayfirst": False,
|
||||
"date_format": None,
|
||||
"usecols": None,
|
||||
# 'iterator': False,
|
||||
"chunksize": None,
|
||||
"encoding": None,
|
||||
"compression": None,
|
||||
"skip_blank_lines": True,
|
||||
"encoding_errors": "strict",
|
||||
"on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
|
||||
"dtype_backend": lib.no_default,
|
||||
}
|
||||
|
||||
|
||||
def get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
|
||||
"""
|
||||
Get the NaN values for a given column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : str
|
||||
The name of the column.
|
||||
na_values : array-like, dict
|
||||
The object listing the NaN values as strings.
|
||||
na_fvalues : array-like, dict
|
||||
The object listing the NaN values as floats.
|
||||
keep_default_na : bool
|
||||
If `na_values` is a dict, and the column is not mapped in the
|
||||
dictionary, whether to return the default NaN values or the empty set.
|
||||
|
||||
Returns
|
||||
-------
|
||||
nan_tuple : A length-two tuple composed of
|
||||
|
||||
1) na_values : the string NaN values for that column.
|
||||
2) na_fvalues : the float NaN values for that column.
|
||||
"""
|
||||
if isinstance(na_values, dict):
|
||||
if col in na_values:
|
||||
return na_values[col], na_fvalues[col]
|
||||
else:
|
||||
if keep_default_na:
|
||||
return STR_NA_VALUES, set()
|
||||
|
||||
return set(), set()
|
||||
else:
|
||||
return na_values, na_fvalues
|
||||
|
||||
|
||||
def is_index_col(col) -> bool:
|
||||
return col is not None and col is not False
|
||||
|
||||
|
||||
def validate_parse_dates_presence(
|
||||
parse_dates: bool | list, columns: Sequence[Hashable]
|
||||
) -> set:
|
||||
"""
|
||||
Check if parse_dates are in columns.
|
||||
|
||||
If user has provided names for parse_dates, check if those columns
|
||||
are available.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : list
|
||||
List of names of the dataframe.
|
||||
|
||||
Returns
|
||||
-------
|
||||
The names of the columns which will get parsed later if a list
|
||||
is given as specification.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If column to parse_date is not in dataframe.
|
||||
|
||||
"""
|
||||
if not isinstance(parse_dates, list):
|
||||
return set()
|
||||
|
||||
missing = set()
|
||||
unique_cols = set()
|
||||
for col in parse_dates:
|
||||
if isinstance(col, str):
|
||||
if col not in columns:
|
||||
missing.add(col)
|
||||
else:
|
||||
unique_cols.add(col)
|
||||
elif col in columns:
|
||||
unique_cols.add(col)
|
||||
else:
|
||||
unique_cols.add(columns[col])
|
||||
if missing:
|
||||
missing_cols = ", ".join(sorted(missing))
|
||||
raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'")
|
||||
return unique_cols
|
||||
|
||||
|
||||
def _validate_usecols_arg(usecols):
|
||||
"""
|
||||
Validate the 'usecols' parameter.
|
||||
|
||||
Checks whether or not the 'usecols' parameter contains all integers
|
||||
(column selection by index), strings (column by name) or is a callable.
|
||||
Raises a ValueError if that is not the case.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : list-like, callable, or None
|
||||
List of columns to use when parsing or a callable that can be used
|
||||
to filter a list of table columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
usecols_tuple : tuple
|
||||
A tuple of (verified_usecols, usecols_dtype).
|
||||
|
||||
'verified_usecols' is either a set if an array-like is passed in or
|
||||
'usecols' if a callable or None is passed in.
|
||||
|
||||
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
|
||||
is passed in or None if a callable or None is passed in.
|
||||
"""
|
||||
msg = (
|
||||
"'usecols' must either be list-like of all strings, all unicode, "
|
||||
"all integers or a callable."
|
||||
)
|
||||
if usecols is not None:
|
||||
if callable(usecols):
|
||||
return usecols, None
|
||||
|
||||
if not is_list_like(usecols):
|
||||
# see gh-20529
|
||||
#
|
||||
# Ensure it is iterable container but not string.
|
||||
raise ValueError(msg)
|
||||
|
||||
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
|
||||
|
||||
if usecols_dtype not in ("empty", "integer", "string"):
|
||||
raise ValueError(msg)
|
||||
|
||||
usecols = set(usecols)
|
||||
|
||||
return usecols, usecols_dtype
|
||||
return usecols, None
|
||||
|
||||
|
||||
@overload
|
||||
def evaluate_callable_usecols(
|
||||
usecols: Callable[[Hashable], object],
|
||||
names: Iterable[Hashable],
|
||||
) -> set[int]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def evaluate_callable_usecols(
|
||||
usecols: SequenceT, names: Iterable[Hashable]
|
||||
) -> SequenceT: ...
|
||||
|
||||
|
||||
def evaluate_callable_usecols(
|
||||
usecols: Callable[[Hashable], object] | SequenceT,
|
||||
names: Iterable[Hashable],
|
||||
) -> SequenceT | set[int]:
|
||||
"""
|
||||
Check whether or not the 'usecols' parameter
|
||||
is a callable. If so, enumerates the 'names'
|
||||
parameter and returns a set of indices for
|
||||
each entry in 'names' that evaluates to True.
|
||||
If not a callable, returns 'usecols'.
|
||||
"""
|
||||
if callable(usecols):
|
||||
return {i for i, name in enumerate(names) if usecols(name)}
|
||||
return usecols
|
||||
+395
@@ -0,0 +1,395 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
parsers,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import DtypeWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.concat import (
|
||||
concat_compat,
|
||||
union_categoricals,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
from pandas.core.indexes.api import ensure_index_from_sequences
|
||||
|
||||
from pandas.io.common import (
|
||||
dedup_names,
|
||||
is_potential_multi_index,
|
||||
)
|
||||
from pandas.io.parsers.base_parser import (
|
||||
ParserBase,
|
||||
ParserError,
|
||||
date_converter,
|
||||
evaluate_callable_usecols,
|
||||
is_index_col,
|
||||
validate_parse_dates_presence,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
AnyArrayLike,
|
||||
ArrayLike,
|
||||
DtypeArg,
|
||||
DtypeObj,
|
||||
ReadCsvBuffer,
|
||||
SequenceT,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
|
||||
class CParserWrapper(ParserBase):
|
||||
low_memory: bool
|
||||
_reader: parsers.TextReader
|
||||
|
||||
def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
kwds = kwds.copy()
|
||||
|
||||
self.low_memory = kwds.pop("low_memory", False)
|
||||
|
||||
# #2442
|
||||
kwds["allow_leading_cols"] = self.index_col is not False
|
||||
|
||||
# GH20529, validate usecol arg before TextReader
|
||||
kwds["usecols"] = self.usecols
|
||||
|
||||
# Have to pass int, would break tests using TextReader directly otherwise :(
|
||||
kwds["on_bad_lines"] = self.on_bad_lines.value
|
||||
|
||||
for key in (
|
||||
"storage_options",
|
||||
"encoding",
|
||||
"memory_map",
|
||||
"compression",
|
||||
):
|
||||
kwds.pop(key, None)
|
||||
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
|
||||
if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
|
||||
kwds["dtype_backend"] = "numpy"
|
||||
if kwds["dtype_backend"] == "pyarrow":
|
||||
# Fail here loudly instead of in cython after reading
|
||||
import_optional_dependency("pyarrow")
|
||||
self._reader = parsers.TextReader(src, **kwds)
|
||||
|
||||
self.unnamed_cols = self._reader.unnamed_cols
|
||||
|
||||
passed_names = self.names is None
|
||||
|
||||
if self._reader.header is None:
|
||||
self.names = None
|
||||
else:
|
||||
(
|
||||
self.names,
|
||||
self.index_names,
|
||||
self.col_names,
|
||||
passed_names,
|
||||
) = self._extract_multi_indexer_columns(
|
||||
self._reader.header,
|
||||
self.index_names,
|
||||
passed_names,
|
||||
)
|
||||
|
||||
if self.names is None:
|
||||
self.names = list(range(self._reader.table_width))
|
||||
|
||||
# gh-9755
|
||||
#
|
||||
# need to set orig_names here first
|
||||
# so that proper indexing can be done
|
||||
# with _set_noconvert_columns
|
||||
#
|
||||
# once names has been filtered, we will
|
||||
# then set orig_names again to names
|
||||
self.orig_names = self.names[:]
|
||||
|
||||
if self.usecols:
|
||||
usecols = evaluate_callable_usecols(self.usecols, self.orig_names)
|
||||
|
||||
# GH 14671
|
||||
# assert for mypy, orig_names is List or None, None would error in issubset
|
||||
assert self.orig_names is not None
|
||||
if self.usecols_dtype == "string" and not set(usecols).issubset(
|
||||
self.orig_names
|
||||
):
|
||||
self._validate_usecols_names(usecols, self.orig_names)
|
||||
|
||||
if len(self.names) > len(usecols):
|
||||
self.names = [
|
||||
n
|
||||
for i, n in enumerate(self.names)
|
||||
if (i in usecols or n in usecols)
|
||||
]
|
||||
|
||||
if len(self.names) < len(usecols):
|
||||
self._validate_usecols_names(
|
||||
usecols,
|
||||
self.names,
|
||||
)
|
||||
|
||||
validate_parse_dates_presence(self.parse_dates, self.names)
|
||||
self._set_noconvert_columns()
|
||||
|
||||
self.orig_names = self.names
|
||||
|
||||
if self._reader.leading_cols == 0 and is_index_col(self.index_col):
|
||||
(
|
||||
index_names,
|
||||
self.names,
|
||||
self.index_col,
|
||||
) = self._clean_index_names(
|
||||
self.names,
|
||||
self.index_col,
|
||||
)
|
||||
|
||||
if self.index_names is None:
|
||||
self.index_names = index_names
|
||||
|
||||
if self._reader.header is None and not passed_names:
|
||||
assert self.index_names is not None
|
||||
self.index_names = [None] * len(self.index_names)
|
||||
|
||||
self._implicit_index = self._reader.leading_cols > 0
|
||||
|
||||
def close(self) -> None:
|
||||
# close handles opened by C parser
|
||||
try:
|
||||
self._reader.close()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _set_noconvert_columns(self) -> None:
|
||||
"""
|
||||
Set the columns that should not undergo dtype conversions.
|
||||
|
||||
Currently, any column that is involved with date parsing will not
|
||||
undergo such conversions.
|
||||
"""
|
||||
assert self.orig_names is not None
|
||||
# error: Cannot determine type of 'names'
|
||||
|
||||
# much faster than using orig_names.index(x) xref GH#44106
|
||||
names_dict = {x: i for i, x in enumerate(self.orig_names)}
|
||||
col_indices = [names_dict[x] for x in self.names]
|
||||
noconvert_columns = self._set_noconvert_dtype_columns(
|
||||
col_indices,
|
||||
self.names,
|
||||
)
|
||||
for col in noconvert_columns:
|
||||
self._reader.set_noconvert(col)
|
||||
|
||||
def read(
|
||||
self,
|
||||
nrows: int | None = None,
|
||||
) -> tuple[
|
||||
Index | MultiIndex | None,
|
||||
Sequence[Hashable] | MultiIndex,
|
||||
Mapping[Hashable, AnyArrayLike],
|
||||
]:
|
||||
index: Index | MultiIndex | None
|
||||
column_names: Sequence[Hashable] | MultiIndex
|
||||
try:
|
||||
if self.low_memory:
|
||||
chunks = self._reader.read_low_memory(nrows)
|
||||
# destructive to chunks
|
||||
data = _concatenate_chunks(chunks, self.names)
|
||||
else:
|
||||
data = self._reader.read(nrows)
|
||||
except StopIteration:
|
||||
if self._first_chunk:
|
||||
self._first_chunk = False
|
||||
# assert for mypy, orig_names is List or None, None would error in
|
||||
# list(...) in dedup_names
|
||||
assert self.orig_names is not None
|
||||
names = dedup_names(
|
||||
self.orig_names,
|
||||
is_potential_multi_index(self.orig_names, self.index_col),
|
||||
)
|
||||
index, columns, col_dict = self._get_empty_meta(
|
||||
names,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
# error: Incompatible types in assignment (expression has type
|
||||
# "list[Hashable] | MultiIndex", variable has type "list[Hashable]")
|
||||
columns = self._maybe_make_multi_index_columns( # type: ignore[assignment]
|
||||
columns, self.col_names
|
||||
)
|
||||
|
||||
columns = _filter_usecols(self.usecols, columns)
|
||||
columns_set = set(columns)
|
||||
|
||||
col_dict = {k: v for k, v in col_dict.items() if k in columns_set}
|
||||
|
||||
return index, columns, col_dict
|
||||
|
||||
else:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
# Done with first read, next time raise StopIteration
|
||||
self._first_chunk = False
|
||||
|
||||
names = self.names
|
||||
|
||||
if self._reader.leading_cols:
|
||||
# implicit index, no index names
|
||||
arrays = []
|
||||
|
||||
if self.index_col and self._reader.leading_cols != len(self.index_col):
|
||||
raise ParserError(
|
||||
"Could not construct index. Requested to use "
|
||||
f"{len(self.index_col)} number of columns, but "
|
||||
f"{self._reader.leading_cols} left to parse."
|
||||
)
|
||||
|
||||
for i in range(self._reader.leading_cols):
|
||||
if self.index_col is None:
|
||||
values = data.pop(i)
|
||||
else:
|
||||
values = data.pop(self.index_col[i])
|
||||
|
||||
if self._should_parse_dates(i):
|
||||
values = date_converter(
|
||||
values,
|
||||
col=(
|
||||
self.index_names[i]
|
||||
if self.index_names is not None
|
||||
else None
|
||||
),
|
||||
dayfirst=self.dayfirst,
|
||||
cache_dates=self.cache_dates,
|
||||
date_format=self.date_format,
|
||||
)
|
||||
arrays.append(values)
|
||||
|
||||
index = ensure_index_from_sequences(arrays)
|
||||
|
||||
names = _filter_usecols(self.usecols, names)
|
||||
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups, strict=True)}
|
||||
|
||||
date_data = self._do_date_conversions(names, data)
|
||||
|
||||
# maybe create a mi on the columns
|
||||
column_names = self._maybe_make_multi_index_columns(names, self.col_names)
|
||||
|
||||
else:
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
|
||||
# ugh, mutation
|
||||
|
||||
# assert for mypy, orig_names is List or None, None would error in list(...)
|
||||
assert self.orig_names is not None
|
||||
names = list(self.orig_names)
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
names = _filter_usecols(self.usecols, names)
|
||||
|
||||
# columns as list
|
||||
alldata = [x[1] for x in data_tups]
|
||||
if self.usecols is None:
|
||||
self._check_data_length(names, alldata)
|
||||
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups, strict=False)}
|
||||
|
||||
date_data = self._do_date_conversions(names, data)
|
||||
index, column_names = self._make_index(alldata, names)
|
||||
|
||||
return index, column_names, date_data
|
||||
|
||||
|
||||
def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]:
|
||||
# hackish
|
||||
usecols = evaluate_callable_usecols(usecols, names)
|
||||
if usecols is not None and len(names) != len(usecols):
|
||||
return [name for i, name in enumerate(names) if i in usecols or name in usecols]
|
||||
return names
|
||||
|
||||
|
||||
def _concatenate_chunks(
|
||||
chunks: list[dict[int, ArrayLike]], column_names: list[str]
|
||||
) -> dict:
|
||||
"""
|
||||
Concatenate chunks of data read with low_memory=True.
|
||||
|
||||
The tricky part is handling Categoricals, where different chunks
|
||||
may have different inferred categories.
|
||||
"""
|
||||
names = list(chunks[0].keys())
|
||||
warning_columns = []
|
||||
|
||||
result: dict = {}
|
||||
for name in names:
|
||||
arrs = [chunk.pop(name) for chunk in chunks]
|
||||
# Check each arr for consistent types.
|
||||
dtypes = {a.dtype for a in arrs}
|
||||
non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)}
|
||||
|
||||
dtype = dtypes.pop()
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
result[name] = union_categoricals(arrs, sort_categories=False)
|
||||
else:
|
||||
result[name] = concat_compat(arrs)
|
||||
if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
|
||||
warning_columns.append(column_names[name])
|
||||
|
||||
if warning_columns:
|
||||
warning_names = ", ".join(
|
||||
[f"{index}: {name}" for index, name in enumerate(warning_columns)]
|
||||
)
|
||||
warning_message = " ".join(
|
||||
[
|
||||
f"Columns ({warning_names}) have mixed types. "
|
||||
f"Specify dtype option on import or set low_memory=False."
|
||||
]
|
||||
)
|
||||
warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
|
||||
return result
|
||||
|
||||
|
||||
def ensure_dtype_objs(
|
||||
dtype: DtypeArg | dict[Hashable, DtypeArg] | None,
|
||||
) -> DtypeObj | dict[Hashable, DtypeObj] | None:
|
||||
"""
|
||||
Ensure we have either None, a dtype object, or a dictionary mapping to
|
||||
dtype objects.
|
||||
"""
|
||||
if isinstance(dtype, defaultdict):
|
||||
# "None" not callable [misc]
|
||||
default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
|
||||
dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
|
||||
for key in dtype.keys():
|
||||
dtype_converted[key] = pandas_dtype(dtype[key])
|
||||
return dtype_converted
|
||||
elif isinstance(dtype, dict):
|
||||
return {k: pandas_dtype(dtype[k]) for k in dtype}
|
||||
elif dtype is not None:
|
||||
return pandas_dtype(dtype)
|
||||
return dtype
|
||||
+1557
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,239 @@
|
||||
"""pickle compat"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pickle
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.compat import pickle_compat
|
||||
from pandas.util._decorators import set_module
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadPickleBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def to_pickle(
|
||||
obj: Any,
|
||||
filepath_or_buffer: FilePath | WriteBuffer[bytes],
|
||||
compression: CompressionOptions = "infer",
|
||||
protocol: int = pickle.HIGHEST_PROTOCOL,
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Pickle (serialize) object to file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : any object
|
||||
Any python object.
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``write()`` function.
|
||||
Also accepts URL. URL has to be of S3 or GCS.
|
||||
compression : str or dict, default 'infer'
|
||||
For on-the-fly compression of the output data. If 'infer' and
|
||||
'filepath_or_buffer' is path-like, then detect compression from the
|
||||
following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
|
||||
'.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
|
||||
Set to ``None`` for no compression.
|
||||
Can also be a dict with key ``'method'`` set
|
||||
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``,
|
||||
``'tar'``} and other key-value pairs are forwarded to
|
||||
``zipfile.ZipFile``, ``gzip.GzipFile``,
|
||||
``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
|
||||
``tarfile.TarFile``, respectively.
|
||||
As an example, the following could be passed for faster compression
|
||||
and to create a reproducible gzip archive:
|
||||
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
|
||||
protocol : int
|
||||
Int which indicates which protocol should be used by the pickler,
|
||||
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
|
||||
values for this parameter depend on the version of Python. For Python
|
||||
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
|
||||
For Python >= 3.4, 4 is a valid value. A negative value for the
|
||||
protocol parameter is equivalent to setting its value to
|
||||
HIGHEST_PROTOCOL.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
|
||||
.. [1] https://docs.python.org/3/library/pickle.html
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_pickle : Load pickled pandas object (or any object) from file.
|
||||
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
|
||||
DataFrame.to_sql : Write DataFrame to a SQL database.
|
||||
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame(
|
||||
... {{"foo": range(5), "bar": range(5, 10)}}
|
||||
... ) # doctest: +SKIP
|
||||
>>> original_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
|
||||
>>> unpickled_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
"""
|
||||
if protocol < 0:
|
||||
protocol = pickle.HIGHEST_PROTOCOL
|
||||
|
||||
with get_handle(
|
||||
filepath_or_buffer,
|
||||
"wb",
|
||||
compression=compression,
|
||||
is_text=False,
|
||||
storage_options=storage_options,
|
||||
) as handles:
|
||||
# letting pickle write directly to the buffer is more memory-efficient
|
||||
pickle.dump(obj, handles.handle, protocol=protocol)
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_pickle(
|
||||
filepath_or_buffer: FilePath | ReadPickleBuffer,
|
||||
compression: CompressionOptions = "infer",
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> DataFrame | Series:
|
||||
"""
|
||||
Load pickled pandas object (or any object) from file and return unpickled object.
|
||||
|
||||
.. warning::
|
||||
|
||||
Loading pickled data received from untrusted sources can be
|
||||
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``readlines()`` function.
|
||||
Also accepts URL. URL is not limited to S3 and GCS.
|
||||
compression : str or dict, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer' and
|
||||
'filepath_or_buffer' is path-like, then detect compression from the
|
||||
following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
|
||||
'.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
|
||||
If using 'zip' or 'tar', the ZIP file must contain only one data file
|
||||
to be read in.
|
||||
Set to ``None`` for no decompression.
|
||||
Can also be a dict with key ``'method'`` set
|
||||
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``,
|
||||
``'tar'``} and other key-value pairs are forwarded to
|
||||
``zipfile.ZipFile``, ``gzip.GzipFile``,
|
||||
``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
|
||||
``tarfile.TarFile``, respectively.
|
||||
As an example, the following could be passed for Zstandard decompression
|
||||
using a custom compression dictionary:
|
||||
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g.
|
||||
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
|
||||
are forwarded to ``urllib.request.Request`` as header options. For other
|
||||
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
|
||||
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
|
||||
details, and for more examples on storage options refer `here
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html?
|
||||
highlight=storage_options#reading-writing-remote-files>`_.
|
||||
|
||||
Returns
|
||||
-------
|
||||
object
|
||||
The unpickled pandas object (or any object) that was stored in file.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
|
||||
Series.to_pickle : Pickle (serialize) Series object to file.
|
||||
read_hdf : Read HDF5 file into a DataFrame.
|
||||
read_sql : Read SQL query or database table into a DataFrame.
|
||||
read_parquet : Load a parquet object, returning a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
read_pickle is only guaranteed to be backwards compatible to pandas 1.0
|
||||
provided the object was serialized with to_pickle.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame(
|
||||
... {{"foo": range(5), "bar": range(5, 10)}}
|
||||
... ) # doctest: +SKIP
|
||||
>>> original_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
|
||||
>>> unpickled_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
"""
|
||||
# TypeError for Cython complaints about object.__new__ vs Tick.__new__
|
||||
excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
|
||||
with get_handle(
|
||||
filepath_or_buffer,
|
||||
"rb",
|
||||
compression=compression,
|
||||
is_text=False,
|
||||
storage_options=storage_options,
|
||||
) as handles:
|
||||
# 1) try standard library Pickle
|
||||
# 2) try pickle_compat (older pandas version) to handle subclass changes
|
||||
try:
|
||||
with warnings.catch_warnings(record=True):
|
||||
# We want to silence any warnings about, e.g. moved modules.
|
||||
warnings.simplefilter("ignore", Warning)
|
||||
return pickle.load(handles.handle)
|
||||
except excs_to_catch:
|
||||
# e.g.
|
||||
# "No module named 'pandas.core.sparse.series'"
|
||||
# "Can't get attribute '_nat_unpickle' on <module 'pandas._libs.tslib"
|
||||
handles.handle.seek(0)
|
||||
return pickle_compat.Unpickler(handles.handle).load()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
from pandas.io.sas.sasreader import read_sas
|
||||
|
||||
__all__ = ["read_sas"]
|
||||
@@ -0,0 +1,738 @@
|
||||
"""
|
||||
Read SAS7BDAT files
|
||||
|
||||
Based on code written by Jared Hobbs:
|
||||
https://bitbucket.org/jaredhobbs/sas7bdat
|
||||
|
||||
See also:
|
||||
https://github.com/BioStatMatt/sas7bdat
|
||||
|
||||
Partial documentation of the file format:
|
||||
https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
|
||||
|
||||
Reference for binary data compression:
|
||||
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas._libs.byteswap import (
|
||||
read_double_with_byteswap,
|
||||
read_float_with_byteswap,
|
||||
read_uint16_with_byteswap,
|
||||
read_uint32_with_byteswap,
|
||||
read_uint64_with_byteswap,
|
||||
)
|
||||
from pandas._libs.sas import (
|
||||
Parser,
|
||||
get_subheader_index,
|
||||
)
|
||||
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
import pandas.io.sas.sas_constants as const
|
||||
from pandas.io.sas.sasreader import SASReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
|
||||
|
||||
_unix_origin = Timestamp("1970-01-01")
|
||||
_sas_origin = Timestamp("1960-01-01")
|
||||
|
||||
|
||||
def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
|
||||
"""
|
||||
Convert to Timestamp if possible, otherwise to datetime.datetime.
|
||||
SAS float64 lacks precision for more than ms resolution so the fit
|
||||
to datetime.datetime is ok.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sas_datetimes : {Series, Sequence[float]}
|
||||
Dates or datetimes in SAS
|
||||
unit : {'d', 's'}
|
||||
"d" if the floats represent dates, "s" for datetimes
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
Series of datetime64 dtype or datetime.datetime.
|
||||
"""
|
||||
td = (_sas_origin - _unix_origin).as_unit("s")
|
||||
if unit == "s":
|
||||
millis = cast_from_unit_vectorized(
|
||||
sas_datetimes._values, unit="s", out_unit="ms"
|
||||
)
|
||||
dt64ms = millis.view("M8[ms]") + td
|
||||
return pd.Series(dt64ms, index=sas_datetimes.index, copy=False)
|
||||
else:
|
||||
vals = np.array(sas_datetimes, dtype="M8[D]") + td
|
||||
return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False)
|
||||
|
||||
|
||||
class _Column:
|
||||
col_id: int
|
||||
name: str | bytes
|
||||
label: str | bytes
|
||||
format: str | bytes
|
||||
ctype: bytes
|
||||
length: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
col_id: int,
|
||||
# These can be bytes when convert_header_text is False
|
||||
name: str | bytes,
|
||||
label: str | bytes,
|
||||
format: str | bytes,
|
||||
ctype: bytes,
|
||||
length: int,
|
||||
) -> None:
|
||||
self.col_id = col_id
|
||||
self.name = name
|
||||
self.label = label
|
||||
self.format = format
|
||||
self.ctype = ctype
|
||||
self.length = length
|
||||
|
||||
|
||||
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
|
||||
class SAS7BDATReader(SASReader):
|
||||
"""
|
||||
Read SAS files in SAS7BDAT format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : path name or buffer
|
||||
Name of SAS file or file-like object pointing to SAS file
|
||||
contents.
|
||||
index : column identifier, defaults to None
|
||||
Column to use as index.
|
||||
convert_dates : bool, defaults to True
|
||||
Attempt to convert dates to Pandas datetime values. Note that
|
||||
some rarely used SAS date formats may be unsupported.
|
||||
blank_missing : bool, defaults to True
|
||||
Convert empty strings to missing values (SAS uses blanks to
|
||||
indicate missing character variables).
|
||||
chunksize : int, defaults to None
|
||||
Return SAS7BDATReader object for iterations, returns chunks
|
||||
with given number of lines.
|
||||
encoding : str, 'infer', defaults to None
|
||||
String encoding acc. to Python standard encodings,
|
||||
encoding='infer' tries to detect the encoding from the file header,
|
||||
encoding=None will leave the data in binary format.
|
||||
convert_text : bool, defaults to True
|
||||
If False, text variables are left as raw bytes.
|
||||
convert_header_text : bool, defaults to True
|
||||
If False, header text, including column names, are left as raw
|
||||
bytes.
|
||||
"""
|
||||
|
||||
_int_length: int
|
||||
_cached_page: bytes | None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_buf: FilePath | ReadBuffer[bytes],
|
||||
index=None,
|
||||
convert_dates: bool = True,
|
||||
blank_missing: bool = True,
|
||||
chunksize: int | None = None,
|
||||
encoding: str | None = None,
|
||||
convert_text: bool = True,
|
||||
convert_header_text: bool = True,
|
||||
compression: CompressionOptions = "infer",
|
||||
) -> None:
|
||||
self.index = index
|
||||
self.convert_dates = convert_dates
|
||||
self.blank_missing = blank_missing
|
||||
self.chunksize = chunksize
|
||||
self.encoding = encoding
|
||||
self.convert_text = convert_text
|
||||
self.convert_header_text = convert_header_text
|
||||
|
||||
self.default_encoding = "latin-1"
|
||||
self.compression = b""
|
||||
self.column_names_raw: list[bytes] = []
|
||||
self.column_names: list[str | bytes] = []
|
||||
self.column_formats: list[str | bytes] = []
|
||||
self.columns: list[_Column] = []
|
||||
|
||||
self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
|
||||
self._cached_page = None
|
||||
self._column_data_lengths: list[int] = []
|
||||
self._column_data_offsets: list[int] = []
|
||||
self._column_types: list[bytes] = []
|
||||
|
||||
self._current_row_in_file_index = 0
|
||||
self._current_row_on_page_index = 0
|
||||
self._current_row_in_file_index = 0
|
||||
|
||||
self.handles = get_handle(
|
||||
path_or_buf, "rb", is_text=False, compression=compression
|
||||
)
|
||||
|
||||
self._path_or_buf = self.handles.handle
|
||||
|
||||
# Same order as const.SASIndex
|
||||
self._subheader_processors = [
|
||||
self._process_rowsize_subheader,
|
||||
self._process_columnsize_subheader,
|
||||
self._process_subheader_counts,
|
||||
self._process_columntext_subheader,
|
||||
self._process_columnname_subheader,
|
||||
self._process_columnattributes_subheader,
|
||||
self._process_format_subheader,
|
||||
self._process_columnlist_subheader,
|
||||
None, # Data
|
||||
]
|
||||
|
||||
try:
|
||||
self._get_properties()
|
||||
self._parse_metadata()
|
||||
except Exception:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
def column_data_lengths(self) -> np.ndarray:
|
||||
"""Return a numpy int64 array of the column data lengths"""
|
||||
return np.asarray(self._column_data_lengths, dtype=np.int64)
|
||||
|
||||
def column_data_offsets(self) -> np.ndarray:
|
||||
"""Return a numpy int64 array of the column offsets"""
|
||||
return np.asarray(self._column_data_offsets, dtype=np.int64)
|
||||
|
||||
def column_types(self) -> np.ndarray:
|
||||
"""
|
||||
Returns a numpy character array of the column types:
|
||||
s (string) or d (double)
|
||||
"""
|
||||
return np.asarray(self._column_types, dtype=np.dtype("S1"))
|
||||
|
||||
def close(self) -> None:
|
||||
self.handles.close()
|
||||
|
||||
def _get_properties(self) -> None:
|
||||
# Check magic number
|
||||
self._path_or_buf.seek(0)
|
||||
self._cached_page = self._path_or_buf.read(288)
|
||||
if self._cached_page[0 : len(const.magic)] != const.magic:
|
||||
raise ValueError("magic number mismatch (not a SAS file?)")
|
||||
|
||||
# Get alignment information
|
||||
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
|
||||
if buf == const.u64_byte_checker_value:
|
||||
self.U64 = True
|
||||
self._int_length = 8
|
||||
self._page_bit_offset = const.page_bit_offset_x64
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x64
|
||||
else:
|
||||
self.U64 = False
|
||||
self._page_bit_offset = const.page_bit_offset_x86
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x86
|
||||
self._int_length = 4
|
||||
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
|
||||
if buf == const.align_1_checker_value:
|
||||
align1 = const.align_2_value
|
||||
else:
|
||||
align1 = 0
|
||||
|
||||
# Get endianness information
|
||||
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
|
||||
if buf == b"\x01":
|
||||
self.byte_order = "<"
|
||||
self.need_byteswap = sys.byteorder == "big"
|
||||
else:
|
||||
self.byte_order = ">"
|
||||
self.need_byteswap = sys.byteorder == "little"
|
||||
|
||||
# Get encoding information
|
||||
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
|
||||
if buf in const.encoding_names:
|
||||
self.inferred_encoding = const.encoding_names[buf]
|
||||
if self.encoding == "infer":
|
||||
self.encoding = self.inferred_encoding
|
||||
else:
|
||||
self.inferred_encoding = f"unknown (code={buf})"
|
||||
|
||||
# Timestamp is epoch 01/01/1960
|
||||
epoch = datetime(1960, 1, 1)
|
||||
x = self._read_float(
|
||||
const.date_created_offset + align1, const.date_created_length
|
||||
)
|
||||
self.date_created = epoch + pd.to_timedelta(x, unit="s")
|
||||
x = self._read_float(
|
||||
const.date_modified_offset + align1, const.date_modified_length
|
||||
)
|
||||
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
|
||||
|
||||
self.header_length = self._read_uint(
|
||||
const.header_size_offset + align1, const.header_size_length
|
||||
)
|
||||
|
||||
# Read the rest of the header into cached_page.
|
||||
buf = self._path_or_buf.read(self.header_length - 288)
|
||||
self._cached_page += buf
|
||||
if len(self._cached_page) != self.header_length:
|
||||
raise ValueError("The SAS7BDAT file appears to be truncated.")
|
||||
|
||||
self._page_length = self._read_uint(
|
||||
const.page_size_offset + align1, const.page_size_length
|
||||
)
|
||||
|
||||
def __next__(self) -> DataFrame:
|
||||
da = self.read(nrows=self.chunksize or 1)
|
||||
if da.empty:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
return da
|
||||
|
||||
# Read a single float of the given width (4 or 8).
|
||||
def _read_float(self, offset: int, width: int) -> float:
|
||||
assert self._cached_page is not None
|
||||
if width == 4:
|
||||
return read_float_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
elif width == 8:
|
||||
return read_double_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("invalid float width")
|
||||
|
||||
# Read a single unsigned integer of the given width (1, 2, 4 or 8).
|
||||
def _read_uint(self, offset: int, width: int) -> int:
|
||||
assert self._cached_page is not None
|
||||
if width == 1:
|
||||
return self._read_bytes(offset, 1)[0]
|
||||
elif width == 2:
|
||||
return read_uint16_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
elif width == 4:
|
||||
return read_uint32_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
elif width == 8:
|
||||
return read_uint64_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("invalid int width")
|
||||
|
||||
def _read_bytes(self, offset: int, length: int):
|
||||
assert self._cached_page is not None
|
||||
if offset + length > len(self._cached_page):
|
||||
self.close()
|
||||
raise ValueError("The cached page is too small.")
|
||||
return self._cached_page[offset : offset + length]
|
||||
|
||||
def _parse_metadata(self) -> None:
|
||||
done = False
|
||||
while not done:
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
break
|
||||
if len(self._cached_page) != self._page_length:
|
||||
raise ValueError("Failed to read a meta data page from the SAS file.")
|
||||
done = self._process_page_meta()
|
||||
|
||||
def _process_page_meta(self) -> bool:
|
||||
self._read_page_header()
|
||||
pt = [*const.page_meta_types, const.page_amd_type, const.page_mix_type]
|
||||
if self._current_page_type in pt:
|
||||
self._process_page_metadata()
|
||||
is_data_page = self._current_page_type == const.page_data_type
|
||||
is_mix_page = self._current_page_type == const.page_mix_type
|
||||
return bool(
|
||||
is_data_page
|
||||
or is_mix_page
|
||||
or self._current_page_data_subheader_pointers != []
|
||||
)
|
||||
|
||||
def _read_page_header(self) -> None:
|
||||
bit_offset = self._page_bit_offset
|
||||
tx = const.page_type_offset + bit_offset
|
||||
self._current_page_type = (
|
||||
self._read_uint(tx, const.page_type_length) & const.page_type_mask2
|
||||
)
|
||||
tx = const.block_count_offset + bit_offset
|
||||
self._current_page_block_count = self._read_uint(tx, const.block_count_length)
|
||||
tx = const.subheader_count_offset + bit_offset
|
||||
self._current_page_subheaders_count = self._read_uint(
|
||||
tx, const.subheader_count_length
|
||||
)
|
||||
|
||||
def _process_page_metadata(self) -> None:
|
||||
bit_offset = self._page_bit_offset
|
||||
|
||||
for i in range(self._current_page_subheaders_count):
|
||||
offset = const.subheader_pointers_offset + bit_offset
|
||||
total_offset = offset + self._subheader_pointer_length * i
|
||||
|
||||
subheader_offset = self._read_uint(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_length = self._read_uint(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_compression = self._read_uint(total_offset, 1)
|
||||
total_offset += 1
|
||||
|
||||
subheader_type = self._read_uint(total_offset, 1)
|
||||
|
||||
if (
|
||||
subheader_length == 0
|
||||
or subheader_compression == const.truncated_subheader_id
|
||||
):
|
||||
continue
|
||||
|
||||
subheader_signature = self._read_bytes(subheader_offset, self._int_length)
|
||||
subheader_index = get_subheader_index(subheader_signature)
|
||||
subheader_processor = self._subheader_processors[subheader_index]
|
||||
|
||||
if subheader_processor is None:
|
||||
f1 = subheader_compression in (const.compressed_subheader_id, 0)
|
||||
f2 = subheader_type == const.compressed_subheader_type
|
||||
if self.compression and f1 and f2:
|
||||
self._current_page_data_subheader_pointers.append(
|
||||
(subheader_offset, subheader_length)
|
||||
)
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(
|
||||
f"Unknown subheader signature {subheader_signature}"
|
||||
)
|
||||
else:
|
||||
subheader_processor(subheader_offset, subheader_length)
|
||||
|
||||
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
lcs_offset = offset
|
||||
lcp_offset = offset
|
||||
if self.U64:
|
||||
lcs_offset += 682
|
||||
lcp_offset += 706
|
||||
else:
|
||||
lcs_offset += 354
|
||||
lcp_offset += 378
|
||||
|
||||
self.row_length = self._read_uint(
|
||||
offset + const.row_length_offset_multiplier * int_len,
|
||||
int_len,
|
||||
)
|
||||
self.row_count = self._read_uint(
|
||||
offset + const.row_count_offset_multiplier * int_len,
|
||||
int_len,
|
||||
)
|
||||
self.col_count_p1 = self._read_uint(
|
||||
offset + const.col_count_p1_multiplier * int_len, int_len
|
||||
)
|
||||
self.col_count_p2 = self._read_uint(
|
||||
offset + const.col_count_p2_multiplier * int_len, int_len
|
||||
)
|
||||
mx = const.row_count_on_mix_page_offset_multiplier * int_len
|
||||
self._mix_page_row_count = self._read_uint(offset + mx, int_len)
|
||||
self._lcs = self._read_uint(lcs_offset, 2)
|
||||
self._lcp = self._read_uint(lcp_offset, 2)
|
||||
|
||||
def _process_columnsize_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
self.column_count = self._read_uint(offset, int_len)
|
||||
if self.col_count_p1 + self.col_count_p2 != self.column_count:
|
||||
print(
|
||||
f"Warning: column count mismatch ({self.col_count_p1} + "
|
||||
f"{self.col_count_p2} != {self.column_count})\n"
|
||||
)
|
||||
|
||||
# Unknown purpose
|
||||
def _process_subheader_counts(self, offset: int, length: int) -> None:
|
||||
pass
|
||||
|
||||
def _process_columntext_subheader(self, offset: int, length: int) -> None:
|
||||
offset += self._int_length
|
||||
text_block_size = self._read_uint(offset, const.text_block_size_length)
|
||||
|
||||
buf = self._read_bytes(offset, text_block_size)
|
||||
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
|
||||
self.column_names_raw.append(cname_raw)
|
||||
|
||||
if len(self.column_names_raw) == 1:
|
||||
compression_literal = b""
|
||||
for cl in const.compression_literals:
|
||||
if cl in cname_raw:
|
||||
compression_literal = cl
|
||||
self.compression = compression_literal
|
||||
offset -= self._int_length
|
||||
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
compression_literal = buf.rstrip(b"\x00")
|
||||
if compression_literal == b"":
|
||||
self._lcs = 0
|
||||
offset1 = offset + 32
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif compression_literal == const.rle_compression:
|
||||
offset1 = offset + 40
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif self._lcs > 0:
|
||||
self._lcp = 0
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcs)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
if hasattr(self, "creator_proc"):
|
||||
self.creator_proc = self._convert_header_text(self.creator_proc) # pyright: ignore[reportArgumentType]
|
||||
|
||||
def _process_columnname_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
column_name_pointers_count = (length - 2 * int_len - 12) // 8
|
||||
for i in range(column_name_pointers_count):
|
||||
text_subheader = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_text_subheader_offset
|
||||
)
|
||||
col_name_offset = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_offset_offset
|
||||
)
|
||||
col_name_length = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_length_offset
|
||||
)
|
||||
|
||||
idx = self._read_uint(
|
||||
text_subheader, const.column_name_text_subheader_length
|
||||
)
|
||||
col_offset = self._read_uint(
|
||||
col_name_offset, const.column_name_offset_length
|
||||
)
|
||||
col_len = self._read_uint(col_name_length, const.column_name_length_length)
|
||||
|
||||
name_raw = self.column_names_raw[idx]
|
||||
cname = name_raw[col_offset : col_offset + col_len]
|
||||
self.column_names.append(self._convert_header_text(cname))
|
||||
|
||||
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
|
||||
for i in range(column_attributes_vectors_count):
|
||||
col_data_offset = (
|
||||
offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
|
||||
)
|
||||
col_data_len = (
|
||||
offset
|
||||
+ 2 * int_len
|
||||
+ const.column_data_length_offset
|
||||
+ i * (int_len + 8)
|
||||
)
|
||||
col_types = (
|
||||
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
|
||||
)
|
||||
|
||||
x = self._read_uint(col_data_offset, int_len)
|
||||
self._column_data_offsets.append(x)
|
||||
|
||||
x = self._read_uint(col_data_len, const.column_data_length_length)
|
||||
self._column_data_lengths.append(x)
|
||||
|
||||
x = self._read_uint(col_types, const.column_type_length)
|
||||
self._column_types.append(b"d" if x == 1 else b"s")
|
||||
|
||||
def _process_columnlist_subheader(self, offset: int, length: int) -> None:
|
||||
# unknown purpose
|
||||
pass
|
||||
|
||||
def _process_format_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
text_subheader_format = (
|
||||
offset + const.column_format_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
|
||||
col_format_len = offset + const.column_format_length_offset + 3 * int_len
|
||||
text_subheader_label = (
|
||||
offset + const.column_label_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
|
||||
col_label_len = offset + const.column_label_length_offset + 3 * int_len
|
||||
|
||||
x = self._read_uint(
|
||||
text_subheader_format, const.column_format_text_subheader_index_length
|
||||
)
|
||||
format_idx = min(x, len(self.column_names_raw) - 1)
|
||||
|
||||
format_start = self._read_uint(
|
||||
col_format_offset, const.column_format_offset_length
|
||||
)
|
||||
format_len = self._read_uint(col_format_len, const.column_format_length_length)
|
||||
|
||||
label_idx = self._read_uint(
|
||||
text_subheader_label, const.column_label_text_subheader_index_length
|
||||
)
|
||||
label_idx = min(label_idx, len(self.column_names_raw) - 1)
|
||||
|
||||
label_start = self._read_uint(
|
||||
col_label_offset, const.column_label_offset_length
|
||||
)
|
||||
label_len = self._read_uint(col_label_len, const.column_label_length_length)
|
||||
|
||||
label_names = self.column_names_raw[label_idx]
|
||||
column_label = self._convert_header_text(
|
||||
label_names[label_start : label_start + label_len]
|
||||
)
|
||||
format_names = self.column_names_raw[format_idx]
|
||||
column_format = self._convert_header_text(
|
||||
format_names[format_start : format_start + format_len]
|
||||
)
|
||||
current_column_number = len(self.columns)
|
||||
|
||||
col = _Column(
|
||||
current_column_number,
|
||||
self.column_names[current_column_number],
|
||||
column_label,
|
||||
column_format,
|
||||
self._column_types[current_column_number],
|
||||
self._column_data_lengths[current_column_number],
|
||||
)
|
||||
|
||||
self.column_formats.append(column_format)
|
||||
self.columns.append(col)
|
||||
|
||||
def read(self, nrows: int | None = None) -> DataFrame:
|
||||
if (nrows is None) and (self.chunksize is not None):
|
||||
nrows = self.chunksize
|
||||
elif nrows is None:
|
||||
nrows = self.row_count
|
||||
|
||||
if len(self._column_types) == 0:
|
||||
self.close()
|
||||
raise EmptyDataError("No columns to parse from file")
|
||||
|
||||
if nrows > 0 and self._current_row_in_file_index >= self.row_count:
|
||||
return DataFrame()
|
||||
|
||||
nrows = min(nrows, self.row_count - self._current_row_in_file_index)
|
||||
|
||||
nd = self._column_types.count(b"d")
|
||||
ns = self._column_types.count(b"s")
|
||||
|
||||
self._string_chunk = np.empty((ns, nrows), dtype=object)
|
||||
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
|
||||
|
||||
self._current_row_in_chunk_index = 0
|
||||
p = Parser(self)
|
||||
p.read(nrows)
|
||||
|
||||
rslt = self._chunk_to_dataframe()
|
||||
if self.index is not None:
|
||||
rslt = rslt.set_index(self.index)
|
||||
|
||||
return rslt
|
||||
|
||||
def _read_next_page(self):
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
return True
|
||||
elif len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
msg = (
|
||||
"failed to read complete page from file (read "
|
||||
f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
self._read_page_header()
|
||||
if self._current_page_type in const.page_meta_types:
|
||||
self._process_page_metadata()
|
||||
|
||||
if self._current_page_type not in [
|
||||
*const.page_meta_types,
|
||||
const.page_data_type,
|
||||
const.page_mix_type,
|
||||
]:
|
||||
return self._read_next_page()
|
||||
|
||||
return False
|
||||
|
||||
def _chunk_to_dataframe(self) -> DataFrame:
|
||||
n = self._current_row_in_chunk_index
|
||||
m = self._current_row_in_file_index
|
||||
ix = range(m - n, m)
|
||||
rslt = {}
|
||||
|
||||
js, jb = 0, 0
|
||||
infer_string = using_string_dtype()
|
||||
for j in range(self.column_count):
|
||||
name = self.column_names[j]
|
||||
|
||||
if self._column_types[j] == b"d":
|
||||
col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
|
||||
rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False)
|
||||
if self.convert_dates:
|
||||
if self.column_formats[j] in const.sas_date_formats:
|
||||
rslt[name] = _convert_datetimes(rslt[name], "d")
|
||||
elif self.column_formats[j] in const.sas_datetime_formats:
|
||||
rslt[name] = _convert_datetimes(rslt[name], "s")
|
||||
jb += 1
|
||||
elif self._column_types[j] == b"s":
|
||||
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False)
|
||||
if self.convert_text and (self.encoding is not None):
|
||||
rslt[name] = self._decode_string(rslt[name].str)
|
||||
if infer_string:
|
||||
rslt[name] = rslt[name].astype("str")
|
||||
|
||||
js += 1
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(f"unknown column type {self._column_types[j]!r}")
|
||||
|
||||
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
|
||||
return df
|
||||
|
||||
def _decode_string(self, b):
|
||||
return b.decode(self.encoding or self.default_encoding)
|
||||
|
||||
def _convert_header_text(self, b: bytes) -> str | bytes:
|
||||
if self.convert_header_text:
|
||||
return self._decode_string(b)
|
||||
else:
|
||||
return b
|
||||
+310
@@ -0,0 +1,310 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Final
|
||||
|
||||
magic: Final = (
|
||||
b"\x00\x00\x00\x00\x00\x00\x00\x00"
|
||||
b"\x00\x00\x00\x00\xc2\xea\x81\x60"
|
||||
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
|
||||
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
|
||||
)
|
||||
|
||||
align_1_checker_value: Final = b"3"
|
||||
align_1_offset: Final = 32
|
||||
align_1_length: Final = 1
|
||||
align_1_value: Final = 4
|
||||
u64_byte_checker_value: Final = b"3"
|
||||
align_2_offset: Final = 35
|
||||
align_2_length: Final = 1
|
||||
align_2_value: Final = 4
|
||||
endianness_offset: Final = 37
|
||||
endianness_length: Final = 1
|
||||
platform_offset: Final = 39
|
||||
platform_length: Final = 1
|
||||
encoding_offset: Final = 70
|
||||
encoding_length: Final = 1
|
||||
dataset_offset: Final = 92
|
||||
dataset_length: Final = 64
|
||||
file_type_offset: Final = 156
|
||||
file_type_length: Final = 8
|
||||
date_created_offset: Final = 164
|
||||
date_created_length: Final = 8
|
||||
date_modified_offset: Final = 172
|
||||
date_modified_length: Final = 8
|
||||
header_size_offset: Final = 196
|
||||
header_size_length: Final = 4
|
||||
page_size_offset: Final = 200
|
||||
page_size_length: Final = 4
|
||||
page_count_offset: Final = 204
|
||||
page_count_length: Final = 4
|
||||
sas_release_offset: Final = 216
|
||||
sas_release_length: Final = 8
|
||||
sas_server_type_offset: Final = 224
|
||||
sas_server_type_length: Final = 16
|
||||
os_version_number_offset: Final = 240
|
||||
os_version_number_length: Final = 16
|
||||
os_maker_offset: Final = 256
|
||||
os_maker_length: Final = 16
|
||||
os_name_offset: Final = 272
|
||||
os_name_length: Final = 16
|
||||
page_bit_offset_x86: Final = 16
|
||||
page_bit_offset_x64: Final = 32
|
||||
subheader_pointer_length_x86: Final = 12
|
||||
subheader_pointer_length_x64: Final = 24
|
||||
page_type_offset: Final = 0
|
||||
page_type_length: Final = 2
|
||||
block_count_offset: Final = 2
|
||||
block_count_length: Final = 2
|
||||
subheader_count_offset: Final = 4
|
||||
subheader_count_length: Final = 2
|
||||
page_type_mask: Final = 0x0F00
|
||||
# Keep "page_comp_type" bits
|
||||
page_type_mask2: Final = 0xF000 | page_type_mask
|
||||
page_meta_type: Final = 0x0000
|
||||
page_data_type: Final = 0x0100
|
||||
page_mix_type: Final = 0x0200
|
||||
page_amd_type: Final = 0x0400
|
||||
page_meta2_type: Final = 0x4000
|
||||
page_comp_type: Final = 0x9000
|
||||
page_meta_types: Final = [page_meta_type, page_meta2_type]
|
||||
subheader_pointers_offset: Final = 8
|
||||
truncated_subheader_id: Final = 1
|
||||
compressed_subheader_id: Final = 4
|
||||
compressed_subheader_type: Final = 1
|
||||
text_block_size_length: Final = 2
|
||||
row_length_offset_multiplier: Final = 5
|
||||
row_count_offset_multiplier: Final = 6
|
||||
col_count_p1_multiplier: Final = 9
|
||||
col_count_p2_multiplier: Final = 10
|
||||
row_count_on_mix_page_offset_multiplier: Final = 15
|
||||
column_name_pointer_length: Final = 8
|
||||
column_name_text_subheader_offset: Final = 0
|
||||
column_name_text_subheader_length: Final = 2
|
||||
column_name_offset_offset: Final = 2
|
||||
column_name_offset_length: Final = 2
|
||||
column_name_length_offset: Final = 4
|
||||
column_name_length_length: Final = 2
|
||||
column_data_offset_offset: Final = 8
|
||||
column_data_length_offset: Final = 8
|
||||
column_data_length_length: Final = 4
|
||||
column_type_offset: Final = 14
|
||||
column_type_length: Final = 1
|
||||
column_format_text_subheader_index_offset: Final = 22
|
||||
column_format_text_subheader_index_length: Final = 2
|
||||
column_format_offset_offset: Final = 24
|
||||
column_format_offset_length: Final = 2
|
||||
column_format_length_offset: Final = 26
|
||||
column_format_length_length: Final = 2
|
||||
column_label_text_subheader_index_offset: Final = 28
|
||||
column_label_text_subheader_index_length: Final = 2
|
||||
column_label_offset_offset: Final = 30
|
||||
column_label_offset_length: Final = 2
|
||||
column_label_length_offset: Final = 32
|
||||
column_label_length_length: Final = 2
|
||||
rle_compression: Final = b"SASYZCRL"
|
||||
rdc_compression: Final = b"SASYZCR2"
|
||||
|
||||
compression_literals: Final = [rle_compression, rdc_compression]
|
||||
|
||||
# Incomplete list of encodings, using SAS nomenclature:
|
||||
# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
|
||||
# corresponding to the Python documentation of standard encodings
|
||||
# https://docs.python.org/3/library/codecs.html#standard-encodings
|
||||
encoding_names: Final = {
|
||||
20: "utf-8",
|
||||
29: "latin1",
|
||||
30: "latin2",
|
||||
31: "latin3",
|
||||
32: "latin4",
|
||||
33: "cyrillic",
|
||||
34: "arabic",
|
||||
35: "greek",
|
||||
36: "hebrew",
|
||||
37: "latin5",
|
||||
38: "latin6",
|
||||
39: "cp874",
|
||||
40: "latin9",
|
||||
41: "cp437",
|
||||
42: "cp850",
|
||||
43: "cp852",
|
||||
44: "cp857",
|
||||
45: "cp858",
|
||||
46: "cp862",
|
||||
47: "cp864",
|
||||
48: "cp865",
|
||||
49: "cp866",
|
||||
50: "cp869",
|
||||
51: "cp874",
|
||||
# 52: "", # not found
|
||||
# 53: "", # not found
|
||||
# 54: "", # not found
|
||||
55: "cp720",
|
||||
56: "cp737",
|
||||
57: "cp775",
|
||||
58: "cp860",
|
||||
59: "cp863",
|
||||
60: "cp1250",
|
||||
61: "cp1251",
|
||||
62: "cp1252",
|
||||
63: "cp1253",
|
||||
64: "cp1254",
|
||||
65: "cp1255",
|
||||
66: "cp1256",
|
||||
67: "cp1257",
|
||||
68: "cp1258",
|
||||
118: "cp950",
|
||||
# 119: "", # not found
|
||||
123: "big5",
|
||||
125: "gb2312",
|
||||
126: "cp936",
|
||||
134: "euc_jp",
|
||||
136: "cp932",
|
||||
138: "shift_jis",
|
||||
140: "euc-kr",
|
||||
141: "cp949",
|
||||
227: "latin8",
|
||||
# 228: "", # not found
|
||||
# 229: "" # not found
|
||||
}
|
||||
|
||||
|
||||
class SASIndex:
|
||||
row_size_index: Final = 0
|
||||
column_size_index: Final = 1
|
||||
subheader_counts_index: Final = 2
|
||||
column_text_index: Final = 3
|
||||
column_name_index: Final = 4
|
||||
column_attributes_index: Final = 5
|
||||
format_and_label_index: Final = 6
|
||||
column_list_index: Final = 7
|
||||
data_subheader_index: Final = 8
|
||||
|
||||
|
||||
subheader_signature_to_index: Final = {
|
||||
b"\xf7\xf7\xf7\xf7": SASIndex.row_size_index,
|
||||
b"\x00\x00\x00\x00\xf7\xf7\xf7\xf7": SASIndex.row_size_index,
|
||||
b"\xf7\xf7\xf7\xf7\x00\x00\x00\x00": SASIndex.row_size_index,
|
||||
b"\xf7\xf7\xf7\xf7\xff\xff\xfb\xfe": SASIndex.row_size_index,
|
||||
b"\xf6\xf6\xf6\xf6": SASIndex.column_size_index,
|
||||
b"\x00\x00\x00\x00\xf6\xf6\xf6\xf6": SASIndex.column_size_index,
|
||||
b"\xf6\xf6\xf6\xf6\x00\x00\x00\x00": SASIndex.column_size_index,
|
||||
b"\xf6\xf6\xf6\xf6\xff\xff\xfb\xfe": SASIndex.column_size_index,
|
||||
b"\x00\xfc\xff\xff": SASIndex.subheader_counts_index,
|
||||
b"\xff\xff\xfc\x00": SASIndex.subheader_counts_index,
|
||||
b"\x00\xfc\xff\xff\xff\xff\xff\xff": SASIndex.subheader_counts_index,
|
||||
b"\xff\xff\xff\xff\xff\xff\xfc\x00": SASIndex.subheader_counts_index,
|
||||
b"\xfd\xff\xff\xff": SASIndex.column_text_index,
|
||||
b"\xff\xff\xff\xfd": SASIndex.column_text_index,
|
||||
b"\xfd\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_text_index,
|
||||
b"\xff\xff\xff\xff\xff\xff\xff\xfd": SASIndex.column_text_index,
|
||||
b"\xff\xff\xff\xff": SASIndex.column_name_index,
|
||||
b"\xff\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_name_index,
|
||||
b"\xfc\xff\xff\xff": SASIndex.column_attributes_index,
|
||||
b"\xff\xff\xff\xfc": SASIndex.column_attributes_index,
|
||||
b"\xfc\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_attributes_index,
|
||||
b"\xff\xff\xff\xff\xff\xff\xff\xfc": SASIndex.column_attributes_index,
|
||||
b"\xfe\xfb\xff\xff": SASIndex.format_and_label_index,
|
||||
b"\xff\xff\xfb\xfe": SASIndex.format_and_label_index,
|
||||
b"\xfe\xfb\xff\xff\xff\xff\xff\xff": SASIndex.format_and_label_index,
|
||||
b"\xff\xff\xff\xff\xff\xff\xfb\xfe": SASIndex.format_and_label_index,
|
||||
b"\xfe\xff\xff\xff": SASIndex.column_list_index,
|
||||
b"\xff\xff\xff\xfe": SASIndex.column_list_index,
|
||||
b"\xfe\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_list_index,
|
||||
b"\xff\xff\xff\xff\xff\xff\xff\xfe": SASIndex.column_list_index,
|
||||
}
|
||||
|
||||
|
||||
# List of frequently used SAS date and datetime formats
|
||||
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
|
||||
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
|
||||
sas_date_formats: Final = (
|
||||
"DATE",
|
||||
"DAY",
|
||||
"DDMMYY",
|
||||
"DOWNAME",
|
||||
"JULDAY",
|
||||
"JULIAN",
|
||||
"MMDDYY",
|
||||
"MMYY",
|
||||
"MMYYC",
|
||||
"MMYYD",
|
||||
"MMYYP",
|
||||
"MMYYS",
|
||||
"MMYYN",
|
||||
"MONNAME",
|
||||
"MONTH",
|
||||
"MONYY",
|
||||
"QTR",
|
||||
"QTRR",
|
||||
"NENGO",
|
||||
"WEEKDATE",
|
||||
"WEEKDATX",
|
||||
"WEEKDAY",
|
||||
"WEEKV",
|
||||
"WORDDATE",
|
||||
"WORDDATX",
|
||||
"YEAR",
|
||||
"YYMM",
|
||||
"YYMMC",
|
||||
"YYMMD",
|
||||
"YYMMP",
|
||||
"YYMMS",
|
||||
"YYMMN",
|
||||
"YYMON",
|
||||
"YYMMDD",
|
||||
"YYQ",
|
||||
"YYQC",
|
||||
"YYQD",
|
||||
"YYQP",
|
||||
"YYQS",
|
||||
"YYQN",
|
||||
"YYQR",
|
||||
"YYQRC",
|
||||
"YYQRD",
|
||||
"YYQRP",
|
||||
"YYQRS",
|
||||
"YYQRN",
|
||||
"YYMMDDP",
|
||||
"YYMMDDC",
|
||||
"E8601DA",
|
||||
"YYMMDDN",
|
||||
"MMDDYYC",
|
||||
"MMDDYYS",
|
||||
"MMDDYYD",
|
||||
"YYMMDDS",
|
||||
"B8601DA",
|
||||
"DDMMYYN",
|
||||
"YYMMDDD",
|
||||
"DDMMYYB",
|
||||
"DDMMYYP",
|
||||
"MMDDYYP",
|
||||
"YYMMDDB",
|
||||
"MMDDYYN",
|
||||
"DDMMYYC",
|
||||
"DDMMYYD",
|
||||
"DDMMYYS",
|
||||
"MINGUO",
|
||||
)
|
||||
|
||||
sas_datetime_formats: Final = (
|
||||
"DATETIME",
|
||||
"DTWKDATX",
|
||||
"B8601DN",
|
||||
"B8601DT",
|
||||
"B8601DX",
|
||||
"B8601DZ",
|
||||
"B8601LX",
|
||||
"E8601DN",
|
||||
"E8601DT",
|
||||
"E8601DX",
|
||||
"E8601DZ",
|
||||
"E8601LX",
|
||||
"DATEAMPM",
|
||||
"DTDATE",
|
||||
"DTMONYY",
|
||||
"DTMONYY",
|
||||
"DTWKDATX",
|
||||
"DTYEAR",
|
||||
"TOD",
|
||||
"MDYAMPM",
|
||||
)
|
||||
@@ -0,0 +1,501 @@
|
||||
"""
|
||||
Read a SAS XPort format file into a Pandas DataFrame.
|
||||
|
||||
Based on code from Jack Cushman (github.com/jcushman/xport).
|
||||
|
||||
The file format is defined here:
|
||||
|
||||
https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
import struct
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
from pandas.io.sas.sasreader import SASReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
DatetimeNaTType,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
_correct_line1 = (
|
||||
"HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000 "
|
||||
)
|
||||
_correct_header1 = (
|
||||
"HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000"
|
||||
)
|
||||
_correct_header2 = (
|
||||
"HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000 "
|
||||
)
|
||||
_correct_obs_header = (
|
||||
"HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000 "
|
||||
)
|
||||
_fieldkeys = [
|
||||
"ntype",
|
||||
"nhfun",
|
||||
"field_length",
|
||||
"nvar0",
|
||||
"name",
|
||||
"label",
|
||||
"nform",
|
||||
"nfl",
|
||||
"num_decimals",
|
||||
"nfj",
|
||||
"nfill",
|
||||
"niform",
|
||||
"nifl",
|
||||
"nifd",
|
||||
"npos",
|
||||
"_",
|
||||
]
|
||||
|
||||
|
||||
_base_params_doc = """\
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str or file-like object
|
||||
Path to SAS file or object implementing binary read method."""
|
||||
|
||||
_params2_doc = """\
|
||||
index : identifier of index column
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : str
|
||||
Encoding for text data.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator."""
|
||||
|
||||
_format_params_doc = """\
|
||||
format : str
|
||||
File format, only `xport` is currently supported."""
|
||||
|
||||
_iterator_doc = """\
|
||||
iterator : bool, default False
|
||||
Return XportReader object for reading file incrementally."""
|
||||
|
||||
|
||||
_read_sas_doc = f"""Read a SAS file into a DataFrame.
|
||||
|
||||
{_base_params_doc}
|
||||
{_format_params_doc}
|
||||
{_params2_doc}
|
||||
{_iterator_doc}
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a SAS Xport file:
|
||||
|
||||
>>> df = pd.read_sas('filename.XPT')
|
||||
|
||||
Read a Xport file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
>>> do_something(chunk)
|
||||
|
||||
"""
|
||||
|
||||
_xport_reader_doc = f"""\
|
||||
Class for reading SAS Xport files.
|
||||
|
||||
{_base_params_doc}
|
||||
{_params2_doc}
|
||||
|
||||
Attributes
|
||||
----------
|
||||
member_info : list
|
||||
Contains information about the file
|
||||
fields : list
|
||||
Contains information about the variables in the file
|
||||
"""
|
||||
|
||||
|
||||
def _parse_date(datestr: str) -> DatetimeNaTType:
|
||||
"""Given a date in xport format, return Python date."""
|
||||
try:
|
||||
# e.g. "16FEB11:10:07:55"
|
||||
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
|
||||
except ValueError:
|
||||
return pd.NaT
|
||||
|
||||
|
||||
def _split_line(s: str, parts):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
s: str
|
||||
Fixed-length string to split
|
||||
parts: list of (name, length) pairs
|
||||
Used to break up string, name '_' will be filtered from output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict of name:contents of string at given location.
|
||||
"""
|
||||
out = {}
|
||||
start = 0
|
||||
for name, length in parts:
|
||||
out[name] = s[start : start + length].strip()
|
||||
start += length
|
||||
del out["_"]
|
||||
return out
|
||||
|
||||
|
||||
def _handle_truncated_float_vec(vec, nbytes):
|
||||
# This feature is not well documented, but some SAS XPORT files
|
||||
# have 2-7 byte "truncated" floats. To read these truncated
|
||||
# floats, pad them with zeros on the right to make 8 byte floats.
|
||||
#
|
||||
# References:
|
||||
# https://github.com/jcushman/xport/pull/3
|
||||
# The R "foreign" library
|
||||
|
||||
if nbytes != 8:
|
||||
vec1 = np.zeros(len(vec), np.dtype("S8"))
|
||||
dtype = np.dtype(f"S{nbytes},S{8 - nbytes}")
|
||||
vec2 = vec1.view(dtype=dtype)
|
||||
vec2["f0"] = vec
|
||||
return vec2
|
||||
|
||||
return vec
|
||||
|
||||
|
||||
def _parse_float_vec(vec):
|
||||
"""
|
||||
Parse a vector of float values representing IBM 8 byte floats into
|
||||
native 8 byte floats.
|
||||
"""
|
||||
dtype = np.dtype(">u4,>u4")
|
||||
vec1 = vec.view(dtype=dtype)
|
||||
xport1 = vec1["f0"]
|
||||
xport2 = vec1["f1"]
|
||||
|
||||
# Start by setting first half of ieee number to first half of IBM
|
||||
# number sans exponent
|
||||
ieee1 = xport1 & 0x00FFFFFF
|
||||
|
||||
# The fraction bit to the left of the binary point in the ieee
|
||||
# format was set and the number was shifted 0, 1, 2, or 3
|
||||
# places. This will tell us how to adjust the ibm exponent to be a
|
||||
# power of 2 ieee exponent and how to shift the fraction bits to
|
||||
# restore the correct magnitude.
|
||||
shift = np.zeros(len(vec), dtype=np.uint8)
|
||||
shift[np.where(xport1 & 0x00200000)] = 1
|
||||
shift[np.where(xport1 & 0x00400000)] = 2
|
||||
shift[np.where(xport1 & 0x00800000)] = 3
|
||||
|
||||
# shift the ieee number down the correct number of places then
|
||||
# set the second half of the ieee number to be the second half
|
||||
# of the ibm number shifted appropriately, ored with the bits
|
||||
# from the first half that would have been shifted in if we
|
||||
# could shift a double. All we are worried about are the low
|
||||
# order 3 bits of the first half since we're only shifting by
|
||||
# 1, 2, or 3.
|
||||
ieee1 >>= shift
|
||||
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
|
||||
|
||||
# clear the 1 bit to the left of the binary point
|
||||
ieee1 &= 0xFFEFFFFF
|
||||
|
||||
# set the exponent of the ieee number to be the actual exponent
|
||||
# plus the shift count + 1023. Or this into the first half of the
|
||||
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
|
||||
# since during conversion to ibm format the exponent is
|
||||
# incremented by 1 and the fraction bits left 4 positions to the
|
||||
# right of the radix point. (had to add >> 24 because C treats &
|
||||
# 0x7f as 0x7f000000 and Python doesn't)
|
||||
ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
|
||||
xport1 & 0x80000000
|
||||
)
|
||||
|
||||
ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
|
||||
ieee["f0"] = ieee1
|
||||
ieee["f1"] = ieee2
|
||||
ieee = ieee.view(dtype=">f8")
|
||||
ieee = ieee.astype("f8")
|
||||
|
||||
return ieee
|
||||
|
||||
|
||||
class XportReader(SASReader):
|
||||
__doc__ = _xport_reader_doc
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
index=None,
|
||||
encoding: str | None = "ISO-8859-1",
|
||||
chunksize: int | None = None,
|
||||
compression: CompressionOptions = "infer",
|
||||
) -> None:
|
||||
self._encoding = encoding
|
||||
self._lines_read = 0
|
||||
self._index = index
|
||||
self._chunksize = chunksize
|
||||
|
||||
self.handles = get_handle(
|
||||
filepath_or_buffer,
|
||||
"rb",
|
||||
encoding=encoding,
|
||||
is_text=False,
|
||||
compression=compression,
|
||||
)
|
||||
self.filepath_or_buffer = self.handles.handle
|
||||
|
||||
try:
|
||||
self._read_header()
|
||||
except Exception:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
def close(self) -> None:
|
||||
self.handles.close()
|
||||
|
||||
def _get_row(self):
|
||||
return self.filepath_or_buffer.read(80).decode()
|
||||
|
||||
def _read_header(self) -> None:
|
||||
self.filepath_or_buffer.seek(0)
|
||||
|
||||
# read file header
|
||||
line1 = self._get_row()
|
||||
if line1 != _correct_line1:
|
||||
if "**COMPRESSED**" in line1:
|
||||
# this was created with the PROC CPORT method and can't be read
|
||||
# https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm
|
||||
raise ValueError(
|
||||
"Header record indicates a CPORT file, which is not readable."
|
||||
)
|
||||
raise ValueError("Header record is not an XPORT file.")
|
||||
|
||||
line2 = self._get_row()
|
||||
fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
|
||||
file_info = _split_line(line2, fif)
|
||||
if file_info["prefix"] != "SAS SAS SASLIB":
|
||||
raise ValueError("Header record has invalid prefix.")
|
||||
file_info["created"] = _parse_date(file_info["created"])
|
||||
self.file_info = file_info
|
||||
|
||||
line3 = self._get_row()
|
||||
file_info["modified"] = _parse_date(line3[:16])
|
||||
|
||||
# read member header
|
||||
header1 = self._get_row()
|
||||
header2 = self._get_row()
|
||||
headflag1 = header1.startswith(_correct_header1)
|
||||
headflag2 = header2 == _correct_header2
|
||||
if not (headflag1 and headflag2):
|
||||
raise ValueError("Member header not found")
|
||||
# usually 140, could be 135
|
||||
fieldnamelength = int(header1[-5:-2])
|
||||
|
||||
# member info
|
||||
mem = [
|
||||
["prefix", 8],
|
||||
["set_name", 8],
|
||||
["sasdata", 8],
|
||||
["version", 8],
|
||||
["OS", 8],
|
||||
["_", 24],
|
||||
["created", 16],
|
||||
]
|
||||
member_info = _split_line(self._get_row(), mem)
|
||||
mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
|
||||
member_info.update(_split_line(self._get_row(), mem))
|
||||
member_info["modified"] = _parse_date(member_info["modified"])
|
||||
member_info["created"] = _parse_date(member_info["created"])
|
||||
self.member_info = member_info
|
||||
|
||||
# read field names
|
||||
types = {1: "numeric", 2: "char"}
|
||||
fieldcount = int(self._get_row()[54:58])
|
||||
datalength = fieldnamelength * fieldcount
|
||||
# round up to nearest 80
|
||||
if datalength % 80:
|
||||
datalength += 80 - datalength % 80
|
||||
fielddata = self.filepath_or_buffer.read(datalength)
|
||||
fields = []
|
||||
obs_length = 0
|
||||
while len(fielddata) >= fieldnamelength:
|
||||
# pull data for one field
|
||||
fieldbytes, fielddata = (
|
||||
fielddata[:fieldnamelength],
|
||||
fielddata[fieldnamelength:],
|
||||
)
|
||||
|
||||
# rest at end gets ignored, so if field is short, pad out
|
||||
# to match struct pattern below
|
||||
fieldbytes = fieldbytes.ljust(140)
|
||||
|
||||
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
|
||||
field = dict(zip(_fieldkeys, fieldstruct, strict=True))
|
||||
del field["_"]
|
||||
field["ntype"] = types[field["ntype"]]
|
||||
fl = field["field_length"]
|
||||
if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
|
||||
msg = f"Floating field width {fl} is not between 2 and 8."
|
||||
raise TypeError(msg)
|
||||
|
||||
for k, v in field.items():
|
||||
try:
|
||||
field[k] = v.strip()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
obs_length += field["field_length"]
|
||||
fields += [field]
|
||||
|
||||
header = self._get_row()
|
||||
if not header == _correct_obs_header:
|
||||
raise ValueError("Observation header not found.")
|
||||
|
||||
self.fields = fields
|
||||
self.record_length = obs_length
|
||||
self.record_start = self.filepath_or_buffer.tell()
|
||||
|
||||
self.nobs = self._record_count()
|
||||
self.columns = [x["name"].decode() for x in self.fields]
|
||||
|
||||
# Setup the dtype.
|
||||
dtypel = [
|
||||
("s" + str(i), "S" + str(field["field_length"]))
|
||||
for i, field in enumerate(self.fields)
|
||||
]
|
||||
dtype = np.dtype(dtypel)
|
||||
self._dtype = dtype
|
||||
|
||||
def __next__(self) -> pd.DataFrame:
|
||||
return self.read(nrows=self._chunksize or 1)
|
||||
|
||||
def _record_count(self) -> int:
|
||||
"""
|
||||
Get number of records in file.
|
||||
|
||||
This is maybe suboptimal because we have to seek to the end of
|
||||
the file.
|
||||
|
||||
Side effect: returns file position to record_start.
|
||||
"""
|
||||
self.filepath_or_buffer.seek(0, 2)
|
||||
total_records_length = self.filepath_or_buffer.tell() - self.record_start
|
||||
|
||||
if total_records_length % 80 != 0:
|
||||
warnings.warn(
|
||||
"xport file may be corrupted.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
if self.record_length > 80:
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
return total_records_length // self.record_length
|
||||
|
||||
self.filepath_or_buffer.seek(-80, 2)
|
||||
last_card_bytes = self.filepath_or_buffer.read(80)
|
||||
last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)
|
||||
|
||||
# 8 byte blank
|
||||
ix = np.flatnonzero(last_card == 2314885530818453536)
|
||||
|
||||
if len(ix) == 0:
|
||||
tail_pad = 0
|
||||
else:
|
||||
tail_pad = 8 * len(ix)
|
||||
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
|
||||
return (total_records_length - tail_pad) // self.record_length
|
||||
|
||||
def get_chunk(self, size: int | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
Reads lines from Xport file and returns as dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, defaults to None
|
||||
Number of lines to read. If None, reads whole file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
if size is None:
|
||||
size = self._chunksize
|
||||
return self.read(nrows=size)
|
||||
|
||||
def _missing_double(self, vec):
|
||||
v = vec.view(dtype="u1,u1,u2,u4")
|
||||
miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
|
||||
miss1 = (
|
||||
((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
|
||||
| (v["f0"] == 0x5F)
|
||||
| (v["f0"] == 0x2E)
|
||||
)
|
||||
miss &= miss1
|
||||
return miss
|
||||
|
||||
def read(self, nrows: int | None = None) -> pd.DataFrame:
|
||||
"""Read observations from SAS Xport file, returning as data frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nrows : int
|
||||
Number of rows to read from data file; if None, read whole
|
||||
file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame.
|
||||
"""
|
||||
if nrows is None:
|
||||
nrows = self.nobs
|
||||
|
||||
read_lines = min(nrows, self.nobs - self._lines_read)
|
||||
read_len = read_lines * self.record_length
|
||||
if read_len <= 0:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
raw = self.filepath_or_buffer.read(read_len)
|
||||
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
|
||||
|
||||
df_data = {}
|
||||
for j, x in enumerate(self.columns):
|
||||
vec = data["s" + str(j)]
|
||||
ntype = self.fields[j]["ntype"]
|
||||
if ntype == "numeric":
|
||||
vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
|
||||
miss = self._missing_double(vec)
|
||||
v = _parse_float_vec(vec)
|
||||
v[miss] = np.nan
|
||||
elif self.fields[j]["ntype"] == "char":
|
||||
v = [y.rstrip() for y in vec]
|
||||
|
||||
if self._encoding is not None:
|
||||
v = [y.decode(self._encoding) for y in v]
|
||||
|
||||
df_data.update({x: v})
|
||||
df = pd.DataFrame(df_data)
|
||||
|
||||
if self._index is None:
|
||||
df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
|
||||
else:
|
||||
df = df.set_index(self._index)
|
||||
|
||||
self._lines_read += read_lines
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Read SAS sas7bdat or xport files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABC,
|
||||
abstractmethod,
|
||||
)
|
||||
from collections.abc import Iterator
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Self,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas.util._decorators import set_module
|
||||
|
||||
from pandas.io.common import stringify_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Hashable
|
||||
from types import TracebackType
|
||||
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@set_module("pandas.api.typing")
|
||||
class SASReader(Iterator["DataFrame"], ABC):
|
||||
"""
|
||||
Abstract class for XportReader and SAS7BDATReader.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def read(self, nrows: int | None = None) -> DataFrame: ...
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None: ...
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
traceback: TracebackType | None,
|
||||
) -> None:
|
||||
self.close()
|
||||
|
||||
|
||||
@overload
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
*,
|
||||
format: str | None = ...,
|
||||
index: Hashable | None = ...,
|
||||
encoding: str | None = ...,
|
||||
chunksize: int = ...,
|
||||
iterator: bool = ...,
|
||||
compression: CompressionOptions = ...,
|
||||
) -> SASReader: ...
|
||||
|
||||
|
||||
@overload
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
*,
|
||||
format: str | None = ...,
|
||||
index: Hashable | None = ...,
|
||||
encoding: str | None = ...,
|
||||
chunksize: None = ...,
|
||||
iterator: bool = ...,
|
||||
compression: CompressionOptions = ...,
|
||||
) -> DataFrame | SASReader: ...
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
*,
|
||||
format: str | None = None,
|
||||
index: Hashable | None = None,
|
||||
encoding: str | None = None,
|
||||
chunksize: int | None = None,
|
||||
iterator: bool = False,
|
||||
compression: CompressionOptions = "infer",
|
||||
) -> DataFrame | SASReader:
|
||||
"""
|
||||
Read SAS files stored as either XPORT or SAS7BDAT format files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be
|
||||
a URL. Valid URL schemes include http, ftp, s3, and file. For file
|
||||
URLs, a host is expected. A local file could be:
|
||||
``file://localhost/path/to/table.sas7bdat``.
|
||||
format : str {{'xport', 'sas7bdat'}} or None
|
||||
If None, file format is inferred from file extension. If 'xport' or
|
||||
'sas7bdat', uses the corresponding format.
|
||||
index : identifier of index column, defaults to None
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : str, default is None
|
||||
Encoding for text data. If None, text data are stored as raw bytes.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator.
|
||||
iterator : bool, defaults to False
|
||||
If True, returns an iterator for reading the file incrementally.
|
||||
compression : str or dict, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer' and
|
||||
'filepath_or_buffer' is path-like, then detect compression from the
|
||||
following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
|
||||
'.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
|
||||
Set to ``None`` for no decompression.
|
||||
Can also be a dict with key ``'method'`` set to one of {``'zip'``,
|
||||
``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other
|
||||
key-value pairs are forwarded to ``zipfile.ZipFile``,
|
||||
``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdCompressor``,
|
||||
``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively.
|
||||
As an example, the following could be passed for faster compression
|
||||
and to create a reproducible gzip archive:
|
||||
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame, SAS7BDATReader, or XportReader
|
||||
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
|
||||
or XportReader, file format is inferred from file extension.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_csv : Read a comma-separated values (csv) file into a DataFrame.
|
||||
read_excel : Read an Excel file into a pandas DataFrame.
|
||||
read_spss : Read an SPSS file into a pandas DataFrame.
|
||||
read_orc : Load an ORC object into a pandas DataFrame.
|
||||
read_feather : Load a feather-format object into a pandas DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP
|
||||
"""
|
||||
if format is None:
|
||||
buffer_error_msg = (
|
||||
"If this is a buffer object rather "
|
||||
"than a string name, you must specify a format string"
|
||||
)
|
||||
filepath_or_buffer = stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, str):
|
||||
raise ValueError(buffer_error_msg)
|
||||
fname = filepath_or_buffer.lower()
|
||||
if ".xpt" in fname:
|
||||
format = "xport"
|
||||
elif ".sas7bdat" in fname:
|
||||
format = "sas7bdat"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"unable to infer format of SAS file from filename: {fname!r}"
|
||||
)
|
||||
|
||||
reader: SASReader
|
||||
if format.lower() == "xport":
|
||||
from pandas.io.sas.sas_xport import XportReader
|
||||
|
||||
reader = XportReader(
|
||||
filepath_or_buffer,
|
||||
index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize,
|
||||
compression=compression,
|
||||
)
|
||||
elif format.lower() == "sas7bdat":
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
|
||||
reader = SAS7BDATReader(
|
||||
filepath_or_buffer,
|
||||
index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize,
|
||||
compression=compression,
|
||||
)
|
||||
else:
|
||||
raise ValueError("unknown SAS format")
|
||||
|
||||
if iterator or chunksize:
|
||||
return reader
|
||||
|
||||
with reader:
|
||||
return reader.read()
|
||||
@@ -0,0 +1,95 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import set_module
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.dtypes.inference import is_list_like
|
||||
|
||||
from pandas.io.common import stringify_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
from pathlib import Path
|
||||
|
||||
from pandas._typing import DtypeBackend
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_spss(
|
||||
path: str | Path,
|
||||
usecols: Sequence[str] | None = None,
|
||||
convert_categoricals: bool = True,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
**kwargs: Any,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load an SPSS file from the file path, returning a DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str or Path
|
||||
File path.
|
||||
usecols : list-like, optional
|
||||
Return a subset of the columns. If None, return all columns.
|
||||
convert_categoricals : bool, default is True
|
||||
Convert categorical columns into pd.Categorical.
|
||||
dtype_backend : {'numpy_nullable', 'pyarrow'}
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). If not specified, the default behavior
|
||||
is to not use nullable data types. If specified, the behavior
|
||||
is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
* ``"pyarrow"``: returns pyarrow-backed
|
||||
nullable :class:`ArrowDtype` :class:`DataFrame`
|
||||
|
||||
.. versionadded:: 2.0
|
||||
**kwargs
|
||||
Additional keyword arguments that can be passed to :func:`pyreadstat.read_sav`.
|
||||
|
||||
.. versionadded:: 3.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
DataFrame based on the SPSS file.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_csv : Read a comma-separated values (csv) file into a pandas DataFrame.
|
||||
read_excel : Read an Excel file into a pandas DataFrame.
|
||||
read_sas : Read an SAS file into a pandas DataFrame.
|
||||
read_orc : Load an ORC object into a pandas DataFrame.
|
||||
read_feather : Load a feather-format object into a pandas DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP
|
||||
"""
|
||||
pyreadstat = import_optional_dependency("pyreadstat")
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
if usecols is not None:
|
||||
if not is_list_like(usecols):
|
||||
raise TypeError("usecols must be list-like.")
|
||||
usecols = list(usecols) # pyreadstat requires a list
|
||||
|
||||
df, metadata = pyreadstat.read_sav(
|
||||
stringify_path(path),
|
||||
usecols=usecols,
|
||||
apply_value_formats=convert_categoricals,
|
||||
**kwargs,
|
||||
)
|
||||
df.attrs = metadata.__dict__
|
||||
if dtype_backend is not lib.no_default:
|
||||
df = df.convert_dtypes(dtype_backend=dtype_backend)
|
||||
return df
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user