This commit is contained in:
2024-12-04 13:35:57 +05:00
parent d346bf4b2a
commit 73ce681a55
7059 changed files with 1196501 additions and 0 deletions

View File

@ -0,0 +1,13 @@
# ruff: noqa: TCH004
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# import modules that have public classes/functions
from pandas.io import (
formats,
json,
stata,
)
# mark only those modules as public
__all__ = ["formats", "json", "stata"]

View File

@ -0,0 +1,34 @@
from __future__ import annotations
from typing import Callable
from pandas.compat._optional import import_optional_dependency
import pandas as pd
def _arrow_dtype_mapping() -> dict:
pa = import_optional_dependency("pyarrow")
return {
pa.int8(): pd.Int8Dtype(),
pa.int16(): pd.Int16Dtype(),
pa.int32(): pd.Int32Dtype(),
pa.int64(): pd.Int64Dtype(),
pa.uint8(): pd.UInt8Dtype(),
pa.uint16(): pd.UInt16Dtype(),
pa.uint32(): pd.UInt32Dtype(),
pa.uint64(): pd.UInt64Dtype(),
pa.bool_(): pd.BooleanDtype(),
pa.string(): pd.StringDtype(),
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
}
def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")
return {
pa.string(): pd.StringDtype(storage="pyarrow_numpy"),
pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"),
}.get

View File

@ -0,0 +1,65 @@
"""
Data IO api
"""
from pandas.io.clipboards import read_clipboard
from pandas.io.excel import (
ExcelFile,
ExcelWriter,
read_excel,
)
from pandas.io.feather_format import read_feather
from pandas.io.gbq import read_gbq
from pandas.io.html import read_html
from pandas.io.json import read_json
from pandas.io.orc import read_orc
from pandas.io.parquet import read_parquet
from pandas.io.parsers import (
read_csv,
read_fwf,
read_table,
)
from pandas.io.pickle import (
read_pickle,
to_pickle,
)
from pandas.io.pytables import (
HDFStore,
read_hdf,
)
from pandas.io.sas import read_sas
from pandas.io.spss import read_spss
from pandas.io.sql import (
read_sql,
read_sql_query,
read_sql_table,
)
from pandas.io.stata import read_stata
from pandas.io.xml import read_xml
__all__ = [
"ExcelFile",
"ExcelWriter",
"HDFStore",
"read_clipboard",
"read_csv",
"read_excel",
"read_feather",
"read_fwf",
"read_gbq",
"read_hdf",
"read_html",
"read_json",
"read_orc",
"read_parquet",
"read_pickle",
"read_sas",
"read_spss",
"read_sql",
"read_sql_query",
"read_sql_table",
"read_stata",
"read_table",
"read_xml",
"to_pickle",
]

View File

@ -0,0 +1,747 @@
"""
Pyperclip
A cross-platform clipboard module for Python,
with copy & paste functions for plain text.
By Al Sweigart al@inventwithpython.com
Licence at LICENSES/PYPERCLIP_LICENSE
Usage:
import pyperclip
pyperclip.copy('The text to be copied to the clipboard.')
spam = pyperclip.paste()
if not pyperclip.is_available():
print("Copy functionality unavailable!")
On Windows, no additional modules are needed.
On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli
commands. (These commands should come with OS X.).
On Linux, install xclip, xsel, or wl-clipboard (for "wayland" sessions) via
package manager.
For example, in Debian:
sudo apt-get install xclip
sudo apt-get install xsel
sudo apt-get install wl-clipboard
Otherwise on Linux, you will need the PyQt5 modules installed.
This module does not work with PyGObject yet.
Cygwin is currently not supported.
Security Note: This module runs programs with these names:
- pbcopy
- pbpaste
- xclip
- xsel
- wl-copy/wl-paste
- klipper
- qdbus
A malicious user could rename or add programs with these names, tricking
Pyperclip into running them with whatever permissions the Python process has.
"""
__version__ = "1.8.2"
import contextlib
import ctypes
from ctypes import (
c_size_t,
c_wchar,
c_wchar_p,
get_errno,
sizeof,
)
import os
import platform
from shutil import which as _executable_exists
import subprocess
import time
import warnings
from pandas.errors import (
PyperclipException,
PyperclipWindowsException,
)
from pandas.util._exceptions import find_stack_level
# `import PyQt4` sys.exit()s if DISPLAY is not in the environment.
# Thus, we need to detect the presence of $DISPLAY manually
# and not load PyQt4 if it is absent.
HAS_DISPLAY = os.getenv("DISPLAY")
EXCEPT_MSG = """
Pyperclip could not find a copy/paste mechanism for your system.
For more information, please visit
https://pyperclip.readthedocs.io/en/latest/index.html#not-implemented-error
"""
ENCODING = "utf-8"
class PyperclipTimeoutException(PyperclipException):
pass
def _stringifyText(text) -> str:
acceptedTypes = (str, int, float, bool)
if not isinstance(text, acceptedTypes):
raise PyperclipException(
f"only str, int, float, and bool values "
f"can be copied to the clipboard, not {type(text).__name__}"
)
return str(text)
def init_osx_pbcopy_clipboard():
def copy_osx_pbcopy(text):
text = _stringifyText(text) # Converts non-str values to str.
with subprocess.Popen(
["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True
) as p:
p.communicate(input=text.encode(ENCODING))
def paste_osx_pbcopy():
with subprocess.Popen(
["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True
) as p:
stdout = p.communicate()[0]
return stdout.decode(ENCODING)
return copy_osx_pbcopy, paste_osx_pbcopy
def init_osx_pyobjc_clipboard():
def copy_osx_pyobjc(text):
"""Copy string argument to clipboard"""
text = _stringifyText(text) # Converts non-str values to str.
newStr = Foundation.NSString.stringWithString_(text).nsstring()
newData = newStr.dataUsingEncoding_(Foundation.NSUTF8StringEncoding)
board = AppKit.NSPasteboard.generalPasteboard()
board.declareTypes_owner_([AppKit.NSStringPboardType], None)
board.setData_forType_(newData, AppKit.NSStringPboardType)
def paste_osx_pyobjc():
"""Returns contents of clipboard"""
board = AppKit.NSPasteboard.generalPasteboard()
content = board.stringForType_(AppKit.NSStringPboardType)
return content
return copy_osx_pyobjc, paste_osx_pyobjc
def init_qt_clipboard():
global QApplication
# $DISPLAY should exist
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
try:
from qtpy.QtWidgets import QApplication
except ImportError:
try:
from PyQt5.QtWidgets import QApplication
except ImportError:
from PyQt4.QtGui import QApplication
app = QApplication.instance()
if app is None:
app = QApplication([])
def copy_qt(text):
text = _stringifyText(text) # Converts non-str values to str.
cb = app.clipboard()
cb.setText(text)
def paste_qt() -> str:
cb = app.clipboard()
return str(cb.text())
return copy_qt, paste_qt
def init_xclip_clipboard():
DEFAULT_SELECTION = "c"
PRIMARY_SELECTION = "p"
def copy_xclip(text, primary=False):
text = _stringifyText(text) # Converts non-str values to str.
selection = DEFAULT_SELECTION
if primary:
selection = PRIMARY_SELECTION
with subprocess.Popen(
["xclip", "-selection", selection], stdin=subprocess.PIPE, close_fds=True
) as p:
p.communicate(input=text.encode(ENCODING))
def paste_xclip(primary=False):
selection = DEFAULT_SELECTION
if primary:
selection = PRIMARY_SELECTION
with subprocess.Popen(
["xclip", "-selection", selection, "-o"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=True,
) as p:
stdout = p.communicate()[0]
# Intentionally ignore extraneous output on stderr when clipboard is empty
return stdout.decode(ENCODING)
return copy_xclip, paste_xclip
def init_xsel_clipboard():
DEFAULT_SELECTION = "-b"
PRIMARY_SELECTION = "-p"
def copy_xsel(text, primary=False):
text = _stringifyText(text) # Converts non-str values to str.
selection_flag = DEFAULT_SELECTION
if primary:
selection_flag = PRIMARY_SELECTION
with subprocess.Popen(
["xsel", selection_flag, "-i"], stdin=subprocess.PIPE, close_fds=True
) as p:
p.communicate(input=text.encode(ENCODING))
def paste_xsel(primary=False):
selection_flag = DEFAULT_SELECTION
if primary:
selection_flag = PRIMARY_SELECTION
with subprocess.Popen(
["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True
) as p:
stdout = p.communicate()[0]
return stdout.decode(ENCODING)
return copy_xsel, paste_xsel
def init_wl_clipboard():
PRIMARY_SELECTION = "-p"
def copy_wl(text, primary=False):
text = _stringifyText(text) # Converts non-str values to str.
args = ["wl-copy"]
if primary:
args.append(PRIMARY_SELECTION)
if not text:
args.append("--clear")
subprocess.check_call(args, close_fds=True)
else:
p = subprocess.Popen(args, stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode(ENCODING))
def paste_wl(primary=False):
args = ["wl-paste", "-n"]
if primary:
args.append(PRIMARY_SELECTION)
p = subprocess.Popen(args, stdout=subprocess.PIPE, close_fds=True)
stdout, _stderr = p.communicate()
return stdout.decode(ENCODING)
return copy_wl, paste_wl
def init_klipper_clipboard():
def copy_klipper(text):
text = _stringifyText(text) # Converts non-str values to str.
with subprocess.Popen(
[
"qdbus",
"org.kde.klipper",
"/klipper",
"setClipboardContents",
text.encode(ENCODING),
],
stdin=subprocess.PIPE,
close_fds=True,
) as p:
p.communicate(input=None)
def paste_klipper():
with subprocess.Popen(
["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
stdout=subprocess.PIPE,
close_fds=True,
) as p:
stdout = p.communicate()[0]
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
# TODO: https://github.com/asweigart/pyperclip/issues/43
clipboardContents = stdout.decode(ENCODING)
# even if blank, Klipper will append a newline at the end
assert len(clipboardContents) > 0
# make sure that newline is there
assert clipboardContents.endswith("\n")
if clipboardContents.endswith("\n"):
clipboardContents = clipboardContents[:-1]
return clipboardContents
return copy_klipper, paste_klipper
def init_dev_clipboard_clipboard():
def copy_dev_clipboard(text):
text = _stringifyText(text) # Converts non-str values to str.
if text == "":
warnings.warn(
"Pyperclip cannot copy a blank string to the clipboard on Cygwin. "
"This is effectively a no-op.",
stacklevel=find_stack_level(),
)
if "\r" in text:
warnings.warn(
"Pyperclip cannot handle \\r characters on Cygwin.",
stacklevel=find_stack_level(),
)
with open("/dev/clipboard", "w", encoding="utf-8") as fd:
fd.write(text)
def paste_dev_clipboard() -> str:
with open("/dev/clipboard", encoding="utf-8") as fd:
content = fd.read()
return content
return copy_dev_clipboard, paste_dev_clipboard
def init_no_clipboard():
class ClipboardUnavailable:
def __call__(self, *args, **kwargs):
raise PyperclipException(EXCEPT_MSG)
def __bool__(self) -> bool:
return False
return ClipboardUnavailable(), ClipboardUnavailable()
# Windows-related clipboard functions:
class CheckedCall:
def __init__(self, f) -> None:
super().__setattr__("f", f)
def __call__(self, *args):
ret = self.f(*args)
if not ret and get_errno():
raise PyperclipWindowsException("Error calling " + self.f.__name__)
return ret
def __setattr__(self, key, value):
setattr(self.f, key, value)
def init_windows_clipboard():
global HGLOBAL, LPVOID, DWORD, LPCSTR, INT
global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE
from ctypes.wintypes import (
BOOL,
DWORD,
HANDLE,
HGLOBAL,
HINSTANCE,
HMENU,
HWND,
INT,
LPCSTR,
LPVOID,
UINT,
)
windll = ctypes.windll
msvcrt = ctypes.CDLL("msvcrt")
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
safeCreateWindowExA.argtypes = [
DWORD,
LPCSTR,
LPCSTR,
DWORD,
INT,
INT,
INT,
INT,
HWND,
HMENU,
HINSTANCE,
LPVOID,
]
safeCreateWindowExA.restype = HWND
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
safeDestroyWindow.argtypes = [HWND]
safeDestroyWindow.restype = BOOL
OpenClipboard = windll.user32.OpenClipboard
OpenClipboard.argtypes = [HWND]
OpenClipboard.restype = BOOL
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
safeCloseClipboard.argtypes = []
safeCloseClipboard.restype = BOOL
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
safeEmptyClipboard.argtypes = []
safeEmptyClipboard.restype = BOOL
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
safeGetClipboardData.argtypes = [UINT]
safeGetClipboardData.restype = HANDLE
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
safeSetClipboardData.argtypes = [UINT, HANDLE]
safeSetClipboardData.restype = HANDLE
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
safeGlobalAlloc.argtypes = [UINT, c_size_t]
safeGlobalAlloc.restype = HGLOBAL
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
safeGlobalLock.argtypes = [HGLOBAL]
safeGlobalLock.restype = LPVOID
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
safeGlobalUnlock.argtypes = [HGLOBAL]
safeGlobalUnlock.restype = BOOL
wcslen = CheckedCall(msvcrt.wcslen)
wcslen.argtypes = [c_wchar_p]
wcslen.restype = UINT
GMEM_MOVEABLE = 0x0002
CF_UNICODETEXT = 13
@contextlib.contextmanager
def window():
"""
Context that provides a valid Windows hwnd.
"""
# we really just need the hwnd, so setting "STATIC"
# as predefined lpClass is just fine.
hwnd = safeCreateWindowExA(
0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
)
try:
yield hwnd
finally:
safeDestroyWindow(hwnd)
@contextlib.contextmanager
def clipboard(hwnd):
"""
Context manager that opens the clipboard and prevents
other applications from modifying the clipboard content.
"""
# We may not get the clipboard handle immediately because
# some other application is accessing it (?)
# We try for at least 500ms to get the clipboard.
t = time.time() + 0.5
success = False
while time.time() < t:
success = OpenClipboard(hwnd)
if success:
break
time.sleep(0.01)
if not success:
raise PyperclipWindowsException("Error calling OpenClipboard")
try:
yield
finally:
safeCloseClipboard()
def copy_windows(text):
# This function is heavily based on
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
text = _stringifyText(text) # Converts non-str values to str.
with window() as hwnd:
# http://msdn.com/ms649048
# If an application calls OpenClipboard with hwnd set to NULL,
# EmptyClipboard sets the clipboard owner to NULL;
# this causes SetClipboardData to fail.
# => We need a valid hwnd to copy something.
with clipboard(hwnd):
safeEmptyClipboard()
if text:
# http://msdn.com/ms649051
# If the hMem parameter identifies a memory object,
# the object must have been allocated using the
# function with the GMEM_MOVEABLE flag.
count = wcslen(text) + 1
handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
locked_handle = safeGlobalLock(handle)
ctypes.memmove(
c_wchar_p(locked_handle),
c_wchar_p(text),
count * sizeof(c_wchar),
)
safeGlobalUnlock(handle)
safeSetClipboardData(CF_UNICODETEXT, handle)
def paste_windows():
with clipboard(None):
handle = safeGetClipboardData(CF_UNICODETEXT)
if not handle:
# GetClipboardData may return NULL with errno == NO_ERROR
# if the clipboard is empty.
# (Also, it may return a handle to an empty buffer,
# but technically that's not empty)
return ""
return c_wchar_p(handle).value
return copy_windows, paste_windows
def init_wsl_clipboard():
def copy_wsl(text):
text = _stringifyText(text) # Converts non-str values to str.
with subprocess.Popen(["clip.exe"], stdin=subprocess.PIPE, close_fds=True) as p:
p.communicate(input=text.encode(ENCODING))
def paste_wsl():
with subprocess.Popen(
["powershell.exe", "-command", "Get-Clipboard"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=True,
) as p:
stdout = p.communicate()[0]
# WSL appends "\r\n" to the contents.
return stdout[:-2].decode(ENCODING)
return copy_wsl, paste_wsl
# Automatic detection of clipboard mechanisms
# and importing is done in determine_clipboard():
def determine_clipboard():
"""
Determine the OS/platform and set the copy() and paste() functions
accordingly.
"""
global Foundation, AppKit, qtpy, PyQt4, PyQt5
# Setup for the CYGWIN platform:
if (
"cygwin" in platform.system().lower()
): # Cygwin has a variety of values returned by platform.system(),
# such as 'CYGWIN_NT-6.1'
# FIXME(pyperclip#55): pyperclip currently does not support Cygwin,
# see https://github.com/asweigart/pyperclip/issues/55
if os.path.exists("/dev/clipboard"):
warnings.warn(
"Pyperclip's support for Cygwin is not perfect, "
"see https://github.com/asweigart/pyperclip/issues/55",
stacklevel=find_stack_level(),
)
return init_dev_clipboard_clipboard()
# Setup for the WINDOWS platform:
elif os.name == "nt" or platform.system() == "Windows":
return init_windows_clipboard()
if platform.system() == "Linux":
if _executable_exists("wslconfig.exe"):
return init_wsl_clipboard()
# Setup for the macOS platform:
if os.name == "mac" or platform.system() == "Darwin":
try:
import AppKit
import Foundation # check if pyobjc is installed
except ImportError:
return init_osx_pbcopy_clipboard()
else:
return init_osx_pyobjc_clipboard()
# Setup for the LINUX platform:
if HAS_DISPLAY:
if os.environ.get("WAYLAND_DISPLAY") and _executable_exists("wl-copy"):
return init_wl_clipboard()
if _executable_exists("xsel"):
return init_xsel_clipboard()
if _executable_exists("xclip"):
return init_xclip_clipboard()
if _executable_exists("klipper") and _executable_exists("qdbus"):
return init_klipper_clipboard()
try:
# qtpy is a small abstraction layer that lets you write applications
# using a single api call to either PyQt or PySide.
# https://pypi.python.org/project/QtPy
import qtpy # check if qtpy is installed
except ImportError:
# If qtpy isn't installed, fall back on importing PyQt4.
try:
import PyQt5 # check if PyQt5 is installed
except ImportError:
try:
import PyQt4 # check if PyQt4 is installed
except ImportError:
pass # We want to fail fast for all non-ImportError exceptions.
else:
return init_qt_clipboard()
else:
return init_qt_clipboard()
else:
return init_qt_clipboard()
return init_no_clipboard()
def set_clipboard(clipboard):
"""
Explicitly sets the clipboard mechanism. The "clipboard mechanism" is how
the copy() and paste() functions interact with the operating system to
implement the copy/paste feature. The clipboard parameter must be one of:
- pbcopy
- pyobjc (default on macOS)
- qt
- xclip
- xsel
- klipper
- windows (default on Windows)
- no (this is what is set when no clipboard mechanism can be found)
"""
global copy, paste
clipboard_types = {
"pbcopy": init_osx_pbcopy_clipboard,
"pyobjc": init_osx_pyobjc_clipboard,
"qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5'
"xclip": init_xclip_clipboard,
"xsel": init_xsel_clipboard,
"wl-clipboard": init_wl_clipboard,
"klipper": init_klipper_clipboard,
"windows": init_windows_clipboard,
"no": init_no_clipboard,
}
if clipboard not in clipboard_types:
allowed_clipboard_types = [repr(_) for _ in clipboard_types]
raise ValueError(
f"Argument must be one of {', '.join(allowed_clipboard_types)}"
)
# Sets pyperclip's copy() and paste() functions:
copy, paste = clipboard_types[clipboard]()
def lazy_load_stub_copy(text):
"""
A stub function for copy(), which will load the real copy() function when
called so that the real copy() function is used for later calls.
This allows users to import pyperclip without having determine_clipboard()
automatically run, which will automatically select a clipboard mechanism.
This could be a problem if it selects, say, the memory-heavy PyQt4 module
but the user was just going to immediately call set_clipboard() to use a
different clipboard mechanism.
The lazy loading this stub function implements gives the user a chance to
call set_clipboard() to pick another clipboard mechanism. Or, if the user
simply calls copy() or paste() without calling set_clipboard() first,
will fall back on whatever clipboard mechanism that determine_clipboard()
automatically chooses.
"""
global copy, paste
copy, paste = determine_clipboard()
return copy(text)
def lazy_load_stub_paste():
"""
A stub function for paste(), which will load the real paste() function when
called so that the real paste() function is used for later calls.
This allows users to import pyperclip without having determine_clipboard()
automatically run, which will automatically select a clipboard mechanism.
This could be a problem if it selects, say, the memory-heavy PyQt4 module
but the user was just going to immediately call set_clipboard() to use a
different clipboard mechanism.
The lazy loading this stub function implements gives the user a chance to
call set_clipboard() to pick another clipboard mechanism. Or, if the user
simply calls copy() or paste() without calling set_clipboard() first,
will fall back on whatever clipboard mechanism that determine_clipboard()
automatically chooses.
"""
global copy, paste
copy, paste = determine_clipboard()
return paste()
def is_available() -> bool:
return copy != lazy_load_stub_copy and paste != lazy_load_stub_paste
# Initially, copy() and paste() are set to lazy loading wrappers which will
# set `copy` and `paste` to real functions the first time they're used, unless
# set_clipboard() or determine_clipboard() is called first.
copy, paste = lazy_load_stub_copy, lazy_load_stub_paste
def waitForPaste(timeout=None):
"""This function call blocks until a non-empty text string exists on the
clipboard. It returns this text.
This function raises PyperclipTimeoutException if timeout was set to
a number of seconds that has elapsed without non-empty text being put on
the clipboard."""
startTime = time.time()
while True:
clipboardText = paste()
if clipboardText != "":
return clipboardText
time.sleep(0.01)
if timeout is not None and time.time() > startTime + timeout:
raise PyperclipTimeoutException(
"waitForPaste() timed out after " + str(timeout) + " seconds."
)
def waitForNewPaste(timeout=None):
"""This function call blocks until a new text string exists on the
clipboard that is different from the text that was there when the function
was first called. It returns this text.
This function raises PyperclipTimeoutException if timeout was set to
a number of seconds that has elapsed without non-empty text being put on
the clipboard."""
startTime = time.time()
originalText = paste()
while True:
currentText = paste()
if currentText != originalText:
return currentText
time.sleep(0.01)
if timeout is not None and time.time() > startTime + timeout:
raise PyperclipTimeoutException(
"waitForNewPaste() timed out after " + str(timeout) + " seconds."
)
__all__ = [
"copy",
"paste",
"waitForPaste",
"waitForNewPaste",
"set_clipboard",
"determine_clipboard",
]
# pandas aliases
clipboard_get = paste
clipboard_set = copy

View File

@ -0,0 +1,197 @@
""" io on the clipboard """
from __future__ import annotations
from io import StringIO
from typing import TYPE_CHECKING
import warnings
from pandas._libs import lib
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import check_dtype_backend
from pandas.core.dtypes.generic import ABCDataFrame
from pandas import (
get_option,
option_context,
)
if TYPE_CHECKING:
from pandas._typing import DtypeBackend
def read_clipboard(
sep: str = r"\s+",
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
**kwargs,
): # pragma: no cover
r"""
Read text from clipboard and pass to :func:`~pandas.read_csv`.
Parses clipboard contents similar to how CSV files are parsed
using :func:`~pandas.read_csv`.
Parameters
----------
sep : str, default '\\s+'
A string or regex delimiter. The default of ``'\\s+'`` denotes
one or more whitespace characters.
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
.. versionadded:: 2.0
**kwargs
See :func:`~pandas.read_csv` for the full argument list.
Returns
-------
DataFrame
A parsed :class:`~pandas.DataFrame` object.
See Also
--------
DataFrame.to_clipboard : Copy object to the system clipboard.
read_csv : Read a comma-separated values (csv) file into DataFrame.
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
Examples
--------
>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
>>> df.to_clipboard() # doctest: +SKIP
>>> pd.read_clipboard() # doctest: +SKIP
A B C
0 1 2 3
1 4 5 6
"""
encoding = kwargs.pop("encoding", "utf-8")
# only utf-8 is valid for passed value because that's what clipboard
# supports
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
check_dtype_backend(dtype_backend)
from pandas.io.clipboard import clipboard_get
from pandas.io.parsers import read_csv
text = clipboard_get()
# Try to decode (if needed, as "text" might already be a string here).
try:
text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
except AttributeError:
pass
# Excel copies into clipboard with \t separation
# inspect no more then the 10 first lines, if they
# all contain an equal number (>0) of tabs, infer
# that this came from excel and set 'sep' accordingly
lines = text[:10000].split("\n")[:-1][:10]
# Need to remove leading white space, since read_csv
# accepts:
# a b
# 0 1 2
# 1 3 4
counts = {x.lstrip(" ").count("\t") for x in lines}
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
sep = "\t"
# check the number of leading tabs in the first line
# to account for index columns
index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
if index_length != 0:
kwargs.setdefault("index_col", list(range(index_length)))
# Edge case where sep is specified to be None, return to default
if sep is None and kwargs.get("delim_whitespace") is None:
sep = r"\s+"
# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
if len(sep) > 1 and kwargs.get("engine") is None:
kwargs["engine"] = "python"
elif len(sep) > 1 and kwargs.get("engine") == "c":
warnings.warn(
"read_clipboard with regex separator does not work properly with c engine.",
stacklevel=find_stack_level(),
)
return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs)
def to_clipboard(
obj, excel: bool | None = True, sep: str | None = None, **kwargs
) -> None: # pragma: no cover
"""
Attempt to write text representation of object to the system clipboard
The clipboard can be then pasted into Excel for example.
Parameters
----------
obj : the object to write to the clipboard
excel : bool, defaults to True
if True, use the provided separator, writing in a csv
format for allowing easy pasting into excel.
if False, write a string representation of the object
to the clipboard
sep : optional, defaults to tab
other keywords are passed to to_csv
Notes
-----
Requirements for your platform
- Linux: xclip, or xsel (with PyQt4 modules)
- Windows:
- OS X:
"""
encoding = kwargs.pop("encoding", "utf-8")
# testing if an invalid encoding is passed to clipboard
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
raise ValueError("clipboard only supports utf-8 encoding")
from pandas.io.clipboard import clipboard_set
if excel is None:
excel = True
if excel:
try:
if sep is None:
sep = "\t"
buf = StringIO()
# clipboard_set (pyperclip) expects unicode
obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
text = buf.getvalue()
clipboard_set(text)
return
except TypeError:
warnings.warn(
"to_clipboard in excel mode requires a single character separator.",
stacklevel=find_stack_level(),
)
elif sep is not None:
warnings.warn(
"to_clipboard with excel=False ignores the sep argument.",
stacklevel=find_stack_level(),
)
if isinstance(obj, ABCDataFrame):
# str(df) has various unhelpful defaults, like truncation
with option_context("display.max_colwidth", None):
objstr = obj.to_string(**kwargs)
else:
objstr = str(obj)
clipboard_set(objstr)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
from pandas.io.excel._base import (
ExcelFile,
ExcelWriter,
read_excel,
)
from pandas.io.excel._odswriter import ODSWriter as _ODSWriter
from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter
from pandas.io.excel._util import register_writer
from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
register_writer(_OpenpyxlWriter)
register_writer(_XlsxWriter)
register_writer(_ODSWriter)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,121 @@
from __future__ import annotations
from datetime import (
date,
datetime,
time,
timedelta,
)
from typing import (
TYPE_CHECKING,
Any,
Union,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
import pandas as pd
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel._base import BaseExcelReader
if TYPE_CHECKING:
from python_calamine import (
CalamineSheet,
CalamineWorkbook,
)
from pandas._typing import (
FilePath,
NaTType,
ReadBuffer,
Scalar,
StorageOptions,
)
_CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
class CalamineReader(BaseExcelReader["CalamineWorkbook"]):
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using calamine engine (xlsx/xls/xlsb/ods).
Parameters
----------
filepath_or_buffer : str, path to be parsed or
an open readable stream.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("python_calamine")
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
@property
def _workbook_class(self) -> type[CalamineWorkbook]:
from python_calamine import CalamineWorkbook
return CalamineWorkbook
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any
) -> CalamineWorkbook:
from python_calamine import load_workbook
return load_workbook(filepath_or_buffer, **engine_kwargs)
@property
def sheet_names(self) -> list[str]:
from python_calamine import SheetTypeEnum
return [
sheet.name
for sheet in self.book.sheets_metadata
if sheet.typ == SheetTypeEnum.WorkSheet
]
def get_sheet_by_name(self, name: str) -> CalamineSheet:
self.raise_if_bad_sheet_by_name(name)
return self.book.get_sheet_by_name(name)
def get_sheet_by_index(self, index: int) -> CalamineSheet:
self.raise_if_bad_sheet_by_index(index)
return self.book.get_sheet_by_index(index)
def get_sheet_data(
self, sheet: CalamineSheet, file_rows_needed: int | None = None
) -> list[list[Scalar | NaTType | time]]:
def _convert_cell(value: _CellValue) -> Scalar | NaTType | time:
if isinstance(value, float):
val = int(value)
if val == value:
return val
else:
return value
elif isinstance(value, date):
return pd.Timestamp(value)
elif isinstance(value, timedelta):
return pd.Timedelta(value)
elif isinstance(value, time):
return value
return value
rows: list[list[_CellValue]] = sheet.to_python(
skip_empty_area=False, nrows=file_rows_needed
)
data = [[_convert_cell(cell) for cell in row] for row in rows]
return data

View File

@ -0,0 +1,253 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
cast,
)
import numpy as np
from pandas._typing import (
FilePath,
ReadBuffer,
Scalar,
StorageOptions,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
import pandas as pd
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel._base import BaseExcelReader
if TYPE_CHECKING:
from odf.opendocument import OpenDocument
from pandas._libs.tslibs.nattype import NaTType
@doc(storage_options=_shared_docs["storage_options"])
class ODFReader(BaseExcelReader["OpenDocument"]):
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Read tables out of OpenDocument formatted files.
Parameters
----------
filepath_or_buffer : str, path to be parsed or
an open readable stream.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("odf")
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
@property
def _workbook_class(self) -> type[OpenDocument]:
from odf.opendocument import OpenDocument
return OpenDocument
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
) -> OpenDocument:
from odf.opendocument import load
return load(filepath_or_buffer, **engine_kwargs)
@property
def empty_value(self) -> str:
"""Property for compat with other readers."""
return ""
@property
def sheet_names(self) -> list[str]:
"""Return a list of sheet names present in the document"""
from odf.table import Table
tables = self.book.getElementsByType(Table)
return [t.getAttribute("name") for t in tables]
def get_sheet_by_index(self, index: int):
from odf.table import Table
self.raise_if_bad_sheet_by_index(index)
tables = self.book.getElementsByType(Table)
return tables[index]
def get_sheet_by_name(self, name: str):
from odf.table import Table
self.raise_if_bad_sheet_by_name(name)
tables = self.book.getElementsByType(Table)
for table in tables:
if table.getAttribute("name") == name:
return table
self.close()
raise ValueError(f"sheet {name} not found")
def get_sheet_data(
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar | NaTType]]:
"""
Parse an ODF Table into a list of lists
"""
from odf.table import (
CoveredTableCell,
TableCell,
TableRow,
)
covered_cell_name = CoveredTableCell().qname
table_cell_name = TableCell().qname
cell_names = {covered_cell_name, table_cell_name}
sheet_rows = sheet.getElementsByType(TableRow)
empty_rows = 0
max_row_len = 0
table: list[list[Scalar | NaTType]] = []
for sheet_row in sheet_rows:
sheet_cells = [
x
for x in sheet_row.childNodes
if hasattr(x, "qname") and x.qname in cell_names
]
empty_cells = 0
table_row: list[Scalar | NaTType] = []
for sheet_cell in sheet_cells:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value
column_repeat = self._get_column_repeat(sheet_cell)
# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
if max_row_len < len(table_row):
max_row_len = len(table_row)
row_repeat = self._get_row_repeat(sheet_row)
if len(table_row) == 0:
empty_rows += row_repeat
else:
# add blank rows to our table
table.extend([[self.empty_value]] * empty_rows)
empty_rows = 0
table.extend(table_row for _ in range(row_repeat))
if file_rows_needed is not None and len(table) >= file_rows_needed:
break
# Make our table square
for row in table:
if len(row) < max_row_len:
row.extend([self.empty_value] * (max_row_len - len(row)))
return table
def _get_row_repeat(self, row) -> int:
"""
Return number of times this row was repeated
Repeating an empty row appeared to be a common way
of representing sparse rows in the table.
"""
from odf.namespaces import TABLENS
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
def _get_column_repeat(self, cell) -> int:
from odf.namespaces import TABLENS
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
def _get_cell_value(self, cell) -> Scalar | NaTType:
from odf.namespaces import OFFICENS
if str(cell) == "#N/A":
return np.nan
cell_type = cell.attributes.get((OFFICENS, "value-type"))
if cell_type == "boolean":
if str(cell) == "TRUE":
return True
return False
if cell_type is None:
return self.empty_value
elif cell_type == "float":
# GH5394
cell_value = float(cell.attributes.get((OFFICENS, "value")))
val = int(cell_value)
if val == cell_value:
return val
return cell_value
elif cell_type == "percentage":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "string":
return self._get_cell_string_value(cell)
elif cell_type == "currency":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "date":
cell_value = cell.attributes.get((OFFICENS, "date-value"))
return pd.Timestamp(cell_value)
elif cell_type == "time":
stamp = pd.Timestamp(str(cell))
# cast needed here because Scalar doesn't include datetime.time
return cast(Scalar, stamp.time())
else:
self.close()
raise ValueError(f"Unrecognized type {cell_type}")
def _get_cell_string_value(self, cell) -> str:
"""
Find and decode OpenDocument text:s tags that represent
a run length encoded sequence of space characters.
"""
from odf.element import Element
from odf.namespaces import TEXTNS
from odf.office import Annotation
from odf.text import S
office_annotation = Annotation().qname
text_s = S().qname
value = []
for fragment in cell.childNodes:
if isinstance(fragment, Element):
if fragment.qname == text_s:
spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
value.append(" " * spaces)
elif fragment.qname == office_annotation:
continue
else:
# recursive impl needed in case of nested fragments
# with multiple spaces
# https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
value.append(self._get_cell_string_value(fragment))
else:
value.append(str(fragment).strip("\n"))
return "".join(value)

View File

@ -0,0 +1,357 @@
from __future__ import annotations
from collections import defaultdict
import datetime
import json
from typing import (
TYPE_CHECKING,
Any,
DefaultDict,
cast,
overload,
)
from pandas.io.excel._base import ExcelWriter
from pandas.io.excel._util import (
combine_kwargs,
validate_freeze_panes,
)
if TYPE_CHECKING:
from pandas._typing import (
ExcelWriterIfSheetExists,
FilePath,
StorageOptions,
WriteExcelBuffer,
)
from pandas.io.formats.excel import ExcelCell
class ODSWriter(ExcelWriter):
_engine = "odf"
_supported_extensions = (".ods",)
def __init__(
self,
path: FilePath | WriteExcelBuffer | ExcelWriter,
engine: str | None = None,
date_format: str | None = None,
datetime_format=None,
mode: str = "w",
storage_options: StorageOptions | None = None,
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
engine_kwargs: dict[str, Any] | None = None,
**kwargs,
) -> None:
from odf.opendocument import OpenDocumentSpreadsheet
if mode == "a":
raise ValueError("Append mode is not supported with odf!")
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
self._book = OpenDocumentSpreadsheet(**engine_kwargs)
super().__init__(
path,
mode=mode,
storage_options=storage_options,
if_sheet_exists=if_sheet_exists,
engine_kwargs=engine_kwargs,
)
self._style_dict: dict[str, str] = {}
@property
def book(self):
"""
Book instance of class odf.opendocument.OpenDocumentSpreadsheet.
This attribute can be used to access engine-specific features.
"""
return self._book
@property
def sheets(self) -> dict[str, Any]:
"""Mapping of sheet names to sheet objects."""
from odf.table import Table
result = {
sheet.getAttribute("name"): sheet
for sheet in self.book.getElementsByType(Table)
}
return result
def _save(self) -> None:
"""
Save workbook to disk.
"""
for sheet in self.sheets.values():
self.book.spreadsheet.addElement(sheet)
self.book.save(self._handles.handle)
def _write_cells(
self,
cells: list[ExcelCell],
sheet_name: str | None = None,
startrow: int = 0,
startcol: int = 0,
freeze_panes: tuple[int, int] | None = None,
) -> None:
"""
Write the frame cells using odf
"""
from odf.table import (
Table,
TableCell,
TableRow,
)
from odf.text import P
sheet_name = self._get_sheet_name(sheet_name)
assert sheet_name is not None
if sheet_name in self.sheets:
wks = self.sheets[sheet_name]
else:
wks = Table(name=sheet_name)
self.book.spreadsheet.addElement(wks)
if validate_freeze_panes(freeze_panes):
freeze_panes = cast(tuple[int, int], freeze_panes)
self._create_freeze_panes(sheet_name, freeze_panes)
for _ in range(startrow):
wks.addElement(TableRow())
rows: DefaultDict = defaultdict(TableRow)
col_count: DefaultDict = defaultdict(int)
for cell in sorted(cells, key=lambda cell: (cell.row, cell.col)):
# only add empty cells if the row is still empty
if not col_count[cell.row]:
for _ in range(startcol):
rows[cell.row].addElement(TableCell())
# fill with empty cells if needed
for _ in range(cell.col - col_count[cell.row]):
rows[cell.row].addElement(TableCell())
col_count[cell.row] += 1
pvalue, tc = self._make_table_cell(cell)
rows[cell.row].addElement(tc)
col_count[cell.row] += 1
p = P(text=pvalue)
tc.addElement(p)
# add all rows to the sheet
if len(rows) > 0:
for row_nr in range(max(rows.keys()) + 1):
wks.addElement(rows[row_nr])
def _make_table_cell_attributes(self, cell) -> dict[str, int | str]:
"""Convert cell attributes to OpenDocument attributes
Parameters
----------
cell : ExcelCell
Spreadsheet cell data
Returns
-------
attributes : Dict[str, Union[int, str]]
Dictionary with attributes and attribute values
"""
attributes: dict[str, int | str] = {}
style_name = self._process_style(cell.style)
if style_name is not None:
attributes["stylename"] = style_name
if cell.mergestart is not None and cell.mergeend is not None:
attributes["numberrowsspanned"] = max(1, cell.mergestart)
attributes["numbercolumnsspanned"] = cell.mergeend
return attributes
def _make_table_cell(self, cell) -> tuple[object, Any]:
"""Convert cell data to an OpenDocument spreadsheet cell
Parameters
----------
cell : ExcelCell
Spreadsheet cell data
Returns
-------
pvalue, cell : Tuple[str, TableCell]
Display value, Cell value
"""
from odf.table import TableCell
attributes = self._make_table_cell_attributes(cell)
val, fmt = self._value_with_fmt(cell.val)
pvalue = value = val
if isinstance(val, bool):
value = str(val).lower()
pvalue = str(val).upper()
return (
pvalue,
TableCell(
valuetype="boolean",
booleanvalue=value,
attributes=attributes,
),
)
elif isinstance(val, datetime.datetime):
# Fast formatting
value = val.isoformat()
# Slow but locale-dependent
pvalue = val.strftime("%c")
return (
pvalue,
TableCell(valuetype="date", datevalue=value, attributes=attributes),
)
elif isinstance(val, datetime.date):
# Fast formatting
value = f"{val.year}-{val.month:02d}-{val.day:02d}"
# Slow but locale-dependent
pvalue = val.strftime("%x")
return (
pvalue,
TableCell(valuetype="date", datevalue=value, attributes=attributes),
)
elif isinstance(val, str):
return (
pvalue,
TableCell(
valuetype="string",
stringvalue=value,
attributes=attributes,
),
)
else:
return (
pvalue,
TableCell(
valuetype="float",
value=value,
attributes=attributes,
),
)
@overload
def _process_style(self, style: dict[str, Any]) -> str:
...
@overload
def _process_style(self, style: None) -> None:
...
def _process_style(self, style: dict[str, Any] | None) -> str | None:
"""Convert a style dictionary to a OpenDocument style sheet
Parameters
----------
style : Dict
Style dictionary
Returns
-------
style_key : str
Unique style key for later reference in sheet
"""
from odf.style import (
ParagraphProperties,
Style,
TableCellProperties,
TextProperties,
)
if style is None:
return None
style_key = json.dumps(style)
if style_key in self._style_dict:
return self._style_dict[style_key]
name = f"pd{len(self._style_dict)+1}"
self._style_dict[style_key] = name
odf_style = Style(name=name, family="table-cell")
if "font" in style:
font = style["font"]
if font.get("bold", False):
odf_style.addElement(TextProperties(fontweight="bold"))
if "borders" in style:
borders = style["borders"]
for side, thickness in borders.items():
thickness_translation = {"thin": "0.75pt solid #000000"}
odf_style.addElement(
TableCellProperties(
attributes={f"border{side}": thickness_translation[thickness]}
)
)
if "alignment" in style:
alignment = style["alignment"]
horizontal = alignment.get("horizontal")
if horizontal:
odf_style.addElement(ParagraphProperties(textalign=horizontal))
vertical = alignment.get("vertical")
if vertical:
odf_style.addElement(TableCellProperties(verticalalign=vertical))
self.book.styles.addElement(odf_style)
return name
def _create_freeze_panes(
self, sheet_name: str, freeze_panes: tuple[int, int]
) -> None:
"""
Create freeze panes in the sheet.
Parameters
----------
sheet_name : str
Name of the spreadsheet
freeze_panes : tuple of (int, int)
Freeze pane location x and y
"""
from odf.config import (
ConfigItem,
ConfigItemMapEntry,
ConfigItemMapIndexed,
ConfigItemMapNamed,
ConfigItemSet,
)
config_item_set = ConfigItemSet(name="ooo:view-settings")
self.book.settings.addElement(config_item_set)
config_item_map_indexed = ConfigItemMapIndexed(name="Views")
config_item_set.addElement(config_item_map_indexed)
config_item_map_entry = ConfigItemMapEntry()
config_item_map_indexed.addElement(config_item_map_entry)
config_item_map_named = ConfigItemMapNamed(name="Tables")
config_item_map_entry.addElement(config_item_map_named)
config_item_map_entry = ConfigItemMapEntry(name=sheet_name)
config_item_map_named.addElement(config_item_map_entry)
config_item_map_entry.addElement(
ConfigItem(name="HorizontalSplitMode", type="short", text="2")
)
config_item_map_entry.addElement(
ConfigItem(name="VerticalSplitMode", type="short", text="2")
)
config_item_map_entry.addElement(
ConfigItem(
name="HorizontalSplitPosition", type="int", text=str(freeze_panes[0])
)
)
config_item_map_entry.addElement(
ConfigItem(
name="VerticalSplitPosition", type="int", text=str(freeze_panes[1])
)
)
config_item_map_entry.addElement(
ConfigItem(name="PositionRight", type="int", text=str(freeze_panes[0]))
)
config_item_map_entry.addElement(
ConfigItem(name="PositionBottom", type="int", text=str(freeze_panes[1]))
)

View File

@ -0,0 +1,639 @@
from __future__ import annotations
import mmap
from typing import (
TYPE_CHECKING,
Any,
cast,
)
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel._base import (
BaseExcelReader,
ExcelWriter,
)
from pandas.io.excel._util import (
combine_kwargs,
validate_freeze_panes,
)
if TYPE_CHECKING:
from openpyxl import Workbook
from openpyxl.descriptors.serialisable import Serialisable
from pandas._typing import (
ExcelWriterIfSheetExists,
FilePath,
ReadBuffer,
Scalar,
StorageOptions,
WriteExcelBuffer,
)
class OpenpyxlWriter(ExcelWriter):
_engine = "openpyxl"
_supported_extensions = (".xlsx", ".xlsm")
def __init__(
self,
path: FilePath | WriteExcelBuffer | ExcelWriter,
engine: str | None = None,
date_format: str | None = None,
datetime_format: str | None = None,
mode: str = "w",
storage_options: StorageOptions | None = None,
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
engine_kwargs: dict[str, Any] | None = None,
**kwargs,
) -> None:
# Use the openpyxl module as the Excel writer.
from openpyxl.workbook import Workbook
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
super().__init__(
path,
mode=mode,
storage_options=storage_options,
if_sheet_exists=if_sheet_exists,
engine_kwargs=engine_kwargs,
)
# ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
# the file and later write to it
if "r+" in self._mode: # Load from existing workbook
from openpyxl import load_workbook
try:
self._book = load_workbook(self._handles.handle, **engine_kwargs)
except TypeError:
self._handles.handle.close()
raise
self._handles.handle.seek(0)
else:
# Create workbook object with default optimized_write=True.
try:
self._book = Workbook(**engine_kwargs)
except TypeError:
self._handles.handle.close()
raise
if self.book.worksheets:
self.book.remove(self.book.worksheets[0])
@property
def book(self) -> Workbook:
"""
Book instance of class openpyxl.workbook.Workbook.
This attribute can be used to access engine-specific features.
"""
return self._book
@property
def sheets(self) -> dict[str, Any]:
"""Mapping of sheet names to sheet objects."""
result = {name: self.book[name] for name in self.book.sheetnames}
return result
def _save(self) -> None:
"""
Save workbook to disk.
"""
self.book.save(self._handles.handle)
if "r+" in self._mode and not isinstance(self._handles.handle, mmap.mmap):
# truncate file to the written content
self._handles.handle.truncate()
@classmethod
def _convert_to_style_kwargs(cls, style_dict: dict) -> dict[str, Serialisable]:
"""
Convert a style_dict to a set of kwargs suitable for initializing
or updating-on-copy an openpyxl v2 style object.
Parameters
----------
style_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'font'
'fill'
'border' ('borders')
'alignment'
'number_format'
'protection'
Returns
-------
style_kwargs : dict
A dict with the same, normalized keys as ``style_dict`` but each
value has been replaced with a native openpyxl style object of the
appropriate class.
"""
_style_key_map = {"borders": "border"}
style_kwargs: dict[str, Serialisable] = {}
for k, v in style_dict.items():
k = _style_key_map.get(k, k)
_conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None)
new_v = _conv_to_x(v)
if new_v:
style_kwargs[k] = new_v
return style_kwargs
@classmethod
def _convert_to_color(cls, color_spec):
"""
Convert ``color_spec`` to an openpyxl v2 Color object.
Parameters
----------
color_spec : str, dict
A 32-bit ARGB hex string, or a dict with zero or more of the
following keys.
'rgb'
'indexed'
'auto'
'theme'
'tint'
'index'
'type'
Returns
-------
color : openpyxl.styles.Color
"""
from openpyxl.styles import Color
if isinstance(color_spec, str):
return Color(color_spec)
else:
return Color(**color_spec)
@classmethod
def _convert_to_font(cls, font_dict):
"""
Convert ``font_dict`` to an openpyxl v2 Font object.
Parameters
----------
font_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'name'
'size' ('sz')
'bold' ('b')
'italic' ('i')
'underline' ('u')
'strikethrough' ('strike')
'color'
'vertAlign' ('vertalign')
'charset'
'scheme'
'family'
'outline'
'shadow'
'condense'
Returns
-------
font : openpyxl.styles.Font
"""
from openpyxl.styles import Font
_font_key_map = {
"sz": "size",
"b": "bold",
"i": "italic",
"u": "underline",
"strike": "strikethrough",
"vertalign": "vertAlign",
}
font_kwargs = {}
for k, v in font_dict.items():
k = _font_key_map.get(k, k)
if k == "color":
v = cls._convert_to_color(v)
font_kwargs[k] = v
return Font(**font_kwargs)
@classmethod
def _convert_to_stop(cls, stop_seq):
"""
Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
suitable for initializing the ``GradientFill`` ``stop`` parameter.
Parameters
----------
stop_seq : iterable
An iterable that yields objects suitable for consumption by
``_convert_to_color``.
Returns
-------
stop : list of openpyxl.styles.Color
"""
return map(cls._convert_to_color, stop_seq)
@classmethod
def _convert_to_fill(cls, fill_dict: dict[str, Any]):
"""
Convert ``fill_dict`` to an openpyxl v2 Fill object.
Parameters
----------
fill_dict : dict
A dict with one or more of the following keys (or their synonyms),
'fill_type' ('patternType', 'patterntype')
'start_color' ('fgColor', 'fgcolor')
'end_color' ('bgColor', 'bgcolor')
or one or more of the following keys (or their synonyms).
'type' ('fill_type')
'degree'
'left'
'right'
'top'
'bottom'
'stop'
Returns
-------
fill : openpyxl.styles.Fill
"""
from openpyxl.styles import (
GradientFill,
PatternFill,
)
_pattern_fill_key_map = {
"patternType": "fill_type",
"patterntype": "fill_type",
"fgColor": "start_color",
"fgcolor": "start_color",
"bgColor": "end_color",
"bgcolor": "end_color",
}
_gradient_fill_key_map = {"fill_type": "type"}
pfill_kwargs = {}
gfill_kwargs = {}
for k, v in fill_dict.items():
pk = _pattern_fill_key_map.get(k)
gk = _gradient_fill_key_map.get(k)
if pk in ["start_color", "end_color"]:
v = cls._convert_to_color(v)
if gk == "stop":
v = cls._convert_to_stop(v)
if pk:
pfill_kwargs[pk] = v
elif gk:
gfill_kwargs[gk] = v
else:
pfill_kwargs[k] = v
gfill_kwargs[k] = v
try:
return PatternFill(**pfill_kwargs)
except TypeError:
return GradientFill(**gfill_kwargs)
@classmethod
def _convert_to_side(cls, side_spec):
"""
Convert ``side_spec`` to an openpyxl v2 Side object.
Parameters
----------
side_spec : str, dict
A string specifying the border style, or a dict with zero or more
of the following keys (or their synonyms).
'style' ('border_style')
'color'
Returns
-------
side : openpyxl.styles.Side
"""
from openpyxl.styles import Side
_side_key_map = {"border_style": "style"}
if isinstance(side_spec, str):
return Side(style=side_spec)
side_kwargs = {}
for k, v in side_spec.items():
k = _side_key_map.get(k, k)
if k == "color":
v = cls._convert_to_color(v)
side_kwargs[k] = v
return Side(**side_kwargs)
@classmethod
def _convert_to_border(cls, border_dict):
"""
Convert ``border_dict`` to an openpyxl v2 Border object.
Parameters
----------
border_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'left'
'right'
'top'
'bottom'
'diagonal'
'diagonal_direction'
'vertical'
'horizontal'
'diagonalUp' ('diagonalup')
'diagonalDown' ('diagonaldown')
'outline'
Returns
-------
border : openpyxl.styles.Border
"""
from openpyxl.styles import Border
_border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
border_kwargs = {}
for k, v in border_dict.items():
k = _border_key_map.get(k, k)
if k == "color":
v = cls._convert_to_color(v)
if k in ["left", "right", "top", "bottom", "diagonal"]:
v = cls._convert_to_side(v)
border_kwargs[k] = v
return Border(**border_kwargs)
@classmethod
def _convert_to_alignment(cls, alignment_dict):
"""
Convert ``alignment_dict`` to an openpyxl v2 Alignment object.
Parameters
----------
alignment_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'horizontal'
'vertical'
'text_rotation'
'wrap_text'
'shrink_to_fit'
'indent'
Returns
-------
alignment : openpyxl.styles.Alignment
"""
from openpyxl.styles import Alignment
return Alignment(**alignment_dict)
@classmethod
def _convert_to_number_format(cls, number_format_dict):
"""
Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
initializer.
Parameters
----------
number_format_dict : dict
A dict with zero or more of the following keys.
'format_code' : str
Returns
-------
number_format : str
"""
return number_format_dict["format_code"]
@classmethod
def _convert_to_protection(cls, protection_dict):
"""
Convert ``protection_dict`` to an openpyxl v2 Protection object.
Parameters
----------
protection_dict : dict
A dict with zero or more of the following keys.
'locked'
'hidden'
Returns
-------
"""
from openpyxl.styles import Protection
return Protection(**protection_dict)
def _write_cells(
self,
cells,
sheet_name: str | None = None,
startrow: int = 0,
startcol: int = 0,
freeze_panes: tuple[int, int] | None = None,
) -> None:
# Write the frame cells using openpyxl.
sheet_name = self._get_sheet_name(sheet_name)
_style_cache: dict[str, dict[str, Serialisable]] = {}
if sheet_name in self.sheets and self._if_sheet_exists != "new":
if "r+" in self._mode:
if self._if_sheet_exists == "replace":
old_wks = self.sheets[sheet_name]
target_index = self.book.index(old_wks)
del self.book[sheet_name]
wks = self.book.create_sheet(sheet_name, target_index)
elif self._if_sheet_exists == "error":
raise ValueError(
f"Sheet '{sheet_name}' already exists and "
f"if_sheet_exists is set to 'error'."
)
elif self._if_sheet_exists == "overlay":
wks = self.sheets[sheet_name]
else:
raise ValueError(
f"'{self._if_sheet_exists}' is not valid for if_sheet_exists. "
"Valid options are 'error', 'new', 'replace' and 'overlay'."
)
else:
wks = self.sheets[sheet_name]
else:
wks = self.book.create_sheet()
wks.title = sheet_name
if validate_freeze_panes(freeze_panes):
freeze_panes = cast(tuple[int, int], freeze_panes)
wks.freeze_panes = wks.cell(
row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
)
for cell in cells:
xcell = wks.cell(
row=startrow + cell.row + 1, column=startcol + cell.col + 1
)
xcell.value, fmt = self._value_with_fmt(cell.val)
if fmt:
xcell.number_format = fmt
style_kwargs: dict[str, Serialisable] | None = {}
if cell.style:
key = str(cell.style)
style_kwargs = _style_cache.get(key)
if style_kwargs is None:
style_kwargs = self._convert_to_style_kwargs(cell.style)
_style_cache[key] = style_kwargs
if style_kwargs:
for k, v in style_kwargs.items():
setattr(xcell, k, v)
if cell.mergestart is not None and cell.mergeend is not None:
wks.merge_cells(
start_row=startrow + cell.row + 1,
start_column=startcol + cell.col + 1,
end_column=startcol + cell.mergeend + 1,
end_row=startrow + cell.mergestart + 1,
)
# When cells are merged only the top-left cell is preserved
# The behaviour of the other cells in a merged range is
# undefined
if style_kwargs:
first_row = startrow + cell.row + 1
last_row = startrow + cell.mergestart + 1
first_col = startcol + cell.col + 1
last_col = startcol + cell.mergeend + 1
for row in range(first_row, last_row + 1):
for col in range(first_col, last_col + 1):
if row == first_row and col == first_col:
# Ignore first cell. It is already handled.
continue
xcell = wks.cell(column=col, row=row)
for k, v in style_kwargs.items():
setattr(xcell, k, v)
class OpenpyxlReader(BaseExcelReader["Workbook"]):
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using openpyxl engine.
Parameters
----------
filepath_or_buffer : str, path object or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("openpyxl")
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
@property
def _workbook_class(self) -> type[Workbook]:
from openpyxl import Workbook
return Workbook
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
) -> Workbook:
from openpyxl import load_workbook
default_kwargs = {"read_only": True, "data_only": True, "keep_links": False}
return load_workbook(
filepath_or_buffer,
**(default_kwargs | engine_kwargs),
)
@property
def sheet_names(self) -> list[str]:
return [sheet.title for sheet in self.book.worksheets]
def get_sheet_by_name(self, name: str):
self.raise_if_bad_sheet_by_name(name)
return self.book[name]
def get_sheet_by_index(self, index: int):
self.raise_if_bad_sheet_by_index(index)
return self.book.worksheets[index]
def _convert_cell(self, cell) -> Scalar:
from openpyxl.cell.cell import (
TYPE_ERROR,
TYPE_NUMERIC,
)
if cell.value is None:
return "" # compat with xlrd
elif cell.data_type == TYPE_ERROR:
return np.nan
elif cell.data_type == TYPE_NUMERIC:
val = int(cell.value)
if val == cell.value:
return val
return float(cell.value)
return cell.value
def get_sheet_data(
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
if self.book.read_only:
sheet.reset_dimensions()
data: list[list[Scalar]] = []
last_row_with_data = -1
for row_number, row in enumerate(sheet.rows):
converted_row = [self._convert_cell(cell) for cell in row]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
if converted_row:
last_row_with_data = row_number
data.append(converted_row)
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
# Trim trailing empty rows
data = data[: last_row_with_data + 1]
if len(data) > 0:
# extend rows to max width
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
empty_cell: list[Scalar] = [""]
data = [
data_row + (max_width - len(data_row)) * empty_cell
for data_row in data
]
return data

View File

@ -0,0 +1,127 @@
# pyright: reportMissingImports=false
from __future__ import annotations
from typing import TYPE_CHECKING
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel._base import BaseExcelReader
if TYPE_CHECKING:
from pyxlsb import Workbook
from pandas._typing import (
FilePath,
ReadBuffer,
Scalar,
StorageOptions,
)
class PyxlsbReader(BaseExcelReader["Workbook"]):
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using pyxlsb engine.
Parameters
----------
filepath_or_buffer : str, path object, or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("pyxlsb")
# This will call load_workbook on the filepath or buffer
# And set the result to the book-attribute
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
@property
def _workbook_class(self) -> type[Workbook]:
from pyxlsb import Workbook
return Workbook
def load_workbook(
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
) -> Workbook:
from pyxlsb import open_workbook
# TODO: hack in buffer capability
# This might need some modifications to the Pyxlsb library
# Actual work for opening it is in xlsbpackage.py, line 20-ish
return open_workbook(filepath_or_buffer, **engine_kwargs)
@property
def sheet_names(self) -> list[str]:
return self.book.sheets
def get_sheet_by_name(self, name: str):
self.raise_if_bad_sheet_by_name(name)
return self.book.get_sheet(name)
def get_sheet_by_index(self, index: int):
self.raise_if_bad_sheet_by_index(index)
# pyxlsb sheets are indexed from 1 onwards
# There's a fix for this in the source, but the pypi package doesn't have it
return self.book.get_sheet(index + 1)
def _convert_cell(self, cell) -> Scalar:
# TODO: there is no way to distinguish between floats and datetimes in pyxlsb
# This means that there is no way to read datetime types from an xlsb file yet
if cell.v is None:
return "" # Prevents non-named columns from not showing up as Unnamed: i
if isinstance(cell.v, float):
val = int(cell.v)
if val == cell.v:
return val
else:
return float(cell.v)
return cell.v
def get_sheet_data(
self,
sheet,
file_rows_needed: int | None = None,
) -> list[list[Scalar]]:
data: list[list[Scalar]] = []
previous_row_number = -1
# When sparse=True the rows can have different lengths and empty rows are
# not returned. The cells are namedtuples of row, col, value (r, c, v).
for row in sheet.rows(sparse=True):
row_number = row[0].r
converted_row = [self._convert_cell(cell) for cell in row]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
if converted_row:
data.extend([[]] * (row_number - previous_row_number - 1))
data.append(converted_row)
previous_row_number = row_number
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
if data:
# extend rows to max_width
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
empty_cell: list[Scalar] = [""]
data = [
data_row + (max_width - len(data_row)) * empty_cell
for data_row in data
]
return data

View File

@ -0,0 +1,334 @@
from __future__ import annotations
from collections.abc import (
Hashable,
Iterable,
MutableMapping,
Sequence,
)
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
TypeVar,
overload,
)
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.common import (
is_integer,
is_list_like,
)
if TYPE_CHECKING:
from pandas.io.excel._base import ExcelWriter
ExcelWriter_t = type[ExcelWriter]
usecols_func = TypeVar("usecols_func", bound=Callable[[Hashable], object])
_writers: MutableMapping[str, ExcelWriter_t] = {}
def register_writer(klass: ExcelWriter_t) -> None:
"""
Add engine to the excel writer registry.io.excel.
You must use this method to integrate with ``to_excel``.
Parameters
----------
klass : ExcelWriter
"""
if not callable(klass):
raise ValueError("Can only register callables as engines")
engine_name = klass._engine
_writers[engine_name] = klass
def get_default_engine(ext: str, mode: Literal["reader", "writer"] = "reader") -> str:
"""
Return the default reader/writer for the given extension.
Parameters
----------
ext : str
The excel file extension for which to get the default engine.
mode : str {'reader', 'writer'}
Whether to get the default engine for reading or writing.
Either 'reader' or 'writer'
Returns
-------
str
The default engine for the extension.
"""
_default_readers = {
"xlsx": "openpyxl",
"xlsm": "openpyxl",
"xlsb": "pyxlsb",
"xls": "xlrd",
"ods": "odf",
}
_default_writers = {
"xlsx": "openpyxl",
"xlsm": "openpyxl",
"xlsb": "pyxlsb",
"ods": "odf",
}
assert mode in ["reader", "writer"]
if mode == "writer":
# Prefer xlsxwriter over openpyxl if installed
xlsxwriter = import_optional_dependency("xlsxwriter", errors="warn")
if xlsxwriter:
_default_writers["xlsx"] = "xlsxwriter"
return _default_writers[ext]
else:
return _default_readers[ext]
def get_writer(engine_name: str) -> ExcelWriter_t:
try:
return _writers[engine_name]
except KeyError as err:
raise ValueError(f"No Excel writer '{engine_name}'") from err
def _excel2num(x: str) -> int:
"""
Convert Excel column name like 'AB' to 0-based column index.
Parameters
----------
x : str
The Excel column name to convert to a 0-based column index.
Returns
-------
num : int
The column index corresponding to the name.
Raises
------
ValueError
Part of the Excel column name was invalid.
"""
index = 0
for c in x.upper().strip():
cp = ord(c)
if cp < ord("A") or cp > ord("Z"):
raise ValueError(f"Invalid column name: {x}")
index = index * 26 + cp - ord("A") + 1
return index - 1
def _range2cols(areas: str) -> list[int]:
"""
Convert comma separated list of column names and ranges to indices.
Parameters
----------
areas : str
A string containing a sequence of column ranges (or areas).
Returns
-------
cols : list
A list of 0-based column indices.
Examples
--------
>>> _range2cols('A:E')
[0, 1, 2, 3, 4]
>>> _range2cols('A,C,Z:AB')
[0, 2, 25, 26, 27]
"""
cols: list[int] = []
for rng in areas.split(","):
if ":" in rng:
rngs = rng.split(":")
cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1))
else:
cols.append(_excel2num(rng))
return cols
@overload
def maybe_convert_usecols(usecols: str | list[int]) -> list[int]:
...
@overload
def maybe_convert_usecols(usecols: list[str]) -> list[str]:
...
@overload
def maybe_convert_usecols(usecols: usecols_func) -> usecols_func:
...
@overload
def maybe_convert_usecols(usecols: None) -> None:
...
def maybe_convert_usecols(
usecols: str | list[int] | list[str] | usecols_func | None,
) -> None | list[int] | list[str] | usecols_func:
"""
Convert `usecols` into a compatible format for parsing in `parsers.py`.
Parameters
----------
usecols : object
The use-columns object to potentially convert.
Returns
-------
converted : object
The compatible format of `usecols`.
"""
if usecols is None:
return usecols
if is_integer(usecols):
raise ValueError(
"Passing an integer for `usecols` is no longer supported. "
"Please pass in a list of int from 0 to `usecols` inclusive instead."
)
if isinstance(usecols, str):
return _range2cols(usecols)
return usecols
@overload
def validate_freeze_panes(freeze_panes: tuple[int, int]) -> Literal[True]:
...
@overload
def validate_freeze_panes(freeze_panes: None) -> Literal[False]:
...
def validate_freeze_panes(freeze_panes: tuple[int, int] | None) -> bool:
if freeze_panes is not None:
if len(freeze_panes) == 2 and all(
isinstance(item, int) for item in freeze_panes
):
return True
raise ValueError(
"freeze_panes must be of form (row, column) "
"where row and column are integers"
)
# freeze_panes wasn't specified, return False so it won't be applied
# to output sheet
return False
def fill_mi_header(
row: list[Hashable], control_row: list[bool]
) -> tuple[list[Hashable], list[bool]]:
"""
Forward fill blank entries in row but only inside the same parent index.
Used for creating headers in Multiindex.
Parameters
----------
row : list
List of items in a single row.
control_row : list of bool
Helps to determine if particular column is in same parent index as the
previous value. Used to stop propagation of empty cells between
different indexes.
Returns
-------
Returns changed row and control_row
"""
last = row[0]
for i in range(1, len(row)):
if not control_row[i]:
last = row[i]
if row[i] == "" or row[i] is None:
row[i] = last
else:
control_row[i] = False
last = row[i]
return row, control_row
def pop_header_name(
row: list[Hashable], index_col: int | Sequence[int]
) -> tuple[Hashable | None, list[Hashable]]:
"""
Pop the header name for MultiIndex parsing.
Parameters
----------
row : list
The data row to parse for the header name.
index_col : int, list
The index columns for our data. Assumed to be non-null.
Returns
-------
header_name : str
The extracted header name.
trimmed_row : list
The original data row with the header name removed.
"""
# Pop out header name and fill w/blank.
if is_list_like(index_col):
assert isinstance(index_col, Iterable)
i = max(index_col)
else:
assert not isinstance(index_col, Iterable)
i = index_col
header_name = row[i]
header_name = None if header_name == "" else header_name
return header_name, row[:i] + [""] + row[i + 1 :]
def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict:
"""
Used to combine two sources of kwargs for the backend engine.
Use of kwargs is deprecated, this function is solely for use in 1.3 and should
be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs
or kwargs must be None or empty respectively.
Parameters
----------
engine_kwargs: dict
kwargs to be passed through to the engine.
kwargs: dict
kwargs to be psased through to the engine (deprecated)
Returns
-------
engine_kwargs combined with kwargs
"""
if engine_kwargs is None:
result = {}
else:
result = engine_kwargs.copy()
result.update(kwargs)
return result

View File

@ -0,0 +1,143 @@
from __future__ import annotations
from datetime import time
import math
from typing import TYPE_CHECKING
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel._base import BaseExcelReader
if TYPE_CHECKING:
from xlrd import Book
from pandas._typing import (
Scalar,
StorageOptions,
)
class XlrdReader(BaseExcelReader["Book"]):
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer,
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using xlrd engine.
Parameters
----------
filepath_or_buffer : str, path object or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
import_optional_dependency("xlrd", extra=err_msg)
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
@property
def _workbook_class(self) -> type[Book]:
from xlrd import Book
return Book
def load_workbook(self, filepath_or_buffer, engine_kwargs) -> Book:
from xlrd import open_workbook
if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
return open_workbook(file_contents=data, **engine_kwargs)
else:
return open_workbook(filepath_or_buffer, **engine_kwargs)
@property
def sheet_names(self):
return self.book.sheet_names()
def get_sheet_by_name(self, name):
self.raise_if_bad_sheet_by_name(name)
return self.book.sheet_by_name(name)
def get_sheet_by_index(self, index):
self.raise_if_bad_sheet_by_index(index)
return self.book.sheet_by_index(index)
def get_sheet_data(
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
XL_CELL_DATE,
XL_CELL_ERROR,
XL_CELL_NUMBER,
xldate,
)
epoch1904 = self.book.datemode
def _parse_cell(cell_contents, cell_typ):
"""
converts the contents of the cell into a pandas appropriate object
"""
if cell_typ == XL_CELL_DATE:
# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
except OverflowError:
return cell_contents
# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if (not epoch1904 and year == (1899, 12, 31)) or (
epoch1904 and year == (1904, 1, 1)
):
cell_contents = time(
cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond,
)
elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
if math.isfinite(cell_contents):
# GH54564 - don't attempt to convert NaN/Inf
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents
data = []
nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)
return data

View File

@ -0,0 +1,284 @@
from __future__ import annotations
import json
from typing import (
TYPE_CHECKING,
Any,
)
from pandas.io.excel._base import ExcelWriter
from pandas.io.excel._util import (
combine_kwargs,
validate_freeze_panes,
)
if TYPE_CHECKING:
from pandas._typing import (
ExcelWriterIfSheetExists,
FilePath,
StorageOptions,
WriteExcelBuffer,
)
class _XlsxStyler:
# Map from openpyxl-oriented styles to flatter xlsxwriter representation
# Ordering necessary for both determinism and because some are keyed by
# prefixes of others.
STYLE_MAPPING: dict[str, list[tuple[tuple[str, ...], str]]] = {
"font": [
(("name",), "font_name"),
(("sz",), "font_size"),
(("size",), "font_size"),
(("color", "rgb"), "font_color"),
(("color",), "font_color"),
(("b",), "bold"),
(("bold",), "bold"),
(("i",), "italic"),
(("italic",), "italic"),
(("u",), "underline"),
(("underline",), "underline"),
(("strike",), "font_strikeout"),
(("vertAlign",), "font_script"),
(("vertalign",), "font_script"),
],
"number_format": [(("format_code",), "num_format"), ((), "num_format")],
"protection": [(("locked",), "locked"), (("hidden",), "hidden")],
"alignment": [
(("horizontal",), "align"),
(("vertical",), "valign"),
(("text_rotation",), "rotation"),
(("wrap_text",), "text_wrap"),
(("indent",), "indent"),
(("shrink_to_fit",), "shrink"),
],
"fill": [
(("patternType",), "pattern"),
(("patterntype",), "pattern"),
(("fill_type",), "pattern"),
(("start_color", "rgb"), "fg_color"),
(("fgColor", "rgb"), "fg_color"),
(("fgcolor", "rgb"), "fg_color"),
(("start_color",), "fg_color"),
(("fgColor",), "fg_color"),
(("fgcolor",), "fg_color"),
(("end_color", "rgb"), "bg_color"),
(("bgColor", "rgb"), "bg_color"),
(("bgcolor", "rgb"), "bg_color"),
(("end_color",), "bg_color"),
(("bgColor",), "bg_color"),
(("bgcolor",), "bg_color"),
],
"border": [
(("color", "rgb"), "border_color"),
(("color",), "border_color"),
(("style",), "border"),
(("top", "color", "rgb"), "top_color"),
(("top", "color"), "top_color"),
(("top", "style"), "top"),
(("top",), "top"),
(("right", "color", "rgb"), "right_color"),
(("right", "color"), "right_color"),
(("right", "style"), "right"),
(("right",), "right"),
(("bottom", "color", "rgb"), "bottom_color"),
(("bottom", "color"), "bottom_color"),
(("bottom", "style"), "bottom"),
(("bottom",), "bottom"),
(("left", "color", "rgb"), "left_color"),
(("left", "color"), "left_color"),
(("left", "style"), "left"),
(("left",), "left"),
],
}
@classmethod
def convert(cls, style_dict, num_format_str=None):
"""
converts a style_dict to an xlsxwriter format dict
Parameters
----------
style_dict : style dictionary to convert
num_format_str : optional number format string
"""
# Create a XlsxWriter format object.
props = {}
if num_format_str is not None:
props["num_format"] = num_format_str
if style_dict is None:
return props
if "borders" in style_dict:
style_dict = style_dict.copy()
style_dict["border"] = style_dict.pop("borders")
for style_group_key, style_group in style_dict.items():
for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
# src is a sequence of keys into a nested dict
# dst is a flat key
if dst in props:
continue
v = style_group
for k in src:
try:
v = v[k]
except (KeyError, TypeError):
break
else:
props[dst] = v
if isinstance(props.get("pattern"), str):
# TODO: support other fill patterns
props["pattern"] = 0 if props["pattern"] == "none" else 1
for k in ["border", "top", "right", "bottom", "left"]:
if isinstance(props.get(k), str):
try:
props[k] = [
"none",
"thin",
"medium",
"dashed",
"dotted",
"thick",
"double",
"hair",
"mediumDashed",
"dashDot",
"mediumDashDot",
"dashDotDot",
"mediumDashDotDot",
"slantDashDot",
].index(props[k])
except ValueError:
props[k] = 2
if isinstance(props.get("font_script"), str):
props["font_script"] = ["baseline", "superscript", "subscript"].index(
props["font_script"]
)
if isinstance(props.get("underline"), str):
props["underline"] = {
"none": 0,
"single": 1,
"double": 2,
"singleAccounting": 33,
"doubleAccounting": 34,
}[props["underline"]]
# GH 30107 - xlsxwriter uses different name
if props.get("valign") == "center":
props["valign"] = "vcenter"
return props
class XlsxWriter(ExcelWriter):
_engine = "xlsxwriter"
_supported_extensions = (".xlsx",)
def __init__(
self,
path: FilePath | WriteExcelBuffer | ExcelWriter,
engine: str | None = None,
date_format: str | None = None,
datetime_format: str | None = None,
mode: str = "w",
storage_options: StorageOptions | None = None,
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
engine_kwargs: dict[str, Any] | None = None,
**kwargs,
) -> None:
# Use the xlsxwriter module as the Excel writer.
from xlsxwriter import Workbook
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
if mode == "a":
raise ValueError("Append mode is not supported with xlsxwriter!")
super().__init__(
path,
engine=engine,
date_format=date_format,
datetime_format=datetime_format,
mode=mode,
storage_options=storage_options,
if_sheet_exists=if_sheet_exists,
engine_kwargs=engine_kwargs,
)
try:
self._book = Workbook(self._handles.handle, **engine_kwargs)
except TypeError:
self._handles.handle.close()
raise
@property
def book(self):
"""
Book instance of class xlsxwriter.Workbook.
This attribute can be used to access engine-specific features.
"""
return self._book
@property
def sheets(self) -> dict[str, Any]:
result = self.book.sheetnames
return result
def _save(self) -> None:
"""
Save workbook to disk.
"""
self.book.close()
def _write_cells(
self,
cells,
sheet_name: str | None = None,
startrow: int = 0,
startcol: int = 0,
freeze_panes: tuple[int, int] | None = None,
) -> None:
# Write the frame cells using xlsxwriter.
sheet_name = self._get_sheet_name(sheet_name)
wks = self.book.get_worksheet_by_name(sheet_name)
if wks is None:
wks = self.book.add_worksheet(sheet_name)
style_dict = {"null": None}
if validate_freeze_panes(freeze_panes):
wks.freeze_panes(*(freeze_panes))
for cell in cells:
val, fmt = self._value_with_fmt(cell.val)
stylekey = json.dumps(cell.style)
if fmt:
stylekey += fmt
if stylekey in style_dict:
style = style_dict[stylekey]
else:
style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
style_dict[stylekey] = style
if cell.mergestart is not None and cell.mergeend is not None:
wks.merge_range(
startrow + cell.row,
startcol + cell.col,
startrow + cell.mergestart,
startcol + cell.mergeend,
val,
style,
)
else:
wks.write(startrow + cell.row, startcol + cell.col, val, style)

View File

@ -0,0 +1,143 @@
""" feather-format compat """
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
from pandas._config import using_pyarrow_string_dtype
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.util._validators import check_dtype_backend
import pandas as pd
from pandas.core.api import DataFrame
from pandas.core.shared_docs import _shared_docs
from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import get_handle
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Sequence,
)
from pandas._typing import (
DtypeBackend,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
@doc(storage_options=_shared_docs["storage_options"])
def to_feather(
df: DataFrame,
path: FilePath | WriteBuffer[bytes],
storage_options: StorageOptions | None = None,
**kwargs: Any,
) -> None:
"""
Write a DataFrame to the binary Feather format.
Parameters
----------
df : DataFrame
path : str, path object, or file-like object
{storage_options}
**kwargs :
Additional keywords passed to `pyarrow.feather.write_feather`.
"""
import_optional_dependency("pyarrow")
from pyarrow import feather
if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
with get_handle(
path, "wb", storage_options=storage_options, is_text=False
) as handles:
feather.write_feather(df, handles.handle, **kwargs)
@doc(storage_options=_shared_docs["storage_options"])
def read_feather(
path: FilePath | ReadBuffer[bytes],
columns: Sequence[Hashable] | None = None,
use_threads: bool = True,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame:
"""
Load a feather-format object from the file path.
Parameters
----------
path : str, path object, or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function. The string could be a URL.
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be: ``file://localhost/path/to/table.feather``.
columns : sequence, default None
If not provided, all columns are read.
use_threads : bool, default True
Whether to parallelize reading using multiple threads.
{storage_options}
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
.. versionadded:: 2.0
Returns
-------
type of object stored in file
Examples
--------
>>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP
"""
import_optional_dependency("pyarrow")
from pyarrow import feather
# import utils to register the pyarrow extension types
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
check_dtype_backend(dtype_backend)
with get_handle(
path, "rb", storage_options=storage_options, is_text=False
) as handles:
if dtype_backend is lib.no_default and not using_pyarrow_string_dtype():
return feather.read_feather(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
pa_table = feather.read_table(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
if dtype_backend == "numpy_nullable":
from pandas.io._util import _arrow_dtype_mapping
return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
elif dtype_backend == "pyarrow":
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
elif using_pyarrow_string_dtype():
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
raise NotImplementedError

View File

@ -0,0 +1,9 @@
# ruff: noqa: TCH004
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# import modules that have public classes/functions
from pandas.io.formats import style
# and mark only those modules as public
__all__ = ["style"]

View File

@ -0,0 +1,157 @@
# GH37967: Enable the use of CSS named colors, as defined in
# matplotlib.colors.CSS4_COLORS, when exporting to Excel.
# This data has been copied here, instead of being imported from matplotlib,
# not to have ``to_excel`` methods require matplotlib.
# source: matplotlib._color_data (3.3.3)
from __future__ import annotations
CSS4_COLORS = {
"aliceblue": "F0F8FF",
"antiquewhite": "FAEBD7",
"aqua": "00FFFF",
"aquamarine": "7FFFD4",
"azure": "F0FFFF",
"beige": "F5F5DC",
"bisque": "FFE4C4",
"black": "000000",
"blanchedalmond": "FFEBCD",
"blue": "0000FF",
"blueviolet": "8A2BE2",
"brown": "A52A2A",
"burlywood": "DEB887",
"cadetblue": "5F9EA0",
"chartreuse": "7FFF00",
"chocolate": "D2691E",
"coral": "FF7F50",
"cornflowerblue": "6495ED",
"cornsilk": "FFF8DC",
"crimson": "DC143C",
"cyan": "00FFFF",
"darkblue": "00008B",
"darkcyan": "008B8B",
"darkgoldenrod": "B8860B",
"darkgray": "A9A9A9",
"darkgreen": "006400",
"darkgrey": "A9A9A9",
"darkkhaki": "BDB76B",
"darkmagenta": "8B008B",
"darkolivegreen": "556B2F",
"darkorange": "FF8C00",
"darkorchid": "9932CC",
"darkred": "8B0000",
"darksalmon": "E9967A",
"darkseagreen": "8FBC8F",
"darkslateblue": "483D8B",
"darkslategray": "2F4F4F",
"darkslategrey": "2F4F4F",
"darkturquoise": "00CED1",
"darkviolet": "9400D3",
"deeppink": "FF1493",
"deepskyblue": "00BFFF",
"dimgray": "696969",
"dimgrey": "696969",
"dodgerblue": "1E90FF",
"firebrick": "B22222",
"floralwhite": "FFFAF0",
"forestgreen": "228B22",
"fuchsia": "FF00FF",
"gainsboro": "DCDCDC",
"ghostwhite": "F8F8FF",
"gold": "FFD700",
"goldenrod": "DAA520",
"gray": "808080",
"green": "008000",
"greenyellow": "ADFF2F",
"grey": "808080",
"honeydew": "F0FFF0",
"hotpink": "FF69B4",
"indianred": "CD5C5C",
"indigo": "4B0082",
"ivory": "FFFFF0",
"khaki": "F0E68C",
"lavender": "E6E6FA",
"lavenderblush": "FFF0F5",
"lawngreen": "7CFC00",
"lemonchiffon": "FFFACD",
"lightblue": "ADD8E6",
"lightcoral": "F08080",
"lightcyan": "E0FFFF",
"lightgoldenrodyellow": "FAFAD2",
"lightgray": "D3D3D3",
"lightgreen": "90EE90",
"lightgrey": "D3D3D3",
"lightpink": "FFB6C1",
"lightsalmon": "FFA07A",
"lightseagreen": "20B2AA",
"lightskyblue": "87CEFA",
"lightslategray": "778899",
"lightslategrey": "778899",
"lightsteelblue": "B0C4DE",
"lightyellow": "FFFFE0",
"lime": "00FF00",
"limegreen": "32CD32",
"linen": "FAF0E6",
"magenta": "FF00FF",
"maroon": "800000",
"mediumaquamarine": "66CDAA",
"mediumblue": "0000CD",
"mediumorchid": "BA55D3",
"mediumpurple": "9370DB",
"mediumseagreen": "3CB371",
"mediumslateblue": "7B68EE",
"mediumspringgreen": "00FA9A",
"mediumturquoise": "48D1CC",
"mediumvioletred": "C71585",
"midnightblue": "191970",
"mintcream": "F5FFFA",
"mistyrose": "FFE4E1",
"moccasin": "FFE4B5",
"navajowhite": "FFDEAD",
"navy": "000080",
"oldlace": "FDF5E6",
"olive": "808000",
"olivedrab": "6B8E23",
"orange": "FFA500",
"orangered": "FF4500",
"orchid": "DA70D6",
"palegoldenrod": "EEE8AA",
"palegreen": "98FB98",
"paleturquoise": "AFEEEE",
"palevioletred": "DB7093",
"papayawhip": "FFEFD5",
"peachpuff": "FFDAB9",
"peru": "CD853F",
"pink": "FFC0CB",
"plum": "DDA0DD",
"powderblue": "B0E0E6",
"purple": "800080",
"rebeccapurple": "663399",
"red": "FF0000",
"rosybrown": "BC8F8F",
"royalblue": "4169E1",
"saddlebrown": "8B4513",
"salmon": "FA8072",
"sandybrown": "F4A460",
"seagreen": "2E8B57",
"seashell": "FFF5EE",
"sienna": "A0522D",
"silver": "C0C0C0",
"skyblue": "87CEEB",
"slateblue": "6A5ACD",
"slategray": "708090",
"slategrey": "708090",
"snow": "FFFAFA",
"springgreen": "00FF7F",
"steelblue": "4682B4",
"tan": "D2B48C",
"teal": "008080",
"thistle": "D8BFD8",
"tomato": "FF6347",
"turquoise": "40E0D0",
"violet": "EE82EE",
"wheat": "F5DEB3",
"white": "FFFFFF",
"whitesmoke": "F5F5F5",
"yellow": "FFFF00",
"yellowgreen": "9ACD32",
}

View File

@ -0,0 +1,94 @@
"""
Internal module for console introspection
"""
from __future__ import annotations
from shutil import get_terminal_size
def get_console_size() -> tuple[int | None, int | None]:
"""
Return console size as tuple = (width, height).
Returns (None,None) in non-interactive session.
"""
from pandas import get_option
display_width = get_option("display.width")
display_height = get_option("display.max_rows")
# Consider
# interactive shell terminal, can detect term size
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
# size non-interactive script, should disregard term size
# in addition
# width,height have default values, but setting to 'None' signals
# should use Auto-Detection, But only in interactive shell-terminal.
# Simple. yeah.
if in_interactive_session():
if in_ipython_frontend():
# sane defaults for interactive non-shell terminal
# match default for width,height in config_init
from pandas._config.config import get_default_val
terminal_width = get_default_val("display.width")
terminal_height = get_default_val("display.max_rows")
else:
# pure terminal
terminal_width, terminal_height = get_terminal_size()
else:
terminal_width, terminal_height = None, None
# Note if the User sets width/Height to None (auto-detection)
# and we're in a script (non-inter), this will return (None,None)
# caller needs to deal.
return display_width or terminal_width, display_height or terminal_height
# ----------------------------------------------------------------------
# Detect our environment
def in_interactive_session() -> bool:
"""
Check if we're running in an interactive shell.
Returns
-------
bool
True if running under python/ipython interactive shell.
"""
from pandas import get_option
def check_main():
try:
import __main__ as main
except ModuleNotFoundError:
return get_option("mode.sim_interactive")
return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
try:
# error: Name '__IPYTHON__' is not defined
return __IPYTHON__ or check_main() # type: ignore[name-defined]
except NameError:
return check_main()
def in_ipython_frontend() -> bool:
"""
Check if we're inside an IPython zmq frontend.
Returns
-------
bool
"""
try:
# error: Name 'get_ipython' is not defined
ip = get_ipython() # type: ignore[name-defined]
return "zmq" in str(type(ip)).lower()
except NameError:
pass
return False

View File

@ -0,0 +1,421 @@
"""
Utilities for interpreting CSS from Stylers for formatting non-HTML outputs.
"""
from __future__ import annotations
import re
from typing import (
TYPE_CHECKING,
Callable,
)
import warnings
from pandas.errors import CSSWarning
from pandas.util._exceptions import find_stack_level
if TYPE_CHECKING:
from collections.abc import (
Generator,
Iterable,
Iterator,
)
def _side_expander(prop_fmt: str) -> Callable:
"""
Wrapper to expand shorthand property into top, right, bottom, left properties
Parameters
----------
side : str
The border side to expand into properties
Returns
-------
function: Return to call when a 'border(-{side}): {value}' string is encountered
"""
def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]:
"""
Expand shorthand property into side-specific property (top, right, bottom, left)
Parameters
----------
prop (str): CSS property name
value (str): String token for property
Yields
------
Tuple (str, str): Expanded property, value
"""
tokens = value.split()
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
warnings.warn(
f'Could not expand "{prop}: {value}"',
CSSWarning,
stacklevel=find_stack_level(),
)
return
for key, idx in zip(self.SIDES, mapping):
yield prop_fmt.format(key), tokens[idx]
return expand
def _border_expander(side: str = "") -> Callable:
"""
Wrapper to expand 'border' property into border color, style, and width properties
Parameters
----------
side : str
The border side to expand into properties
Returns
-------
function: Return to call when a 'border(-{side}): {value}' string is encountered
"""
if side != "":
side = f"-{side}"
def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]:
"""
Expand border into color, style, and width tuples
Parameters
----------
prop : str
CSS property name passed to styler
value : str
Value passed to styler for property
Yields
------
Tuple (str, str): Expanded property, value
"""
tokens = value.split()
if len(tokens) == 0 or len(tokens) > 3:
warnings.warn(
f'Too many tokens provided to "{prop}" (expected 1-3)',
CSSWarning,
stacklevel=find_stack_level(),
)
# TODO: Can we use current color as initial value to comply with CSS standards?
border_declarations = {
f"border{side}-color": "black",
f"border{side}-style": "none",
f"border{side}-width": "medium",
}
for token in tokens:
if token.lower() in self.BORDER_STYLES:
border_declarations[f"border{side}-style"] = token
elif any(ratio in token.lower() for ratio in self.BORDER_WIDTH_RATIOS):
border_declarations[f"border{side}-width"] = token
else:
border_declarations[f"border{side}-color"] = token
# TODO: Warn user if item entered more than once (e.g. "border: red green")
# Per CSS, "border" will reset previous "border-*" definitions
yield from self.atomize(border_declarations.items())
return expand
class CSSResolver:
"""
A callable for parsing and resolving CSS to atomic properties.
"""
UNIT_RATIOS = {
"pt": ("pt", 1),
"em": ("em", 1),
"rem": ("pt", 12),
"ex": ("em", 0.5),
# 'ch':
"px": ("pt", 0.75),
"pc": ("pt", 12),
"in": ("pt", 72),
"cm": ("in", 1 / 2.54),
"mm": ("in", 1 / 25.4),
"q": ("mm", 0.25),
"!!default": ("em", 0),
}
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
FONT_SIZE_RATIOS.update(
{
"%": ("em", 0.01),
"xx-small": ("rem", 0.5),
"x-small": ("rem", 0.625),
"small": ("rem", 0.8),
"medium": ("rem", 1),
"large": ("rem", 1.125),
"x-large": ("rem", 1.5),
"xx-large": ("rem", 2),
"smaller": ("em", 1 / 1.2),
"larger": ("em", 1.2),
"!!default": ("em", 1),
}
)
MARGIN_RATIOS = UNIT_RATIOS.copy()
MARGIN_RATIOS.update({"none": ("pt", 0)})
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
BORDER_WIDTH_RATIOS.update(
{
"none": ("pt", 0),
"thick": ("px", 4),
"medium": ("px", 2),
"thin": ("px", 1),
# Default: medium only if solid
}
)
BORDER_STYLES = [
"none",
"hidden",
"dotted",
"dashed",
"solid",
"double",
"groove",
"ridge",
"inset",
"outset",
"mediumdashdot",
"dashdotdot",
"hair",
"mediumdashdotdot",
"dashdot",
"slantdashdot",
"mediumdashed",
]
SIDE_SHORTHANDS = {
1: [0, 0, 0, 0],
2: [0, 1, 0, 1],
3: [0, 1, 2, 1],
4: [0, 1, 2, 3],
}
SIDES = ("top", "right", "bottom", "left")
CSS_EXPANSIONS = {
**{
(f"border-{prop}" if prop else "border"): _border_expander(prop)
for prop in ["", "top", "right", "bottom", "left"]
},
**{
f"border-{prop}": _side_expander(f"border-{{:s}}-{prop}")
for prop in ["color", "style", "width"]
},
"margin": _side_expander("margin-{:s}"),
"padding": _side_expander("padding-{:s}"),
}
def __call__(
self,
declarations: str | Iterable[tuple[str, str]],
inherited: dict[str, str] | None = None,
) -> dict[str, str]:
"""
The given declarations to atomic properties.
Parameters
----------
declarations_str : str | Iterable[tuple[str, str]]
A CSS string or set of CSS declaration tuples
e.g. "font-weight: bold; background: blue" or
{("font-weight", "bold"), ("background", "blue")}
inherited : dict, optional
Atomic properties indicating the inherited style context in which
declarations_str is to be resolved. ``inherited`` should already
be resolved, i.e. valid output of this method.
Returns
-------
dict
Atomic CSS 2.2 properties.
Examples
--------
>>> resolve = CSSResolver()
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
>>> out = resolve('''
... border-color: BLUE RED;
... font-size: 1em;
... font-size: 2em;
... font-weight: normal;
... font-weight: inherit;
... ''', inherited)
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
[('border-bottom-color', 'blue'),
('border-left-color', 'red'),
('border-right-color', 'red'),
('border-top-color', 'blue'),
('font-family', 'serif'),
('font-size', '24pt'),
('font-weight', 'bold')]
"""
if isinstance(declarations, str):
declarations = self.parse(declarations)
props = dict(self.atomize(declarations))
if inherited is None:
inherited = {}
props = self._update_initial(props, inherited)
props = self._update_font_size(props, inherited)
return self._update_other_units(props)
def _update_initial(
self,
props: dict[str, str],
inherited: dict[str, str],
) -> dict[str, str]:
# 1. resolve inherited, initial
for prop, val in inherited.items():
if prop not in props:
props[prop] = val
new_props = props.copy()
for prop, val in props.items():
if val == "inherit":
val = inherited.get(prop, "initial")
if val in ("initial", None):
# we do not define a complete initial stylesheet
del new_props[prop]
else:
new_props[prop] = val
return new_props
def _update_font_size(
self,
props: dict[str, str],
inherited: dict[str, str],
) -> dict[str, str]:
# 2. resolve relative font size
if props.get("font-size"):
props["font-size"] = self.size_to_pt(
props["font-size"],
self._get_font_size(inherited),
conversions=self.FONT_SIZE_RATIOS,
)
return props
def _get_font_size(self, props: dict[str, str]) -> float | None:
if props.get("font-size"):
font_size_string = props["font-size"]
return self._get_float_font_size_from_pt(font_size_string)
return None
def _get_float_font_size_from_pt(self, font_size_string: str) -> float:
assert font_size_string.endswith("pt")
return float(font_size_string.rstrip("pt"))
def _update_other_units(self, props: dict[str, str]) -> dict[str, str]:
font_size = self._get_font_size(props)
# 3. TODO: resolve other font-relative units
for side in self.SIDES:
prop = f"border-{side}-width"
if prop in props:
props[prop] = self.size_to_pt(
props[prop],
em_pt=font_size,
conversions=self.BORDER_WIDTH_RATIOS,
)
for prop in [f"margin-{side}", f"padding-{side}"]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
props[prop],
em_pt=font_size,
conversions=self.MARGIN_RATIOS,
)
return props
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS) -> str:
def _error():
warnings.warn(
f"Unhandled size: {repr(in_val)}",
CSSWarning,
stacklevel=find_stack_level(),
)
return self.size_to_pt("1!!default", conversions=conversions)
match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val)
if match is None:
return _error()
val, unit = match.groups()
if val == "":
# hack for 'large' etc.
val = 1
else:
try:
val = float(val)
except ValueError:
return _error()
while unit != "pt":
if unit == "em":
if em_pt is None:
unit = "rem"
else:
val *= em_pt
unit = "pt"
continue
try:
unit, mul = conversions[unit]
except KeyError:
return _error()
val *= mul
val = round(val, 5)
if int(val) == val:
size_fmt = f"{int(val):d}pt"
else:
size_fmt = f"{val:f}pt"
return size_fmt
def atomize(self, declarations: Iterable) -> Generator[tuple[str, str], None, None]:
for prop, value in declarations:
prop = prop.lower()
value = value.lower()
if prop in self.CSS_EXPANSIONS:
expand = self.CSS_EXPANSIONS[prop]
yield from expand(self, prop, value)
else:
yield prop, value
def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]:
"""
Generates (prop, value) pairs from declarations.
In a future version may generate parsed tokens from tinycss/tinycss2
Parameters
----------
declarations_str : str
"""
for decl in declarations_str.split(";"):
if not decl.strip():
continue
prop, sep, val = decl.partition(":")
prop = prop.strip().lower()
# TODO: don't lowercase case sensitive parts of values (strings)
val = val.strip().lower()
if sep:
yield prop, val
else:
warnings.warn(
f"Ill-formatted attribute: expected a colon in {repr(decl)}",
CSSWarning,
stacklevel=find_stack_level(),
)

View File

@ -0,0 +1,330 @@
"""
Module for formatting output data into CSV files.
"""
from __future__ import annotations
from collections.abc import (
Hashable,
Iterable,
Iterator,
Sequence,
)
import csv as csvlib
import os
from typing import (
TYPE_CHECKING,
Any,
cast,
)
import numpy as np
from pandas._libs import writers as libwriters
from pandas._typing import SequenceNotStr
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
ABCIndex,
ABCMultiIndex,
ABCPeriodIndex,
)
from pandas.core.dtypes.missing import notna
from pandas.core.indexes.api import Index
from pandas.io.common import get_handle
if TYPE_CHECKING:
from pandas._typing import (
CompressionOptions,
FilePath,
FloatFormatType,
IndexLabel,
StorageOptions,
WriteBuffer,
npt,
)
from pandas.io.formats.format import DataFrameFormatter
_DEFAULT_CHUNKSIZE_CELLS = 100_000
class CSVFormatter:
cols: npt.NDArray[np.object_]
def __init__(
self,
formatter: DataFrameFormatter,
path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
sep: str = ",",
cols: Sequence[Hashable] | None = None,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
errors: str = "strict",
compression: CompressionOptions = "infer",
quoting: int | None = None,
lineterminator: str | None = "\n",
chunksize: int | None = None,
quotechar: str | None = '"',
date_format: str | None = None,
doublequote: bool = True,
escapechar: str | None = None,
storage_options: StorageOptions | None = None,
) -> None:
self.fmt = formatter
self.obj = self.fmt.frame
self.filepath_or_buffer = path_or_buf
self.encoding = encoding
self.compression: CompressionOptions = compression
self.mode = mode
self.storage_options = storage_options
self.sep = sep
self.index_label = self._initialize_index_label(index_label)
self.errors = errors
self.quoting = quoting or csvlib.QUOTE_MINIMAL
self.quotechar = self._initialize_quotechar(quotechar)
self.doublequote = doublequote
self.escapechar = escapechar
self.lineterminator = lineterminator or os.linesep
self.date_format = date_format
self.cols = self._initialize_columns(cols)
self.chunksize = self._initialize_chunksize(chunksize)
@property
def na_rep(self) -> str:
return self.fmt.na_rep
@property
def float_format(self) -> FloatFormatType | None:
return self.fmt.float_format
@property
def decimal(self) -> str:
return self.fmt.decimal
@property
def header(self) -> bool | SequenceNotStr[str]:
return self.fmt.header
@property
def index(self) -> bool:
return self.fmt.index
def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel:
if index_label is not False:
if index_label is None:
return self._get_index_label_from_obj()
elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)):
# given a string for a DF with Index
return [index_label]
return index_label
def _get_index_label_from_obj(self) -> Sequence[Hashable]:
if isinstance(self.obj.index, ABCMultiIndex):
return self._get_index_label_multiindex()
else:
return self._get_index_label_flat()
def _get_index_label_multiindex(self) -> Sequence[Hashable]:
return [name or "" for name in self.obj.index.names]
def _get_index_label_flat(self) -> Sequence[Hashable]:
index_label = self.obj.index.name
return [""] if index_label is None else [index_label]
def _initialize_quotechar(self, quotechar: str | None) -> str | None:
if self.quoting != csvlib.QUOTE_NONE:
# prevents crash in _csv
return quotechar
return None
@property
def has_mi_columns(self) -> bool:
return bool(isinstance(self.obj.columns, ABCMultiIndex))
def _initialize_columns(
self, cols: Iterable[Hashable] | None
) -> npt.NDArray[np.object_]:
# validate mi options
if self.has_mi_columns:
if cols is not None:
msg = "cannot specify cols with a MultiIndex on the columns"
raise TypeError(msg)
if cols is not None:
if isinstance(cols, ABCIndex):
cols = cols._get_values_for_csv(**self._number_format)
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]
# update columns to include possible multiplicity of dupes
# and make sure cols is just a list of labels
new_cols = self.obj.columns
return new_cols._get_values_for_csv(**self._number_format)
def _initialize_chunksize(self, chunksize: int | None) -> int:
if chunksize is None:
return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
return int(chunksize)
@property
def _number_format(self) -> dict[str, Any]:
"""Dictionary used for storing number formatting settings."""
return {
"na_rep": self.na_rep,
"float_format": self.float_format,
"date_format": self.date_format,
"quoting": self.quoting,
"decimal": self.decimal,
}
@cache_readonly
def data_index(self) -> Index:
data_index = self.obj.index
if (
isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex))
and self.date_format is not None
):
data_index = Index(
[x.strftime(self.date_format) if notna(x) else "" for x in data_index]
)
elif isinstance(data_index, ABCMultiIndex):
data_index = data_index.remove_unused_levels()
return data_index
@property
def nlevels(self) -> int:
if self.index:
return getattr(self.data_index, "nlevels", 1)
else:
return 0
@property
def _has_aliases(self) -> bool:
return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
@property
def _need_to_save_header(self) -> bool:
return bool(self._has_aliases or self.header)
@property
def write_cols(self) -> SequenceNotStr[Hashable]:
if self._has_aliases:
assert not isinstance(self.header, bool)
if len(self.header) != len(self.cols):
raise ValueError(
f"Writing {len(self.cols)} cols but got {len(self.header)} aliases"
)
return self.header
else:
# self.cols is an ndarray derived from Index._get_values_for_csv,
# so its entries are strings, i.e. hashable
return cast(SequenceNotStr[Hashable], self.cols)
@property
def encoded_labels(self) -> list[Hashable]:
encoded_labels: list[Hashable] = []
if self.index and self.index_label:
assert isinstance(self.index_label, Sequence)
encoded_labels = list(self.index_label)
if not self.has_mi_columns or self._has_aliases:
encoded_labels += list(self.write_cols)
return encoded_labels
def save(self) -> None:
"""
Create the writer & save.
"""
# apply compression and byte/text conversion
with get_handle(
self.filepath_or_buffer,
self.mode,
encoding=self.encoding,
errors=self.errors,
compression=self.compression,
storage_options=self.storage_options,
) as handles:
# Note: self.encoding is irrelevant here
self.writer = csvlib.writer(
handles.handle,
lineterminator=self.lineterminator,
delimiter=self.sep,
quoting=self.quoting,
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar,
)
self._save()
def _save(self) -> None:
if self._need_to_save_header:
self._save_header()
self._save_body()
def _save_header(self) -> None:
if not self.has_mi_columns or self._has_aliases:
self.writer.writerow(self.encoded_labels)
else:
for row in self._generate_multiindex_header_rows():
self.writer.writerow(row)
def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
columns = self.obj.columns
for i in range(columns.nlevels):
# we need at least 1 index column to write our col names
col_line = []
if self.index:
# name is the first column
col_line.append(columns.names[i])
if isinstance(self.index_label, list) and len(self.index_label) > 1:
col_line.extend([""] * (len(self.index_label) - 1))
col_line.extend(columns._get_level_values(i))
yield col_line
# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
if self.encoded_labels and set(self.encoded_labels) != {""}:
yield self.encoded_labels + [""] * len(columns)
def _save_body(self) -> None:
nrows = len(self.data_index)
chunks = (nrows // self.chunksize) + 1
for i in range(chunks):
start_i = i * self.chunksize
end_i = min(start_i + self.chunksize, nrows)
if start_i >= end_i:
break
self._save_chunk(start_i, end_i)
def _save_chunk(self, start_i: int, end_i: int) -> None:
# create the data for a chunk
slicer = slice(start_i, end_i)
df = self.obj.iloc[slicer]
res = df._get_values_for_csv(**self._number_format)
data = list(res._iter_column_arrays())
ix = self.data_index[slicer]._get_values_for_csv(**self._number_format)
libwriters.write_csv_rows(
data,
ix,
self.nlevels,
self.cols,
self.writer,
)

View File

@ -0,0 +1,962 @@
"""
Utilities for conversion to writer-agnostic Excel representation.
"""
from __future__ import annotations
from collections.abc import (
Hashable,
Iterable,
Mapping,
Sequence,
)
import functools
import itertools
import re
from typing import (
TYPE_CHECKING,
Any,
Callable,
cast,
)
import warnings
import numpy as np
from pandas._libs.lib import is_list_like
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes import missing
from pandas.core.dtypes.common import (
is_float,
is_scalar,
)
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
)
import pandas.core.common as com
from pandas.core.shared_docs import _shared_docs
from pandas.io.formats._color_data import CSS4_COLORS
from pandas.io.formats.css import (
CSSResolver,
CSSWarning,
)
from pandas.io.formats.format import get_level_lengths
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from pandas._typing import (
FilePath,
IndexLabel,
StorageOptions,
WriteExcelBuffer,
)
from pandas import ExcelWriter
class ExcelCell:
__fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
__slots__ = __fields__
def __init__(
self,
row: int,
col: int,
val,
style=None,
mergestart: int | None = None,
mergeend: int | None = None,
) -> None:
self.row = row
self.col = col
self.val = val
self.style = style
self.mergestart = mergestart
self.mergeend = mergeend
class CssExcelCell(ExcelCell):
def __init__(
self,
row: int,
col: int,
val,
style: dict | None,
css_styles: dict[tuple[int, int], list[tuple[str, Any]]] | None,
css_row: int,
css_col: int,
css_converter: Callable | None,
**kwargs,
) -> None:
if css_styles and css_converter:
# Use dict to get only one (case-insensitive) declaration per property
declaration_dict = {
prop.lower(): val for prop, val in css_styles[css_row, css_col]
}
# Convert to frozenset for order-invariant caching
unique_declarations = frozenset(declaration_dict.items())
style = css_converter(unique_declarations)
super().__init__(row=row, col=col, val=val, style=style, **kwargs)
class CSSToExcelConverter:
"""
A callable for converting CSS declarations to ExcelWriter styles
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
focusing on font styling, backgrounds, borders and alignment.
Operates by first computing CSS styles in a fairly generic
way (see :meth:`compute_css`) then determining Excel style
properties from CSS properties (see :meth:`build_xlstyle`).
Parameters
----------
inherited : str, optional
CSS declarations understood to be the containing scope for the
CSS processed by :meth:`__call__`.
"""
NAMED_COLORS = CSS4_COLORS
VERTICAL_MAP = {
"top": "top",
"text-top": "top",
"middle": "center",
"baseline": "bottom",
"bottom": "bottom",
"text-bottom": "bottom",
# OpenXML also has 'justify', 'distributed'
}
BOLD_MAP = {
"bold": True,
"bolder": True,
"600": True,
"700": True,
"800": True,
"900": True,
"normal": False,
"lighter": False,
"100": False,
"200": False,
"300": False,
"400": False,
"500": False,
}
ITALIC_MAP = {
"normal": False,
"italic": True,
"oblique": True,
}
FAMILY_MAP = {
"serif": 1, # roman
"sans-serif": 2, # swiss
"cursive": 4, # script
"fantasy": 5, # decorative
}
BORDER_STYLE_MAP = {
style.lower(): style
for style in [
"dashed",
"mediumDashDot",
"dashDotDot",
"hair",
"dotted",
"mediumDashDotDot",
"double",
"dashDot",
"slantDashDot",
"mediumDashed",
]
}
# NB: Most of the methods here could be classmethods, as only __init__
# and __call__ make use of instance attributes. We leave them as
# instancemethods so that users can easily experiment with extensions
# without monkey-patching.
inherited: dict[str, str] | None
def __init__(self, inherited: str | None = None) -> None:
if inherited is not None:
self.inherited = self.compute_css(inherited)
else:
self.inherited = None
# We should avoid cache on the __call__ method.
# Otherwise once the method __call__ has been called
# garbage collection no longer deletes the instance.
self._call_cached = functools.cache(self._call_uncached)
compute_css = CSSResolver()
def __call__(
self, declarations: str | frozenset[tuple[str, str]]
) -> dict[str, dict[str, str]]:
"""
Convert CSS declarations to ExcelWriter style.
Parameters
----------
declarations : str | frozenset[tuple[str, str]]
CSS string or set of CSS declaration tuples.
e.g. "font-weight: bold; background: blue" or
{("font-weight", "bold"), ("background", "blue")}
Returns
-------
xlstyle : dict
A style as interpreted by ExcelWriter when found in
ExcelCell.style.
"""
return self._call_cached(declarations)
def _call_uncached(
self, declarations: str | frozenset[tuple[str, str]]
) -> dict[str, dict[str, str]]:
properties = self.compute_css(declarations, self.inherited)
return self.build_xlstyle(properties)
def build_xlstyle(self, props: Mapping[str, str]) -> dict[str, dict[str, str]]:
out = {
"alignment": self.build_alignment(props),
"border": self.build_border(props),
"fill": self.build_fill(props),
"font": self.build_font(props),
"number_format": self.build_number_format(props),
}
# TODO: handle cell width and height: needs support in pandas.io.excel
def remove_none(d: dict[str, str | None]) -> None:
"""Remove key where value is None, through nested dicts"""
for k, v in list(d.items()):
if v is None:
del d[k]
elif isinstance(v, dict):
remove_none(v)
if not v:
del d[k]
remove_none(out)
return out
def build_alignment(self, props: Mapping[str, str]) -> dict[str, bool | str | None]:
# TODO: text-indent, padding-left -> alignment.indent
return {
"horizontal": props.get("text-align"),
"vertical": self._get_vertical_alignment(props),
"wrap_text": self._get_is_wrap_text(props),
}
def _get_vertical_alignment(self, props: Mapping[str, str]) -> str | None:
vertical_align = props.get("vertical-align")
if vertical_align:
return self.VERTICAL_MAP.get(vertical_align)
return None
def _get_is_wrap_text(self, props: Mapping[str, str]) -> bool | None:
if props.get("white-space") is None:
return None
return bool(props["white-space"] not in ("nowrap", "pre", "pre-line"))
def build_border(
self, props: Mapping[str, str]
) -> dict[str, dict[str, str | None]]:
return {
side: {
"style": self._border_style(
props.get(f"border-{side}-style"),
props.get(f"border-{side}-width"),
self.color_to_excel(props.get(f"border-{side}-color")),
),
"color": self.color_to_excel(props.get(f"border-{side}-color")),
}
for side in ["top", "right", "bottom", "left"]
}
def _border_style(self, style: str | None, width: str | None, color: str | None):
# convert styles and widths to openxml, one of:
# 'dashDot'
# 'dashDotDot'
# 'dashed'
# 'dotted'
# 'double'
# 'hair'
# 'medium'
# 'mediumDashDot'
# 'mediumDashDotDot'
# 'mediumDashed'
# 'slantDashDot'
# 'thick'
# 'thin'
if width is None and style is None and color is None:
# Return None will remove "border" from style dictionary
return None
if width is None and style is None:
# Return "none" will keep "border" in style dictionary
return "none"
if style in ("none", "hidden"):
return "none"
width_name = self._get_width_name(width)
if width_name is None:
return "none"
if style in (None, "groove", "ridge", "inset", "outset", "solid"):
# not handled
return width_name
if style == "double":
return "double"
if style == "dotted":
if width_name in ("hair", "thin"):
return "dotted"
return "mediumDashDotDot"
if style == "dashed":
if width_name in ("hair", "thin"):
return "dashed"
return "mediumDashed"
elif style in self.BORDER_STYLE_MAP:
# Excel-specific styles
return self.BORDER_STYLE_MAP[style]
else:
warnings.warn(
f"Unhandled border style format: {repr(style)}",
CSSWarning,
stacklevel=find_stack_level(),
)
return "none"
def _get_width_name(self, width_input: str | None) -> str | None:
width = self._width_to_float(width_input)
if width < 1e-5:
return None
elif width < 1.3:
return "thin"
elif width < 2.8:
return "medium"
return "thick"
def _width_to_float(self, width: str | None) -> float:
if width is None:
width = "2pt"
return self._pt_to_float(width)
def _pt_to_float(self, pt_string: str) -> float:
assert pt_string.endswith("pt")
return float(pt_string.rstrip("pt"))
def build_fill(self, props: Mapping[str, str]):
# TODO: perhaps allow for special properties
# -excel-pattern-bgcolor and -excel-pattern-type
fill_color = props.get("background-color")
if fill_color not in (None, "transparent", "none"):
return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}
def build_number_format(self, props: Mapping[str, str]) -> dict[str, str | None]:
fc = props.get("number-format")
fc = fc.replace("§", ";") if isinstance(fc, str) else fc
return {"format_code": fc}
def build_font(
self, props: Mapping[str, str]
) -> dict[str, bool | float | str | None]:
font_names = self._get_font_names(props)
decoration = self._get_decoration(props)
return {
"name": font_names[0] if font_names else None,
"family": self._select_font_family(font_names),
"size": self._get_font_size(props),
"bold": self._get_is_bold(props),
"italic": self._get_is_italic(props),
"underline": ("single" if "underline" in decoration else None),
"strike": ("line-through" in decoration) or None,
"color": self.color_to_excel(props.get("color")),
# shadow if nonzero digit before shadow color
"shadow": self._get_shadow(props),
}
def _get_is_bold(self, props: Mapping[str, str]) -> bool | None:
weight = props.get("font-weight")
if weight:
return self.BOLD_MAP.get(weight)
return None
def _get_is_italic(self, props: Mapping[str, str]) -> bool | None:
font_style = props.get("font-style")
if font_style:
return self.ITALIC_MAP.get(font_style)
return None
def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]:
decoration = props.get("text-decoration")
if decoration is not None:
return decoration.split()
else:
return ()
def _get_underline(self, decoration: Sequence[str]) -> str | None:
if "underline" in decoration:
return "single"
return None
def _get_shadow(self, props: Mapping[str, str]) -> bool | None:
if "text-shadow" in props:
return bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
return None
def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]:
font_names_tmp = re.findall(
r"""(?x)
(
"(?:[^"]|\\")+"
|
'(?:[^']|\\')+'
|
[^'",]+
)(?=,|\s*$)
""",
props.get("font-family", ""),
)
font_names = []
for name in font_names_tmp:
if name[:1] == '"':
name = name[1:-1].replace('\\"', '"')
elif name[:1] == "'":
name = name[1:-1].replace("\\'", "'")
else:
name = name.strip()
if name:
font_names.append(name)
return font_names
def _get_font_size(self, props: Mapping[str, str]) -> float | None:
size = props.get("font-size")
if size is None:
return size
return self._pt_to_float(size)
def _select_font_family(self, font_names: Sequence[str]) -> int | None:
family = None
for name in font_names:
family = self.FAMILY_MAP.get(name)
if family:
break
return family
def color_to_excel(self, val: str | None) -> str | None:
if val is None:
return None
if self._is_hex_color(val):
return self._convert_hex_to_excel(val)
try:
return self.NAMED_COLORS[val]
except KeyError:
warnings.warn(
f"Unhandled color format: {repr(val)}",
CSSWarning,
stacklevel=find_stack_level(),
)
return None
def _is_hex_color(self, color_string: str) -> bool:
return bool(color_string.startswith("#"))
def _convert_hex_to_excel(self, color_string: str) -> str:
code = color_string.lstrip("#")
if self._is_shorthand_color(color_string):
return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper()
else:
return code.upper()
def _is_shorthand_color(self, color_string: str) -> bool:
"""Check if color code is shorthand.
#FFF is a shorthand as opposed to full #FFFFFF.
"""
code = color_string.lstrip("#")
if len(code) == 3:
return True
elif len(code) == 6:
return False
else:
raise ValueError(f"Unexpected color {color_string}")
class ExcelFormatter:
"""
Class for formatting a DataFrame to a list of ExcelCells,
Parameters
----------
df : DataFrame or Styler
na_rep: na representation
float_format : str, default None
Format string for floating point numbers
cols : sequence, optional
Columns to write
header : bool or sequence of str, default True
Write out column names. If a list of string is given it is
assumed to be aliases for the column names
index : bool, default True
output row names (index)
index_label : str or sequence, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
merge_cells : bool, default False
Format MultiIndex and Hierarchical Rows as merged cells.
inf_rep : str, default `'inf'`
representation for np.inf values (which aren't representable in Excel)
A `'-'` sign will be added in front of -inf.
style_converter : callable, optional
This translates Styler styles (CSS) into ExcelWriter styles.
Defaults to ``CSSToExcelConverter()``.
It should have signature css_declarations string -> excel style.
This is only called for body cells.
"""
max_rows = 2**20
max_cols = 2**14
def __init__(
self,
df,
na_rep: str = "",
float_format: str | None = None,
cols: Sequence[Hashable] | None = None,
header: Sequence[Hashable] | bool = True,
index: bool = True,
index_label: IndexLabel | None = None,
merge_cells: bool = False,
inf_rep: str = "inf",
style_converter: Callable | None = None,
) -> None:
self.rowcounter = 0
self.na_rep = na_rep
if not isinstance(df, DataFrame):
self.styler = df
self.styler._compute() # calculate applied styles
df = df.data
if style_converter is None:
style_converter = CSSToExcelConverter()
self.style_converter: Callable | None = style_converter
else:
self.styler = None
self.style_converter = None
self.df = df
if cols is not None:
# all missing, raise
if not len(Index(cols).intersection(df.columns)):
raise KeyError("passes columns are not ALL present dataframe")
if len(Index(cols).intersection(df.columns)) != len(set(cols)):
# Deprecated in GH#17295, enforced in 1.0.0
raise KeyError("Not all names specified in 'columns' are found")
self.df = df.reindex(columns=cols)
self.columns = self.df.columns
self.float_format = float_format
self.index = index
self.index_label = index_label
self.header = header
self.merge_cells = merge_cells
self.inf_rep = inf_rep
@property
def header_style(self) -> dict[str, dict[str, str | bool]]:
return {
"font": {"bold": True},
"borders": {
"top": "thin",
"right": "thin",
"bottom": "thin",
"left": "thin",
},
"alignment": {"horizontal": "center", "vertical": "top"},
}
def _format_value(self, val):
if is_scalar(val) and missing.isna(val):
val = self.na_rep
elif is_float(val):
if missing.isposinf_scalar(val):
val = self.inf_rep
elif missing.isneginf_scalar(val):
val = f"-{self.inf_rep}"
elif self.float_format is not None:
val = float(self.float_format % val)
if getattr(val, "tzinfo", None) is not None:
raise ValueError(
"Excel does not support datetimes with "
"timezones. Please ensure that datetimes "
"are timezone unaware before writing to Excel."
)
return val
def _format_header_mi(self) -> Iterable[ExcelCell]:
if self.columns.nlevels > 1:
if not self.index:
raise NotImplementedError(
"Writing to Excel with MultiIndex columns and no "
"index ('index'=False) is not yet implemented."
)
if not (self._has_aliases or self.header):
return
columns = self.columns
level_strs = columns._format_multi(
sparsify=self.merge_cells, include_names=False
)
level_lengths = get_level_lengths(level_strs)
coloffset = 0
lnum = 0
if self.index and isinstance(self.df.index, MultiIndex):
coloffset = len(self.df.index[0]) - 1
if self.merge_cells:
# Format multi-index as a merged cells.
for lnum, name in enumerate(columns.names):
yield ExcelCell(
row=lnum,
col=coloffset,
val=name,
style=self.header_style,
)
for lnum, (spans, levels, level_codes) in enumerate(
zip(level_lengths, columns.levels, columns.codes)
):
values = levels.take(level_codes)
for i, span_val in spans.items():
mergestart, mergeend = None, None
if span_val > 1:
mergestart, mergeend = lnum, coloffset + i + span_val
yield CssExcelCell(
row=lnum,
col=coloffset + i + 1,
val=values[i],
style=self.header_style,
css_styles=getattr(self.styler, "ctx_columns", None),
css_row=lnum,
css_col=i,
css_converter=self.style_converter,
mergestart=mergestart,
mergeend=mergeend,
)
else:
# Format in legacy format with dots to indicate levels.
for i, values in enumerate(zip(*level_strs)):
v = ".".join(map(pprint_thing, values))
yield CssExcelCell(
row=lnum,
col=coloffset + i + 1,
val=v,
style=self.header_style,
css_styles=getattr(self.styler, "ctx_columns", None),
css_row=lnum,
css_col=i,
css_converter=self.style_converter,
)
self.rowcounter = lnum
def _format_header_regular(self) -> Iterable[ExcelCell]:
if self._has_aliases or self.header:
coloffset = 0
if self.index:
coloffset = 1
if isinstance(self.df.index, MultiIndex):
coloffset = len(self.df.index.names)
colnames = self.columns
if self._has_aliases:
self.header = cast(Sequence, self.header)
if len(self.header) != len(self.columns):
raise ValueError(
f"Writing {len(self.columns)} cols "
f"but got {len(self.header)} aliases"
)
colnames = self.header
for colindex, colname in enumerate(colnames):
yield CssExcelCell(
row=self.rowcounter,
col=colindex + coloffset,
val=colname,
style=self.header_style,
css_styles=getattr(self.styler, "ctx_columns", None),
css_row=0,
css_col=colindex,
css_converter=self.style_converter,
)
def _format_header(self) -> Iterable[ExcelCell]:
gen: Iterable[ExcelCell]
if isinstance(self.columns, MultiIndex):
gen = self._format_header_mi()
else:
gen = self._format_header_regular()
gen2: Iterable[ExcelCell] = ()
if self.df.index.names:
row = [x if x is not None else "" for x in self.df.index.names] + [
""
] * len(self.columns)
if functools.reduce(lambda x, y: x and y, (x != "" for x in row)):
gen2 = (
ExcelCell(self.rowcounter, colindex, val, self.header_style)
for colindex, val in enumerate(row)
)
self.rowcounter += 1
return itertools.chain(gen, gen2)
def _format_body(self) -> Iterable[ExcelCell]:
if isinstance(self.df.index, MultiIndex):
return self._format_hierarchical_rows()
else:
return self._format_regular_rows()
def _format_regular_rows(self) -> Iterable[ExcelCell]:
if self._has_aliases or self.header:
self.rowcounter += 1
# output index and index_label?
if self.index:
# check aliases
# if list only take first as this is not a MultiIndex
if self.index_label and isinstance(
self.index_label, (list, tuple, np.ndarray, Index)
):
index_label = self.index_label[0]
# if string good to go
elif self.index_label and isinstance(self.index_label, str):
index_label = self.index_label
else:
index_label = self.df.index.names[0]
if isinstance(self.columns, MultiIndex):
self.rowcounter += 1
if index_label and self.header is not False:
yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)
# write index_values
index_values = self.df.index
if isinstance(self.df.index, PeriodIndex):
index_values = self.df.index.to_timestamp()
for idx, idxval in enumerate(index_values):
yield CssExcelCell(
row=self.rowcounter + idx,
col=0,
val=idxval,
style=self.header_style,
css_styles=getattr(self.styler, "ctx_index", None),
css_row=idx,
css_col=0,
css_converter=self.style_converter,
)
coloffset = 1
else:
coloffset = 0
yield from self._generate_body(coloffset)
def _format_hierarchical_rows(self) -> Iterable[ExcelCell]:
if self._has_aliases or self.header:
self.rowcounter += 1
gcolidx = 0
if self.index:
index_labels = self.df.index.names
# check for aliases
if self.index_label and isinstance(
self.index_label, (list, tuple, np.ndarray, Index)
):
index_labels = self.index_label
# MultiIndex columns require an extra row
# with index names (blank if None) for
# unambiguous round-trip, unless not merging,
# in which case the names all go on one row Issue #11328
if isinstance(self.columns, MultiIndex) and self.merge_cells:
self.rowcounter += 1
# if index labels are not empty go ahead and dump
if com.any_not_none(*index_labels) and self.header is not False:
for cidx, name in enumerate(index_labels):
yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)
if self.merge_cells:
# Format hierarchical rows as merged cells.
level_strs = self.df.index._format_multi(
sparsify=True, include_names=False
)
level_lengths = get_level_lengths(level_strs)
for spans, levels, level_codes in zip(
level_lengths, self.df.index.levels, self.df.index.codes
):
values = levels.take(
level_codes,
allow_fill=levels._can_hold_na,
fill_value=levels._na_value,
)
for i, span_val in spans.items():
mergestart, mergeend = None, None
if span_val > 1:
mergestart = self.rowcounter + i + span_val - 1
mergeend = gcolidx
yield CssExcelCell(
row=self.rowcounter + i,
col=gcolidx,
val=values[i],
style=self.header_style,
css_styles=getattr(self.styler, "ctx_index", None),
css_row=i,
css_col=gcolidx,
css_converter=self.style_converter,
mergestart=mergestart,
mergeend=mergeend,
)
gcolidx += 1
else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
yield CssExcelCell(
row=self.rowcounter + idx,
col=gcolidx,
val=indexcolval,
style=self.header_style,
css_styles=getattr(self.styler, "ctx_index", None),
css_row=idx,
css_col=gcolidx,
css_converter=self.style_converter,
)
gcolidx += 1
yield from self._generate_body(gcolidx)
@property
def _has_aliases(self) -> bool:
"""Whether the aliases for column names are present."""
return is_list_like(self.header)
def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]:
# Write the body of the frame data series by series.
for colidx in range(len(self.columns)):
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
yield CssExcelCell(
row=self.rowcounter + i,
col=colidx + coloffset,
val=val,
style=None,
css_styles=getattr(self.styler, "ctx", None),
css_row=i,
css_col=colidx,
css_converter=self.style_converter,
)
def get_formatted_cells(self) -> Iterable[ExcelCell]:
for cell in itertools.chain(self._format_header(), self._format_body()):
cell.val = self._format_value(cell.val)
yield cell
@doc(storage_options=_shared_docs["storage_options"])
def write(
self,
writer: FilePath | WriteExcelBuffer | ExcelWriter,
sheet_name: str = "Sheet1",
startrow: int = 0,
startcol: int = 0,
freeze_panes: tuple[int, int] | None = None,
engine: str | None = None,
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
writer : path-like, file-like, or ExcelWriter object
File path or existing ExcelWriter
sheet_name : str, default 'Sheet1'
Name of sheet which will contain DataFrame
startrow :
upper left cell row to dump data frame
startcol :
upper left cell column to dump data frame
freeze_panes : tuple of integer (length 2), default None
Specifies the one-based bottommost row and rightmost column that
is to be frozen
engine : string, default None
write engine to use if writer is a path - you can also set this
via the options ``io.excel.xlsx.writer``,
or ``io.excel.xlsm.writer``.
{storage_options}
engine_kwargs: dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
from pandas.io.excel import ExcelWriter
num_rows, num_cols = self.df.shape
if num_rows > self.max_rows or num_cols > self.max_cols:
raise ValueError(
f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} "
f"Max sheet size is: {self.max_rows}, {self.max_cols}"
)
if engine_kwargs is None:
engine_kwargs = {}
formatted_cells = self.get_formatted_cells()
if isinstance(writer, ExcelWriter):
need_save = False
else:
writer = ExcelWriter(
writer,
engine=engine,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
need_save = True
try:
writer._write_cells(
formatted_cells,
sheet_name,
startrow=startrow,
startcol=startcol,
freeze_panes=freeze_panes,
)
finally:
# make sure to close opened file handles
if need_save:
writer.close()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,646 @@
"""
Module for formatting output data in HTML.
"""
from __future__ import annotations
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Any,
Final,
cast,
)
from pandas._config import get_option
from pandas._libs import lib
from pandas import (
MultiIndex,
option_context,
)
from pandas.io.common import is_url
from pandas.io.formats.format import (
DataFrameFormatter,
get_level_lengths,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterable,
Mapping,
)
class HTMLFormatter:
"""
Internal class for formatting output data in html.
This class is intended for shared functionality between
DataFrame.to_html() and DataFrame._repr_html_().
Any logic in common with other output formatting methods
should ideally be inherited from classes in format.py
and this class responsible for only producing html markup.
"""
indent_delta: Final = 2
def __init__(
self,
formatter: DataFrameFormatter,
classes: str | list[str] | tuple[str, ...] | None = None,
border: int | bool | None = None,
table_id: str | None = None,
render_links: bool = False,
) -> None:
self.fmt = formatter
self.classes = classes
self.frame = self.fmt.frame
self.columns = self.fmt.tr_frame.columns
self.elements: list[str] = []
self.bold_rows = self.fmt.bold_rows
self.escape = self.fmt.escape
self.show_dimensions = self.fmt.show_dimensions
if border is None or border is True:
border = cast(int, get_option("display.html.border"))
elif not border:
border = None
self.border = border
self.table_id = table_id
self.render_links = render_links
self.col_space = {}
is_multi_index = isinstance(self.columns, MultiIndex)
for column, value in self.fmt.col_space.items():
col_space_value = f"{value}px" if isinstance(value, int) else value
self.col_space[column] = col_space_value
# GH 53885: Handling case where column is index
# Flatten the data in the multi index and add in the map
if is_multi_index and isinstance(column, tuple):
for column_index in column:
self.col_space[str(column_index)] = col_space_value
def to_string(self) -> str:
lines = self.render()
if any(isinstance(x, str) for x in lines):
lines = [str(x) for x in lines]
return "\n".join(lines)
def render(self) -> list[str]:
self._write_table()
if self.should_show_dimensions:
by = chr(215) # × # noqa: RUF003
self.write(
f"<p>{len(self.frame)} rows {by} {len(self.frame.columns)} columns</p>"
)
return self.elements
@property
def should_show_dimensions(self) -> bool:
return self.fmt.should_show_dimensions
@property
def show_row_idx_names(self) -> bool:
return self.fmt.show_row_idx_names
@property
def show_col_idx_names(self) -> bool:
return self.fmt.show_col_idx_names
@property
def row_levels(self) -> int:
if self.fmt.index:
# showing (row) index
return self.frame.index.nlevels
elif self.show_col_idx_names:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# If the row index is not displayed a column of
# blank cells need to be included before the DataFrame values.
return 1
# not showing (row) index
return 0
def _get_columns_formatted_values(self) -> Iterable:
return self.columns
@property
def is_truncated(self) -> bool:
return self.fmt.is_truncated
@property
def ncols(self) -> int:
return len(self.fmt.tr_frame.columns)
def write(self, s: Any, indent: int = 0) -> None:
rs = pprint_thing(s)
self.elements.append(" " * indent + rs)
def write_th(
self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None
) -> None:
"""
Method for writing a formatted <th> cell.
If col_space is set on the formatter then that is used for
the value of min-width.
Parameters
----------
s : object
The data to be written inside the cell.
header : bool, default False
Set to True if the <th> is for use inside <thead>. This will
cause min-width to be set if there is one.
indent : int, default 0
The indentation level of the cell.
tags : str, default None
Tags to include in the cell.
Returns
-------
A written <th> cell.
"""
col_space = self.col_space.get(s, None)
if header and col_space is not None:
tags = tags or ""
tags += f'style="min-width: {col_space};"'
self._write_cell(s, kind="th", indent=indent, tags=tags)
def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None:
self._write_cell(s, kind="td", indent=indent, tags=tags)
def _write_cell(
self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None
) -> None:
if tags is not None:
start_tag = f"<{kind} {tags}>"
else:
start_tag = f"<{kind}>"
if self.escape:
# escape & first to prevent double escaping of &
esc = {"&": r"&amp;", "<": r"&lt;", ">": r"&gt;"}
else:
esc = {}
rs = pprint_thing(s, escape_chars=esc).strip()
if self.render_links and is_url(rs):
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
start_tag += f'<a href="{rs_unescaped}" target="_blank">'
end_a = "</a>"
else:
end_a = ""
self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
def write_tr(
self,
line: Iterable,
indent: int = 0,
indent_delta: int = 0,
header: bool = False,
align: str | None = None,
tags: dict[int, str] | None = None,
nindex_levels: int = 0,
) -> None:
if tags is None:
tags = {}
if align is None:
self.write("<tr>", indent)
else:
self.write(f'<tr style="text-align: {align};">', indent)
indent += indent_delta
for i, s in enumerate(line):
val_tag = tags.get(i, None)
if header or (self.bold_rows and i < nindex_levels):
self.write_th(s, indent=indent, header=header, tags=val_tag)
else:
self.write_td(s, indent, tags=val_tag)
indent -= indent_delta
self.write("</tr>", indent)
def _write_table(self, indent: int = 0) -> None:
_classes = ["dataframe"] # Default class.
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
_classes.append("tex2jax_ignore")
if self.classes is not None:
if isinstance(self.classes, str):
self.classes = self.classes.split()
if not isinstance(self.classes, (list, tuple)):
raise TypeError(
"classes must be a string, list, "
f"or tuple, not {type(self.classes)}"
)
_classes.extend(self.classes)
if self.table_id is None:
id_section = ""
else:
id_section = f' id="{self.table_id}"'
if self.border is None:
border_attr = ""
else:
border_attr = f' border="{self.border}"'
self.write(
f'<table{border_attr} class="{" ".join(_classes)}"{id_section}>',
indent,
)
if self.fmt.header or self.show_row_idx_names:
self._write_header(indent + self.indent_delta)
self._write_body(indent + self.indent_delta)
self.write("</table>", indent)
def _write_col_header(self, indent: int) -> None:
row: list[Hashable]
is_truncated_horizontally = self.fmt.is_truncated_horizontally
if isinstance(self.columns, MultiIndex):
template = 'colspan="{span:d}" halign="left"'
sentinel: lib.NoDefault | bool
if self.fmt.sparsify:
# GH3547
sentinel = lib.no_default
else:
sentinel = False
levels = self.columns._format_multi(sparsify=sentinel, include_names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
if is_truncated_horizontally:
# modify the header lines
ins_col = self.fmt.tr_col_num
if self.fmt.sparsify:
recs_new = {}
# Increment tags after ... col.
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
elif tag + span > ins_col:
recs_new[tag] = span + 1
if lnum == inner_lvl:
values = (
values[:ins_col] + ("...",) + values[ins_col:]
)
else:
# sparse col headers do not receive a ...
values = (
values[:ins_col]
+ (values[ins_col - 1],)
+ values[ins_col:]
)
else:
recs_new[tag] = span
# if ins_col lies between tags, all col headers
# get ...
if tag + span == ins_col:
recs_new[ins_col] = 1
values = values[:ins_col] + ("...",) + values[ins_col:]
records = recs_new
inner_lvl = len(level_lengths) - 1
if lnum == inner_lvl:
records[ins_col] = 1
else:
recs_new = {}
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
else:
recs_new[tag] = span
recs_new[ins_col] = 1
records = recs_new
values = values[:ins_col] + ["..."] + values[ins_col:]
# see gh-22579
# Column Offset Bug with to_html(index=False) with
# MultiIndex Columns and Index.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code
# block below for standard columns index.
row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class and create a
# _get_formatted_column_labels function for code
# parity with DataFrameFormatter class.
if self.fmt.show_index_names:
name = self.columns.names[lnum]
row.append(pprint_thing(name or ""))
else:
row.append("")
tags = {}
j = len(row)
for i, v in enumerate(values):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
continue
j += 1
row.append(v)
self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
else:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code block
# above for columns MultiIndex.
row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class.
if self.fmt.show_index_names:
row.append(self.columns.name or "")
else:
row.append("")
row.extend(self._get_columns_formatted_values())
align = self.fmt.justify
if is_truncated_horizontally:
ins_col = self.row_levels + self.fmt.tr_col_num
row.insert(ins_col, "...")
self.write_tr(row, indent, self.indent_delta, header=True, align=align)
def _write_row_header(self, indent: int) -> None:
is_truncated_horizontally = self.fmt.is_truncated_horizontally
row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
self.ncols + (1 if is_truncated_horizontally else 0)
)
self.write_tr(row, indent, self.indent_delta, header=True)
def _write_header(self, indent: int) -> None:
self.write("<thead>", indent)
if self.fmt.header:
self._write_col_header(indent + self.indent_delta)
if self.show_row_idx_names:
self._write_row_header(indent + self.indent_delta)
self.write("</thead>", indent)
def _get_formatted_values(self) -> dict[int, list[str]]:
with option_context("display.max_colwidth", None):
fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)}
return fmt_values
def _write_body(self, indent: int) -> None:
self.write("<tbody>", indent)
fmt_values = self._get_formatted_values()
# write values
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
else:
self._write_regular_rows(fmt_values, indent + self.indent_delta)
self.write("</tbody>", indent)
def _write_regular_rows(
self, fmt_values: Mapping[int, list[str]], indent: int
) -> None:
is_truncated_horizontally = self.fmt.is_truncated_horizontally
is_truncated_vertically = self.fmt.is_truncated_vertically
nrows = len(self.fmt.tr_frame)
if self.fmt.index:
fmt = self.fmt._get_formatter("__index__")
if fmt is not None:
index_values = self.fmt.tr_frame.index.map(fmt)
else:
# only reached with non-Multi index
index_values = self.fmt.tr_frame.index._format_flat(include_name=False)
row: list[str] = []
for i in range(nrows):
if is_truncated_vertically and i == (self.fmt.tr_row_num):
str_sep_row = ["..."] * len(row)
self.write_tr(
str_sep_row,
indent,
self.indent_delta,
tags=None,
nindex_levels=self.row_levels,
)
row = []
if self.fmt.index:
row.append(index_values[i])
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Add blank cell before data cells.
elif self.show_col_idx_names:
row.append("")
row.extend(fmt_values[j][i] for j in range(self.ncols))
if is_truncated_horizontally:
dot_col_ix = self.fmt.tr_col_num + self.row_levels
row.insert(dot_col_ix, "...")
self.write_tr(
row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
)
def _write_hierarchical_rows(
self, fmt_values: Mapping[int, list[str]], indent: int
) -> None:
template = 'rowspan="{span}" valign="top"'
is_truncated_horizontally = self.fmt.is_truncated_horizontally
is_truncated_vertically = self.fmt.is_truncated_vertically
frame = self.fmt.tr_frame
nrows = len(frame)
assert isinstance(frame.index, MultiIndex)
idx_values = frame.index._format_multi(sparsify=False, include_names=False)
idx_values = list(zip(*idx_values))
if self.fmt.sparsify:
# GH3547
sentinel = lib.no_default
levels = frame.index._format_multi(sparsify=sentinel, include_names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
if is_truncated_vertically:
# Insert ... row and adjust idx_values and
# level_lengths to take this into account.
ins_row = self.fmt.tr_row_num
inserted = False
for lnum, records in enumerate(level_lengths):
rec_new = {}
for tag, span in list(records.items()):
if tag >= ins_row:
rec_new[tag + 1] = span
elif tag + span > ins_row:
rec_new[tag] = span + 1
# GH 14882 - Make sure insertion done once
if not inserted:
dot_row = list(idx_values[ins_row - 1])
dot_row[-1] = "..."
idx_values.insert(ins_row, tuple(dot_row))
inserted = True
else:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
else:
rec_new[tag] = span
# If ins_row lies between tags, all cols idx cols
# receive ...
if tag + span == ins_row:
rec_new[ins_row] = 1
if lnum == 0:
idx_values.insert(
ins_row, tuple(["..."] * len(level_lengths))
)
# GH 14882 - Place ... in correct level
elif inserted:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
level_lengths[lnum] = rec_new
level_lengths[inner_lvl][ins_row] = 1
for ix_col in fmt_values:
fmt_values[ix_col].insert(ins_row, "...")
nrows += 1
for i in range(nrows):
row = []
tags = {}
sparse_offset = 0
j = 0
for records, v in zip(level_lengths, idx_values[i]):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
sparse_offset += 1
continue
j += 1
row.append(v)
row.extend(fmt_values[j][i] for j in range(self.ncols))
if is_truncated_horizontally:
row.insert(
self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
)
self.write_tr(
row,
indent,
self.indent_delta,
tags=tags,
nindex_levels=len(levels) - sparse_offset,
)
else:
row = []
for i in range(len(frame)):
if is_truncated_vertically and i == (self.fmt.tr_row_num):
str_sep_row = ["..."] * len(row)
self.write_tr(
str_sep_row,
indent,
self.indent_delta,
tags=None,
nindex_levels=self.row_levels,
)
idx_values = list(
zip(*frame.index._format_multi(sparsify=False, include_names=False))
)
row = []
row.extend(idx_values[i])
row.extend(fmt_values[j][i] for j in range(self.ncols))
if is_truncated_horizontally:
row.insert(self.row_levels + self.fmt.tr_col_num, "...")
self.write_tr(
row,
indent,
self.indent_delta,
tags=None,
nindex_levels=frame.index.nlevels,
)
class NotebookFormatter(HTMLFormatter):
"""
Internal class for formatting output data in html for display in Jupyter
Notebooks. This class is intended for functionality specific to
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
"""
def _get_formatted_values(self) -> dict[int, list[str]]:
return {i: self.fmt.format_col(i) for i in range(self.ncols)}
def _get_columns_formatted_values(self) -> list[str]:
# only reached with non-Multi Index
return self.columns._format_flat(include_name=False)
def write_style(self) -> None:
# We use the "scoped" attribute here so that the desired
# style properties for the data frame are not then applied
# throughout the entire notebook.
template_first = """\
<style scoped>"""
template_last = """\
</style>"""
template_select = """\
.dataframe %s {
%s: %s;
}"""
element_props = [
("tbody tr th:only-of-type", "vertical-align", "middle"),
("tbody tr th", "vertical-align", "top"),
]
if isinstance(self.columns, MultiIndex):
element_props.append(("thead tr th", "text-align", "left"))
if self.show_row_idx_names:
element_props.append(
("thead tr:last-of-type th", "text-align", "right")
)
else:
element_props.append(("thead th", "text-align", "right"))
template_mid = "\n\n".join(template_select % t for t in element_props)
template = dedent(f"{template_first}\n{template_mid}\n{template_last}")
self.write(template)
def render(self) -> list[str]:
self.write("<div>")
self.write_style()
super().render()
self.write("</div>")
return self.elements

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,572 @@
"""
Printing tools.
"""
from __future__ import annotations
from collections.abc import (
Iterable,
Mapping,
Sequence,
)
import sys
from typing import (
Any,
Callable,
TypeVar,
Union,
)
from unicodedata import east_asian_width
from pandas._config import get_option
from pandas.core.dtypes.inference import is_sequence
from pandas.io.formats.console import get_console_size
EscapeChars = Union[Mapping[str, str], Iterable[str]]
_KT = TypeVar("_KT")
_VT = TypeVar("_VT")
def adjoin(space: int, *lists: list[str], **kwargs) -> str:
"""
Glues together two sets of strings using the amount of space requested.
The idea is to prettify.
----------
space : int
number of spaces for padding
lists : str
list of str which being joined
strlen : callable
function used to calculate the length of each str. Needed for unicode
handling.
justfunc : callable
function used to justify str. Needed for unicode handling.
"""
strlen = kwargs.pop("strlen", len)
justfunc = kwargs.pop("justfunc", _adj_justify)
newLists = []
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
# not the last one
lengths.append(max(map(len, lists[-1])))
maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
nl = justfunc(lst, lengths[i], mode="left")
nl = ([" " * lengths[i]] * (maxLen - len(lst))) + nl
newLists.append(nl)
toJoin = zip(*newLists)
return "\n".join("".join(lines) for lines in toJoin)
def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]:
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == "left":
return [x.ljust(max_len) for x in texts]
elif mode == "center":
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
# Unicode consolidation
# ---------------------
#
# pprinting utility functions for generating Unicode text or
# bytes(3.x)/str(2.x) representations of objects.
# Try to use these as much as possible rather than rolling your own.
#
# When to use
# -----------
#
# 1) If you're writing code internal to pandas (no I/O directly involved),
# use pprint_thing().
#
# It will always return unicode text which can handled by other
# parts of the package without breakage.
#
# 2) if you need to write something out to file, use
# pprint_thing_encoded(encoding).
#
# If no encoding is specified, it defaults to utf-8. Since encoding pure
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
# working with straight ascii.
def _pprint_seq(
seq: Sequence, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds
) -> str:
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather than calling this directly.
bounds length of printed sequence, depending on options
"""
if isinstance(seq, set):
fmt = "{{{body}}}"
else:
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
s = iter(seq)
# handle sets, no slicing
r = [
pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
for i in range(min(nitems, len(seq)))
]
body = ", ".join(r)
if nitems < len(seq):
body += ", ..."
elif isinstance(seq, tuple) and len(seq) == 1:
body += ","
return fmt.format(body=body)
def _pprint_dict(
seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds
) -> str:
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather than calling this directly.
"""
fmt = "{{{things}}}"
pairs = []
pfmt = "{key}: {val}"
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
for k, v in list(seq.items())[:nitems]:
pairs.append(
pfmt.format(
key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
)
)
if nitems < len(seq):
return fmt.format(things=", ".join(pairs) + ", ...")
else:
return fmt.format(things=", ".join(pairs))
def pprint_thing(
thing: Any,
_nest_lvl: int = 0,
escape_chars: EscapeChars | None = None,
default_escapes: bool = False,
quote_strings: bool = False,
max_seq_items: int | None = None,
) -> str:
"""
This function is the sanctioned way of converting objects
to a string representation and properly handles nested sequences.
Parameters
----------
thing : anything to be formatted
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
with pprint_sequence, this argument is used to keep track of the
current nesting level, and limit it.
escape_chars : list or dict, optional
Characters to escape. If a dict is passed the values are the
replacements
default_escapes : bool, default False
Whether the input escape characters replaces or adds to the defaults
max_seq_items : int or None, default None
Pass through to other pretty printers to limit sequence printing
Returns
-------
str
"""
def as_escaped_string(
thing: Any, escape_chars: EscapeChars | None = escape_chars
) -> str:
translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"}
if isinstance(escape_chars, dict):
if default_escapes:
translate.update(escape_chars)
else:
translate = escape_chars
escape_chars = list(escape_chars.keys())
else:
escape_chars = escape_chars or ()
result = str(thing)
for c in escape_chars:
result = result.replace(c, translate[c])
return result
if hasattr(thing, "__next__"):
return str(thing)
elif isinstance(thing, dict) and _nest_lvl < get_option(
"display.pprint_nest_depth"
):
result = _pprint_dict(
thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
)
elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
result = _pprint_seq(
thing,
_nest_lvl,
escape_chars=escape_chars,
quote_strings=quote_strings,
max_seq_items=max_seq_items,
)
elif isinstance(thing, str) and quote_strings:
result = f"'{as_escaped_string(thing)}'"
else:
result = as_escaped_string(thing)
return result
def pprint_thing_encoded(
object, encoding: str = "utf-8", errors: str = "replace"
) -> bytes:
value = pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors)
def enable_data_resource_formatter(enable: bool) -> None:
if "IPython" not in sys.modules:
# definitely not in IPython
return
from IPython import get_ipython
ip = get_ipython()
if ip is None:
# still not in IPython
return
formatters = ip.display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
if enable:
if mimetype not in formatters:
# define tableschema formatter
from IPython.core.formatters import BaseFormatter
from traitlets import ObjectName
class TableSchemaFormatter(BaseFormatter):
print_method = ObjectName("_repr_data_resource_")
_return_type = (dict,)
# register it:
formatters[mimetype] = TableSchemaFormatter()
# enable it if it's been disabled:
formatters[mimetype].enabled = True
# unregister tableschema mime-type
elif mimetype in formatters:
formatters[mimetype].enabled = False
def default_pprint(thing: Any, max_seq_items: int | None = None) -> str:
return pprint_thing(
thing,
escape_chars=("\t", "\r", "\n"),
quote_strings=True,
max_seq_items=max_seq_items,
)
def format_object_summary(
obj,
formatter: Callable,
is_justify: bool = True,
name: str | None = None,
indent_for_name: bool = True,
line_break_each_value: bool = False,
) -> str:
"""
Return the formatted obj as a unicode string
Parameters
----------
obj : object
must be iterable and support __getitem__
formatter : callable
string formatter for an element
is_justify : bool
should justify the display
name : name, optional
defaults to the class name of the obj
indent_for_name : bool, default True
Whether subsequent lines should be indented to
align with the name.
line_break_each_value : bool, default False
If True, inserts a line break for each value of ``obj``.
If False, only break lines when the a line of values gets wider
than the display width.
Returns
-------
summary string
"""
display_width, _ = get_console_size()
if display_width is None:
display_width = get_option("display.width") or 80
if name is None:
name = type(obj).__name__
if indent_for_name:
name_len = len(name)
space1 = f'\n{(" " * (name_len + 1))}'
space2 = f'\n{(" " * (name_len + 2))}'
else:
space1 = "\n"
space2 = "\n " # space for the opening '['
n = len(obj)
if line_break_each_value:
# If we want to vertically align on each value of obj, we need to
# separate values by a line break and indent the values
sep = ",\n " + " " * len(name)
else:
sep = ","
max_seq_items = get_option("display.max_seq_items") or n
# are we a truncated display
is_truncated = n > max_seq_items
# adj can optionally handle unicode eastern asian width
adj = get_adjustment()
def _extend_line(
s: str, line: str, value: str, display_width: int, next_line_prefix: str
) -> tuple[str, str]:
if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
s += line.rstrip()
line = next_line_prefix
line += value
return s, line
def best_len(values: list[str]) -> int:
if values:
return max(adj.len(x) for x in values)
else:
return 0
close = ", "
if n == 0:
summary = f"[]{close}"
elif n == 1 and not line_break_each_value:
first = formatter(obj[0])
summary = f"[{first}]{close}"
elif n == 2 and not line_break_each_value:
first = formatter(obj[0])
last = formatter(obj[-1])
summary = f"[{first}, {last}]{close}"
else:
if max_seq_items == 1:
# If max_seq_items=1 show only last element
head = []
tail = [formatter(x) for x in obj[-1:]]
elif n > max_seq_items:
n = min(max_seq_items // 2, 10)
head = [formatter(x) for x in obj[:n]]
tail = [formatter(x) for x in obj[-n:]]
else:
head = []
tail = [formatter(x) for x in obj]
# adjust all values to max length if needed
if is_justify:
if line_break_each_value:
# Justify each string in the values of head and tail, so the
# strings will right align when head and tail are stacked
# vertically.
head, tail = _justify(head, tail)
elif is_truncated or not (
len(", ".join(head)) < display_width
and len(", ".join(tail)) < display_width
):
# Each string in head and tail should align with each other
max_length = max(best_len(head), best_len(tail))
head = [x.rjust(max_length) for x in head]
tail = [x.rjust(max_length) for x in tail]
# If we are not truncated and we are only a single
# line, then don't justify
if line_break_each_value:
# Now head and tail are of type List[Tuple[str]]. Below we
# convert them into List[str], so there will be one string per
# value. Also truncate items horizontally if wider than
# max_space
max_space = display_width - len(space2)
value = tail[0]
max_items = 1
for num_items in reversed(range(1, len(value) + 1)):
pprinted_seq = _pprint_seq(value, max_seq_items=num_items)
if len(pprinted_seq) < max_space:
max_items = num_items
break
head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
summary = ""
line = space2
for head_value in head:
word = head_value + sep + " "
summary, line = _extend_line(summary, line, word, display_width, space2)
if is_truncated:
# remove trailing space of last line
summary += line.rstrip() + space2 + "..."
line = space2
for tail_item in tail[:-1]:
word = tail_item + sep + " "
summary, line = _extend_line(summary, line, word, display_width, space2)
# last value: no sep added + 1 space of width used for trailing ','
summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
summary += line
# right now close is either '' or ', '
# Now we want to include the ']', but not the maybe space.
close = "]" + close.rstrip(" ")
summary += close
if len(summary) > (display_width) or line_break_each_value:
summary += space1
else: # one row
summary += " "
# remove initial space
summary = "[" + summary[len(space2) :]
return summary
def _justify(
head: list[Sequence[str]], tail: list[Sequence[str]]
) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]:
"""
Justify items in head and tail, so they are right-aligned when stacked.
Parameters
----------
head : list-like of list-likes of strings
tail : list-like of list-likes of strings
Returns
-------
tuple of list of tuples of strings
Same as head and tail, but items are right aligned when stacked
vertically.
Examples
--------
>>> _justify([['a', 'b']], [['abc', 'abcd']])
([(' a', ' b')], [('abc', 'abcd')])
"""
combined = head + tail
# For each position for the sequences in ``combined``,
# find the length of the largest string.
max_length = [0] * len(combined[0])
for inner_seq in combined:
length = [len(item) for item in inner_seq]
max_length = [max(x, y) for x, y in zip(max_length, length)]
# justify each item in each list-like in head and tail using max_length
head_tuples = [
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head
]
tail_tuples = [
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail
]
return head_tuples, tail_tuples
class PrettyDict(dict[_KT, _VT]):
"""Dict extension to support abbreviated __repr__"""
def __repr__(self) -> str:
return pprint_thing(self)
class _TextAdjustment:
def __init__(self) -> None:
self.encoding = get_option("display.encoding")
def len(self, text: str) -> int:
return len(text)
def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == "left":
return [x.ljust(max_len) for x in texts]
elif mode == "center":
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
def adjoin(self, space: int, *lists, **kwargs) -> str:
return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs)
class _EastAsianTextAdjustment(_TextAdjustment):
def __init__(self) -> None:
super().__init__()
if get_option("display.unicode.ambiguous_as_wide"):
self.ambiguous_width = 2
else:
self.ambiguous_width = 1
# Definition of East Asian Width
# https://unicode.org/reports/tr11/
# Ambiguous width can be changed by option
self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
def len(self, text: str) -> int:
"""
Calculate display width considering unicode East Asian Width
"""
if not isinstance(text, str):
return len(text)
return sum(
self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
)
def justify(
self, texts: Iterable[str], max_len: int, mode: str = "right"
) -> list[str]:
# re-calculate padding space per str considering East Asian Width
def _get_pad(t):
return max_len - self.len(t) + len(t)
if mode == "left":
return [x.ljust(_get_pad(x)) for x in texts]
elif mode == "center":
return [x.center(_get_pad(x)) for x in texts]
else:
return [x.rjust(_get_pad(x)) for x in texts]
def get_adjustment() -> _TextAdjustment:
use_east_asian_width = get_option("display.unicode.east_asian_width")
if use_east_asian_width:
return _EastAsianTextAdjustment()
else:
return _TextAdjustment()

View File

@ -0,0 +1,206 @@
"""
Module for formatting output data in console (to string).
"""
from __future__ import annotations
from shutil import get_terminal_size
from typing import TYPE_CHECKING
import numpy as np
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import Iterable
from pandas.io.formats.format import DataFrameFormatter
class StringFormatter:
"""Formatter for string representation of a dataframe."""
def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> None:
self.fmt = fmt
self.adj = fmt.adj
self.frame = fmt.frame
self.line_width = line_width
def to_string(self) -> str:
text = self._get_string_representation()
if self.fmt.should_show_dimensions:
text = f"{text}{self.fmt.dimensions_info}"
return text
def _get_strcols(self) -> list[list[str]]:
strcols = self.fmt.get_strcols()
if self.fmt.is_truncated:
strcols = self._insert_dot_separators(strcols)
return strcols
def _get_string_representation(self) -> str:
if self.fmt.frame.empty:
return self._empty_info_line
strcols = self._get_strcols()
if self.line_width is None:
# no need to wrap around just print the whole frame
return self.adj.adjoin(1, *strcols)
if self._need_to_wrap_around:
return self._join_multiline(strcols)
return self._fit_strcols_to_terminal_width(strcols)
@property
def _empty_info_line(self) -> str:
return (
f"Empty {type(self.frame).__name__}\n"
f"Columns: {pprint_thing(self.frame.columns)}\n"
f"Index: {pprint_thing(self.frame.index)}"
)
@property
def _need_to_wrap_around(self) -> bool:
return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0)
def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]:
str_index = self.fmt._get_formatted_index(self.fmt.tr_frame)
index_length = len(str_index)
if self.fmt.is_truncated_horizontally:
strcols = self._insert_dot_separator_horizontal(strcols, index_length)
if self.fmt.is_truncated_vertically:
strcols = self._insert_dot_separator_vertical(strcols, index_length)
return strcols
@property
def _adjusted_tr_col_num(self) -> int:
return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num
def _insert_dot_separator_horizontal(
self, strcols: list[list[str]], index_length: int
) -> list[list[str]]:
strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length)
return strcols
def _insert_dot_separator_vertical(
self, strcols: list[list[str]], index_length: int
) -> list[list[str]]:
n_header_rows = index_length - len(self.fmt.tr_frame)
row_num = self.fmt.tr_row_num
for ix, col in enumerate(strcols):
cwidth = self.adj.len(col[row_num])
if self.fmt.is_truncated_horizontally:
is_dot_col = ix == self._adjusted_tr_col_num
else:
is_dot_col = False
if cwidth > 3 or is_dot_col:
dots = "..."
else:
dots = ".."
if ix == 0 and self.fmt.index:
dot_mode = "left"
elif is_dot_col:
cwidth = 4
dot_mode = "right"
else:
dot_mode = "right"
dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0]
col.insert(row_num + n_header_rows, dot_str)
return strcols
def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
lwidth = self.line_width
adjoin_width = 1
strcols = list(strcols_input)
if self.fmt.index:
idx = strcols.pop(0)
lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
col_widths = [
np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
for col in strcols
]
assert lwidth is not None
col_bins = _binify(col_widths, lwidth)
nbins = len(col_bins)
str_lst = []
start = 0
for i, end in enumerate(col_bins):
row = strcols[start:end]
if self.fmt.index:
row.insert(0, idx)
if nbins > 1:
nrows = len(row[-1])
if end <= len(strcols) and i < nbins - 1:
row.append([" \\"] + [" "] * (nrows - 1))
else:
row.append([" "] * nrows)
str_lst.append(self.adj.adjoin(adjoin_width, *row))
start = end
return "\n\n".join(str_lst)
def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str:
from pandas import Series
lines = self.adj.adjoin(1, *strcols).split("\n")
max_len = Series(lines).str.len().max()
# plus truncate dot col
width, _ = get_terminal_size()
dif = max_len - width
# '+ 1' to avoid too wide repr (GH PR #17023)
adj_dif = dif + 1
col_lens = Series([Series(ele).str.len().max() for ele in strcols])
n_cols = len(col_lens)
counter = 0
while adj_dif > 0 and n_cols > 1:
counter += 1
mid = round(n_cols / 2)
mid_ix = col_lens.index[mid]
col_len = col_lens[mid_ix]
# adjoin adds one
adj_dif -= col_len + 1
col_lens = col_lens.drop(mid_ix)
n_cols = len(col_lens)
# subtract index column
max_cols_fitted = n_cols - self.fmt.index
# GH-21180. Ensure that we print at least two.
max_cols_fitted = max(max_cols_fitted, 2)
self.fmt.max_cols_fitted = max_cols_fitted
# Call again _truncate to cut frame appropriately
# and then generate string representation
self.fmt.truncate()
strcols = self._get_strcols()
return self.adj.adjoin(1, *strcols)
def _binify(cols: list[int], line_width: int) -> list[int]:
adjoin_width = 1
bins = []
curr_width = 0
i_last_column = len(cols) - 1
for i, w in enumerate(cols):
w_adjoined = w + adjoin_width
curr_width += w_adjoined
if i_last_column == i:
wrap = curr_width + 1 > line_width and i > 0
else:
wrap = curr_width + 2 > line_width and i > 0
if wrap:
bins.append(i)
curr_width = w_adjoined
bins.append(len(cols))
return bins

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
{# Update the html_style/table_structure.html documentation too #}
{% if doctype_html %}
<!DOCTYPE html>
<html>
<head>
<meta charset="{{encoding}}">
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
</head>
<body>
{% include html_table_tpl %}
</body>
</html>
{% elif not doctype_html %}
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
{% include html_table_tpl %}
{% endif %}

View File

@ -0,0 +1,26 @@
{%- block before_style -%}{%- endblock before_style -%}
{% block style %}
<style type="text/css">
{% block table_styles %}
{% for s in table_styles %}
#T_{{uuid}} {{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{% endfor %}
{% endblock table_styles %}
{% block before_cellstyle %}{% endblock before_cellstyle %}
{% block cellstyle %}
{% for cs in [cellstyle, cellstyle_index, cellstyle_columns] %}
{% for s in cs %}
{% for selector in s.selectors %}{% if not loop.first %}, {% endif %}#T_{{uuid}}_{{selector}}{% endfor %} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{% endfor %}
{% endfor %}
{% endblock cellstyle %}
</style>
{% endblock style %}

View File

@ -0,0 +1,63 @@
{% block before_table %}{% endblock before_table %}
{% block table %}
{% if exclude_styles %}
<table>
{% else %}
<table id="T_{{uuid}}"{% if table_attributes %} {{table_attributes}}{% endif %}>
{% endif %}
{% block caption %}
{% if caption and caption is string %}
<caption>{{caption}}</caption>
{% elif caption and caption is sequence %}
<caption>{{caption[0]}}</caption>
{% endif %}
{% endblock caption %}
{% block thead %}
<thead>
{% block before_head_rows %}{% endblock %}
{% for r in head %}
{% block head_tr scoped %}
<tr>
{% if exclude_styles %}
{% for c in r %}
{% if c.is_visible != False %}
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}
{% endfor %}
{% else %}
{% for c in r %}
{% if c.is_visible != False %}
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}
{% endfor %}
{% endif %}
</tr>
{% endblock head_tr %}
{% endfor %}
{% block after_head_rows %}{% endblock %}
</thead>
{% endblock thead %}
{% block tbody %}
<tbody>
{% block before_rows %}{% endblock before_rows %}
{% for r in body %}
{% block tr scoped %}
<tr>
{% if exclude_styles %}
{% for c in r %}{% if c.is_visible != False %}
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}{% endfor %}
{% else %}
{% for c in r %}{% if c.is_visible != False %}
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}{% endfor %}
{% endif %}
</tr>
{% endblock tr %}
{% endfor %}
{% block after_rows %}{% endblock after_rows %}
</tbody>
{% endblock tbody %}
</table>
{% endblock table %}
{% block after_table %}{% endblock after_table %}

View File

@ -0,0 +1,5 @@
{% if environment == "longtable" %}
{% include "latex_longtable.tpl" %}
{% else %}
{% include "latex_table.tpl" %}
{% endif %}

View File

@ -0,0 +1,82 @@
\begin{longtable}
{%- set position = parse_table(table_styles, 'position') %}
{%- if position is not none %}
[{{position}}]
{%- endif %}
{%- set column_format = parse_table(table_styles, 'column_format') %}
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
{% for style in table_styles %}
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format', 'label'] %}
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
{% endif %}
{% endfor %}
{% if caption and caption is string %}
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
{%- set label = parse_table(table_styles, 'label') %}
{%- if label is not none %}
\label{{label}}
{%- endif %} \\
{% elif caption and caption is sequence %}
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
{%- set label = parse_table(table_styles, 'label') %}
{%- if label is not none %}
\label{{label}}
{%- endif %} \\
{% else %}
{%- set label = parse_table(table_styles, 'label') %}
{%- if label is not none %}
\label{{label}} \\
{% endif %}
{% endif %}
{% set toprule = parse_table(table_styles, 'toprule') %}
{% if toprule is not none %}
\{{toprule}}
{% endif %}
{% for row in head %}
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
{% endfor %}
{% set midrule = parse_table(table_styles, 'midrule') %}
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\endfirsthead
{% if caption and caption is string %}
\caption[]{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %} \\
{% elif caption and caption is sequence %}
\caption[]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %} \\
{% endif %}
{% if toprule is not none %}
\{{toprule}}
{% endif %}
{% for row in head %}
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
{% endfor %}
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\endhead
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\multicolumn{% raw %}{{% endraw %}{{body[0]|length}}{% raw %}}{% endraw %}{r}{Continued on next page} \\
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\endfoot
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
{% if bottomrule is not none %}
\{{bottomrule}}
{% endif %}
\endlastfoot
{% for row in body %}
{% for c in row %}{% if not loop.first %} & {% endif %}
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
{%- endfor %} \\
{% if clines and clines[loop.index] | length > 0 %}
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
{% endif %}
{% endfor %}
\end{longtable}
{% raw %}{% endraw %}

View File

@ -0,0 +1,57 @@
{% if environment or parse_wrap(table_styles, caption) %}
\begin{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
{%- set position = parse_table(table_styles, 'position') %}
{%- if position is not none %}
[{{position}}]
{%- endif %}
{% set position_float = parse_table(table_styles, 'position_float') %}
{% if position_float is not none%}
\{{position_float}}
{% endif %}
{% if caption and caption is string %}
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
{% elif caption and caption is sequence %}
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
{% endif %}
{% for style in table_styles %}
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format'] %}
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
{% endif %}
{% endfor %}
{% endif %}
\begin{tabular}
{%- set column_format = parse_table(table_styles, 'column_format') %}
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
{% set toprule = parse_table(table_styles, 'toprule') %}
{% if toprule is not none %}
\{{toprule}}
{% endif %}
{% for row in head %}
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx, convert_css)}}{% endfor %} \\
{% endfor %}
{% set midrule = parse_table(table_styles, 'midrule') %}
{% if midrule is not none %}
\{{midrule}}
{% endif %}
{% for row in body %}
{% for c in row %}{% if not loop.first %} & {% endif %}
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align, False, convert_css)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
{%- endfor %} \\
{% if clines and clines[loop.index] | length > 0 %}
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
{% endif %}
{% endfor %}
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
{% if bottomrule is not none %}
\{{bottomrule}}
{% endif %}
\end{tabular}
{% if environment or parse_wrap(table_styles, caption) %}
\end{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
{% endif %}

View File

@ -0,0 +1,12 @@
{% for r in head %}
{% for c in r %}{% if c["is_visible"] %}
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
{% endif %}{% endfor %}
{% endfor %}
{% for r in body %}
{% for c in r %}{% if c["is_visible"] %}
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
{% endif %}{% endfor %}
{% endfor %}

View File

@ -0,0 +1,560 @@
"""
:mod:`pandas.io.formats.xml` is a module for formatting data in XML.
"""
from __future__ import annotations
import codecs
import io
from typing import (
TYPE_CHECKING,
Any,
final,
)
import warnings
from pandas.errors import AbstractMethodError
from pandas.util._decorators import (
cache_readonly,
doc,
)
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.missing import isna
from pandas.core.shared_docs import _shared_docs
from pandas.io.common import get_handle
from pandas.io.xml import (
get_data_from_filepath,
preprocess_data,
)
if TYPE_CHECKING:
from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
from pandas import DataFrame
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buffer",
)
class _BaseXMLFormatter:
"""
Subclass for formatting data in XML.
Parameters
----------
path_or_buffer : str or file-like
This can be either a string of raw XML, a valid URL,
file or file-like object.
index : bool
Whether to include index in xml document.
row_name : str
Name for root of xml document. Default is 'data'.
root_name : str
Name for row elements of xml document. Default is 'row'.
na_rep : str
Missing data representation.
attrs_cols : list
List of columns to write as attributes in row element.
elem_cols : list
List of columns to write as children in row element.
namespaces : dict
The namespaces to define in XML document as dicts with key
being namespace and value the URI.
prefix : str
The prefix for each element in XML document including root.
encoding : str
Encoding of xml object or document.
xml_declaration : bool
Whether to include xml declaration at top line item in xml.
pretty_print : bool
Whether to write xml document with line breaks and indentation.
stylesheet : str or file-like
A URL, file, file-like object, or a raw string containing XSLT.
{compression_options}
.. versionchanged:: 1.4.0 Zstandard support.
{storage_options}
See also
--------
pandas.io.formats.xml.EtreeXMLFormatter
pandas.io.formats.xml.LxmlXMLFormatter
"""
def __init__(
self,
frame: DataFrame,
path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
index: bool = True,
root_name: str | None = "data",
row_name: str | None = "row",
na_rep: str | None = None,
attr_cols: list[str] | None = None,
elem_cols: list[str] | None = None,
namespaces: dict[str | None, str] | None = None,
prefix: str | None = None,
encoding: str = "utf-8",
xml_declaration: bool | None = True,
pretty_print: bool | None = True,
stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions | None = None,
) -> None:
self.frame = frame
self.path_or_buffer = path_or_buffer
self.index = index
self.root_name = root_name
self.row_name = row_name
self.na_rep = na_rep
self.attr_cols = attr_cols
self.elem_cols = elem_cols
self.namespaces = namespaces
self.prefix = prefix
self.encoding = encoding
self.xml_declaration = xml_declaration
self.pretty_print = pretty_print
self.stylesheet = stylesheet
self.compression: CompressionOptions = compression
self.storage_options = storage_options
self.orig_cols = self.frame.columns.tolist()
self.frame_dicts = self._process_dataframe()
self._validate_columns()
self._validate_encoding()
self.prefix_uri = self._get_prefix_uri()
self._handle_indexes()
def _build_tree(self) -> bytes:
"""
Build tree from data.
This method initializes the root and builds attributes and elements
with optional namespaces.
"""
raise AbstractMethodError(self)
@final
def _validate_columns(self) -> None:
"""
Validate elems_cols and attrs_cols.
This method will check if columns is list-like.
Raises
------
ValueError
* If value is not a list and less then length of nodes.
"""
if self.attr_cols and not is_list_like(self.attr_cols):
raise TypeError(
f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
)
if self.elem_cols and not is_list_like(self.elem_cols):
raise TypeError(
f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
)
@final
def _validate_encoding(self) -> None:
"""
Validate encoding.
This method will check if encoding is among listed under codecs.
Raises
------
LookupError
* If encoding is not available in codecs.
"""
codecs.lookup(self.encoding)
@final
def _process_dataframe(self) -> dict[int | str, dict[str, Any]]:
"""
Adjust Data Frame to fit xml output.
This method will adjust underlying data frame for xml output,
including optionally replacing missing values and including indexes.
"""
df = self.frame
if self.index:
df = df.reset_index()
if self.na_rep is not None:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Downcasting object dtype arrays",
category=FutureWarning,
)
df = df.fillna(self.na_rep)
return df.to_dict(orient="index")
@final
def _handle_indexes(self) -> None:
"""
Handle indexes.
This method will add indexes into attr_cols or elem_cols.
"""
if not self.index:
return
first_key = next(iter(self.frame_dicts))
indexes: list[str] = [
x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
]
if self.attr_cols:
self.attr_cols = indexes + self.attr_cols
if self.elem_cols:
self.elem_cols = indexes + self.elem_cols
def _get_prefix_uri(self) -> str:
"""
Get uri of namespace prefix.
This method retrieves corresponding URI to prefix in namespaces.
Raises
------
KeyError
*If prefix is not included in namespace dict.
"""
raise AbstractMethodError(self)
@final
def _other_namespaces(self) -> dict:
"""
Define other namespaces.
This method will build dictionary of namespaces attributes
for root element, conditionally with optional namespaces and
prefix.
"""
nmsp_dict: dict[str, str] = {}
if self.namespaces:
nmsp_dict = {
f"xmlns{p if p=='' else f':{p}'}": n
for p, n in self.namespaces.items()
if n != self.prefix_uri[1:-1]
}
return nmsp_dict
@final
def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
"""
Create attributes of row.
This method adds attributes using attr_cols to row element and
works with tuples for multindex or hierarchical columns.
"""
if not self.attr_cols:
return elem_row
for col in self.attr_cols:
attr_name = self._get_flat_col_name(col)
try:
if not isna(d[col]):
elem_row.attrib[attr_name] = str(d[col])
except KeyError:
raise KeyError(f"no valid column, {col}")
return elem_row
@final
def _get_flat_col_name(self, col: str | tuple) -> str:
flat_col = col
if isinstance(col, tuple):
flat_col = (
"".join([str(c) for c in col]).strip()
if "" in col
else "_".join([str(c) for c in col]).strip()
)
return f"{self.prefix_uri}{flat_col}"
@cache_readonly
def _sub_element_cls(self):
raise AbstractMethodError(self)
@final
def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
"""
Create child elements of row.
This method adds child elements using elem_cols to row element and
works with tuples for multindex or hierarchical columns.
"""
sub_element_cls = self._sub_element_cls
if not self.elem_cols:
return
for col in self.elem_cols:
elem_name = self._get_flat_col_name(col)
try:
val = None if isna(d[col]) or d[col] == "" else str(d[col])
sub_element_cls(elem_row, elem_name).text = val
except KeyError:
raise KeyError(f"no valid column, {col}")
@final
def write_output(self) -> str | None:
xml_doc = self._build_tree()
if self.path_or_buffer is not None:
with get_handle(
self.path_or_buffer,
"wb",
compression=self.compression,
storage_options=self.storage_options,
is_text=False,
) as handles:
handles.handle.write(xml_doc)
return None
else:
return xml_doc.decode(self.encoding).rstrip()
class EtreeXMLFormatter(_BaseXMLFormatter):
"""
Class for formatting data in xml using Python standard library
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
"""
def _build_tree(self) -> bytes:
from xml.etree.ElementTree import (
Element,
SubElement,
tostring,
)
self.root = Element(
f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces()
)
for d in self.frame_dicts.values():
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
self.elem_cols = list(d.keys())
self._build_elems(d, elem_row)
else:
elem_row = self._build_attribs(d, elem_row)
self._build_elems(d, elem_row)
self.out_xml = tostring(
self.root,
method="xml",
encoding=self.encoding,
xml_declaration=self.xml_declaration,
)
if self.pretty_print:
self.out_xml = self._prettify_tree()
if self.stylesheet is not None:
raise ValueError(
"To use stylesheet, you need lxml installed and selected as parser."
)
return self.out_xml
def _get_prefix_uri(self) -> str:
from xml.etree.ElementTree import register_namespace
uri = ""
if self.namespaces:
for p, n in self.namespaces.items():
if isinstance(p, str) and isinstance(n, str):
register_namespace(p, n)
if self.prefix:
try:
uri = f"{{{self.namespaces[self.prefix]}}}"
except KeyError:
raise KeyError(f"{self.prefix} is not included in namespaces")
elif "" in self.namespaces:
uri = f'{{{self.namespaces[""]}}}'
else:
uri = ""
return uri
@cache_readonly
def _sub_element_cls(self):
from xml.etree.ElementTree import SubElement
return SubElement
def _prettify_tree(self) -> bytes:
"""
Output tree for pretty print format.
This method will pretty print xml with line breaks and indentation.
"""
from xml.dom.minidom import parseString
dom = parseString(self.out_xml)
return dom.toprettyxml(indent=" ", encoding=self.encoding)
class LxmlXMLFormatter(_BaseXMLFormatter):
"""
Class for formatting data in xml using Python standard library
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
"""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._convert_empty_str_key()
def _build_tree(self) -> bytes:
"""
Build tree from data.
This method initializes the root and builds attributes and elements
with optional namespaces.
"""
from lxml.etree import (
Element,
SubElement,
tostring,
)
self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
for d in self.frame_dicts.values():
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
self.elem_cols = list(d.keys())
self._build_elems(d, elem_row)
else:
elem_row = self._build_attribs(d, elem_row)
self._build_elems(d, elem_row)
self.out_xml = tostring(
self.root,
pretty_print=self.pretty_print,
method="xml",
encoding=self.encoding,
xml_declaration=self.xml_declaration,
)
if self.stylesheet is not None:
self.out_xml = self._transform_doc()
return self.out_xml
def _convert_empty_str_key(self) -> None:
"""
Replace zero-length string in `namespaces`.
This method will replace '' with None to align to `lxml`
requirement that empty string prefixes are not allowed.
"""
if self.namespaces and "" in self.namespaces.keys():
self.namespaces[None] = self.namespaces.pop("", "default")
def _get_prefix_uri(self) -> str:
uri = ""
if self.namespaces:
if self.prefix:
try:
uri = f"{{{self.namespaces[self.prefix]}}}"
except KeyError:
raise KeyError(f"{self.prefix} is not included in namespaces")
elif "" in self.namespaces:
uri = f'{{{self.namespaces[""]}}}'
else:
uri = ""
return uri
@cache_readonly
def _sub_element_cls(self):
from lxml.etree import SubElement
return SubElement
def _transform_doc(self) -> bytes:
"""
Parse stylesheet from file or buffer and run it.
This method will parse stylesheet object into tree for parsing
conditionally by its specific object type, then transforms
original tree with XSLT script.
"""
from lxml.etree import (
XSLT,
XMLParser,
fromstring,
parse,
)
style_doc = self.stylesheet
assert style_doc is not None # is ensured by caller
handle_data = get_data_from_filepath(
filepath_or_buffer=style_doc,
encoding=self.encoding,
compression=self.compression,
storage_options=self.storage_options,
)
with preprocess_data(handle_data) as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
if isinstance(xml_data, io.StringIO):
xsl_doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
else:
xsl_doc = parse(xml_data, parser=curr_parser)
transformer = XSLT(xsl_doc)
new_doc = transformer(self.root)
return bytes(new_doc)

View File

@ -0,0 +1,255 @@
""" Google BigQuery support """
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import warnings
from pandas.compat._optional import import_optional_dependency
from pandas.util._exceptions import find_stack_level
if TYPE_CHECKING:
from google.auth.credentials import Credentials
from pandas import DataFrame
def _try_import():
# since pandas is a dependency of pandas-gbq
# we need to import on first use
msg = (
"pandas-gbq is required to load data from Google BigQuery. "
"See the docs: https://pandas-gbq.readthedocs.io."
)
pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
return pandas_gbq
def read_gbq(
query: str,
project_id: str | None = None,
index_col: str | None = None,
col_order: list[str] | None = None,
reauth: bool = False,
auth_local_webserver: bool = True,
dialect: str | None = None,
location: str | None = None,
configuration: dict[str, Any] | None = None,
credentials: Credentials | None = None,
use_bqstorage_api: bool | None = None,
max_results: int | None = None,
progress_bar_type: str | None = None,
) -> DataFrame:
"""
Load data from Google BigQuery.
.. deprecated:: 2.2.0
Please use ``pandas_gbq.read_gbq`` instead.
This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.
Parameters
----------
query : str
SQL-Like Query to return data values.
project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
index_col : str, optional
Name of result column to use for index in results DataFrame.
col_order : list(str), optional
List of BigQuery column names in the desired order for results
DataFrame.
reauth : bool, default False
Force Google BigQuery to re-authenticate the user. This is useful
if multiple accounts are used.
auth_local_webserver : bool, default True
Use the `local webserver flow`_ instead of the `console flow`_
when getting user credentials.
.. _local webserver flow:
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
.. _console flow:
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
*New in version 0.2.0 of pandas-gbq*.
.. versionchanged:: 1.5.0
Default value is changed to ``True``. Google has deprecated the
``auth_local_webserver = False`` `"out of band" (copy-paste)
flow
<https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
dialect : str, default 'legacy'
Note: The default value is changing to 'standard' in a future version.
SQL syntax dialect to use. Value can be one of:
``'legacy'``
Use BigQuery's legacy SQL dialect. For more information see
`BigQuery Legacy SQL Reference
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
``'standard'``
Use BigQuery's standard SQL, which is
compliant with the SQL 2011 standard. For more information
see `BigQuery Standard SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
location : str, optional
Location where the query job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of any
datasets used in the query.
*New in version 0.5.0 of pandas-gbq*.
configuration : dict, optional
Query config parameters for job processing.
For example:
configuration = {'query': {'useQueryCache': False}}
For more information see `BigQuery REST API Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
credentials : google.auth.credentials.Credentials, optional
Credentials for accessing Google APIs. Use this parameter to override
default credentials, such as to use Compute Engine
:class:`google.auth.compute_engine.Credentials` or Service Account
:class:`google.oauth2.service_account.Credentials` directly.
*New in version 0.8.0 of pandas-gbq*.
use_bqstorage_api : bool, default False
Use the `BigQuery Storage API
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
download query results quickly, but at an increased cost. To use this
API, first `enable it in the Cloud Console
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
You must also have the `bigquery.readsessions.create
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
permission on the project you are billing queries to.
This feature requires version 0.10.0 or later of the ``pandas-gbq``
package. It also requires the ``google-cloud-bigquery-storage`` and
``fastavro`` packages.
max_results : int, optional
If set, limit the maximum number of rows to fetch from the query
results.
progress_bar_type : Optional, str
If set, use the `tqdm <https://tqdm.github.io/>`__ library to
display a progress bar while the data downloads. Install the
``tqdm`` package to use this feature.
Possible values of ``progress_bar_type`` include:
``None``
No progress bar.
``'tqdm'``
Use the :func:`tqdm.tqdm` function to print a progress bar
to :data:`sys.stderr`.
``'tqdm_notebook'``
Use the :func:`tqdm.tqdm_notebook` function to display a
progress bar as a Jupyter notebook widget.
``'tqdm_gui'``
Use the :func:`tqdm.tqdm_gui` function to display a
progress bar as a graphical dialog box.
Returns
-------
df: DataFrame
DataFrame representing results of query.
See Also
--------
pandas_gbq.read_gbq : This function in the pandas-gbq library.
DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
Examples
--------
Example taken from `Google BigQuery documentation
<https://cloud.google.com/bigquery/docs/pandas-gbq-migration>`_
>>> sql = "SELECT name FROM table_name WHERE state = 'TX' LIMIT 100;"
>>> df = pd.read_gbq(sql, dialect="standard") # doctest: +SKIP
>>> project_id = "your-project-id" # doctest: +SKIP
>>> df = pd.read_gbq(sql,
... project_id=project_id,
... dialect="standard"
... ) # doctest: +SKIP
"""
warnings.warn(
"read_gbq is deprecated and will be removed in a future version. "
"Please use pandas_gbq.read_gbq instead: "
"https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.read_gbq",
FutureWarning,
stacklevel=find_stack_level(),
)
pandas_gbq = _try_import()
kwargs: dict[str, str | bool | int | None] = {}
# START: new kwargs. Don't populate unless explicitly set.
if use_bqstorage_api is not None:
kwargs["use_bqstorage_api"] = use_bqstorage_api
if max_results is not None:
kwargs["max_results"] = max_results
kwargs["progress_bar_type"] = progress_bar_type
# END: new kwargs
return pandas_gbq.read_gbq(
query,
project_id=project_id,
index_col=index_col,
col_order=col_order,
reauth=reauth,
auth_local_webserver=auth_local_webserver,
dialect=dialect,
location=location,
configuration=configuration,
credentials=credentials,
**kwargs,
)
def to_gbq(
dataframe: DataFrame,
destination_table: str,
project_id: str | None = None,
chunksize: int | None = None,
reauth: bool = False,
if_exists: str = "fail",
auth_local_webserver: bool = True,
table_schema: list[dict[str, str]] | None = None,
location: str | None = None,
progress_bar: bool = True,
credentials: Credentials | None = None,
) -> None:
warnings.warn(
"to_gbq is deprecated and will be removed in a future version. "
"Please use pandas_gbq.to_gbq instead: "
"https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.to_gbq",
FutureWarning,
stacklevel=find_stack_level(),
)
pandas_gbq = _try_import()
pandas_gbq.to_gbq(
dataframe,
destination_table,
project_id=project_id,
chunksize=chunksize,
reauth=reauth,
if_exists=if_exists,
auth_local_webserver=auth_local_webserver,
table_schema=table_schema,
location=location,
progress_bar=progress_bar,
credentials=credentials,
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
from pandas.io.json._json import (
read_json,
to_json,
ujson_dumps,
ujson_loads,
)
from pandas.io.json._table_schema import build_table_schema
__all__ = [
"ujson_dumps",
"ujson_loads",
"read_json",
"to_json",
"build_table_schema",
]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,544 @@
# ---------------------------------------------------------------------
# JSON normalization routines
from __future__ import annotations
from collections import (
abc,
defaultdict,
)
import copy
from typing import (
TYPE_CHECKING,
Any,
DefaultDict,
)
import numpy as np
from pandas._libs.writers import convert_json_to_lines
import pandas as pd
from pandas import DataFrame
if TYPE_CHECKING:
from collections.abc import Iterable
from pandas._typing import (
IgnoreRaise,
Scalar,
)
def convert_to_line_delimits(s: str) -> str:
"""
Helper function that converts JSON lists to line delimited JSON.
"""
# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
if not s[0] == "[" and s[-1] == "]":
return s
s = s[1:-1]
return convert_json_to_lines(s)
def nested_to_record(
ds,
prefix: str = "",
sep: str = ".",
level: int = 0,
max_level: int | None = None,
):
"""
A simplified json_normalize
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.
Parameters
----------
ds : dict or list of dicts
prefix: the prefix, optional, default: ""
sep : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
level: int, optional, default: 0
The number of levels in the json string.
max_level: int, optional, default: None
The max depth to normalize.
Returns
-------
d - dict or list of dicts, matching `ds`
Examples
--------
>>> nested_to_record(
... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
... )
{\
'flat1': 1, \
'dict1.c': 1, \
'dict1.d': 2, \
'nested.e.c': 1, \
'nested.e.d': 2, \
'nested.d': 2\
}
"""
singleton = False
if isinstance(ds, dict):
ds = [ds]
singleton = True
new_ds = []
for d in ds:
new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
if not isinstance(k, str):
k = str(k)
if level == 0:
newkey = k
else:
newkey = prefix + sep + k
# flatten if type is dict and
# current dict level < maximum level provided and
# only dicts gets recurse-flattened
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict) or (
max_level is not None and level >= max_level
):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
new_ds.append(new_d)
if singleton:
return new_ds[0]
return new_ds
def _normalise_json(
data: Any,
key_string: str,
normalized_dict: dict[str, Any],
separator: str,
) -> dict[str, Any]:
"""
Main recursive function
Designed for the most basic use case of pd.json_normalize(data)
intended as a performance improvement, see #15621
Parameters
----------
data : Any
Type dependent on types contained within nested Json
key_string : str
New key (with separator(s) in) for data
normalized_dict : dict
The new normalized/flattened Json dict
separator : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
"""
if isinstance(data, dict):
for key, value in data.items():
new_key = f"{key_string}{separator}{key}"
if not key_string:
new_key = new_key.removeprefix(separator)
_normalise_json(
data=value,
key_string=new_key,
normalized_dict=normalized_dict,
separator=separator,
)
else:
normalized_dict[key_string] = data
return normalized_dict
def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
"""
Order the top level keys and then recursively go to depth
Parameters
----------
data : dict or list of dicts
separator : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
Returns
-------
dict or list of dicts, matching `normalised_json_object`
"""
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
nested_dict_ = _normalise_json(
data={k: v for k, v in data.items() if isinstance(v, dict)},
key_string="",
normalized_dict={},
separator=separator,
)
return {**top_dict_, **nested_dict_}
def _simple_json_normalize(
ds: dict | list[dict],
sep: str = ".",
) -> dict | list[dict] | Any:
"""
A optimized basic json_normalize
Converts a nested dict into a flat dict ("record"), unlike
json_normalize and nested_to_record it doesn't do anything clever.
But for the most basic use cases it enhances performance.
E.g. pd.json_normalize(data)
Parameters
----------
ds : dict or list of dicts
sep : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
Returns
-------
frame : DataFrame
d - dict or list of dicts, matching `normalised_json_object`
Examples
--------
>>> _simple_json_normalize(
... {
... "flat1": 1,
... "dict1": {"c": 1, "d": 2},
... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
... }
... )
{\
'flat1': 1, \
'dict1.c': 1, \
'dict1.d': 2, \
'nested.e.c': 1, \
'nested.e.d': 2, \
'nested.d': 2\
}
"""
normalised_json_object = {}
# expect a dictionary, as most jsons are. However, lists are perfectly valid
if isinstance(ds, dict):
normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
elif isinstance(ds, list):
normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
return normalised_json_list
return normalised_json_object
def json_normalize(
data: dict | list[dict],
record_path: str | list | None = None,
meta: str | list[str | list[str]] | None = None,
meta_prefix: str | None = None,
record_prefix: str | None = None,
errors: IgnoreRaise = "raise",
sep: str = ".",
max_level: int | None = None,
) -> DataFrame:
"""
Normalize semi-structured JSON data into a flat table.
Parameters
----------
data : dict or list of dicts
Unserialized JSON objects.
record_path : str or list of str, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records.
meta : list of paths (str or list of str), default None
Fields to use as metadata for each record in resulting table.
meta_prefix : str, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
meta is ['foo', 'bar'].
record_prefix : str, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar'].
errors : {'raise', 'ignore'}, default 'raise'
Configures error handling.
* 'ignore' : will ignore KeyError if keys listed in meta are not
always present.
* 'raise' : will raise KeyError if keys listed in meta are not
always present.
sep : str, default '.'
Nested records will generate names separated by sep.
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
max_level : int, default None
Max number of levels(depth of dict) to normalize.
if None, normalizes all levels.
Returns
-------
frame : DataFrame
Normalize semi-structured JSON data into a flat table.
Examples
--------
>>> data = [
... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
... {"name": {"given": "Mark", "family": "Regner"}},
... {"id": 2, "name": "Faye Raker"},
... ]
>>> pd.json_normalize(data)
id name.first name.last name.given name.family name
0 1.0 Coleen Volk NaN NaN NaN
1 NaN NaN NaN Mark Regner NaN
2 2.0 NaN NaN NaN NaN Faye Raker
>>> data = [
... {
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> pd.json_normalize(data, max_level=0)
id name fitness
0 1.0 Cole Volk {'height': 130, 'weight': 60}
1 NaN Mark Reg {'height': 130, 'weight': 60}
2 2.0 Faye Raker {'height': 130, 'weight': 60}
Normalizes nested data up to level 1.
>>> data = [
... {
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> pd.json_normalize(data, max_level=1)
id name fitness.height fitness.weight
0 1.0 Cole Volk 130 60
1 NaN Mark Reg 130 60
2 2.0 Faye Raker 130 60
>>> data = [
... {
... "state": "Florida",
... "shortname": "FL",
... "info": {"governor": "Rick Scott"},
... "counties": [
... {"name": "Dade", "population": 12345},
... {"name": "Broward", "population": 40000},
... {"name": "Palm Beach", "population": 60000},
... ],
... },
... {
... "state": "Ohio",
... "shortname": "OH",
... "info": {"governor": "John Kasich"},
... "counties": [
... {"name": "Summit", "population": 1234},
... {"name": "Cuyahoga", "population": 1337},
... ],
... },
... ]
>>> result = pd.json_normalize(
... data, "counties", ["state", "shortname", ["info", "governor"]]
... )
>>> result
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich
>>> data = {"A": [1, 2]}
>>> pd.json_normalize(data, "A", record_prefix="Prefix.")
Prefix.0
0 1
1 2
Returns normalized data with columns prefixed with the given string.
"""
def _pull_field(
js: dict[str, Any], spec: list | str, extract_record: bool = False
) -> Scalar | Iterable:
"""Internal function to pull field"""
result = js
try:
if isinstance(spec, list):
for field in spec:
if result is None:
raise KeyError(field)
result = result[field]
else:
result = result[spec]
except KeyError as e:
if extract_record:
raise KeyError(
f"Key {e} not found. If specifying a record_path, all elements of "
f"data should have the path."
) from e
if errors == "ignore":
return np.nan
else:
raise KeyError(
f"Key {e} not found. To replace missing values of {e} with "
f"np.nan, pass in errors='ignore'"
) from e
return result
def _pull_records(js: dict[str, Any], spec: list | str) -> list:
"""
Internal function to pull field for records, and similar to
_pull_field, but require to return list. And will raise error
if has non iterable value.
"""
result = _pull_field(js, spec, extract_record=True)
# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
# null, otherwise return an empty list
if not isinstance(result, list):
if pd.isnull(result):
result = []
else:
raise TypeError(
f"{js} has non list value {result} for path {spec}. "
"Must be list or null."
)
return result
if isinstance(data, list) and not data:
return DataFrame()
elif isinstance(data, dict):
# A bit of a hackjob
data = [data]
elif isinstance(data, abc.Iterable) and not isinstance(data, str):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
data = list(data)
else:
raise NotImplementedError
# check to see if a simple recursive function is possible to
# improve performance (see #15621) but only for cases such
# as pd.Dataframe(data) or pd.Dataframe(data, sep)
if (
record_path is None
and meta is None
and meta_prefix is None
and record_prefix is None
and max_level is None
):
return DataFrame(_simple_json_normalize(data, sep=sep))
if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
# naive normalization, this is idempotent for flat records
# and potentially will inflate the data considerably for
# deeply nested structures:
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep, max_level=max_level)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]
if meta is None:
meta = []
elif not isinstance(meta, list):
meta = [meta]
_meta = [m if isinstance(m, list) else [m] for m in meta]
# Disastrously inefficient for now
records: list = []
lengths = []
meta_vals: DefaultDict = defaultdict(list)
meta_keys = [sep.join(val) for val in _meta]
def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
if isinstance(data, dict):
data = [data]
if len(path) > 1:
for obj in data:
for val, key in zip(_meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_records(obj, path[0])
recs = [
nested_to_record(r, sep=sep, max_level=max_level)
if isinstance(r, dict)
else r
for r in recs
]
# For repeating the metadata later
lengths.append(len(recs))
for val, key in zip(_meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
meta_val = _pull_field(obj, val[level:])
meta_vals[key].append(meta_val)
records.extend(recs)
_recursive_extract(data, record_path, {}, level=0)
result = DataFrame(records)
if record_prefix is not None:
result = result.rename(columns=lambda x: f"{record_prefix}{x}")
# Data types, a problem
for k, v in meta_vals.items():
if meta_prefix is not None:
k = meta_prefix + k
if k in result:
raise ValueError(
f"Conflicting metadata name {k}, need distinguishing prefix "
)
# GH 37782
values = np.array(v, dtype=object)
if values.ndim > 1:
# GH 37782
values = np.empty((len(v),), dtype=object)
for i, v in enumerate(v):
values[i] = v
result[k] = values.repeat(lengths)
return result

View File

@ -0,0 +1,389 @@
"""
Table Schema builders
https://specs.frictionlessdata.io/table-schema/
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
cast,
)
import warnings
from pandas._libs import lib
from pandas._libs.json import ujson_loads
from pandas._libs.tslibs import timezones
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.base import _registry as registry
from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer_dtype,
is_numeric_dtype,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)
from pandas import DataFrame
import pandas.core.common as com
from pandas.tseries.frequencies import to_offset
if TYPE_CHECKING:
from pandas._typing import (
DtypeObj,
JSONSerializable,
)
from pandas import Series
from pandas.core.indexes.multi import MultiIndex
TABLE_SCHEMA_VERSION = "1.4.0"
def as_json_table_type(x: DtypeObj) -> str:
"""
Convert a NumPy / pandas type to its corresponding json_table.
Parameters
----------
x : np.dtype or ExtensionDtype
Returns
-------
str
the Table Schema data types
Notes
-----
This table shows the relationship between NumPy / pandas dtypes,
and Table Schema dtypes.
============== =================
Pandas type Table Schema type
============== =================
int64 integer
float64 number
bool boolean
datetime64[ns] datetime
timedelta64[ns] duration
object str
categorical any
=============== =================
"""
if is_integer_dtype(x):
return "integer"
elif is_bool_dtype(x):
return "boolean"
elif is_numeric_dtype(x):
return "number"
elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)):
return "datetime"
elif lib.is_np_dtype(x, "m"):
return "duration"
elif isinstance(x, ExtensionDtype):
return "any"
elif is_string_dtype(x):
return "string"
else:
return "any"
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if com.all_not_none(*data.index.names):
nms = data.index.names
if len(nms) == 1 and data.index.name == "index":
warnings.warn(
"Index name of 'index' is not round-trippable.",
stacklevel=find_stack_level(),
)
elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
warnings.warn(
"Index names beginning with 'level_' are not round-trippable.",
stacklevel=find_stack_level(),
)
return data
data = data.copy()
if data.index.nlevels > 1:
data.index.names = com.fill_missing_names(data.index.names)
else:
data.index.name = data.index.name or "index"
return data
def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
dtype = arr.dtype
name: JSONSerializable
if arr.name is None:
name = "values"
else:
name = arr.name
field: dict[str, JSONSerializable] = {
"name": name,
"type": as_json_table_type(dtype),
}
if isinstance(dtype, CategoricalDtype):
cats = dtype.categories
ordered = dtype.ordered
field["constraints"] = {"enum": list(cats)}
field["ordered"] = ordered
elif isinstance(dtype, PeriodDtype):
field["freq"] = dtype.freq.freqstr
elif isinstance(dtype, DatetimeTZDtype):
if timezones.is_utc(dtype.tz):
# timezone.utc has no "zone" attr
field["tz"] = "UTC"
else:
# error: "tzinfo" has no attribute "zone"
field["tz"] = dtype.tz.zone # type: ignore[attr-defined]
elif isinstance(dtype, ExtensionDtype):
field["extDtype"] = dtype.name
return field
def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
"""
Converts a JSON field descriptor into its corresponding NumPy / pandas type
Parameters
----------
field
A JSON field descriptor
Returns
-------
dtype
Raises
------
ValueError
If the type of the provided field is unknown or currently unsupported
Examples
--------
>>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
'int64'
>>> convert_json_field_to_pandas_type(
... {
... "name": "a_categorical",
... "type": "any",
... "constraints": {"enum": ["a", "b", "c"]},
... "ordered": True,
... }
... )
CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)
>>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
'datetime64[ns]'
>>> convert_json_field_to_pandas_type(
... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
... )
'datetime64[ns, US/Central]'
"""
typ = field["type"]
if typ == "string":
return "object"
elif typ == "integer":
return field.get("extDtype", "int64")
elif typ == "number":
return field.get("extDtype", "float64")
elif typ == "boolean":
return field.get("extDtype", "bool")
elif typ == "duration":
return "timedelta64"
elif typ == "datetime":
if field.get("tz"):
return f"datetime64[ns, {field['tz']}]"
elif field.get("freq"):
# GH#9586 rename frequency M to ME for offsets
offset = to_offset(field["freq"])
freq_n, freq_name = offset.n, offset.name
freq = freq_to_period_freqstr(freq_n, freq_name)
# GH#47747 using datetime over period to minimize the change surface
return f"period[{freq}]"
else:
return "datetime64[ns]"
elif typ == "any":
if "constraints" in field and "ordered" in field:
return CategoricalDtype(
categories=field["constraints"]["enum"], ordered=field["ordered"]
)
elif "extDtype" in field:
return registry.find(field["extDtype"])
else:
return "object"
raise ValueError(f"Unsupported or invalid field type: {typ}")
def build_table_schema(
data: DataFrame | Series,
index: bool = True,
primary_key: bool | None = None,
version: bool = True,
) -> dict[str, JSONSerializable]:
"""
Create a Table schema from ``data``.
Parameters
----------
data : Series, DataFrame
index : bool, default True
Whether to include ``data.index`` in the schema.
primary_key : bool or None, default True
Column names to designate as the primary key.
The default `None` will set `'primaryKey'` to the index
level or levels if the index is unique.
version : bool, default True
Whether to include a field `pandas_version` with the version
of pandas that last revised the table schema. This version
can be different from the installed pandas version.
Returns
-------
dict
Notes
-----
See `Table Schema
<https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
conversion types.
Timedeltas as converted to ISO8601 duration format with
9 decimal places after the seconds field for nanosecond precision.
Categoricals are converted to the `any` dtype, and use the `enum` field
constraint to list the allowed values. The `ordered` attribute is included
in an `ordered` field.
Examples
--------
>>> from pandas.io.json._table_schema import build_table_schema
>>> df = pd.DataFrame(
... {'A': [1, 2, 3],
... 'B': ['a', 'b', 'c'],
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
... }, index=pd.Index(range(3), name='idx'))
>>> build_table_schema(df)
{'fields': \
[{'name': 'idx', 'type': 'integer'}, \
{'name': 'A', 'type': 'integer'}, \
{'name': 'B', 'type': 'string'}, \
{'name': 'C', 'type': 'datetime'}], \
'primaryKey': ['idx'], \
'pandas_version': '1.4.0'}
"""
if index is True:
data = set_default_names(data)
schema: dict[str, Any] = {}
fields = []
if index:
if data.index.nlevels > 1:
data.index = cast("MultiIndex", data.index)
for level, name in zip(data.index.levels, data.index.names):
new_field = convert_pandas_type_to_json_field(level)
new_field["name"] = name
fields.append(new_field)
else:
fields.append(convert_pandas_type_to_json_field(data.index))
if data.ndim > 1:
for column, s in data.items():
fields.append(convert_pandas_type_to_json_field(s))
else:
fields.append(convert_pandas_type_to_json_field(data))
schema["fields"] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
schema["primaryKey"] = [data.index.name]
else:
schema["primaryKey"] = data.index.names
elif primary_key is not None:
schema["primaryKey"] = primary_key
if version:
schema["pandas_version"] = TABLE_SCHEMA_VERSION
return schema
def parse_table_schema(json, precise_float: bool) -> DataFrame:
"""
Builds a DataFrame from a given schema
Parameters
----------
json :
A JSON table schema
precise_float : bool
Flag controlling precision when decoding string to double values, as
dictated by ``read_json``
Returns
-------
df : DataFrame
Raises
------
NotImplementedError
If the JSON table schema contains either timezone or timedelta data
Notes
-----
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
name-less :class:`Index`, this function sets the name of the returned
:class:`DataFrame` to ``None`` when said string is encountered with a
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
applies to any strings beginning with 'level_'. Therefore, an
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
with 'level_' are not supported.
See Also
--------
build_table_schema : Inverse function.
pandas.read_json
"""
table = ujson_loads(json, precise_float=precise_float)
col_order = [field["name"] for field in table["schema"]["fields"]]
df = DataFrame(table["data"], columns=col_order)[col_order]
dtypes = {
field["name"]: convert_json_field_to_pandas_type(field)
for field in table["schema"]["fields"]
}
# No ISO constructor for Timedelta as of yet, so need to raise
if "timedelta64" in dtypes.values():
raise NotImplementedError(
'table="orient" can not yet read ISO-formatted Timedelta data'
)
df = df.astype(dtypes)
if "primaryKey" in table["schema"]:
df = df.set_index(table["schema"]["primaryKey"])
if len(df.index.names) == 1:
if df.index.name == "index":
df.index.name = None
else:
df.index.names = [
None if x.startswith("level_") else x for x in df.index.names
]
return df

View File

@ -0,0 +1,245 @@
""" orc compat """
from __future__ import annotations
import io
from types import ModuleType
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
from pandas._config import using_pyarrow_string_dtype
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._validators import check_dtype_backend
import pandas as pd
from pandas.core.indexes.api import default_index
from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
get_handle,
is_fsspec_url,
)
if TYPE_CHECKING:
import fsspec
import pyarrow.fs
from pandas._typing import (
DtypeBackend,
FilePath,
ReadBuffer,
WriteBuffer,
)
from pandas.core.frame import DataFrame
def read_orc(
path: FilePath | ReadBuffer[bytes],
columns: list[str] | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
**kwargs: Any,
) -> DataFrame:
"""
Load an ORC object from the file path, returning a DataFrame.
Parameters
----------
path : str, path object, or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function. The string could be a URL.
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.orc``.
columns : list, default None
If not None, only these columns will be read from the file.
Output always follows the ordering of the file and not the columns list.
This mirrors the original behaviour of
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
.. versionadded:: 2.0
filesystem : fsspec or pyarrow filesystem, default None
Filesystem object to use when reading the parquet file.
.. versionadded:: 2.1.0
**kwargs
Any additional kwargs are passed to pyarrow.
Returns
-------
DataFrame
Notes
-----
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
and :ref:`install optional dependencies <install.warn_orc>`.
If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
pyarrow or fsspec filesystem object into the filesystem keyword to override this
behavior.
Examples
--------
>>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
"""
# we require a newer version of pyarrow than we support for parquet
orc = import_optional_dependency("pyarrow.orc")
check_dtype_backend(dtype_backend)
with get_handle(path, "rb", is_text=False) as handles:
source = handles.handle
if is_fsspec_url(path) and filesystem is None:
pa = import_optional_dependency("pyarrow")
pa_fs = import_optional_dependency("pyarrow.fs")
try:
filesystem, source = pa_fs.FileSystem.from_uri(path)
except (TypeError, pa.ArrowInvalid):
pass
pa_table = orc.read_table(
source=source, columns=columns, filesystem=filesystem, **kwargs
)
if dtype_backend is not lib.no_default:
if dtype_backend == "pyarrow":
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
else:
from pandas.io._util import _arrow_dtype_mapping
mapping = _arrow_dtype_mapping()
df = pa_table.to_pandas(types_mapper=mapping.get)
return df
else:
if using_pyarrow_string_dtype():
types_mapper = arrow_string_types_mapper()
else:
types_mapper = None
return pa_table.to_pandas(types_mapper=types_mapper)
def to_orc(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
.. versionadded:: 1.5.0
Parameters
----------
df : DataFrame
The dataframe to be written to ORC. Raises NotImplementedError
if dtype of one or more columns is category, unsigned integers,
intervals, periods or sparse.
path : str, file-like object or None, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function). If path is None,
a bytes object is returned.
engine : str, default 'pyarrow'
ORC library to use.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
If ``None``, similar to ``infer`` the dataframe's index(es)
will be saved. However, instead of being saved as values,
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
engine_kwargs : dict[str, Any] or None, default None
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
-------
bytes if no path argument is provided else None
Raises
------
NotImplementedError
Dtype of one or more columns is category, unsigned integers, interval,
period or sparse.
ValueError
engine is not pyarrow.
Notes
-----
* Before using this function you should read the
:ref:`user guide about ORC <io.orc>` and
:ref:`install optional dependencies <install.warn_orc>`.
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
library.
* For supported dtypes please refer to `supported ORC features in Arrow
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
"""
if index is None:
index = df.index.names[0] is not None
if engine_kwargs is None:
engine_kwargs = {}
# validate index
# --------------
# validate that we have only a default index
# raise on anything else as we don't serialize the index
if not df.index.equals(default_index(len(df))):
raise ValueError(
"orc does not support serializing a non-default index for the index; "
"you can .reset_index() to make the index into column(s)"
)
if df.index.name is not None:
raise ValueError("orc does not serialize index meta-data on a default index")
if engine != "pyarrow":
raise ValueError("engine must be 'pyarrow'")
engine = import_optional_dependency(engine, min_version="10.0.1")
pa = import_optional_dependency("pyarrow")
orc = import_optional_dependency("pyarrow.orc")
was_none = path is None
if was_none:
path = io.BytesIO()
assert path is not None # For mypy
with get_handle(path, "wb", is_text=False) as handles:
assert isinstance(engine, ModuleType) # For mypy
try:
orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
handles.handle,
**engine_kwargs,
)
except (TypeError, pa.ArrowNotImplementedError) as e:
raise NotImplementedError(
"The dtype of one or more columns is not supported yet."
) from e
if was_none:
assert isinstance(path, io.BytesIO) # For mypy
return path.getvalue()
return None

View File

@ -0,0 +1,676 @@
""" parquet compat """
from __future__ import annotations
import io
import json
import os
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
import warnings
from warnings import catch_warnings
from pandas._config import using_pyarrow_string_dtype
from pandas._config.config import _get_option
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import check_dtype_backend
import pandas as pd
from pandas import (
DataFrame,
get_option,
)
from pandas.core.shared_docs import _shared_docs
from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
IOHandles,
get_handle,
is_fsspec_url,
is_url,
stringify_path,
)
if TYPE_CHECKING:
from pandas._typing import (
DtypeBackend,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
def get_engine(engine: str) -> BaseImpl:
"""return our implementation"""
if engine == "auto":
engine = get_option("io.parquet.engine")
if engine == "auto":
# try engines in this order
engine_classes = [PyArrowImpl, FastParquetImpl]
error_msgs = ""
for engine_class in engine_classes:
try:
return engine_class()
except ImportError as err:
error_msgs += "\n - " + str(err)
raise ImportError(
"Unable to find a usable engine; "
"tried using: 'pyarrow', 'fastparquet'.\n"
"A suitable version of "
"pyarrow or fastparquet is required for parquet "
"support.\n"
"Trying to import the above resulted in these errors:"
f"{error_msgs}"
)
if engine == "pyarrow":
return PyArrowImpl()
elif engine == "fastparquet":
return FastParquetImpl()
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
def _get_path_or_handle(
path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
fs: Any,
storage_options: StorageOptions | None = None,
mode: str = "rb",
is_dir: bool = False,
) -> tuple[
FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
]:
"""File handling for PyArrow."""
path_or_handle = stringify_path(path)
if fs is not None:
pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore")
fsspec = import_optional_dependency("fsspec", errors="ignore")
if pa_fs is not None and isinstance(fs, pa_fs.FileSystem):
if storage_options:
raise NotImplementedError(
"storage_options not supported with a pyarrow FileSystem."
)
elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem):
pass
else:
raise ValueError(
f"filesystem must be a pyarrow or fsspec FileSystem, "
f"not a {type(fs).__name__}"
)
if is_fsspec_url(path_or_handle) and fs is None:
if storage_options is None:
pa = import_optional_dependency("pyarrow")
pa_fs = import_optional_dependency("pyarrow.fs")
try:
fs, path_or_handle = pa_fs.FileSystem.from_uri(path)
except (TypeError, pa.ArrowInvalid):
pass
if fs is None:
fsspec = import_optional_dependency("fsspec")
fs, path_or_handle = fsspec.core.url_to_fs(
path_or_handle, **(storage_options or {})
)
elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
# can't write to a remote url
# without making use of fsspec at the moment
raise ValueError("storage_options passed with buffer, or non-supported URL")
handles = None
if (
not fs
and not is_dir
and isinstance(path_or_handle, str)
and not os.path.isdir(path_or_handle)
):
# use get_handle only when we are very certain that it is not a directory
# fsspec resources can also point to directories
# this branch is used for example when reading from non-fsspec URLs
handles = get_handle(
path_or_handle, mode, is_text=False, storage_options=storage_options
)
fs = None
path_or_handle = handles.handle
return path_or_handle, handles, fs
class BaseImpl:
@staticmethod
def validate_dataframe(df: DataFrame) -> None:
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")
def write(self, df: DataFrame, path, compression, **kwargs):
raise AbstractMethodError(self)
def read(self, path, columns=None, **kwargs) -> DataFrame:
raise AbstractMethodError(self)
class PyArrowImpl(BaseImpl):
def __init__(self) -> None:
import_optional_dependency(
"pyarrow", extra="pyarrow is required for parquet support."
)
import pyarrow.parquet
# import utils to register the pyarrow extension types
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
self.api = pyarrow
def write(
self,
df: DataFrame,
path: FilePath | WriteBuffer[bytes],
compression: str | None = "snappy",
index: bool | None = None,
storage_options: StorageOptions | None = None,
partition_cols: list[str] | None = None,
filesystem=None,
**kwargs,
) -> None:
self.validate_dataframe(df)
from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
if index is not None:
from_pandas_kwargs["preserve_index"] = index
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if df.attrs:
df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
existing_metadata = table.schema.metadata
merged_metadata = {**existing_metadata, **df_metadata}
table = table.replace_schema_metadata(merged_metadata)
path_or_handle, handles, filesystem = _get_path_or_handle(
path,
filesystem,
storage_options=storage_options,
mode="wb",
is_dir=partition_cols is not None,
)
if (
isinstance(path_or_handle, io.BufferedWriter)
and hasattr(path_or_handle, "name")
and isinstance(path_or_handle.name, (str, bytes))
):
if isinstance(path_or_handle.name, bytes):
path_or_handle = path_or_handle.name.decode()
else:
path_or_handle = path_or_handle.name
try:
if partition_cols is not None:
# writes to multiple files under the given path
self.api.parquet.write_to_dataset(
table,
path_or_handle,
compression=compression,
partition_cols=partition_cols,
filesystem=filesystem,
**kwargs,
)
else:
# write to single output file
self.api.parquet.write_table(
table,
path_or_handle,
compression=compression,
filesystem=filesystem,
**kwargs,
)
finally:
if handles is not None:
handles.close()
def read(
self,
path,
columns=None,
filters=None,
use_nullable_dtypes: bool = False,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
storage_options: StorageOptions | None = None,
filesystem=None,
**kwargs,
) -> DataFrame:
kwargs["use_pandas_metadata"] = True
to_pandas_kwargs = {}
if dtype_backend == "numpy_nullable":
from pandas.io._util import _arrow_dtype_mapping
mapping = _arrow_dtype_mapping()
to_pandas_kwargs["types_mapper"] = mapping.get
elif dtype_backend == "pyarrow":
to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment]
elif using_pyarrow_string_dtype():
to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()
manager = _get_option("mode.data_manager", silent=True)
if manager == "array":
to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]
path_or_handle, handles, filesystem = _get_path_or_handle(
path,
filesystem,
storage_options=storage_options,
mode="rb",
)
try:
pa_table = self.api.parquet.read_table(
path_or_handle,
columns=columns,
filesystem=filesystem,
filters=filters,
**kwargs,
)
result = pa_table.to_pandas(**to_pandas_kwargs)
if manager == "array":
result = result._as_manager("array", copy=False)
if pa_table.schema.metadata:
if b"PANDAS_ATTRS" in pa_table.schema.metadata:
df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"]
result.attrs = json.loads(df_metadata)
return result
finally:
if handles is not None:
handles.close()
class FastParquetImpl(BaseImpl):
def __init__(self) -> None:
# since pandas is a dependency of fastparquet
# we need to import on first use
fastparquet = import_optional_dependency(
"fastparquet", extra="fastparquet is required for parquet support."
)
self.api = fastparquet
def write(
self,
df: DataFrame,
path,
compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
index=None,
partition_cols=None,
storage_options: StorageOptions | None = None,
filesystem=None,
**kwargs,
) -> None:
self.validate_dataframe(df)
if "partition_on" in kwargs and partition_cols is not None:
raise ValueError(
"Cannot use both partition_on and "
"partition_cols. Use partition_cols for partitioning data"
)
if "partition_on" in kwargs:
partition_cols = kwargs.pop("partition_on")
if partition_cols is not None:
kwargs["file_scheme"] = "hive"
if filesystem is not None:
raise NotImplementedError(
"filesystem is not implemented for the fastparquet engine."
)
# cannot use get_handle as write() does not accept file buffers
path = stringify_path(path)
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
# if filesystem is provided by fsspec, file must be opened in 'wb' mode.
kwargs["open_with"] = lambda path, _: fsspec.open(
path, "wb", **(storage_options or {})
).open()
elif storage_options:
raise ValueError(
"storage_options passed with file object or non-fsspec file path"
)
with catch_warnings(record=True):
self.api.write(
path,
df,
compression=compression,
write_index=index,
partition_on=partition_cols,
**kwargs,
)
def read(
self,
path,
columns=None,
filters=None,
storage_options: StorageOptions | None = None,
filesystem=None,
**kwargs,
) -> DataFrame:
parquet_kwargs: dict[str, Any] = {}
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
# We are disabling nullable dtypes for fastparquet pending discussion
parquet_kwargs["pandas_nulls"] = False
if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
if dtype_backend is not lib.no_default:
raise ValueError(
"The 'dtype_backend' argument is not supported for the "
"fastparquet engine"
)
if filesystem is not None:
raise NotImplementedError(
"filesystem is not implemented for the fastparquet engine."
)
path = stringify_path(path)
handles = None
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs
elif isinstance(path, str) and not os.path.isdir(path):
# use get_handle only when we are very certain that it is not a directory
# fsspec resources can also point to directories
# this branch is used for example when reading from non-fsspec URLs
handles = get_handle(
path, "rb", is_text=False, storage_options=storage_options
)
path = handles.handle
try:
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs)
finally:
if handles is not None:
handles.close()
@doc(storage_options=_shared_docs["storage_options"])
def to_parquet(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
engine: str = "auto",
compression: str | None = "snappy",
index: bool | None = None,
storage_options: StorageOptions | None = None,
partition_cols: list[str] | None = None,
filesystem: Any = None,
**kwargs,
) -> bytes | None:
"""
Write a DataFrame to the parquet format.
Parameters
----------
df : DataFrame
path : str, path object, file-like object, or None, default None
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``write()`` function. If None, the result is
returned as bytes. If a string, it will be used as Root Directory path
when writing a partitioned dataset. The engine fastparquet does not
accept file-like objects.
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
When using the ``'pyarrow'`` engine and no storage options are provided
and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
(e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
Use the filesystem keyword with an instantiated fsspec filesystem
if you wish to use its implementation.
compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
default 'snappy'. Name of the compression to use. Use ``None``
for no compression.
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
If ``None``, similar to ``True`` the dataframe's index(es)
will be saved. However, instead of being saved as values,
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
partition_cols : str or list, optional, default None
Column names by which to partition the dataset.
Columns are partitioned in the order they are given.
Must be None if path is not a string.
{storage_options}
filesystem : fsspec or pyarrow filesystem, default None
Filesystem object to use when reading the parquet file. Only implemented
for ``engine="pyarrow"``.
.. versionadded:: 2.1.0
kwargs
Additional keyword arguments passed to the engine
Returns
-------
bytes if no path argument is provided else None
"""
if isinstance(partition_cols, str):
partition_cols = [partition_cols]
impl = get_engine(engine)
path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
impl.write(
df,
path_or_buf,
compression=compression,
index=index,
partition_cols=partition_cols,
storage_options=storage_options,
filesystem=filesystem,
**kwargs,
)
if path is None:
assert isinstance(path_or_buf, io.BytesIO)
return path_or_buf.getvalue()
else:
return None
@doc(storage_options=_shared_docs["storage_options"])
def read_parquet(
path: FilePath | ReadBuffer[bytes],
engine: str = "auto",
columns: list[str] | None = None,
storage_options: StorageOptions | None = None,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
filesystem: Any = None,
filters: list[tuple] | list[list[tuple]] | None = None,
**kwargs,
) -> DataFrame:
"""
Load a parquet object from the file path, returning a DataFrame.
Parameters
----------
path : str, path object or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function.
The string could be a URL. Valid URL schemes include http, ftp, s3,
gs, and file. For file URLs, a host is expected. A local file could be:
``file://localhost/path/to/table.parquet``.
A file URL can also be a path to a directory that contains multiple
partitioned parquet files. Both pyarrow and fastparquet support
paths to directories as well as file URLs. A directory path could be:
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
When using the ``'pyarrow'`` engine and no storage options are provided
and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
(e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
Use the filesystem keyword with an instantiated fsspec filesystem
if you wish to use its implementation.
columns : list, default=None
If not None, only these columns will be read from the file.
{storage_options}
.. versionadded:: 1.3.0
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame. (only applicable for the ``pyarrow``
engine)
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
support dtypes) may change without notice.
.. deprecated:: 2.0
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
.. versionadded:: 2.0
filesystem : fsspec or pyarrow filesystem, default None
Filesystem object to use when reading the parquet file. Only implemented
for ``engine="pyarrow"``.
.. versionadded:: 2.1.0
filters : List[Tuple] or List[List[Tuple]], default None
To filter out data.
Filter syntax: [[(column, op, val), ...],...]
where op is [==, =, >, >=, <, <=, !=, in, not in]
The innermost tuples are transposed into a set of filters applied
through an `AND` operation.
The outer list combines these sets of filters through an `OR`
operation.
A single list of tuples can also be used, meaning that no `OR`
operation between set of filters is to be conducted.
Using this argument will NOT result in row-wise filtering of the final
partitions unless ``engine="pyarrow"`` is also specified. For
other engines, filtering is only performed at the partition level, that is,
to prevent the loading of some row-groups and/or files.
.. versionadded:: 2.1.0
**kwargs
Any additional kwargs are passed to the engine.
Returns
-------
DataFrame
See Also
--------
DataFrame.to_parquet : Create a parquet object that serializes a DataFrame.
Examples
--------
>>> original_df = pd.DataFrame(
... {{"foo": range(5), "bar": range(5, 10)}}
... )
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> df_parquet_bytes = original_df.to_parquet()
>>> from io import BytesIO
>>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes))
>>> restored_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> restored_df.equals(original_df)
True
>>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"])
>>> restored_bar
bar
0 5
1 6
2 7
3 8
4 9
>>> restored_bar.equals(original_df[['bar']])
True
The function uses `kwargs` that are passed directly to the engine.
In the following example, we use the `filters` argument of the pyarrow
engine to filter the rows of the DataFrame.
Since `pyarrow` is the default engine, we can omit the `engine` argument.
Note that the `filters` argument is implemented by the `pyarrow` engine,
which can benefit from multithreading and also potentially be more
economical in terms of memory.
>>> sel = [("foo", ">", 2)]
>>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel)
>>> restored_part
foo bar
0 3 8
1 4 9
"""
impl = get_engine(engine)
if use_nullable_dtypes is not lib.no_default:
msg = (
"The argument 'use_nullable_dtypes' is deprecated and will be removed "
"in a future version."
)
if use_nullable_dtypes is True:
msg += (
"Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True."
)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
else:
use_nullable_dtypes = False
check_dtype_backend(dtype_backend)
return impl.read(
path,
columns=columns,
filters=filters,
storage_options=storage_options,
use_nullable_dtypes=use_nullable_dtypes,
dtype_backend=dtype_backend,
filesystem=filesystem,
**kwargs,
)

View File

@ -0,0 +1,9 @@
from pandas.io.parsers.readers import (
TextFileReader,
TextParser,
read_csv,
read_fwf,
read_table,
)
__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]

View File

@ -0,0 +1,303 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import warnings
from pandas._config import using_pyarrow_string_dtype
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
ParserError,
ParserWarning,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.inference import is_integer
import pandas as pd
from pandas import DataFrame
from pandas.io._util import (
_arrow_dtype_mapping,
arrow_string_types_mapper,
)
from pandas.io.parsers.base_parser import ParserBase
if TYPE_CHECKING:
from pandas._typing import ReadBuffer
class ArrowParserWrapper(ParserBase):
"""
Wrapper for the pyarrow engine for read_csv()
"""
def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
super().__init__(kwds)
self.kwds = kwds
self.src = src
self._parse_kwds()
def _parse_kwds(self) -> None:
"""
Validates keywords before passing to pyarrow.
"""
encoding: str | None = self.kwds.get("encoding")
self.encoding = "utf-8" if encoding is None else encoding
na_values = self.kwds["na_values"]
if isinstance(na_values, dict):
raise ValueError(
"The pyarrow engine doesn't support passing a dict for na_values"
)
self.na_values = list(self.kwds["na_values"])
def _get_pyarrow_options(self) -> None:
"""
Rename some arguments to pass to pyarrow
"""
mapping = {
"usecols": "include_columns",
"na_values": "null_values",
"escapechar": "escape_char",
"skip_blank_lines": "ignore_empty_lines",
"decimal": "decimal_point",
"quotechar": "quote_char",
}
for pandas_name, pyarrow_name in mapping.items():
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
# Date format handling
# If we get a string, we need to convert it into a list for pyarrow
# If we get a dict, we want to parse those separately
date_format = self.date_format
if isinstance(date_format, str):
date_format = [date_format]
else:
# In case of dict, we don't want to propagate through, so
# just set to pyarrow default of None
# Ideally, in future we disable pyarrow dtype inference (read in as string)
# to prevent misreads.
date_format = None
self.kwds["timestamp_parsers"] = date_format
self.parse_options = {
option_name: option_value
for option_name, option_value in self.kwds.items()
if option_value is not None
and option_name
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
}
on_bad_lines = self.kwds.get("on_bad_lines")
if on_bad_lines is not None:
if callable(on_bad_lines):
self.parse_options["invalid_row_handler"] = on_bad_lines
elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
self.parse_options[
"invalid_row_handler"
] = None # PyArrow raises an exception by default
elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
def handle_warning(invalid_row) -> str:
warnings.warn(
f"Expected {invalid_row.expected_columns} columns, but found "
f"{invalid_row.actual_columns}: {invalid_row.text}",
ParserWarning,
stacklevel=find_stack_level(),
)
return "skip"
self.parse_options["invalid_row_handler"] = handle_warning
elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
self.parse_options["invalid_row_handler"] = lambda _: "skip"
self.convert_options = {
option_name: option_value
for option_name, option_value in self.kwds.items()
if option_value is not None
and option_name
in (
"include_columns",
"null_values",
"true_values",
"false_values",
"decimal_point",
"timestamp_parsers",
)
}
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
# autogenerated column names are prefixed with 'f' in pyarrow.csv
if self.header is None and "include_columns" in self.convert_options:
self.convert_options["include_columns"] = [
f"f{n}" for n in self.convert_options["include_columns"]
]
self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
if self.header is not None
else self.kwds["skiprows"],
"encoding": self.encoding,
}
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
"""
Processes data read in based on kwargs.
Parameters
----------
frame: DataFrame
The DataFrame to process.
Returns
-------
DataFrame
The processed DataFrame.
"""
num_cols = len(frame.columns)
multi_index_named = True
if self.header is None:
if self.names is None:
if self.header is None:
self.names = range(num_cols)
if len(self.names) != num_cols:
# usecols is passed through to pyarrow, we only handle index col here
# The only way self.names is not the same length as number of cols is
# if we have int index_col. We should just pad the names(they will get
# removed anyways) to expected length then.
self.names = list(range(num_cols - len(self.names))) + self.names
multi_index_named = False
frame.columns = self.names
# we only need the frame not the names
_, frame = self._do_date_conversions(frame.columns, frame)
if self.index_col is not None:
index_to_set = self.index_col.copy()
for i, item in enumerate(self.index_col):
if is_integer(item):
index_to_set[i] = frame.columns[item]
# String case
elif item not in frame.columns:
raise ValueError(f"Index {item} invalid")
# Process dtype for index_col and drop from dtypes
if self.dtype is not None:
key, new_dtype = (
(item, self.dtype.get(item))
if self.dtype.get(item) is not None
else (frame.columns[item], self.dtype.get(frame.columns[item]))
)
if new_dtype is not None:
frame[key] = frame[key].astype(new_dtype)
del self.dtype[key]
frame.set_index(index_to_set, drop=True, inplace=True)
# Clear names if headerless and no name given
if self.header is None and not multi_index_named:
frame.index.names = [None] * len(frame.index.names)
if self.dtype is not None:
# Ignore non-existent columns from dtype mapping
# like other parsers do
if isinstance(self.dtype, dict):
self.dtype = {
k: pandas_dtype(v)
for k, v in self.dtype.items()
if k in frame.columns
}
else:
self.dtype = pandas_dtype(self.dtype)
try:
frame = frame.astype(self.dtype)
except TypeError as e:
# GH#44901 reraise to keep api consistent
raise ValueError(e)
return frame
def _validate_usecols(self, usecols) -> None:
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
raise ValueError(
"The pyarrow engine does not allow 'usecols' to be integer "
"column positions. Pass a list of string column names instead."
)
elif callable(usecols):
raise ValueError(
"The pyarrow engine does not allow 'usecols' to be a callable."
)
def read(self) -> DataFrame:
"""
Reads the contents of a CSV file into a DataFrame and
processes it according to the kwargs passed in the
constructor.
Returns
-------
DataFrame
The DataFrame created from the CSV file.
"""
pa = import_optional_dependency("pyarrow")
pyarrow_csv = import_optional_dependency("pyarrow.csv")
self._get_pyarrow_options()
try:
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
except TypeError:
include = self.convert_options.get("include_columns", None)
if include is not None:
self._validate_usecols(include)
nulls = self.convert_options.get("null_values", set())
if not lib.is_list_like(nulls) or not all(
isinstance(x, str) for x in nulls
):
raise TypeError(
"The 'pyarrow' engine requires all na_values to be strings"
)
raise
try:
table = pyarrow_csv.read_csv(
self.src,
read_options=pyarrow_csv.ReadOptions(**self.read_options),
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=convert_options,
)
except pa.ArrowInvalid as e:
raise ParserError(e) from e
dtype_backend = self.kwds["dtype_backend"]
# Convert all pa.null() cols -> float64 (non nullable)
# else Int64 (nullable case, see below)
if dtype_backend is lib.no_default:
new_schema = table.schema
new_type = pa.float64()
for i, arrow_type in enumerate(table.schema.types):
if pa.types.is_null(arrow_type):
new_schema = new_schema.set(
i, new_schema.field(i).with_type(new_type)
)
table = table.cast(new_schema)
if dtype_backend == "pyarrow":
frame = table.to_pandas(types_mapper=pd.ArrowDtype)
elif dtype_backend == "numpy_nullable":
# Modify the default mapping to also
# map null to Int64 (to match other engines)
dtype_mapping = _arrow_dtype_mapping()
dtype_mapping[pa.null()] = pd.Int64Dtype()
frame = table.to_pandas(types_mapper=dtype_mapping.get)
elif using_pyarrow_string_dtype():
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
frame = table.to_pandas()
return self._finalize_pandas_output(frame)

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More