From eaacfeb6c45453979df757960d77c66bf0df023d Mon Sep 17 00:00:00 2001 From: Musgrove Date: Thu, 24 Jul 2025 12:52:21 -0700 Subject: [PATCH] modified pytables.py put() and append() docstrings to ensure more accurate documentation for the min_itemsize variable --- pandas/io/pytables.py | 523 +++++++++++++++++++++++------------------- 1 file changed, 287 insertions(+), 236 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 64a05c87e0f80..0cf8d78f28fb9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2,7 +2,6 @@ High level interface to PyTables for reading and writing pandas data structures to disk """ - from __future__ import annotations from contextlib import suppress @@ -18,9 +17,9 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Final, Literal, - TypeAlias, cast, overload, ) @@ -31,6 +30,7 @@ from pandas._config import ( config, get_option, + using_copy_on_write, using_string_dtype, ) @@ -87,7 +87,6 @@ DatetimeArray, PeriodArray, ) -from pandas.core.arrays.datetimes import tz_to_dtype from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( @@ -99,6 +98,10 @@ extract_array, ) from pandas.core.indexes.api import ensure_index +from pandas.core.internals import ( + ArrayManager, + BlockManager, +) from pandas.io.common import stringify_path from pandas.io.formats.printing import ( @@ -108,7 +111,6 @@ if TYPE_CHECKING: from collections.abc import ( - Callable, Hashable, Iterator, Sequence, @@ -141,6 +143,13 @@ _default_encoding = "UTF-8" +def _ensure_decoded(s): + """if we have bytes, decode them to unicode""" + if isinstance(s, np.bytes_): + s = s.decode("UTF-8") + return s + + def _ensure_encoding(encoding: str | None) -> str: # set the encoding if we need if encoding is None: @@ -161,7 +170,7 @@ def _ensure_str(name): return name -Term: TypeAlias = PyTablesExpr +Term = PyTablesExpr def _ensure_term(where, scope_level: int): @@ -300,14 +309,14 @@ def to_hdf( dropna=dropna, ) - if isinstance(path_or_buf, HDFStore): - f(path_or_buf) - else: - path_or_buf = stringify_path(path_or_buf) + path_or_buf = stringify_path(path_or_buf) + if isinstance(path_or_buf, str): with HDFStore( path_or_buf, mode=mode, complevel=complevel, complib=complib ) as store: f(store) + else: + f(path_or_buf) def read_hdf( @@ -362,7 +371,7 @@ def read_hdf( A list of Term (or convertible) objects. start : int, optional Row number to start selection. - stop : int, optional + stop : int, optional Row number to stop selection. columns : list, optional A list of columns names to return. @@ -392,9 +401,9 @@ def read_hdf( Examples -------- - >>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP - >>> df.to_hdf("./store.h5", "data") # doctest: +SKIP - >>> reread = pd.read_hdf("./store.h5") # doctest: +SKIP + >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP + >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP + >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP """ if mode not in ["r", "r+", "a"]: raise ValueError( @@ -535,9 +544,9 @@ class HDFStore: Examples -------- >>> bar = pd.DataFrame(np.random.randn(10, 4)) - >>> store = pd.HDFStore("test.h5") - >>> store["foo"] = bar # write to HDF5 - >>> bar = store["foo"] # retrieve + >>> store = pd.HDFStore('test.h5') + >>> store['foo'] = bar # write to HDF5 + >>> bar = store['foo'] # retrieve >>> store.close() **Create or load HDF5 file in-memory** @@ -547,9 +556,9 @@ class HDFStore: written when closed: >>> bar = pd.DataFrame(np.random.randn(10, 4)) - >>> store = pd.HDFStore("test.h5", driver="H5FD_CORE") - >>> store["foo"] = bar - >>> store.close() # only now, data is written to disk + >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') + >>> store['foo'] = bar + >>> store.close() # only now, data is written to disk """ _handle: File | None @@ -608,7 +617,7 @@ def __getitem__(self, key: str): def __setitem__(self, key: str, value) -> None: self.put(key, value) - def __delitem__(self, key: str) -> int | None: + def __delitem__(self, key: str) -> None: return self.remove(key) def __getattr__(self, name: str): @@ -671,18 +680,12 @@ def keys(self, include: str = "pandas") -> list[str]: ------ raises ValueError if kind has an illegal value - See Also - -------- - HDFStore.info : Prints detailed information on the store. - HDFStore.get_node : Returns the node with the key. - HDFStore.get_storer : Returns the storer object for a key. - Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP - >>> store.get("data") # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df) # doctest: +SKIP + >>> store.get('data') # doctest: +SKIP >>> print(store.keys()) # doctest: +SKIP ['/data1', '/data2'] >>> store.close() # doctest: +SKIP @@ -800,24 +803,18 @@ def get(self, key: str): Parameters ---------- key : str - Object to retrieve from file. Raises KeyError if not found. Returns ------- object Same type as object stored in file. - See Also - -------- - HDFStore.get_node : Returns the node with the key. - HDFStore.get_storer : Returns the storer object for a key. - Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP - >>> store.get("data") # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df) # doctest: +SKIP + >>> store.get('data') # doctest: +SKIP >>> store.close() # doctest: +SKIP """ with patch_pickle(): @@ -874,25 +871,19 @@ def select( object Retrieved object from file. - See Also - -------- - HDFStore.select_as_coordinates : Returns the selection as an index. - HDFStore.select_column : Returns a single column from the table. - HDFStore.select_as_multiple : Retrieves pandas objects from multiple tables. - Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP - >>> store.get("data") # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df) # doctest: +SKIP + >>> store.get('data') # doctest: +SKIP >>> print(store.keys()) # doctest: +SKIP ['/data1', '/data2'] - >>> store.select("/data1") # doctest: +SKIP + >>> store.select('/data1') # doctest: +SKIP A B 0 1 2 1 3 4 - >>> store.select("/data1", where="columns == A") # doctest: +SKIP + >>> store.select('/data1', where='columns == A') # doctest: +SKIP A 0 1 1 3 @@ -1140,18 +1131,10 @@ def put( """ Store object in HDFStore. - This method writes a pandas DataFrame or Series into an HDF5 file using - either the fixed or table format. The `table` format allows additional - operations like incremental appends and queries but may have performance - trade-offs. The `fixed` format provides faster read/write operations but - does not support appends or queries. - Parameters ---------- key : str - Key of object to store in file. value : {Series, DataFrame} - Value of object to store in file. format : 'fixed(f)|table(t)', default is 'fixed' Format to use when storing object in HDFStore. Value can be one of: @@ -1165,27 +1148,26 @@ def put( Write DataFrame index as a column. append : bool, default False This will force Table format, append the input data to the existing. - complib : default None - This parameter is currently not accepted. - complevel : int, 0-9, default None - Specifies a compression level for data. - A value of 0 or None disables compression. - min_itemsize : int, dict, or None - Dict of columns that specify minimum str sizes. - nan_rep : str - Str to use as str nan representation. + min_itemsize : int, dict of str: int, or None, default None + Minimum size in bytes for string columns. This parameter is only used when + format='table'. Can be: + + - int: Apply the same minimum size to all string columns + - dict: Map column names to their minimum sizes + - None: Use default sizing + + **Important**: The size refers to the number of bytes after encoding, not + the number of characters. For multi-byte characters (e.g., Chinese, Arabic), + you need to account for the encoding. For example, the character '香' is + 1 character but 3 bytes when encoded as UTF-8 + + See examples below for proper usage with encoded strings. data_columns : list of columns or True, default None List of columns to create as data columns, or True to use all columns. See `here `__. encoding : str, default None Provide an encoding for strings. - errors : str, default 'strict' - The error handling scheme to use for encoding errors. - The default is 'strict' meaning that encoding errors raise a - UnicodeEncodeError. Other possible values are 'ignore', 'replace' and - 'xmlcharrefreplace' as well as any other name registered with - codecs.register_error that can handle UnicodeEncodeErrors. track_times : bool, default True Parameter is propagated to 'create_table' method of 'PyTables'. If set to False it enables to have the same h5 files (same hashes) @@ -1193,16 +1175,33 @@ def put( dropna : bool, default False, optional Remove missing values. - See Also - -------- - HDFStore.info : Prints detailed information on the store. - HDFStore.get_storer : Returns the storer object for a key. - Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df) # doctest: +SKIP + + Basic usage with ASCII strings: + + >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP + + Usage with multi-byte characters: + + >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP + >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes + >>> store.put('cities', df_unicode, format='table', # doctest: +SKIP + ... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP + + Determining the correct size for encoded strings: + + >>> text = '香港' # doctest: +SKIP + >>> len(text) # Character length # doctest: +SKIP + 2 + >>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP + 6 + >>> # Use the byte length for min_itemsize """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1224,7 +1223,7 @@ def put( dropna=dropna, ) - def remove(self, key: str, where=None, start=None, stop=None) -> int | None: + def remove(self, key: str, where=None, start=None, stop=None) -> None: """ Remove pandas object partially by specifying the where condition @@ -1272,12 +1271,14 @@ def remove(self, key: str, where=None, start=None, stop=None) -> int | None: # remove the node if com.all_none(where, start, stop): s.group._f_remove(recursive=True) - return None # delete from the table - if not s.is_table: - raise ValueError("can only remove with where on objects written as tables") - return s.delete(where=where, start=start, stop=stop) + else: + if not s.is_table: + raise ValueError( + "can only remove with where on objects written as tables" + ) + return s.delete(where=where, start=start, stop=stop) def append( self, @@ -1307,9 +1308,7 @@ def append( Parameters ---------- key : str - Key of object to append. value : {Series, DataFrame} - Value of object to append. format : 'table' is the default Format to use when storing object in HDFStore. Value can be one of: @@ -1317,66 +1316,92 @@ def append( Table format. Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data. - axes : default None - This parameter is currently not accepted. index : bool, default True Write DataFrame index as a column. - append : bool, default True + append : bool, default True Append the input data to the existing. - complib : default None - This parameter is currently not accepted. - complevel : int, 0-9, default None - Specifies a compression level for data. - A value of 0 or None disables compression. - columns : default None - This parameter is currently not accepted, try data_columns. - min_itemsize : int, dict, or None - Dict of columns that specify minimum str sizes. - nan_rep : str - Str to use as str nan representation. - chunksize : int or None - Size to chunk the writing. - expectedrows : int - Expected TOTAL row size of this table. - dropna : bool, default False, optional - Do not write an ALL nan row to the store settable - by the option 'io.hdf.dropna_table'. data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here `__. - encoding : default None - Provide an encoding for str. - errors : str, default 'strict' - The error handling scheme to use for encoding errors. - The default is 'strict' meaning that encoding errors raise a - UnicodeEncodeError. Other possible values are 'ignore', 'replace' and - 'xmlcharrefreplace' as well as any other name registered with - codecs.register_error that can handle UnicodeEncodeErrors. - - See Also - -------- - HDFStore.append_to_multiple : Append to multiple tables. + min_itemsize : int, dict of str: int, or None, default None + Minimum size in bytes for string columns. Can be: + + - int: Apply the same minimum size to all string columns + - dict: Map column names to their minimum sizes + - None: Use the existing table's column sizes + + **Important**: This parameter is only effective when creating a new table. + If the table already exists, the column sizes are fixed and cannot be + changed. The size refers to the number of bytes after encoding, not + the number of characters. + + For multi-byte characters, calculate the size using the encoded byte length. + + For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1. + nan_rep : str to use as str nan representation + chunksize : size to chunk the writing + expectedrows : expected TOTAL row size of this table + encoding : default None, provide an encoding for str + dropna : bool, default False, optional + Do not write an ALL nan row to the store settable + by the option 'io.hdf.dropna_table'. Notes ----- Does *not* check if data being appended overlaps with existing data in the table, so be careful + When appending to an existing table, the min_itemsize parameter has no effect + as column sizes are already fixed. Set min_itemsize when initially creating + the table with put() or the first append() call. + Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df1, format="table") # doctest: +SKIP - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"]) - >>> store.append("data", df2) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df1, format='table') # doctest: +SKIP + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) + >>> store.append('data', df2) # doctest: +SKIP >>> store.close() # doctest: +SKIP A B 0 1 2 1 3 4 0 5 6 1 7 8 + + Creating a table and appending data: + + >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> # Set min_itemsize when creating the table + >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP + >>> + >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B']) + >>> store.append('data', df2) # doctest: +SKIP + >>> store.close() # doctest: +SKIP + + Handling multi-byte characters: + + >>> df_en = pd.DataFrame([['hello']], columns=['text']) + >>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes + >>> store.put('messages', df_en, format='table', + ... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP + >>> store.append('messages', df_zh) # doctest: +SKIP + >>> store.close() # doctest: +SKIP + + Common error when min_itemsize is too small: + + >>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8 + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> # This will raise ValueError: string length [3] exceeds limit [1] + >>> # store.put('test', df, format='table', min_itemsize={'char': 1}) + >>> # Correct usage: + >>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP + >>> store.close() # doctest: +SKIP """ if columns is not None: raise TypeError( @@ -1555,15 +1580,11 @@ def groups(self) -> list: list List of objects. - See Also - -------- - HDFStore.get_node : Returns the node with the key. - Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df) # doctest: +SKIP >>> print(store.groups()) # doctest: +SKIP >>> store.close() # doctest: +SKIP [/data (Group) '' @@ -1614,17 +1635,13 @@ def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: leaves : list Names (strings) of the pandas objects contained in `path`. - See Also - -------- - HDFStore.info : Prints detailed information on the store. - Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df1, format="table") # doctest: +SKIP - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"]) - >>> store.append("data", df2) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df1, format='table') # doctest: +SKIP + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) + >>> store.append('data', df2) # doctest: +SKIP >>> store.close() # doctest: +SKIP >>> for group in store.walk(): # doctest: +SKIP ... print(group) # doctest: +SKIP @@ -1743,33 +1760,24 @@ def info(self) -> str: Returns ------- str - A String containing the python pandas class name, filepath to the HDF5 - file and all the object keys along with their respective dataframe shapes. - - See Also - -------- - HDFStore.get_storer : Returns the storer object for a key. Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["C", "D"]) - >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data1", df1) # doctest: +SKIP - >>> store.put("data2", df2) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df) # doctest: +SKIP >>> print(store.info()) # doctest: +SKIP >>> store.close() # doctest: +SKIP File path: store.h5 - /data1 frame (shape->[2,2]) - /data2 frame (shape->[2,2]) + /data frame (shape->[2,2]) """ path = pprint_thing(self._path) output = f"{type(self)}\nFile path: {path}\n" if self.is_open: lkeys = sorted(self.keys()) - if lkeys: + if len(lkeys): keys = [] values = [] @@ -1826,8 +1834,8 @@ def _create_storer( if value is not None and not isinstance(value, (Series, DataFrame)): raise TypeError("value must be None, Series, or DataFrame") - pt = getattr(group._v_attrs, "pandas_type", None) - tt = getattr(group._v_attrs, "table_type", None) + pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) + tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) # infer the pt from the passed value if pt is None: @@ -1894,7 +1902,7 @@ def _create_storer( "worm": WORMTable, } try: - cls = _TABLE_MAP[tt] # type: ignore[index] + cls = _TABLE_MAP[tt] except KeyError as err: raise TypeError( f"cannot properly create the storer for: [_TABLE_MAP] [group->" @@ -2241,13 +2249,13 @@ def convert( # preventing the original recarry from being free'ed values = values[self.cname].copy() - val_kind = self.kind + val_kind = _ensure_decoded(self.kind) values = _maybe_convert(values, val_kind, encoding, errors) kwargs = {} - kwargs["name"] = self.index_name + kwargs["name"] = _ensure_decoded(self.index_name) if self.freq is not None: - kwargs["freq"] = self.freq + kwargs["freq"] = _ensure_decoded(self.freq) factory: type[Index | DatetimeIndex] = Index if lib.is_np_dtype(values.dtype, "M") or isinstance( @@ -2261,7 +2269,9 @@ def convert( # "Union[Type[Index], Type[DatetimeIndex]]") factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment] x, freq=kwds.get("freq", None) - )._rename(kwds["name"]) + )._rename( + kwds["name"] + ) # making an Index instance could throw a number of different errors try: @@ -2286,12 +2296,7 @@ def convert( if "freq" in kwargs: kwargs["freq"] = None new_pd_index = factory(values, **kwargs) - - final_pd_index: Index - if self.tz is not None and isinstance(new_pd_index, DatetimeIndex): - final_pd_index = new_pd_index.tz_localize("UTC").tz_convert(self.tz) - else: - final_pd_index = new_pd_index + final_pd_index = _set_tz(new_pd_index, self.tz) return final_pd_index, final_pd_index def take_data(self): @@ -2325,7 +2330,7 @@ def maybe_set_size(self, min_itemsize=None) -> None: min_itemsize can be an integer or a dict with this columns name with an integer size """ - if self.kind == "string": + if _ensure_decoded(self.kind) == "string": if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) @@ -2346,7 +2351,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool) -> None: def validate_col(self, itemsize=None): """validate this column: return the compared against itemsize""" # validate this column for string truncation (or reset to the max size) - if self.kind == "string": + if _ensure_decoded(self.kind) == "string": c = self.col if c is not None: if itemsize is None: @@ -2676,19 +2681,19 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): assert isinstance(converted, np.ndarray) # for mypy # use the meta if needed - meta = self.meta + meta = _ensure_decoded(self.meta) metadata = self.metadata ordered = self.ordered tz = self.tz assert dtype_name is not None # convert to the correct dtype - dtype = dtype_name + dtype = _ensure_decoded(dtype_name) # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz, dtype) + converted = _set_tz(converted, tz, coerce=True) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -2733,7 +2738,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): converted = converted.astype("O", copy=False) # convert nans / decode - if kind == "string": + if _ensure_decoded(kind) == "string": converted = _unconvert_string_array( converted, nan_rep=nan_rep, encoding=encoding, errors=errors ) @@ -2821,19 +2826,18 @@ def is_old_version(self) -> bool: @property def version(self) -> tuple[int, int, int]: """compute and set our version""" - version = getattr(self.group._v_attrs, "pandas_version", None) - if isinstance(version, str): - version_tup = tuple(int(x) for x in version.split(".")) - if len(version_tup) == 2: - version_tup = version_tup + (0,) - assert len(version_tup) == 3 # needed for mypy - return version_tup - else: - return (0, 0, 0) + version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) + try: + version = tuple(int(x) for x in version.split(".")) + if len(version) == 2: + version = version + (0,) + except AttributeError: + version = (0, 0, 0) + return version @property def pandas_type(self): - return getattr(self.group._v_attrs, "pandas_type", None) + return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) def __repr__(self) -> str: """return a pretty representation of myself""" @@ -2928,7 +2932,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ) -> Series | DataFrame: + ): raise NotImplementedError( "cannot read on an abstract storer: subclasses should implement" ) @@ -2940,7 +2944,7 @@ def write(self, obj, **kwargs) -> None: def delete( self, where=None, start: int | None = None, stop: int | None = None - ) -> int | None: + ) -> None: """ support fully deleting the node in its entirety (only) - where specification must be None @@ -2970,7 +2974,9 @@ def _alias_to_class(self, alias): return self._reverse_index_map.get(alias, Index) def _get_index_factory(self, attrs): - index_class = self._alias_to_class(getattr(attrs, "index_class", "")) + index_class = self._alias_to_class( + _ensure_decoded(getattr(attrs, "index_class", "")) + ) factory: Callable @@ -3006,7 +3012,12 @@ def f(values, freq=None, tz=None): factory = TimedeltaIndex if "tz" in attrs: - kwargs["tz"] = attrs["tz"] + if isinstance(attrs["tz"], bytes): + # created by python2 + kwargs["tz"] = attrs["tz"].decode("utf-8") + else: + # created by python3 + kwargs["tz"] = attrs["tz"] assert index_class is DatetimeIndex # just checking return factory, kwargs @@ -3038,9 +3049,9 @@ def set_attrs(self) -> None: def get_attrs(self) -> None: """retrieve our attributes""" self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) - self.errors = getattr(self.attrs, "errors", "strict") + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) for n in self.attributes: - setattr(self, n, getattr(self.attrs, n, None)) + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) def write(self, obj, **kwargs) -> None: self.set_attrs() @@ -3060,7 +3071,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype is not None: ret = pd_array(ret, dtype=dtype) else: - dtype = getattr(attrs, "value_type", None) + dtype = _ensure_decoded(getattr(attrs, "value_type", None)) shape = getattr(attrs, "shape", None) if shape is not None: @@ -3072,7 +3083,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz, dtype) + ret = _set_tz(ret, tz, coerce=True) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -3085,7 +3096,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None def read_index( self, key: str, start: int | None = None, stop: int | None = None ) -> Index: - variety = getattr(self.attrs, f"{key}_variety") + variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) if variety == "multi": return self.read_multi_index(key, start=start, stop=stop) @@ -3175,11 +3186,12 @@ def read_index_node( # have written a sentinel. Here we replace it with the original. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) - kind = node._v_attrs.kind + kind = _ensure_decoded(node._v_attrs.kind) name = None if "name" in node._v_attrs: name = _ensure_str(node._v_attrs.name) + name = _ensure_decoded(name) attrs = node._v_attrs factory, kwargs = self._get_index_factory(attrs) @@ -3285,7 +3297,7 @@ def write_array( pass elif inferred_type == "string": pass - elif get_option("performance_warnings"): + else: ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level()) @@ -3302,9 +3314,7 @@ def write_array( # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "asi8" self._handle.create_array( - self.group, - key, - value.asi8, # type: ignore[union-attr] + self.group, key, value.asi8 # type: ignore[union-attr] ) node = getattr(self.group, key) @@ -3335,7 +3345,7 @@ class SeriesFixed(GenericFixed): name: Hashable @property - def shape(self) -> tuple[int] | None: + def shape(self): try: return (len(self.group.values),) except (TypeError, AttributeError): @@ -3445,14 +3455,23 @@ def read( dfs.append(df) if len(dfs) > 0: - out = concat(dfs, axis=1).copy() - return out.reindex(columns=items) + out = concat(dfs, axis=1, copy=True) + if using_copy_on_write(): + # with CoW, concat ignores the copy keyword. Here, we still want + # to copy to enforce optimized column-major layout + out = out.copy() + out = out.reindex(columns=items, copy=False) + return out return DataFrame(columns=axes[0], index=axes[1]) def write(self, obj, **kwargs) -> None: super().write(obj, **kwargs) + # TODO(ArrayManager) HDFStore relies on accessing the blocks + if isinstance(obj._mgr, ArrayManager): + obj = obj._as_manager("block") + data = obj._mgr if not data.is_consolidated(): data = data.consolidate() @@ -3684,7 +3703,7 @@ def queryables(self) -> dict[str, Any]: return dict(d1 + d2 + d3) - def index_cols(self) -> list[tuple[Any, Any]]: + def index_cols(self): """return a list of my index cols""" # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] @@ -3742,7 +3761,7 @@ def get_attrs(self) -> None: self.info = getattr(self.attrs, "info", None) or {} self.nan_rep = getattr(self.attrs, "nan_rep", None) self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) - self.errors = getattr(self.attrs, "errors", "strict") + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or [] self.index_axes = [a for a in self.indexables if a.is_an_indexable] self.values_axes = [a for a in self.indexables if not a.is_an_indexable] @@ -3814,7 +3833,7 @@ def indexables(self): dc = set(self.data_columns) base_pos = len(_indexables) - def f(i, c: str) -> DataCol: + def f(i, c): assert isinstance(c, str) klass = DataCol if c in dc: @@ -3980,7 +3999,7 @@ def get_object(cls, obj, transposed: bool): """return the data for this obj""" return obj - def validate_data_columns(self, data_columns, min_itemsize, non_index_axes) -> list: + def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): """ take the input data_columns and min_itemize and create a data columns spec @@ -4269,10 +4288,16 @@ def _get_blocks_and_items( data_columns, ): # Helper to clarify non-state-altering parts of _create_axes + + # TODO(ArrayManager) HDFStore relies on accessing the blocks + if isinstance(frame._mgr, ArrayManager): + frame = frame._as_manager("block") + def get_blk_items(mgr): return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] mgr = frame._mgr + mgr = cast(BlockManager, mgr) blocks: list[Block] = list(mgr.blocks) blk_items: list[Index] = get_blk_items(mgr) @@ -4284,6 +4309,7 @@ def get_blk_items(mgr): axis, axis_labels = new_non_index_axes[0] new_labels = Index(axis_labels).difference(Index(data_columns)) mgr = frame.reindex(new_labels, axis=axis)._mgr + mgr = cast(BlockManager, mgr) blocks = list(mgr.blocks) blk_items = get_blk_items(mgr) @@ -4292,6 +4318,7 @@ def get_blk_items(mgr): # index, so we can infer that (as long as axis==1) we # get a single column back, so a single block. mgr = frame.reindex([c], axis=axis)._mgr + mgr = cast(BlockManager, mgr) blocks.extend(mgr.blocks) blk_items.extend(get_blk_items(mgr)) @@ -4471,7 +4498,7 @@ def read_column( encoding=self.encoding, errors=self.errors, ) - cvs = col_values[1] + cvs = _set_tz(col_values[1], a.tz) dtype = getattr(self.table.attrs, f"{column}_meta", None) return Series(cvs, name=column, copy=False, dtype=dtype) @@ -4591,7 +4618,7 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None: masks.append(mask.astype("u1", copy=False)) # consolidate masks - if masks: + if len(masks): mask = masks[0] for m in masks[1:]: mask = mask & m @@ -4676,9 +4703,7 @@ def write_data_chunk( self.table.append(rows) self.table.flush() - def delete( - self, where=None, start: int | None = None, stop: int | None = None - ) -> int | None: + def delete(self, where=None, start: int | None = None, stop: int | None = None): # delete all rows (and return the nrows) if where is None or not len(where): if start is None and stop is None: @@ -4711,7 +4736,7 @@ def delete( groups = list(diff[diff > 1].index) # 1 group - if not groups: + if not len(groups): groups = [0] # final element @@ -4814,7 +4839,7 @@ def read( if values.ndim == 1 and isinstance(values, np.ndarray): values = values.reshape((1, values.shape[0])) - if isinstance(values, (np.ndarray, DatetimeArray)): + if isinstance(values, np.ndarray): try: df = DataFrame(values.T, columns=cols_, index=index_, copy=False) except UnicodeEncodeError as err: @@ -5024,7 +5049,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ) -> DataFrame: + ): df = super().read(where=where, columns=columns, start=start, stop=stop) df = df.set_index(self.levels) @@ -5068,25 +5093,55 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone +@overload +def _set_tz( + values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False +) -> DatetimeIndex: + ... + + +@overload +def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray: + ... + + def _set_tz( - values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str -) -> DatetimeArray: + values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False +) -> np.ndarray | DatetimeIndex: """ - Coerce the values to a DatetimeArray with appropriate tz. + coerce the values to a DatetimeIndex if tz is set + preserve the input shape if possible Parameters ---------- - values : ndarray[int64] - tz : str, tzinfo, or None - datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" + values : ndarray or Index + tz : str or tzinfo + coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ - assert values.dtype == "i8", values.dtype - # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; - # expected "tzinfo" - unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count - dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] - dta = DatetimeArray._from_sequence(values, dtype=dtype) - return dta + if isinstance(values, DatetimeIndex): + # If values is tzaware, the tz gets dropped in the values.ravel() + # call below (which returns an ndarray). So we are only non-lossy + # if `tz` matches `values.tz`. + assert values.tz is None or values.tz == tz + if values.tz is not None: + return values + + if tz is not None: + if isinstance(values, DatetimeIndex): + name = values.name + else: + name = None + values = values.ravel() + + tz = _ensure_decoded(tz) + values = DatetimeIndex(values, name=name) + values = values.tz_localize("UTC").tz_convert(tz) + elif coerce: + values = np.asarray(values, dtype="M8[ns]") + + # error: Incompatible return value type (got "Union[ndarray, Index]", + # expected "Union[ndarray, DatetimeIndex]") + return values # type: ignore[return-value] def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: @@ -5390,6 +5445,8 @@ def _dtype_to_kind(dtype_str: str) -> str: """ Find the "kind" string describing the given dtype name. """ + dtype_str = _ensure_decoded(dtype_str) + if dtype_str.startswith(("string", "bytes")): kind = "string" elif dtype_str.startswith("float"): @@ -5501,13 +5558,7 @@ def __init__( if self.terms is not None: self.condition, self.filter = self.terms.evaluate() - @overload - def generate(self, where: dict | list | tuple | str) -> PyTablesExpr: ... - - @overload - def generate(self, where: None) -> None: ... - - def generate(self, where: dict | list | tuple | str | None) -> PyTablesExpr | None: + def generate(self, where): """where can be a : dict,list,tuple,string""" if where is None: return None