diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index ad4a5db441b89..67fb075110f0d 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -390,7 +390,9 @@ def from_dummies( The default category is the implied category when a value has none of the listed categories specified with a one, i.e. if all dummies in a row are zero. Can be a single value for all variables or a dict directly mapping - the default categories to a prefix of a variable. + the default categories to a prefix of a variable. The default category + will be coerced to the dtype of ``data.columns`` if such coercion is + lossless, and will raise otherwise. Returns ------- diff --git a/pandas/io/html.py b/pandas/io/html.py index 183af3a03221b..ffd05daf7465c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1,9 +1,10 @@ """ -:mod:`pandas.io.html` is a module containing functionality for dealing with -HTML IO. +:mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. +Provides utilities for reading and parsing HTML tables into pandas DataFrames. """ + from __future__ import annotations from collections import abc @@ -387,7 +388,7 @@ def _parse_tables(self, document, match, attrs): def _equals_tag(self, obj, tag) -> bool: """ - Return whether an individual DOM node matches a tag + Return whether an individual DOM node matches a tag. Parameters ---------- @@ -399,8 +400,8 @@ def _equals_tag(self, obj, tag) -> bool: Returns ------- - boolean - Whether `obj`'s tag name is `tag` + bool + Whether `obj`'s tag name is `tag`. """ raise AbstractMethodError(self) @@ -562,7 +563,7 @@ def _expand_colspan_rowspan( def _handle_hidden_tables(self, tbl_list, attr_name: str): """ - Return list of tables, potentially removing hidden elements + Return list of tables, potentially removing hidden elements. Parameters ---------- @@ -679,8 +680,9 @@ def _build_doc(self): def _build_xpath_expr(attrs) -> str: """ - Build an xpath expression to simulate bs4's ability to pass in kwargs to - search for attributes when using the lxml parser. + Build an XPath expression to simulate bs4's ability to pass in kwargs. + + Search for attributes when using the lxml parser. Parameters ---------- @@ -689,10 +691,11 @@ def _build_xpath_expr(attrs) -> str: Returns ------- - expr : unicode + str An XPath expression that checks for the given HTML attributes. """ # give class attribute as class_ because class is a python keyword + if "class_" in attrs: attrs["class"] = attrs.pop("class_") @@ -768,6 +771,8 @@ def _equals_tag(self, obj, tag) -> bool: def _build_doc(self): """ + Build and parse the HTML document into a DOM tree. + Raises ------ ValueError diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index c7b7992a78232..dfb691c785404 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -333,9 +333,7 @@ def test_no_prefix_string_cats_default_category( ): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) - expected = DataFrame(expected) - if using_infer_string: - expected[""] = expected[""].astype("str") + expected = DataFrame(expected, dtype=dummies.columns.dtype) tm.assert_frame_equal(result, expected) @@ -449,3 +447,31 @@ def test_maintain_original_index(): result = from_dummies(df) expected = DataFrame({"": list("abca")}, index=list("abcd")) tm.assert_frame_equal(result, expected) + + +def test_int_columns_with_float_default(): + # https://github.com/pandas-dev/pandas/pull/60694 + df = DataFrame( + { + 3: [1, 0, 0], + 4: [0, 1, 0], + }, + ) + with pytest.raises(ValueError, match="Trying to coerce float values to integers"): + from_dummies(df, default_category=0.5) + + +def test_object_dtype_preserved(): + # https://github.com/pandas-dev/pandas/pull/60694 + # When the input has object dtype, the result should as + # well even when infer_string is True. + df = DataFrame( + { + "x": [1, 0, 0], + "y": [0, 1, 0], + }, + ) + df.columns = df.columns.astype("object") + result = from_dummies(df, default_category="z") + expected = DataFrame({"": ["x", "y", "z"]}, dtype="object") + tm.assert_frame_equal(result, expected)