From d0e7d869fa35348b85c5499d7948bdd1b7f80453 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 25 Jun 2023 13:16:21 -0700 Subject: [PATCH 01/22] ENH: Add arrow engine to to_csv --- pandas/core/generic.py | 11 ++ pandas/io/formats/csvs.py | 55 +++++- pandas/io/formats/format.py | 2 + pandas/tests/io/formats/test_to_csv.py | 246 ++++++++++++++----------- 4 files changed, 199 insertions(+), 115 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9084395871675..e4c16aedb2430 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3658,6 +3658,7 @@ def to_csv( path_or_buf: None = ..., sep: str = ..., na_rep: str = ..., + engine: str = "python", float_format: str | Callable | None = ..., columns: Sequence[Hashable] | None = ..., header: bool_t | list[str] = ..., @@ -3685,6 +3686,7 @@ def to_csv( path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str], sep: str = ..., na_rep: str = ..., + engine: str = "python", float_format: str | Callable | None = ..., columns: Sequence[Hashable] | None = ..., header: bool_t | list[str] = ..., @@ -3716,6 +3718,7 @@ def to_csv( path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, sep: str = ",", na_rep: str = "", + engine: str = "python", float_format: str | Callable | None = None, columns: Sequence[Hashable] | None = None, header: bool_t | list[str] = True, @@ -3755,6 +3758,13 @@ def to_csv( String of length 1. Field delimiter for the output file. na_rep : str, default '' Missing data representation. + engine : str, default 'python' + The engine to use. Available options are "pyarrow" or "python". + The pyarrow engine requires the pyarrow library to be installed + and is generally faster than the python engine. + + However, the python engine may be more feature complete than the + pyarrow engine. float_format : str, Callable, default None Format string for floating point numbers. If a Callable is given, it takes precedence over other numeric formatting parameters, like decimal. @@ -3890,6 +3900,7 @@ def to_csv( return DataFrameRenderer(formatter).to_csv( path_or_buf, + engine=engine, lineterminator=lineterminator, sep=sep, encoding=encoding, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 39abb0bf127d9..db9fd8783d9eb 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -20,6 +20,7 @@ import numpy as np from pandas._libs import writers as libwriters +from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( @@ -57,6 +58,7 @@ def __init__( self, formatter: DataFrameFormatter, path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "", + engine: str = "python", sep: str = ",", cols: Sequence[Hashable] | None = None, index_label: IndexLabel | None = None, @@ -78,6 +80,7 @@ def __init__( self.obj = self.fmt.frame self.filepath_or_buffer = path_or_buf + self.engine = engine self.encoding = encoding self.compression: CompressionOptions = compression self.mode = mode @@ -252,8 +255,48 @@ def save(self) -> None: storage_options=self.storage_options, ) as handles: # Note: self.encoding is irrelevant here + self._save(handles.handle) + + def _save_pyarrow(self, handle) -> None: + pa = import_optional_dependency("pyarrow") + pa_csv = import_optional_dependency("pyarrow.csv") + # Convert index to column and rename name to empty string + # since we serialize the index as basically a column with no name + # TODO: this won't work for multi-indexes + obj = self.obj.reset_index(names=[""]) + + table = pa.Table.from_pandas(obj) + + # Map quoting arg to pyarrow equivalents + pa_quoting = None + if self.quoting == csvlib.QUOTE_MINIMAL: + pa_quoting = "needed" + elif self.quoting == csvlib.QUOTE_ALL: + # TODO: Is this a 1-1 mapping? + # This doesn't quote nulls, check if Python does this + pa_quoting = "all_valid" + elif self.quoting == csvlib.QUOTE_NONE: + pa_quoting = "none" + else: + raise ValueError( + f"Quoting option {self.quoting} is not supported with engine='pyarrow'" + ) + + write_options = pa_csv.WriteOptions( + include_header=self._need_to_save_header, + batch_size=self.chunksize, + delimiter=self.sep, + quoting_style=pa_quoting, + ) + # pa_csv.write_csv(table, handle, write_options) + pa_csv.write_csv(table, self.filepath_or_buffer, write_options) + + def _save(self, handle) -> None: + if self.engine == "pyarrow": + self._save_pyarrow(handle) + else: self.writer = csvlib.writer( - handles.handle, + handle, lineterminator=self.lineterminator, delimiter=self.sep, quoting=self.quoting, @@ -261,13 +304,9 @@ def save(self) -> None: escapechar=self.escapechar, quotechar=self.quotechar, ) - - self._save() - - def _save(self) -> None: - if self._need_to_save_header: - self._save_header() - self._save_body() + if self._need_to_save_header: + self._save_header() + self._save_body() def _save_header(self) -> None: if not self.has_mi_columns or self._has_aliases: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index a7a6f481ebdde..b89f3400675db 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1103,6 +1103,7 @@ def to_string( def to_csv( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + engine: str = "python", encoding: str | None = None, sep: str = ",", columns: Sequence[Hashable] | None = None, @@ -1132,6 +1133,7 @@ def to_csv( csv_formatter = CSVFormatter( path_or_buf=path_or_buf, + engine=engine, lineterminator=lineterminator, sep=sep, encoding=encoding, diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 32509a799fa69..eeb2a1b8a2c56 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -15,8 +15,14 @@ import pandas._testing as tm +@pytest.fixture(params=["python", "pyarrow"]) +def engine(request): + # TODO: Skip if pyarrow not found + return request.param + + class TestToCSV: - def test_to_csv_with_single_column(self): + def test_to_csv_with_single_column(self, engine): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' @@ -30,7 +36,7 @@ def test_to_csv_with_single_column(self): 1.0 """ with tm.ensure_clean("test.csv") as path: - df1.to_csv(path, header=None, index=None) + df1.to_csv(path, header=None, index=None, engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected1 @@ -40,20 +46,20 @@ def test_to_csv_with_single_column(self): "" """ with tm.ensure_clean("test.csv") as path: - df2.to_csv(path, header=None, index=None) + df2.to_csv(path, header=None, index=None, engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected2 - def test_to_csv_default_encoding(self): + def test_to_csv_default_encoding(self, engine): # GH17097 df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) with tm.ensure_clean("test.csv") as path: - # the default to_csv encoding is uft-8. - df.to_csv(path) + # the default to_csv encoding is utf-8. + df.to_csv(path, engine=engine) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) - def test_to_csv_quotechar(self): + def test_to_csv_quotechar(self, engine): df = DataFrame({"col": [1, 2]}) expected = """\ "","col" @@ -62,7 +68,7 @@ def test_to_csv_quotechar(self): """ with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1) # 1=QUOTE_ALL + df.to_csv(path, quoting=1, engine=engine) # 1=QUOTE_ALL with open(path, encoding="utf-8") as f: assert f.read() == expected @@ -73,13 +79,13 @@ def test_to_csv_quotechar(self): """ with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1, quotechar="$") + df.to_csv(path, quoting=1, quotechar="$", engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: with pytest.raises(TypeError, match="quotechar"): - df.to_csv(path, quoting=1, quotechar=None) + df.to_csv(path, quoting=1, quotechar=None, engine=engine) def test_to_csv_doublequote(self): df = DataFrame({"col": ['a"a', '"bb"']}) @@ -90,15 +96,15 @@ def test_to_csv_doublequote(self): ''' with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=True, engine=engine) # QUOTE_ALL with open(path, encoding="utf-8") as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: with pytest.raises(Error, match="escapechar"): - df.to_csv(path, doublequote=False) # no escapechar set + df.to_csv(path, doublequote=False, engine=engine) # no escapechar set - def test_to_csv_escapechar(self): + def test_to_csv_escapechar(self, engine=engine): df = DataFrame({"col": ['a"a', '"bb"']}) expected = """\ "","col" @@ -107,7 +113,9 @@ def test_to_csv_escapechar(self): """ with tm.ensure_clean("test.csv") as path: # QUOTE_ALL - df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") + df.to_csv( + path, quoting=1, doublequote=False, escapechar="\\", engine=engine + ) with open(path, encoding="utf-8") as f: assert f.read() == expected @@ -119,36 +127,39 @@ def test_to_csv_escapechar(self): """ with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE + df.to_csv(path, quoting=3, escapechar="\\", engine=engine) # QUOTE_NONE with open(path, encoding="utf-8") as f: assert f.read() == expected - def test_csv_to_string(self): + def test_csv_to_string(self, engine): df = DataFrame({"col": [1, 2]}) expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv() == expected + assert df.to_csv(engine=engine) == expected - def test_to_csv_decimal(self): + def test_to_csv_decimal(self, engine): # see gh-781 df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv() == expected_default + assert df.to_csv(engine=engine) == expected_default expected_rows = [";col1;col2;col3", "0;1;a;10,1"] expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(decimal=",", sep=";") == expected_european_excel + assert df.to_csv(engine=engine, decimal=",", sep=";") == expected_european_excel expected_rows = [",col1,col2,col3", "0,1,a,10.10"] expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(float_format="%.2f") == expected_float_format_default + assert ( + df.to_csv(engine=engine, float_format="%.2f") + == expected_float_format_default + ) expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) assert ( - df.to_csv(decimal=",", sep=";", float_format="%.2f") + df.to_csv(engine=engine, decimal=",", sep=";", float_format="%.2f") == expected_float_format ) @@ -157,13 +168,13 @@ def test_to_csv_decimal(self): expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(index=False, decimal="^") == expected + assert df.to_csv(engine=engine, index=False, decimal="^") == expected # same but for an index - assert df.set_index("a").to_csv(decimal="^") == expected + assert df.set_index("a").to_csv(engine=engine, decimal="^") == expected # same for a multi-index - assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected + assert df.set_index(["a", "b"]).to_csv(engine=engine, decimal="^") == expected def test_to_csv_float_format(self): # testing if float_format is taken into account for the index @@ -172,10 +183,13 @@ def test_to_csv_float_format(self): expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(float_format="%.2f") == expected + assert df.set_index("a").to_csv(engine=engine, float_format="%.2f") == expected # same for a multi-index - assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, float_format="%.2f") + == expected + ) def test_to_csv_na_rep(self): # see gh-11553 @@ -185,7 +199,7 @@ def test_to_csv_na_rep(self): expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs @@ -193,31 +207,31 @@ def test_to_csv_na_rep(self): expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected # check if na_rep parameter does not break anything when no NaN df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected - csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + csv = pd.Series(["a", pd.NA, "c"]).to_csv(engine=engine, na_rep="ZZZZZ") expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) assert expected == csv - def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): + def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype, engine): # GH 29975 # Make sure full na_rep shows up when a dtype is provided expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv( - na_rep="ZZZZZ" + engine=engine, na_rep="ZZZZZ" ) assert expected == csv - def test_to_csv_date_format(self): + def test_to_csv_date_format(self, engine): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) @@ -231,7 +245,7 @@ def test_to_csv_date_format(self): "4,2013-01-01 00:00:04", ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv() == expected_default_sec + assert df_sec.to_csv(engine=engine) == expected_default_sec expected_rows = [ ",A", @@ -242,7 +256,10 @@ def test_to_csv_date_format(self): "4,2013-01-05 00:00:00", ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day + assert ( + df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S", engine=engine) + == expected_ymdhms_day + ) expected_rows = [ ",A", @@ -253,7 +270,7 @@ def test_to_csv_date_format(self): "4,2013-01-01", ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + assert df_sec.to_csv(date_format="%Y-%m-%d", engine=engine) == expected_ymd_sec expected_rows = [ ",A", @@ -264,8 +281,10 @@ def test_to_csv_date_format(self): "4,2013-01-05", ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_day.to_csv() == expected_default_day - assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day + assert df_day.to_csv(engine=engine) == expected_default_day + assert ( + df_day.to_csv(date_format="%Y-%m-%d", engine=engine) == expected_default_day + ) # see gh-7791 # @@ -278,9 +297,12 @@ def test_to_csv_date_format(self): expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) - assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + assert ( + df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d", engine=engine) + == expected_ymd_sec + ) - def test_to_csv_different_datetime_formats(self): + def test_to_csv_different_datetime_formats(self, engine): # GH#21734 df = DataFrame( { @@ -294,14 +316,14 @@ def test_to_csv_different_datetime_formats(self): "1970-01-01,1970-01-01 01:00:00", ] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(index=False) == expected + assert df.to_csv(index=False, engine=engine) == expected - def test_to_csv_date_format_in_categorical(self): + def test_to_csv_date_format_in_categorical(self, engine): # GH#40754 ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) ser = ser.astype("category") expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) - assert ser.to_csv(index=False) == expected + assert ser.to_csv(index=False, engine=engine) == expected ser = pd.Series( pd.date_range( @@ -309,39 +331,41 @@ def test_to_csv_date_format_in_categorical(self): ).append(pd.DatetimeIndex([pd.NaT])) ) ser = ser.astype("category") - assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected + assert ( + ser.to_csv(index=False, engine=engine, date_format="%Y-%m-%d") == expected + ) - def test_to_csv_float_ea_float_format(self): + def test_to_csv_float_ea_float_format(self, engine): # GH#45991 df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) df["a"] = df["a"].astype("Float64") - result = df.to_csv(index=False, float_format="%.5f") + result = df.to_csv(index=False, engine=engine, float_format="%.5f") expected = tm.convert_rows_list_to_csv_str( ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"] ) assert result == expected - def test_to_csv_float_ea_no_float_format(self): + def test_to_csv_float_ea_no_float_format(self, engine): # GH#45991 df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) df["a"] = df["a"].astype("Float64") - result = df.to_csv(index=False) + result = df.to_csv(index=False, engine=engine) expected = tm.convert_rows_list_to_csv_str( ["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"] ) assert result == expected - def test_to_csv_multi_index(self): + def test_to_csv_multi_index(self, engine): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [",1", ",2", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv() == exp + assert df.to_csv(engine=engine) == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv(index=False) == exp + assert df.to_csv(index=False, engine=engine) == exp df = DataFrame( [1], @@ -351,21 +375,21 @@ def test_to_csv_multi_index(self): exp_rows = [",,1", ",,2", "1,2,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv() == exp + assert df.to_csv(engine=engine) == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv(index=False) == exp + assert df.to_csv(index=False, engine=engine) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) exp_rows = [",foo", ",bar", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv() == exp + assert df.to_csv(engine=engine) == exp exp_rows = ["foo", "bar", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv(index=False) == exp + assert df.to_csv(index=False, engine=engine) == exp @pytest.mark.parametrize( "ind,expected", @@ -382,14 +406,16 @@ def test_to_csv_multi_index(self): ), ], ) - def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series): + def test_to_csv_single_level_multi_index( + self, ind, expected, frame_or_series, engine + ): # see gh-19589 obj = frame_or_series(pd.Series([1], ind, name="data")) - result = obj.to_csv(lineterminator="\n", header=True) + result = obj.to_csv(lineterminator="\n", header=True, engine=engine) assert result == expected - def test_to_csv_string_array_ascii(self): + def test_to_csv_string_array_ascii(self, engine): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) @@ -399,11 +425,11 @@ def test_to_csv_string_array_ascii(self): 1,"['baz', 'qux']" """ with tm.ensure_clean("str_test.csv") as path: - df.to_csv(path, encoding="ascii") + df.to_csv(path, encoding="ascii", engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected_ascii - def test_to_csv_string_array_utf8(self): + def test_to_csv_string_array_utf8(self, engine): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) @@ -413,11 +439,11 @@ def test_to_csv_string_array_utf8(self): 1,"['baz', 'qux']" """ with tm.ensure_clean("unicode_test.csv") as path: - df.to_csv(path, encoding="utf-8") + df.to_csv(path, encoding="utf-8", engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected_utf8 - def test_to_csv_string_with_lf(self): + def test_to_csv_string_with_lf(self, engine): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = DataFrame(data) @@ -434,24 +460,24 @@ def test_to_csv_string_with_lf(self): + b'3,"g\nh\n\ni"' + os_linesep ) - df.to_csv(path, index=False) + df.to_csv(path, index=False, engine=engine) with open(path, "rb") as f: assert f.read() == expected_noarg with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' - df.to_csv(path, lineterminator="\n", index=False) + df.to_csv(path, lineterminator="\n", index=False, engine=engine) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' - df.to_csv(path, lineterminator="\r\n", index=False) + df.to_csv(path, lineterminator="\r\n", index=False, engine=engine) with open(path, "rb") as f: assert f.read() == expected_crlf - def test_to_csv_string_with_crlf(self): + def test_to_csv_string_with_crlf(self, engine): # GH 20353 data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = DataFrame(data) @@ -468,13 +494,13 @@ def test_to_csv_string_with_crlf(self): + b'3,"g\r\nh\r\n\r\ni"' + os_linesep ) - df.to_csv(path, index=False) + df.to_csv(path, index=False, engine=engine) with open(path, "rb") as f: assert f.read() == expected_noarg with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' - df.to_csv(path, lineterminator="\n", index=False) + df.to_csv(path, lineterminator="\n", index=False, engine=engine) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("crlf_test.csv") as path: @@ -486,17 +512,17 @@ def test_to_csv_string_with_crlf(self): b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n' ) - df.to_csv(path, lineterminator="\r\n", index=False) + df.to_csv(path, lineterminator="\r\n", index=False, engine=engine) with open(path, "rb") as f: assert f.read() == expected_crlf - def test_to_csv_stdout_file(self, capsys): + def test_to_csv_stdout_file(self, capsys, engine): # GH 21561 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) - df.to_csv(sys.stdout, encoding="ascii") + df.to_csv(sys.stdout, encoding="ascii", engine=engine) captured = capsys.readouterr() assert captured.out == expected_ascii @@ -510,7 +536,7 @@ def test_to_csv_stdout_file(self, capsys): "(https://docs.python.org/3/library/csv.html#csv.writer)" ), ) - def test_to_csv_write_to_open_file(self): + def test_to_csv_write_to_open_file(self, engine): # GH 21696 df = DataFrame({"a": ["x", "y", "z"]}) expected = """\ @@ -522,11 +548,11 @@ def test_to_csv_write_to_open_file(self): with tm.ensure_clean("test.txt") as path: with open(path, "w", encoding="utf-8") as f: f.write("manual header\n") - df.to_csv(f, header=None, index=None) + df.to_csv(f, header=None, index=None, engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected - def test_to_csv_write_to_open_file_with_newline_py3(self): + def test_to_csv_write_to_open_file_with_newline_py3(self, engine): # see gh-21696 # see gh-20353 df = DataFrame({"a": ["x", "y", "z"]}) @@ -535,7 +561,7 @@ def test_to_csv_write_to_open_file_with_newline_py3(self): with tm.ensure_clean("test.txt") as path: with open(path, "w", newline="", encoding="utf-8") as f: f.write("manual header\n") - df.to_csv(f, header=None, index=None) + df.to_csv(f, header=None, index=None, engine=engine) with open(path, "rb") as f: assert f.read() == bytes(expected, "utf-8") @@ -543,7 +569,7 @@ def test_to_csv_write_to_open_file_with_newline_py3(self): @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_csv_compression( - self, compression_only, read_infer, to_infer, compression_to_extension + self, compression_only, read_infer, to_infer, compression_to_extension, engine ): # see gh-15008 compression = compression_only @@ -558,11 +584,11 @@ def test_to_csv_compression( read_compression = "infer" if read_infer else compression with tm.ensure_clean(filename) as path: - df.to_csv(path, compression=to_compression) + df.to_csv(path, compression=to_compression, engine=engine) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) - def test_to_csv_compression_dict(self, compression_only): + def test_to_csv_compression_dict(self, compression_only, engine): # GH 26023 method = compression_only df = DataFrame({"ABC": [1]}) @@ -573,11 +599,11 @@ def test_to_csv_compression_dict(self, compression_only): }.get(method, method) filename += extension with tm.ensure_clean(filename) as path: - df.to_csv(path, compression={"method": method}) + df.to_csv(path, compression={"method": method}, engine=engine) read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) - def test_to_csv_compression_dict_no_method_raises(self): + def test_to_csv_compression_dict_no_method_raises(self, engine): # GH 26023 df = DataFrame({"ABC": [1]}) compression = {"some_option": True} @@ -585,16 +611,18 @@ def test_to_csv_compression_dict_no_method_raises(self): with tm.ensure_clean("out.zip") as path: with pytest.raises(ValueError, match=msg): - df.to_csv(path, compression=compression) + df.to_csv(path, compression=compression, engine=engine) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) - def test_to_csv_zip_arguments(self, compression, archive_name): + def test_to_csv_zip_arguments(self, compression, archive_name, engine): # GH 26023 df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_archive_name.zip") as path: df.to_csv( - path, compression={"method": compression, "archive_name": archive_name} + path, + compression={"method": compression, "archive_name": archive_name}, + engine=engine, ) with ZipFile(path) as zp: assert len(zp.filelist) == 1 @@ -611,33 +639,35 @@ def test_to_csv_zip_arguments(self, compression, archive_name): ("archive.zip", "archive"), ], ) - def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname): + def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname, engine): # GH 39465 df = DataFrame({"ABC": [1]}) path = tmp_path / filename - df.to_csv(path, compression="zip") + df.to_csv(path, compression="zip", engine=engine) with ZipFile(path) as zp: assert len(zp.filelist) == 1 archived_file = zp.filelist[0].filename assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) - def test_to_csv_na_rep_long_string(self, df_new_type): + def test_to_csv_na_rep_long_string(self, df_new_type, engine): # see gh-25099 df = DataFrame({"c": [float("nan")] * 3}) df = df.astype(df_new_type) expected_rows = ["c", "mynull", "mynull", "mynull"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") + result = df.to_csv( + index=False, na_rep="mynull", encoding="ascii", engine=engine + ) assert expected == result - def test_to_csv_timedelta_precision(self): + def test_to_csv_timedelta_precision(self, engine): # GH 6783 s = pd.Series([1, 1]).astype("timedelta64[ns]") buf = io.StringIO() - s.to_csv(buf) + s.to_csv(buf, engine=engine) result = buf.getvalue() expected_rows = [ ",0", @@ -647,32 +677,32 @@ def test_to_csv_timedelta_precision(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected - def test_na_rep_truncated(self): + def test_na_rep_truncated(self, engine): # https://github.com/pandas-dev/pandas/issues/31447 - result = pd.Series(range(8, 12)).to_csv(na_rep="-") + result = pd.Series(range(8, 12)).to_csv(na_rep="-", engine=engine) expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"]) assert result == expected - result = pd.Series([True, False]).to_csv(na_rep="nan") + result = pd.Series([True, False]).to_csv(na_rep="nan", engine=engine) expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) assert result == expected - result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") + result = pd.Series([1.1, 2.2]).to_csv(na_rep=".", engine=engine) expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) assert result == expected @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) - def test_to_csv_errors(self, errors): + def test_to_csv_errors(self, errors, engine): # GH 22610 data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) with tm.ensure_clean("test.csv") as path: - ser.to_csv(path, errors=errors) + ser.to_csv(path, errors=errors, engine=engine) # No use in reading back the data as it is not the same anymore # due to the error handling @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_binary_handle(self, mode): + def test_to_csv_binary_handle(self, mode, engine): """ Binary file objects should work (if 'mode' contains a 'b') or even without it in most cases. @@ -682,11 +712,11 @@ def test_to_csv_binary_handle(self, mode): df = tm.makeDataFrame() with tm.ensure_clean() as path: with open(path, mode="w+b") as handle: - df.to_csv(handle, mode=mode) + df.to_csv(handle, mode=mode, engine=engine) tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_encoding_binary_handle(self, mode): + def test_to_csv_encoding_binary_handle(self, mode, engine): """ Binary file objects should honor a specified encoding. @@ -698,34 +728,36 @@ def test_to_csv_encoding_binary_handle(self, mode): df = pd.read_csv(buffer, encoding="utf-8-sig") buffer = io.BytesIO() - df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False) + df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False, engine=engine) buffer.seek(0) # tests whether file handle wasn't closed assert buffer.getvalue().startswith(content) # example from GH 13068 with tm.ensure_clean() as path: with open(path, "w+b") as handle: - DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") + DataFrame().to_csv( + handle, mode=mode, encoding="utf-8-sig", engine=engine + ) handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""') -def test_to_csv_iterative_compression_name(compression): +def test_to_csv_iterative_compression_name(compression, engine): # GH 38714 df = tm.makeDataFrame() with tm.ensure_clean() as path: - df.to_csv(path, compression=compression, chunksize=1) + df.to_csv(path, compression=compression, chunksize=1, engine=engine) tm.assert_frame_equal( pd.read_csv(path, compression=compression, index_col=0), df ) -def test_to_csv_iterative_compression_buffer(compression): +def test_to_csv_iterative_compression_buffer(compression, engine): # GH 38714 df = tm.makeDataFrame() with io.BytesIO() as buffer: - df.to_csv(buffer, compression=compression, chunksize=1) + df.to_csv(buffer, compression=compression, chunksize=1, engine=engine) buffer.seek(0) tm.assert_frame_equal( pd.read_csv(buffer, compression=compression, index_col=0), df From 8328120b215b416c743247c92b1aa07617b95729 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 22 Jul 2023 17:17:28 -0700 Subject: [PATCH 02/22] pass more --- pandas/core/generic.py | 11 +++++++-- pandas/io/formats/csvs.py | 34 +++++++++++++++++--------- pandas/io/formats/format.py | 12 ++++++--- pandas/tests/io/formats/test_to_csv.py | 4 ++- 4 files changed, 43 insertions(+), 18 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 79edbf9c1eebe..f8d7b2481f13b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3729,7 +3729,7 @@ def to_csv( header: bool_t | list[str] = True, index: bool_t = True, index_label: IndexLabel | None = None, - mode: str = "w", + mode: str | None = None, encoding: str | None = None, compression: CompressionOptions = "infer", quoting: int | None = None, @@ -3786,7 +3786,7 @@ def to_csv( sequence should be given if the object uses MultiIndex. If False do not print fields for index names. Use index_label=False for easier importing in R. - mode : {{'w', 'x', 'a'}}, default 'w' + mode : {{'w', 'x', 'a'}}, default 'w' (Python engine) or 'wb' (Pyarrow engine) Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control the file opening. Typical values include: @@ -3794,6 +3794,8 @@ def to_csv( - 'x', exclusive creation, failing if the file already exists. - 'a', append to the end of file if it exists. + NOTE: The pyarrow engine can only handle binary buffers. + encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` @@ -3903,6 +3905,11 @@ def to_csv( decimal=decimal, ) + if mode is None: + mode = "w" + if engine == "pyarrow": + mode += "b" + return DataFrameRenderer(formatter).to_csv( path_or_buf, engine=engine, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index db9fd8783d9eb..6970e36ebff54 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -20,6 +20,7 @@ import numpy as np from pandas._libs import writers as libwriters +from pandas.compat import pa_version_under11p0 from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import cache_readonly @@ -253,6 +254,8 @@ def save(self) -> None: errors=self.errors, compression=self.compression, storage_options=self.storage_options, + # pyarrow engine exclusively writes bytes + is_text=self.engine == "python", ) as handles: # Note: self.encoding is irrelevant here self._save(handles.handle) @@ -262,13 +265,17 @@ def _save_pyarrow(self, handle) -> None: pa_csv = import_optional_dependency("pyarrow.csv") # Convert index to column and rename name to empty string # since we serialize the index as basically a column with no name - # TODO: this won't work for multi-indexes - obj = self.obj.reset_index(names=[""]) + # TODO: this won't work for multi-indexes (without names) + obj = self.obj + if self.index: + new_names = [ + label if label is not None else "" for label in self.obj.index.names + ] + obj = self.obj.reset_index(names=new_names) table = pa.Table.from_pandas(obj) # Map quoting arg to pyarrow equivalents - pa_quoting = None if self.quoting == csvlib.QUOTE_MINIMAL: pa_quoting = "needed" elif self.quoting == csvlib.QUOTE_ALL: @@ -278,18 +285,21 @@ def _save_pyarrow(self, handle) -> None: elif self.quoting == csvlib.QUOTE_NONE: pa_quoting = "none" else: - raise ValueError( + raise NotImplementedError( f"Quoting option {self.quoting} is not supported with engine='pyarrow'" ) - write_options = pa_csv.WriteOptions( - include_header=self._need_to_save_header, - batch_size=self.chunksize, - delimiter=self.sep, - quoting_style=pa_quoting, - ) - # pa_csv.write_csv(table, handle, write_options) - pa_csv.write_csv(table, self.filepath_or_buffer, write_options) + kwargs = { + "include_header": self._need_to_save_header, + "batch_size": self.chunksize, + "delimiter": self.sep, + } + + if not pa_version_under11p0: + kwargs["quoting_style"] = pa_quoting + + write_options = pa_csv.WriteOptions(**kwargs) + pa_csv.write_csv(table, handle, write_options) def _save(self, handle) -> None: if self.engine == "pyarrow": diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b89f3400675db..5ede0e080302f 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -18,7 +18,10 @@ ) from decimal import Decimal from functools import partial -from io import StringIO +from io import ( + BytesIO, + StringIO, +) import math import re from shutil import get_terminal_size @@ -1127,7 +1130,7 @@ def to_csv( if path_or_buf is None: created_buffer = True - path_or_buf = StringIO() + path_or_buf = StringIO() if engine == "python" else BytesIO() else: created_buffer = False @@ -1154,8 +1157,11 @@ def to_csv( csv_formatter.save() if created_buffer: - assert isinstance(path_or_buf, StringIO) content = path_or_buf.getvalue() + if isinstance(path_or_buf, BytesIO): + # Need to decode into string since the + # pyarrow engine only writes binary data + content = content.decode("utf-8") path_or_buf.close() return content diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index eeb2a1b8a2c56..bd17cc59c3c5a 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -546,7 +546,8 @@ def test_to_csv_write_to_open_file(self, engine): z """ with tm.ensure_clean("test.txt") as path: - with open(path, "w", encoding="utf-8") as f: + # TODO: open in bytes mode for pyarrow + with open(path, encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None, engine=engine) with open(path, encoding="utf-8") as f: @@ -559,6 +560,7 @@ def test_to_csv_write_to_open_file_with_newline_py3(self, engine): expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) with tm.ensure_clean("test.txt") as path: + # TODO: Open in bytes mode for pyarrow with open(path, "w", newline="", encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None, engine=engine) From a889ebf5fcfd730215ace4124ff180052504afb9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 3 Aug 2023 11:36:25 -0700 Subject: [PATCH 03/22] xfail everything --- pandas/tests/io/formats/test_to_csv.py | 44 ++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index bd17cc59c3c5a..a8926c3cf1ccf 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -21,7 +21,22 @@ def engine(request): return request.param +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + engine = request.getfixturevalue("engine") + if engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) + + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + + class TestToCSV: + @xfail_pyarrow def test_to_csv_with_single_column(self, engine): # see gh-18676, https://bugs.python.org/issue32255 # @@ -59,6 +74,7 @@ def test_to_csv_default_encoding(self, engine): df.to_csv(path, engine=engine) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + @xfail_pyarrow def test_to_csv_quotechar(self, engine): df = DataFrame({"col": [1, 2]}) expected = """\ @@ -131,12 +147,14 @@ def test_to_csv_escapechar(self, engine=engine): with open(path, encoding="utf-8") as f: assert f.read() == expected + @xfail_pyarrow def test_csv_to_string(self, engine): df = DataFrame({"col": [1, 2]}) expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(engine=engine) == expected + @xfail_pyarrow def test_to_csv_decimal(self, engine): # see gh-781 df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) @@ -176,7 +194,8 @@ def test_to_csv_decimal(self, engine): # same for a multi-index assert df.set_index(["a", "b"]).to_csv(engine=engine, decimal="^") == expected - def test_to_csv_float_format(self): + @xfail_pyarrow + def test_to_csv_float_format(self, engine): # testing if float_format is taken into account for the index # GH 11553 df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) @@ -191,7 +210,8 @@ def test_to_csv_float_format(self): == expected ) - def test_to_csv_na_rep(self): + @xfail_pyarrow + def test_to_csv_na_rep(self, engine): # see gh-11553 # # Testing if NaN values are correctly represented in the index. @@ -222,6 +242,7 @@ def test_to_csv_na_rep(self): expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) assert expected == csv + @xfail_pyarrow def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype, engine): # GH 29975 # Make sure full na_rep shows up when a dtype is provided @@ -231,6 +252,7 @@ def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype, engine): ) assert expected == csv + @xfail_pyarrow def test_to_csv_date_format(self, engine): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) @@ -302,6 +324,7 @@ def test_to_csv_date_format(self, engine): == expected_ymd_sec ) + @xfail_pyarrow def test_to_csv_different_datetime_formats(self, engine): # GH#21734 df = DataFrame( @@ -318,6 +341,7 @@ def test_to_csv_different_datetime_formats(self, engine): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(index=False, engine=engine) == expected + @xfail_pyarrow def test_to_csv_date_format_in_categorical(self, engine): # GH#40754 ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) @@ -335,6 +359,7 @@ def test_to_csv_date_format_in_categorical(self, engine): ser.to_csv(index=False, engine=engine, date_format="%Y-%m-%d") == expected ) + @xfail_pyarrow def test_to_csv_float_ea_float_format(self, engine): # GH#45991 df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) @@ -345,6 +370,7 @@ def test_to_csv_float_ea_float_format(self, engine): ) assert result == expected + @xfail_pyarrow def test_to_csv_float_ea_no_float_format(self, engine): # GH#45991 df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) @@ -355,6 +381,7 @@ def test_to_csv_float_ea_no_float_format(self, engine): ) assert result == expected + @xfail_pyarrow def test_to_csv_multi_index(self, engine): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) @@ -391,6 +418,7 @@ def test_to_csv_multi_index(self, engine): exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False, engine=engine) == exp + @xfail_pyarrow @pytest.mark.parametrize( "ind,expected", [ @@ -415,6 +443,7 @@ def test_to_csv_single_level_multi_index( result = obj.to_csv(lineterminator="\n", header=True, engine=engine) assert result == expected + @xfail_pyarrow def test_to_csv_string_array_ascii(self, engine): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] @@ -429,6 +458,7 @@ def test_to_csv_string_array_ascii(self, engine): with open(path, encoding="utf-8") as f: assert f.read() == expected_ascii + @xfail_pyarrow def test_to_csv_string_array_utf8(self, engine): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] @@ -443,6 +473,7 @@ def test_to_csv_string_array_utf8(self, engine): with open(path, encoding="utf-8") as f: assert f.read() == expected_utf8 + @xfail_pyarrow def test_to_csv_string_with_lf(self, engine): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} @@ -477,6 +508,7 @@ def test_to_csv_string_with_lf(self, engine): with open(path, "rb") as f: assert f.read() == expected_crlf + @xfail_pyarrow def test_to_csv_string_with_crlf(self, engine): # GH 20353 data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} @@ -516,6 +548,7 @@ def test_to_csv_string_with_crlf(self, engine): with open(path, "rb") as f: assert f.read() == expected_crlf + @xfail_pyarrow def test_to_csv_stdout_file(self, capsys, engine): # GH 21561 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) @@ -528,6 +561,7 @@ def test_to_csv_stdout_file(self, capsys, engine): assert captured.out == expected_ascii assert not sys.stdout.closed + @xfail_pyarrow @pytest.mark.xfail( compat.is_platform_windows(), reason=( @@ -553,6 +587,7 @@ def test_to_csv_write_to_open_file(self, engine): with open(path, encoding="utf-8") as f: assert f.read() == expected + @xfail_pyarrow def test_to_csv_write_to_open_file_with_newline_py3(self, engine): # see gh-21696 # see gh-20353 @@ -651,6 +686,7 @@ def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname, engin archived_file = zp.filelist[0].filename assert archived_file == expected_arcname + @xfail_pyarrow @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type, engine): # see gh-25099 @@ -665,6 +701,7 @@ def test_to_csv_na_rep_long_string(self, df_new_type, engine): assert expected == result + @xfail_pyarrow def test_to_csv_timedelta_precision(self, engine): # GH 6783 s = pd.Series([1, 1]).astype("timedelta64[ns]") @@ -679,6 +716,7 @@ def test_to_csv_timedelta_precision(self, engine): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected + @xfail_pyarrow def test_na_rep_truncated(self, engine): # https://github.com/pandas-dev/pandas/issues/31447 result = pd.Series(range(8, 12)).to_csv(na_rep="-", engine=engine) @@ -693,6 +731,7 @@ def test_na_rep_truncated(self, engine): expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) assert result == expected + @xfail_pyarrow @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) def test_to_csv_errors(self, errors, engine): # GH 22610 @@ -717,6 +756,7 @@ def test_to_csv_binary_handle(self, mode, engine): df.to_csv(handle, mode=mode, engine=engine) tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + @xfail_pyarrow @pytest.mark.parametrize("mode", ["wb", "w"]) def test_to_csv_encoding_binary_handle(self, mode, engine): """ From 1f7ffea6c7b71a911ddb2bf79c7b256cea0a44d5 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 3 Aug 2023 12:55:06 -0700 Subject: [PATCH 04/22] revert unintentional change --- pandas/tests/io/formats/test_to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index a8926c3cf1ccf..0472285968d68 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -581,7 +581,7 @@ def test_to_csv_write_to_open_file(self, engine): """ with tm.ensure_clean("test.txt") as path: # TODO: open in bytes mode for pyarrow - with open(path, encoding="utf-8") as f: + with open(path, "w", encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None, engine=engine) with open(path, encoding="utf-8") as f: From faeed4c579472485607fc73f839d05a3c7e2bcc9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 3 Aug 2023 16:37:20 -0700 Subject: [PATCH 05/22] fix typing and tests --- pandas/io/formats/csvs.py | 22 ++++++++++++++++------ pandas/io/formats/format.py | 6 +++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6970e36ebff54..4e69aa3c47685 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -12,15 +12,20 @@ import csv as csvlib import os from typing import ( + IO, TYPE_CHECKING, Any, + AnyStr, cast, ) import numpy as np from pandas._libs import writers as libwriters -from pandas.compat import pa_version_under11p0 +from pandas.compat import ( + pa_version_under8p0, + pa_version_under11p0, +) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import cache_readonly @@ -258,9 +263,12 @@ def save(self) -> None: is_text=self.engine == "python", ) as handles: # Note: self.encoding is irrelevant here - self._save(handles.handle) - def _save_pyarrow(self, handle) -> None: + # This is a mypy bug? + # error: Cannot infer type argument 1 of "_save" of "CSVFormatter" [misc] + self._save(handles.handle) # type: ignore[misc] + + def _save_pyarrow(self, handle: IO[AnyStr]) -> None: pa = import_optional_dependency("pyarrow") pa_csv = import_optional_dependency("pyarrow.csv") # Convert index to column and rename name to empty string @@ -289,19 +297,21 @@ def _save_pyarrow(self, handle) -> None: f"Quoting option {self.quoting} is not supported with engine='pyarrow'" ) - kwargs = { + kwargs: dict[str, Any] = { "include_header": self._need_to_save_header, "batch_size": self.chunksize, - "delimiter": self.sep, } + if not pa_version_under8p0: + kwargs["delimiter"] = self.sep + if not pa_version_under11p0: kwargs["quoting_style"] = pa_quoting write_options = pa_csv.WriteOptions(**kwargs) pa_csv.write_csv(table, handle, write_options) - def _save(self, handle) -> None: + def _save(self, handle: IO[AnyStr]) -> None: if self.engine == "pyarrow": self._save_pyarrow(handle) else: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 5ede0e080302f..8506d305d90e1 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -31,6 +31,7 @@ Any, Callable, Final, + Union, cast, ) from unicodedata import east_asian_width @@ -1157,11 +1158,14 @@ def to_csv( csv_formatter.save() if created_buffer: + path_or_buf = cast(Union[BytesIO, StringIO], path_or_buf) content = path_or_buf.getvalue() - if isinstance(path_or_buf, BytesIO): + if isinstance(content, bytes): # Need to decode into string since the # pyarrow engine only writes binary data + # content = cast(bytes, content) content = content.decode("utf-8") + # content = cast(str, content) path_or_buf.close() return content From 47d48f143e62b7bd773460b8c6781f9befd9b2e7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 3 Aug 2023 20:18:55 -0700 Subject: [PATCH 06/22] green everything? --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/tests/io/formats/test_to_csv.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 17894914b44d1..fca5d14400468 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -212,8 +212,8 @@ Other enhancements - Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`) - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) +- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`51722`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 0472285968d68..29eac1ea4d930 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -18,6 +18,8 @@ @pytest.fixture(params=["python", "pyarrow"]) def engine(request): # TODO: Skip if pyarrow not found + if request.param == "pyarrow": + pytest.importorskip("pyarrow") return request.param From ae9f87cbe7714092c668ce29339f2b58eb509835 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Aug 2023 18:57:28 +0000 Subject: [PATCH 07/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 4f522f48569da..b7414f2d278dd 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -217,8 +217,8 @@ Other enhancements - Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`) - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) -- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) +- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`51722`) .. --------------------------------------------------------------------------- From c49309ca57bb75674ddbd9838e22691a2767af08 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 8 Aug 2023 02:16:58 -0700 Subject: [PATCH 08/22] move option to end --- pandas/core/generic.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 798be98a4d447..e26f526c16270 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3663,7 +3663,6 @@ def to_csv( path_or_buf: None = ..., sep: str = ..., na_rep: str = ..., - engine: str = "python", float_format: str | Callable | None = ..., columns: Sequence[Hashable] | None = ..., header: bool_t | list[str] = ..., @@ -3682,6 +3681,7 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + engine: str = "python", ) -> str: ... @@ -3691,7 +3691,6 @@ def to_csv( path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str], sep: str = ..., na_rep: str = ..., - engine: str = "python", float_format: str | Callable | None = ..., columns: Sequence[Hashable] | None = ..., header: bool_t | list[str] = ..., @@ -3710,6 +3709,7 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + engine: str = "python", ) -> None: ... @@ -3723,7 +3723,6 @@ def to_csv( path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, sep: str = ",", na_rep: str = "", - engine: str = "python", float_format: str | Callable | None = None, columns: Sequence[Hashable] | None = None, header: bool_t | list[str] = True, @@ -3742,6 +3741,7 @@ def to_csv( decimal: str = ".", errors: OpenFileErrors = "strict", storage_options: StorageOptions | None = None, + engine: str = "python", ) -> str | None: r""" Write object to a comma-separated values (csv) file. @@ -3763,13 +3763,6 @@ def to_csv( String of length 1. Field delimiter for the output file. na_rep : str, default '' Missing data representation. - engine : str, default 'python' - The engine to use. Available options are "pyarrow" or "python". - The pyarrow engine requires the pyarrow library to be installed - and is generally faster than the python engine. - - However, the python engine may be more feature complete than the - pyarrow engine. float_format : str, Callable, default None Format string for floating point numbers. If a Callable is given, it takes precedence over other numeric formatting parameters, like decimal. @@ -3856,6 +3849,16 @@ def to_csv( .. versionadded:: 1.2.0 + engine : str, default 'python' + The engine to use. Available options are "pyarrow" or "python". + The pyarrow engine requires the pyarrow library to be installed + and is generally faster than the python engine. + + However, the python engine may be more feature complete than the + pyarrow engine. + + .. versionadded:: 2.1.0 + Returns ------- None or str From da130914c8f4df2834835e2e68eea6067eb15d7e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 16 Nov 2023 14:33:17 -0500 Subject: [PATCH 09/22] Update csvs.py --- pandas/io/formats/csvs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e654617447376..8aa427e1170f0 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -23,6 +23,10 @@ import numpy as np from pandas._libs import writers as libwriters +from pandas.compat import ( + pa_version_under8p0, + pa_version_under11p0, +) from pandas.compat._optional import import_optional_dependency from pandas._typing import SequenceNotStr from pandas.util._decorators import cache_readonly From 6345ab53aba3991e4123ded010d11ec3059ecb2b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Nov 2023 12:18:56 -0500 Subject: [PATCH 10/22] Update csvs.py --- pandas/io/formats/csvs.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 8aa427e1170f0..e50cb63c1bc77 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -23,10 +23,7 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas.compat import ( - pa_version_under8p0, - pa_version_under11p0, -) +from pandas.compat import pa_version_under11p0 from pandas.compat._optional import import_optional_dependency from pandas._typing import SequenceNotStr from pandas.util._decorators import cache_readonly @@ -306,9 +303,7 @@ def _save_pyarrow(self, handle: IO[AnyStr]) -> None: "include_header": self._need_to_save_header, "batch_size": self.chunksize, } - - if not pa_version_under8p0: - kwargs["delimiter"] = self.sep + kwargs["delimiter"] = self.sep if not pa_version_under11p0: kwargs["quoting_style"] = pa_quoting From bde1a2b2ae84683dca907b7f6f7001be8b9f9351 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:37:16 -0500 Subject: [PATCH 11/22] green and move whatsnew --- doc/source/whatsnew/v2.1.0.rst | 1 - doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/formats/csvs.py | 7 +++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9817b5a12f212..51b4c4f297b07 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -273,7 +273,6 @@ Other enhancements - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) - :meth:`Index.all` and :meth:`Index.any` with floating dtypes and timedelta64 dtypes no longer raise ``TypeError``, matching the :meth:`Series.all` and :meth:`Series.any` behavior (:issue:`54566`) - :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`) -- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) - Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`) - PyArrow-backed integer dtypes now support bitwise operations (:issue:`54495`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 280eb11abb781..3795784c64e75 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -102,6 +102,7 @@ Other enhancements - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) +- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e50cb63c1bc77..4a0484bbac972 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -23,9 +23,9 @@ import numpy as np from pandas._libs import writers as libwriters +from pandas._typing import SequenceNotStr from pandas.compat import pa_version_under11p0 from pandas.compat._optional import import_optional_dependency -from pandas._typing import SequenceNotStr from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( @@ -316,7 +316,10 @@ def _save(self, handle: IO[AnyStr]) -> None: self._save_pyarrow(handle) else: self.writer = csvlib.writer( - handle, + # error: Argument of type "IO[AnyStr@_save]" cannot be assigned + # to parameter "csvfile" of type "SupportsWrite[str]" + # in function "writer" + handle, # pyright: ignore[reportGeneralTypeIssues] lineterminator=self.lineterminator, delimiter=self.sep, quoting=self.quoting, From cb5f6cd36145628acba87db0b665aa3b7c5ac0b0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 26 Nov 2023 13:54:53 -0500 Subject: [PATCH 12/22] updates --- pandas/core/generic.py | 3 ++- pandas/io/formats/csvs.py | 9 +++++++++ pandas/tests/io/formats/test_to_csv.py | 25 ++++++++++++++++++++----- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fe53bc6c08239..2f53ff68fc3e0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3832,7 +3832,8 @@ def to_csv( - 'x', exclusive creation, failing if the file already exists. - 'a', append to the end of file if it exists. - NOTE: The pyarrow engine can only handle binary buffers. + .. note:: + The pyarrow engine can only handle binary buffers. encoding : str, optional A string representing the encoding to use in the output file, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 4a0484bbac972..119a0a16809d0 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,6 +11,7 @@ Sequence, ) import csv as csvlib +import io import os from typing import ( IO, @@ -253,6 +254,14 @@ def save(self) -> None: """ Create the writer & save. """ + if self.engine == "pyarrow": + if "b" not in self.mode or isinstance( + self.filepath_or_buffer, io.TextIOBase + ): + raise ValueError( + "The pyarrow engine can only open file in binary mode." + ) + # apply compression and byte/text conversion with get_handle( self.filepath_or_buffer, diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 79f51f1047492..c12cac6c82937 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,4 @@ +import contextlib import io import os import sys @@ -17,7 +18,6 @@ @pytest.fixture(params=["python", "pyarrow"]) def engine(request): - # TODO: Skip if pyarrow not found if request.param == "pyarrow": pytest.importorskip("pyarrow") return request.param @@ -31,7 +31,7 @@ def pyarrow_xfail(request): engine = request.getfixturevalue("engine") if engine == "pyarrow": mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") - request.node.add_marker(mark) + request.applymarker(mark) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -582,10 +582,17 @@ def test_to_csv_write_to_open_file(self, engine): z """ with tm.ensure_clean("test.txt") as path: - # TODO: open in bytes mode for pyarrow with open(path, "w", encoding="utf-8") as f: f.write("manual header\n") - df.to_csv(f, header=None, index=None, engine=engine) + if engine == "pyarrow": + raise_if_pyarrow = pytest.raises( + ValueError, + match="The pyarrow engine can only open file in abinary mode.", + ) + else: + raise_if_pyarrow = contextlib.nullcontext() + with raise_if_pyarrow: + df.to_csv(f, header=None, index=None, engine=engine) with open(path, encoding="utf-8") as f: assert f.read() == expected @@ -600,7 +607,15 @@ def test_to_csv_write_to_open_file_with_newline_py3(self, engine): # TODO: Open in bytes mode for pyarrow with open(path, "w", newline="", encoding="utf-8") as f: f.write("manual header\n") - df.to_csv(f, header=None, index=None, engine=engine) + if engine == "pyarrow": + raise_if_pyarrow = pytest.raises( + ValueError, + match="The pyarrow engine can only open file in abinary mode.", + ) + else: + raise_if_pyarrow = contextlib.nullcontext() + with raise_if_pyarrow: + df.to_csv(f, header=None, index=None, engine=engine) with open(path, "rb") as f: assert f.read() == bytes(expected, "utf-8") From 3d95a92709f92db598a147ee9b6b46cbe9fe767c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 26 Nov 2023 15:10:22 -0500 Subject: [PATCH 13/22] address code review --- pandas/io/formats/csvs.py | 34 ++- pandas/io/formats/format.py | 2 - pandas/tests/io/formats/test_to_csv.py | 320 ++++++++++++++++--------- 3 files changed, 239 insertions(+), 117 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 119a0a16809d0..3606aaaf92983 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -254,13 +254,10 @@ def save(self) -> None: """ Create the writer & save. """ - if self.engine == "pyarrow": - if "b" not in self.mode or isinstance( - self.filepath_or_buffer, io.TextIOBase - ): - raise ValueError( - "The pyarrow engine can only open file in binary mode." - ) + if self.engine == "pyarrow" and ( + "b" not in self.mode or isinstance(self.filepath_or_buffer, io.TextIOBase) + ): + raise ValueError("The pyarrow engine can only open files in binary mode.") # apply compression and byte/text conversion with get_handle( @@ -282,6 +279,27 @@ def save(self) -> None: def _save_pyarrow(self, handle: IO[AnyStr]) -> None: pa = import_optional_dependency("pyarrow") pa_csv = import_optional_dependency("pyarrow.csv") + + if self.quotechar is not None and self.quotechar != '"': + raise ValueError('The pyarrow engine only supports " as a quotechar.') + + unsupported_options = [ + # each pair is (option value, default, option name) + (self.decimal, ".", "decimal"), + (self.float_format, None, "float_format"), + (self.na_rep, "", "na_rep"), + (self.date_format, None, "date_foramt"), + (self.lineterminator, os.linesep, "lineterminator"), + (self.encoding, None, "encoding"), + (self.errors, "strict", "errors"), + ] + + for opt_val, default, option in unsupported_options: + if opt_val != default: + raise ValueError( + f"The {option} option is not supported with the pyarrow engine." + ) + # Convert index to column and rename name to empty string # since we serialize the index as basically a column with no name # TODO: this won't work for multi-indexes (without names) @@ -297,6 +315,8 @@ def _save_pyarrow(self, handle: IO[AnyStr]) -> None: # Map quoting arg to pyarrow equivalents if self.quoting == csvlib.QUOTE_MINIMAL: pa_quoting = "needed" + elif self.quotechar is None: + raise TypeError("quotechar must be set if quoting enabled") elif self.quoting == csvlib.QUOTE_ALL: # TODO: Is this a 1-1 mapping? # This doesn't quote nulls, check if Python does this diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 87bbd6e170da4..55061e3ff37e7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1028,9 +1028,7 @@ def to_csv( if isinstance(content, bytes): # Need to decode into string since the # pyarrow engine only writes binary data - # content = cast(bytes, content) content = content.decode("utf-8") - # content = cast(str, content) path_or_buf.close() return content diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index c12cac6c82937..9e818201e62cf 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -34,6 +34,21 @@ def pyarrow_xfail(request): request.applymarker(mark) +def check_raises_if_pyarrow(option, engine): + """ + Returns a context manager that ensures that the pyarrow engine raises an + exception for unsupported options. + """ + if engine == "pyarrow": + raises_if_pyarrow = pytest.raises( + ValueError, + match=f"The {option} option is not supported with the pyarrow engine.", + ) + else: + raises_if_pyarrow = contextlib.nullcontext() + return raises_if_pyarrow + + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -76,7 +91,6 @@ def test_to_csv_default_encoding(self, engine): df.to_csv(path, engine=engine) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) - @xfail_pyarrow def test_to_csv_quotechar(self, engine): df = DataFrame({"col": [1, 2]}) expected = """\ @@ -97,9 +111,18 @@ def test_to_csv_quotechar(self, engine): """ with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1, quotechar="$", engine=engine) - with open(path, encoding="utf-8") as f: - assert f.read() == expected + if engine == "pyarrow": + raises_if_pyarrow = pytest.raises( + ValueError, + match='The pyarrow engine only supports " as a quotechar.', + ) + else: + raises_if_pyarrow = contextlib.nullcontext() + with raises_if_pyarrow: + df.to_csv(path, quoting=1, quotechar="$", engine=engine) + if engine != "pyarrow": + with open(path, encoding="utf-8") as f: + assert f.read() == expected with tm.ensure_clean("test.csv") as path: with pytest.raises(TypeError, match="quotechar"): @@ -159,104 +182,137 @@ def test_csv_to_string(self, engine): @xfail_pyarrow def test_to_csv_decimal(self, engine): # see gh-781 + raises_if_pyarrow = check_raises_if_pyarrow("decimal", engine) df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) + # This assert fails for the pyarrow engine since it quotes strings + # and the Python engine doesn't assert df.to_csv(engine=engine) == expected_default expected_rows = [";col1;col2;col3", "0;1;a;10,1"] expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(engine=engine, decimal=",", sep=";") == expected_european_excel + with raises_if_pyarrow: + assert ( + df.to_csv(engine=engine, decimal=",", sep=";") + == expected_european_excel + ) expected_rows = [",col1,col2,col3", "0,1,a,10.10"] expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) - assert ( - df.to_csv(engine=engine, float_format="%.2f") - == expected_float_format_default - ) + with raises_if_pyarrow: + assert ( + df.to_csv(engine=engine, float_format="%.2f") + == expected_float_format_default + ) expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) - assert ( - df.to_csv(engine=engine, decimal=",", sep=";", float_format="%.2f") - == expected_float_format - ) + with raises_if_pyarrow: + assert ( + df.to_csv(engine=engine, decimal=",", sep=";", float_format="%.2f") + == expected_float_format + ) # see gh-11553: testing if decimal is taken into account for '0.0' df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(engine=engine, index=False, decimal="^") == expected + with raises_if_pyarrow: + assert df.to_csv(engine=engine, index=False, decimal="^") == expected # same but for an index - assert df.set_index("a").to_csv(engine=engine, decimal="^") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, decimal="^") == expected # same for a multi-index - assert df.set_index(["a", "b"]).to_csv(engine=engine, decimal="^") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, decimal="^") == expected + ) - @xfail_pyarrow def test_to_csv_float_format(self, engine): # testing if float_format is taken into account for the index # GH 11553 + raises_if_pyarrow = check_raises_if_pyarrow("float_format", engine) df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(engine=engine, float_format="%.2f") == expected + with raises_if_pyarrow: + assert ( + df.set_index("a").to_csv(engine=engine, float_format="%.2f") == expected + ) # same for a multi-index - assert ( - df.set_index(["a", "b"]).to_csv(engine=engine, float_format="%.2f") - == expected - ) + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, float_format="%.2f") + == expected + ) - @xfail_pyarrow def test_to_csv_na_rep(self, engine): # see gh-11553 # # Testing if NaN values are correctly represented in the index. + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + ) # now with an index containing only NaNs df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + ) # check if na_rep parameter does not break anything when no NaN df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + ) - csv = pd.Series(["a", pd.NA, "c"]).to_csv(engine=engine, na_rep="ZZZZZ") - expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) - assert expected == csv + with raises_if_pyarrow: + csv = pd.Series(["a", pd.NA, "c"]).to_csv(engine=engine, na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv - @xfail_pyarrow def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype, engine): # GH 29975 # Make sure full na_rep shows up when a dtype is provided + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) - csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv( - engine=engine, na_rep="ZZZZZ" - ) - assert expected == csv + with raises_if_pyarrow: + csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv( + engine=engine, na_rep="ZZZZZ" + ) + assert expected == csv @xfail_pyarrow def test_to_csv_date_format(self, engine): # GH 10209 + raises_if_pyarrow = check_raises_if_pyarrow("date_format", engine) df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) @@ -280,10 +336,11 @@ def test_to_csv_date_format(self, engine): "4,2013-01-05 00:00:00", ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert ( - df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S", engine=engine) - == expected_ymdhms_day - ) + with raises_if_pyarrow: + assert ( + df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S", engine=engine) + == expected_ymdhms_day + ) expected_rows = [ ",A", @@ -294,7 +351,10 @@ def test_to_csv_date_format(self, engine): "4,2013-01-01", ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv(date_format="%Y-%m-%d", engine=engine) == expected_ymd_sec + with raises_if_pyarrow: + assert ( + df_sec.to_csv(date_format="%Y-%m-%d", engine=engine) == expected_ymd_sec + ) expected_rows = [ ",A", @@ -305,10 +365,13 @@ def test_to_csv_date_format(self, engine): "4,2013-01-05", ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_day.to_csv(engine=engine) == expected_default_day - assert ( - df_day.to_csv(date_format="%Y-%m-%d", engine=engine) == expected_default_day - ) + with raises_if_pyarrow: + assert df_day.to_csv(engine=engine) == expected_default_day + with raises_if_pyarrow: + assert ( + df_day.to_csv(date_format="%Y-%m-%d", engine=engine) + == expected_default_day + ) # see gh-7791 # @@ -321,10 +384,11 @@ def test_to_csv_date_format(self, engine): expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) - assert ( - df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d", engine=engine) - == expected_ymd_sec - ) + with raises_if_pyarrow: + assert ( + df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d", engine=engine) + == expected_ymd_sec + ) @xfail_pyarrow def test_to_csv_different_datetime_formats(self, engine): @@ -346,6 +410,7 @@ def test_to_csv_different_datetime_formats(self, engine): @xfail_pyarrow def test_to_csv_date_format_in_categorical(self, engine): # GH#40754 + raises_if_pyarrow = check_raises_if_pyarrow("date_format", engine) ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) ser = ser.astype("category") expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) @@ -357,20 +422,23 @@ def test_to_csv_date_format_in_categorical(self, engine): ).append(pd.DatetimeIndex([pd.NaT])) ) ser = ser.astype("category") - assert ( - ser.to_csv(index=False, engine=engine, date_format="%Y-%m-%d") == expected - ) + with raises_if_pyarrow: + assert ( + ser.to_csv(index=False, engine=engine, date_format="%Y-%m-%d") + == expected + ) - @xfail_pyarrow def test_to_csv_float_ea_float_format(self, engine): # GH#45991 + raises_if_pyarrow = check_raises_if_pyarrow("float_format", engine) df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) df["a"] = df["a"].astype("Float64") - result = df.to_csv(index=False, engine=engine, float_format="%.5f") - expected = tm.convert_rows_list_to_csv_str( - ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"] - ) - assert result == expected + with raises_if_pyarrow: + result = df.to_csv(index=False, engine=engine, float_format="%.5f") + expected = tm.convert_rows_list_to_csv_str( + ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"] + ) + assert result == expected @xfail_pyarrow def test_to_csv_float_ea_no_float_format(self, engine): @@ -440,14 +508,16 @@ def test_to_csv_single_level_multi_index( self, ind, expected, frame_or_series, engine ): # see gh-19589 + raises_if_pyarrow = check_raises_if_pyarrow("lineterminator", engine) obj = frame_or_series(pd.Series([1], ind, name="data")) - result = obj.to_csv(lineterminator="\n", header=True, engine=engine) - assert result == expected + with raises_if_pyarrow: + result = obj.to_csv(lineterminator="\n", header=True, engine=engine) + assert result == expected - @xfail_pyarrow def test_to_csv_string_array_ascii(self, engine): # GH 10813 + raises_if_pyarrow = check_raises_if_pyarrow("encoding", engine) str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) expected_ascii = """\ @@ -456,9 +526,10 @@ def test_to_csv_string_array_ascii(self, engine): 1,"['baz', 'qux']" """ with tm.ensure_clean("str_test.csv") as path: - df.to_csv(path, encoding="ascii", engine=engine) - with open(path, encoding="utf-8") as f: - assert f.read() == expected_ascii + with raises_if_pyarrow: + df.to_csv(path, encoding="ascii", engine=engine) + with open(path, encoding="utf-8") as f: + assert f.read() == expected_ascii @xfail_pyarrow def test_to_csv_string_array_utf8(self, engine): @@ -478,6 +549,7 @@ def test_to_csv_string_array_utf8(self, engine): @xfail_pyarrow def test_to_csv_string_with_lf(self, engine): # GH 20353 + raises_if_pyarrow = check_raises_if_pyarrow("lineterminator", engine) data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = DataFrame(data) with tm.ensure_clean("lf_test.csv") as path: @@ -499,20 +571,23 @@ def test_to_csv_string_with_lf(self, engine): with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' - df.to_csv(path, lineterminator="\n", index=False, engine=engine) - with open(path, "rb") as f: - assert f.read() == expected_lf + with raises_if_pyarrow: + df.to_csv(path, lineterminator="\n", index=False, engine=engine) + with open(path, "rb") as f: + assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' - df.to_csv(path, lineterminator="\r\n", index=False, engine=engine) - with open(path, "rb") as f: - assert f.read() == expected_crlf + with raises_if_pyarrow: + df.to_csv(path, lineterminator="\r\n", index=False, engine=engine) + with open(path, "rb") as f: + assert f.read() == expected_crlf @xfail_pyarrow def test_to_csv_string_with_crlf(self, engine): # GH 20353 + raises_if_pyarrow = check_raises_if_pyarrow("lineterminator", engine) data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = DataFrame(data) with tm.ensure_clean("crlf_test.csv") as path: @@ -534,9 +609,10 @@ def test_to_csv_string_with_crlf(self, engine): with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' - df.to_csv(path, lineterminator="\n", index=False, engine=engine) - with open(path, "rb") as f: - assert f.read() == expected_lf + with raises_if_pyarrow: + df.to_csv(path, lineterminator="\n", index=False, engine=engine) + with open(path, "rb") as f: + assert f.read() == expected_lf with tm.ensure_clean("crlf_test.csv") as path: # case 3: CRLF as line terminator # 'lineterminator' should not change inner element @@ -546,9 +622,10 @@ def test_to_csv_string_with_crlf(self, engine): b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n' ) - df.to_csv(path, lineterminator="\r\n", index=False, engine=engine) - with open(path, "rb") as f: - assert f.read() == expected_crlf + with raises_if_pyarrow: + df.to_csv(path, lineterminator="\r\n", index=False, engine=engine) + with open(path, "rb") as f: + assert f.read() == expected_crlf @xfail_pyarrow def test_to_csv_stdout_file(self, capsys, engine): @@ -587,7 +664,7 @@ def test_to_csv_write_to_open_file(self, engine): if engine == "pyarrow": raise_if_pyarrow = pytest.raises( ValueError, - match="The pyarrow engine can only open file in abinary mode.", + match="The pyarrow engine can only open files in abinary mode.", ) else: raise_if_pyarrow = contextlib.nullcontext() @@ -703,20 +780,21 @@ def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname, engin archived_file = zp.filelist[0].filename assert archived_file == expected_arcname - @xfail_pyarrow @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type, engine): # see gh-25099 + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) df = DataFrame({"c": [float("nan")] * 3}) df = df.astype(df_new_type) expected_rows = ["c", "mynull", "mynull", "mynull"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv( - index=False, na_rep="mynull", encoding="ascii", engine=engine - ) + with raises_if_pyarrow: + result = df.to_csv( + index=False, na_rep="mynull", encoding="ascii", engine=engine + ) - assert expected == result + assert expected == result @xfail_pyarrow def test_to_csv_timedelta_precision(self, engine): @@ -733,29 +811,35 @@ def test_to_csv_timedelta_precision(self, engine): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected - @xfail_pyarrow def test_na_rep_truncated(self, engine): # https://github.com/pandas-dev/pandas/issues/31447 - result = pd.Series(range(8, 12)).to_csv(na_rep="-", engine=engine) - expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"]) - assert result == expected + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) + with raises_if_pyarrow: + result = pd.Series(range(8, 12)).to_csv(na_rep="-", engine=engine) + expected = tm.convert_rows_list_to_csv_str( + [",0", "0,8", "1,9", "2,10", "3,11"] + ) + assert result == expected - result = pd.Series([True, False]).to_csv(na_rep="nan", engine=engine) - expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) - assert result == expected + with raises_if_pyarrow: + result = pd.Series([True, False]).to_csv(na_rep="nan", engine=engine) + expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) + assert result == expected - result = pd.Series([1.1, 2.2]).to_csv(na_rep=".", engine=engine) - expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) - assert result == expected + with raises_if_pyarrow: + result = pd.Series([1.1, 2.2]).to_csv(na_rep=".", engine=engine) + expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) + assert result == expected - @xfail_pyarrow @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) def test_to_csv_errors(self, errors, engine): # GH 22610 + raises_if_pyarrow = check_raises_if_pyarrow("errors", engine) data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) - with tm.ensure_clean("test.csv") as path: - ser.to_csv(path, errors=errors, engine=engine) + with raises_if_pyarrow: + with tm.ensure_clean("test.csv") as path: + ser.to_csv(path, errors=errors, engine=engine) # No use in reading back the data as it is not the same anymore # due to the error handling @@ -770,36 +854,56 @@ def test_to_csv_binary_handle(self, mode, engine): df = tm.makeDataFrame() with tm.ensure_clean() as path: with open(path, mode="w+b") as handle: - df.to_csv(handle, mode=mode, engine=engine) - tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + if engine == "pyarrow" and mode == "w": + raises_if_pyarrow = pytest.raises( + ValueError, + match="The pyarrow engine can only open files in binary mode.", + ) + else: + raises_if_pyarrow = contextlib.nullcontext() + with raises_if_pyarrow: + df.to_csv(handle, mode=mode, engine=engine) + if not engine == "pyarrow" and mode == "w": + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) - @xfail_pyarrow @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_encoding_binary_handle(self, mode, engine): + def test_to_csv_encoding_binary_handle(self, mode, engine, request): """ Binary file objects should honor a specified encoding. GH 23854 and GH 13068 with binary handles """ + + if mode == "w" and engine == "pyarrow": + mark = pytest.mark.xfail( + reason="pyarrow doesn't support non-binary handles." + ) + request.applymarker(mark) + + raises_if_pyarrow = check_raises_if_pyarrow("encoding", engine) # example from GH 23854 content = "a, b, 🐟".encode("utf-8-sig") buffer = io.BytesIO(content) df = pd.read_csv(buffer, encoding="utf-8-sig") buffer = io.BytesIO() - df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False, engine=engine) - buffer.seek(0) # tests whether file handle wasn't closed - assert buffer.getvalue().startswith(content) + with raises_if_pyarrow: + df.to_csv( + buffer, mode=mode, encoding="utf-8-sig", index=False, engine=engine + ) + buffer.seek(0) # tests whether file handle wasn't closed + assert buffer.getvalue().startswith(content) # example from GH 13068 with tm.ensure_clean() as path: with open(path, "w+b") as handle: - DataFrame().to_csv( - handle, mode=mode, encoding="utf-8-sig", engine=engine - ) + with raises_if_pyarrow: + DataFrame().to_csv( + handle, mode=mode, encoding="utf-8-sig", engine=engine + ) - handle.seek(0) - assert handle.read().startswith(b'\xef\xbb\xbf""') + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') def test_to_csv_iterative_compression_name(compression, engine): From ba451e1a48d93b10a41479649c90b6a688d834e2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 7 Dec 2023 10:06:16 -0500 Subject: [PATCH 14/22] fix tests --- pandas/tests/io/formats/test_to_csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index bced5ec137e14..7f99b50a40f8c 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -838,9 +838,9 @@ def test_to_csv_errors(self, errors, engine): raises_if_pyarrow = check_raises_if_pyarrow("errors", engine) data = ["\ud800foo"] with raises_if_pyarrow: - ser = pd.Series(data, index=Index(data)) - with tm.ensure_clean("test.csv") as path: - ser.to_csv(path, errors=errors) + ser = pd.Series(data, index=Index(data)) + with tm.ensure_clean("test.csv") as path: + ser.to_csv(path, errors=errors, engine=engine) # No use in reading back the data as it is not the same anymore # due to the error handling From 8ff04f7a542dfa5d085f7e47136952bef69c4ba8 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 5 Sep 2025 15:55:58 -0400 Subject: [PATCH 15/22] Move whatsnew entry to v3.0.0 --- doc/source/whatsnew/v2.2.0.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 15bba47bdea64..e32417e367427 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -343,7 +343,6 @@ Other enhancements - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) -- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7ec50137c3039..a5f9f25e49032 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -225,6 +225,7 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - .. --------------------------------------------------------------------------- From 84b4e5930c935ee5974c1be88a4d9f25f004851a Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 5 Sep 2025 16:04:43 -0400 Subject: [PATCH 16/22] Update versionadded to 3.0.0 --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 60f20772df2fc..f23fc346f0b51 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3876,7 +3876,7 @@ def to_csv( However, the python engine may be more feature complete than the pyarrow engine. - .. versionadded:: 2.1.0 + .. versionadded:: 3.0.0 Returns ------- From 5d6305ebdc6868a0ad1c668831fe7c8a992f8bd9 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 5 Sep 2025 16:36:15 -0400 Subject: [PATCH 17/22] No need to support pyarrow < 11 anymore --- pandas/io/formats/csvs.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 531a2104bd6b9..42b678c8ca83d 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -25,7 +25,6 @@ from pandas._libs import writers as libwriters from pandas._typing import SequenceNotStr -from pandas.compat import pa_version_under11p0 from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import cache_readonly @@ -333,9 +332,7 @@ def _save_pyarrow(self, handle: IO[AnyStr]) -> None: "batch_size": self.chunksize, } kwargs["delimiter"] = self.sep - - if not pa_version_under11p0: - kwargs["quoting_style"] = pa_quoting + kwargs["quoting_style"] = pa_quoting write_options = pa_csv.WriteOptions(**kwargs) pa_csv.write_csv(table, handle, write_options) From 7da6613fb87c0dae3a9966a339e5781f94f07155 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 5 Sep 2025 17:02:04 -0400 Subject: [PATCH 18/22] Fixup test --- pandas/tests/io/formats/test_to_csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 2fa27d3769973..e4c21150e926c 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -835,9 +835,9 @@ def test_to_csv_errors(self, errors, engine): raises_if_pyarrow = check_raises_if_pyarrow("errors", engine) data = ["\ud800foo"] with raises_if_pyarrow: - ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) - with tm.ensure_clean("test.csv") as path: - ser.to_csv(path, errors=errors) + ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) + with tm.ensure_clean("test.csv") as path: + ser.to_csv(path, errors=errors, engine=engine) # No use in reading back the data as it is not the same anymore # due to the error handling From 737000843ae375ca37309b64c704a49057471793 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 5 Sep 2025 17:26:28 -0400 Subject: [PATCH 19/22] Add escapechar to unsupported options --- pandas/io/formats/csvs.py | 1 + pandas/tests/io/formats/test_to_csv.py | 25 ++++++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 42b678c8ca83d..842c143af8a4b 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -291,6 +291,7 @@ def _save_pyarrow(self, handle: IO[AnyStr]) -> None: (self.lineterminator, os.linesep, "lineterminator"), (self.encoding, None, "encoding"), (self.errors, "strict", "errors"), + (self.escapechar, None, "escapechar"), ] for opt_val, default, option in unsupported_options: diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e4c21150e926c..6c88a01c77431 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -146,7 +146,8 @@ def test_to_csv_doublequote(self): with pytest.raises(Error, match="escapechar"): df.to_csv(path, doublequote=False, engine=engine) # no escapechar set - def test_to_csv_escapechar(self, engine=engine): + def test_to_csv_escapechar(self, engine): + raises_if_pyarrow = check_raises_if_pyarrow("escapechar", engine) df = DataFrame({"col": ['a"a', '"bb"']}) expected = """\ "","col" @@ -154,12 +155,13 @@ def test_to_csv_escapechar(self, engine=engine): "1","\\"bb\\"" """ - with tm.ensure_clean("test.csv") as path: # QUOTE_ALL - df.to_csv( - path, quoting=1, doublequote=False, escapechar="\\", engine=engine - ) - with open(path, encoding="utf-8") as f: - assert f.read() == expected + with raises_if_pyarrow: + with tm.ensure_clean("test.csv") as path: # QUOTE_ALL + df.to_csv( + path, quoting=1, doublequote=False, escapechar="\\", engine=engine + ) + with open(path, encoding="utf-8") as f: + assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ @@ -168,10 +170,11 @@ def test_to_csv_escapechar(self, engine=engine): 1,\\,bb\\, """ - with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=3, escapechar="\\", engine=engine) # QUOTE_NONE - with open(path, encoding="utf-8") as f: - assert f.read() == expected + with raises_if_pyarrow: + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=3, escapechar="\\", engine=engine) # QUOTE_NONE + with open(path, encoding="utf-8") as f: + assert f.read() == expected @xfail_pyarrow def test_csv_to_string(self, engine): From a157861f136680ce008c1b677a67abe98f4e6566 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 5 Sep 2025 17:33:25 -0400 Subject: [PATCH 20/22] Sort whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a5f9f25e49032..983284e0bcd92 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -216,6 +216,7 @@ Other enhancements - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) - Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`) +- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`) @@ -225,7 +226,6 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - .. --------------------------------------------------------------------------- From b19c5a3d1af51066a2c5760d68124ea13d76220a Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Sat, 6 Sep 2025 09:35:39 -0400 Subject: [PATCH 21/22] Fix type ignore --- pandas/io/formats/csvs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 842c143af8a4b..51a55da9bb2cd 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -348,7 +348,7 @@ def _save(self, handle: IO[AnyStr]) -> None: # in function "writer" # error: Argument "quoting" to "writer" has incompatible type "int"; # expected "Literal[0, 1, 2, 3]" - handle, # pyright: ignore[reportGeneralTypeIssues] + handle, # type: ignore[arg-type] lineterminator=self.lineterminator, delimiter=self.sep, quoting=self.quoting, # type: ignore[arg-type] From 8a13c4b94b587054a30ff1405f5a112caa4b31b4 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Sat, 6 Sep 2025 15:43:07 -0400 Subject: [PATCH 22/22] Hopefully fix test_to_csv_single_level_multi_index on Windows This test was passing even though xfailed, so remove the raises_if_pyarrow so it will fail due to that (hopefully). --- pandas/tests/io/formats/test_to_csv.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 6c88a01c77431..be31852ad14b9 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -512,12 +512,9 @@ def test_to_csv_single_level_multi_index( self, ind, expected, frame_or_series, engine ): # see gh-19589 - raises_if_pyarrow = check_raises_if_pyarrow("lineterminator", engine) obj = frame_or_series(pd.Series([1], ind, name="data")) - - with raises_if_pyarrow: - result = obj.to_csv(lineterminator="\n", header=True, engine=engine) - assert result == expected + result = obj.to_csv(lineterminator="\n", header=True, engine=engine) + assert result == expected def test_to_csv_string_array_ascii(self, engine): # GH 10813