Skip to content

Commit 3e26250

Browse files
committed
Improving cast for date columns
1 parent 8e5853b commit 3e26250

File tree

4 files changed

+35
-4
lines changed

4 files changed

+35
-4
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
> Utility belt to handle data on AWS.
44
5-
[![Release](https://img.shields.io/badge/release-0.0.22-brightgreen.svg)](https://pypi.org/project/awswrangler/)
5+
[![Release](https://img.shields.io/badge/release-0.0.23-brightgreen.svg)](https://pypi.org/project/awswrangler/)
66
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
77
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
88
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)

awswrangler/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
__title__ = "awswrangler"
22
__description__ = "Utility belt to handle data on AWS."
3-
__version__ = "0.0.22"
3+
__version__ = "0.0.23"
44
__license__ = "Apache License 2.0"

awswrangler/pandas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ def _apply_dates_to_generator(generator, parse_dates):
488488
for df in generator:
489489
if len(df.index) > 0:
490490
for col in parse_dates:
491-
df[col] = df[col].dt.date
491+
df[col] = df[col].dt.date.replace(to_replace={pd.NaT: None})
492492
yield df
493493

494494
def to_csv(
@@ -788,7 +788,7 @@ def _cast_pandas(dataframe: pd.DataFrame, cast_columns: Dict[str, str]) -> pd.Da
788788
if pandas_type == "datetime64":
789789
dataframe[col] = pd.to_datetime(dataframe[col])
790790
elif pandas_type == "date":
791-
dataframe[col] = pd.to_datetime(dataframe[col]).dt.date
791+
dataframe[col] = pd.to_datetime(dataframe[col]).dt.date.replace(to_replace={pd.NaT: None})
792792
else:
793793
dataframe[col] = dataframe[col].astype(pandas_type, skipna=True)
794794
return dataframe

testing/test_awswrangler/test_pandas.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,3 +1249,34 @@ def test_to_parquet_date_null(session, bucket, database):
12491249

12501250
assert df[df.col1 == "val2"].iloc[0].datecol == df2[df2.col1 == "val2"].iloc[0].datecol
12511251
assert df2[df2.col1 == "val2"].iloc[0].datecol == df3[df3.col1 == "val2"].iloc[0].datecol is None
1252+
1253+
1254+
def test_to_parquet_date_null_at_first(session, bucket, database):
1255+
df = pd.DataFrame({
1256+
"col1": ["val0", "val1", "val2", "val3", "val4", "val5", "val6", "val7", "val8", "val9"],
1257+
"datecol": [None, pd.NaT, None, pd.NaT, None, pd.NaT, None, pd.NaT, None,
1258+
date(2019, 11, 9)],
1259+
})
1260+
path = f"s3://{bucket}/test/"
1261+
session.pandas.to_parquet(dataframe=df,
1262+
database=database,
1263+
table="test",
1264+
path=path,
1265+
mode="overwrite",
1266+
preserve_index=False,
1267+
procs_cpu_bound=1,
1268+
cast_columns={"datecol": "date"})
1269+
df2 = None
1270+
for counter in range(10): # Retrying to workaround s3 eventual consistency
1271+
sleep(1)
1272+
df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
1273+
if len(df.index) == len(df2.index):
1274+
break
1275+
1276+
session.s3.delete_objects(path=path)
1277+
1278+
assert len(list(df.columns)) == len(list(df2.columns))
1279+
assert len(df.index) == len(df2.index)
1280+
1281+
assert df[df.col1 == "val9"].iloc[0].datecol == df2[df2.col1 == "val9"].iloc[0].datecol == date(2019, 11, 9)
1282+
assert df[df.col1 == "val0"].iloc[0].datecol == df2[df2.col1 == "val0"].iloc[0].datecol is None

0 commit comments

Comments
 (0)