Improving cast for date columns

igorborgest · igorborgest · commit 3e26250601e8 · 2019-11-22T23:23:05.000-03:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 > Utility belt to handle data on AWS.
 
-[![Release](https://img.shields.io/badge/release-0.0.22-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-0.0.23-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)
diff --git a/awswrangler/__version__.py b/awswrangler/__version__.py
@@ -1,4 +1,4 @@
 __title__ = "awswrangler"
 __description__ = "Utility belt to handle data on AWS."
-__version__ = "0.0.22"
+__version__ = "0.0.23"
 __license__ = "Apache License 2.0"
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -488,7 +488,7 @@ def _apply_dates_to_generator(generator, parse_dates):
         for df in generator:
             if len(df.index) > 0:
                 for col in parse_dates:
-                    df[col] = df[col].dt.date
+                    df[col] = df[col].dt.date.replace(to_replace={pd.NaT: None})
             yield df
 
     def to_csv(
@@ -788,7 +788,7 @@ def _cast_pandas(dataframe: pd.DataFrame, cast_columns: Dict[str, str]) -> pd.Da
             if pandas_type == "datetime64":
                 dataframe[col] = pd.to_datetime(dataframe[col])
             elif pandas_type == "date":
-                dataframe[col] = pd.to_datetime(dataframe[col]).dt.date
+                dataframe[col] = pd.to_datetime(dataframe[col]).dt.date.replace(to_replace={pd.NaT: None})
             else:
                 dataframe[col] = dataframe[col].astype(pandas_type, skipna=True)
         return dataframe
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -1249,3 +1249,34 @@ def test_to_parquet_date_null(session, bucket, database):
 
     assert df[df.col1 == "val2"].iloc[0].datecol == df2[df2.col1 == "val2"].iloc[0].datecol
     assert df2[df2.col1 == "val2"].iloc[0].datecol == df3[df3.col1 == "val2"].iloc[0].datecol is None
+
+
+def test_to_parquet_date_null_at_first(session, bucket, database):
+    df = pd.DataFrame({
+        "col1": ["val0", "val1", "val2", "val3", "val4", "val5", "val6", "val7", "val8", "val9"],
+        "datecol": [None, pd.NaT, None, pd.NaT, None, pd.NaT, None, pd.NaT, None,
+                    date(2019, 11, 9)],
+    })
+    path = f"s3://{bucket}/test/"
+    session.pandas.to_parquet(dataframe=df,
+                              database=database,
+                              table="test",
+                              path=path,
+                              mode="overwrite",
+                              preserve_index=False,
+                              procs_cpu_bound=1,
+                              cast_columns={"datecol": "date"})
+    df2 = None
+    for counter in range(10):  # Retrying to workaround s3 eventual consistency
+        sleep(1)
+        df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
+        if len(df.index) == len(df2.index):
+            break
+
+    session.s3.delete_objects(path=path)
+
+    assert len(list(df.columns)) == len(list(df2.columns))
+    assert len(df.index) == len(df2.index)
+
+    assert df[df.col1 == "val9"].iloc[0].datecol == df2[df2.col1 == "val9"].iloc[0].datecol == date(2019, 11, 9)
+    assert df[df.col1 == "val0"].iloc[0].datecol == df2[df2.col1 == "val0"].iloc[0].datecol is None