Add timestamp as object to database methods (#1132)

jaidisido · web-flow · commit d7ba8b08b6b4 · 2022-01-26T12:13:16.000Z
* Add timestamp as object to database methods

* Adding test
diff --git a/awswrangler/_databases.py b/awswrangler/_databases.py
@@ -128,6 +128,7 @@ def _records2df(
     index: Optional[Union[str, List[str]]],
     safe: bool,
     dtype: Optional[Dict[str, pa.DataType]],
+    timestamp_as_object: bool,
 ) -> pd.DataFrame:
     arrays: List[pa.Array] = []
     for col_values, col_name in zip(tuple(zip(*records)), cols_names):  # Transposing
@@ -155,6 +156,7 @@ def _records2df(
             date_as_object=True,
             types_mapper=_data_types.pyarrow2pandas_extension,
             safe=safe,
+            timestamp_as_object=timestamp_as_object,
         )
     if index is not None:
         df.set_index(index, inplace=True)
@@ -175,6 +177,7 @@ def _iterate_results(
     index_col: Optional[Union[str, List[str]]],
     safe: bool,
     dtype: Optional[Dict[str, pa.DataType]],
+    timestamp_as_object: bool,
 ) -> Iterator[pd.DataFrame]:
     with con.cursor() as cursor:
         cursor.execute(*cursor_args)
@@ -183,7 +186,14 @@ def _iterate_results(
             records = cursor.fetchmany(chunksize)
             if not records:
                 break
-            yield _records2df(records=records, cols_names=cols_names, index=index_col, safe=safe, dtype=dtype)
+            yield _records2df(
+                records=records,
+                cols_names=cols_names,
+                index=index_col,
+                safe=safe,
+                dtype=dtype,
+                timestamp_as_object=timestamp_as_object,
+            )
 
 
 def _fetch_all_results(
@@ -192,6 +202,7 @@ def _fetch_all_results(
     index_col: Optional[Union[str, List[str]]] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> pd.DataFrame:
     with con.cursor() as cursor:
         cursor.execute(*cursor_args)
@@ -202,6 +213,7 @@ def _fetch_all_results(
             index=index_col,
             dtype=dtype,
             safe=safe,
+            timestamp_as_object=timestamp_as_object,
         )
 
 
@@ -213,6 +225,7 @@ def read_sql_query(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Read SQL Query (generic)."""
     args = _convert_params(sql, params)
@@ -224,6 +237,7 @@ def read_sql_query(
                 index_col=index_col,
                 dtype=dtype,
                 safe=safe,
+                timestamp_as_object=timestamp_as_object,
             )
 
         return _iterate_results(
@@ -233,6 +247,7 @@ def read_sql_query(
             index_col=index_col,
             dtype=dtype,
             safe=safe,
+            timestamp_as_object=timestamp_as_object,
         )
     except Exception as ex:
         con.rollback()
diff --git a/awswrangler/mysql.py b/awswrangler/mysql.py
@@ -174,6 +174,7 @@ def read_sql_query(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding to the result set of the query string.
 
@@ -197,6 +198,8 @@ def read_sql_query(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -218,7 +221,14 @@ def read_sql_query(
     """
     _validate_connection(con=con)
     return _db_utils.read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
@@ -231,6 +241,7 @@ def read_sql_table(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding the table.
 
@@ -257,6 +268,8 @@ def read_sql_table(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -279,7 +292,14 @@ def read_sql_table(
     """
     sql: str = f"SELECT * FROM `{table}`" if schema is None else f"SELECT * FROM `{schema}`.`{table}`"
     return read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
diff --git a/awswrangler/postgresql.py b/awswrangler/postgresql.py
@@ -169,6 +169,7 @@ def read_sql_query(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding to the result set of the query string.
 
@@ -192,6 +193,8 @@ def read_sql_query(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -213,7 +216,14 @@ def read_sql_query(
     """
     _validate_connection(con=con)
     return _db_utils.read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
@@ -226,6 +236,7 @@ def read_sql_table(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding the table.
 
@@ -252,6 +263,8 @@ def read_sql_table(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -274,7 +287,14 @@ def read_sql_table(
     """
     sql: str = f'SELECT * FROM "{table}"' if schema is None else f'SELECT * FROM "{schema}"."{table}"'
     return read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -574,6 +574,7 @@ def read_sql_query(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding to the result set of the query string.
 
@@ -602,6 +603,8 @@ def read_sql_query(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -623,7 +626,14 @@ def read_sql_query(
     """
     _validate_connection(con=con)
     return _db_utils.read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
@@ -636,6 +646,7 @@ def read_sql_table(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding the table.
 
@@ -667,6 +678,8 @@ def read_sql_table(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -689,7 +702,14 @@ def read_sql_table(
     """
     sql: str = f'SELECT * FROM "{table}"' if schema is None else f'SELECT * FROM "{schema}"."{table}"'
     return read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
diff --git a/awswrangler/sqlserver.py b/awswrangler/sqlserver.py
@@ -190,6 +190,7 @@ def read_sql_query(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding to the result set of the query string.
 
@@ -213,6 +214,8 @@ def read_sql_query(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -233,7 +236,14 @@ def read_sql_query(
     """
     _validate_connection(con=con)
     return _db_utils.read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
@@ -247,6 +257,7 @@ def read_sql_table(
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
     safe: bool = True,
+    timestamp_as_object: bool = False,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Return a DataFrame corresponding the table.
 
@@ -273,6 +284,8 @@ def read_sql_table(
         The keys should be the column names and the values should be the PyArrow types.
     safe : bool
         Check for overflows or other unsafe data type conversions.
+    timestamp_as_object : bool
+        Cast non-nanosecond timestamps (np.datetime64) to objects.
 
     Returns
     -------
@@ -295,7 +308,14 @@ def read_sql_table(
     table_identifier = _get_table_identifier(schema, table)
     sql: str = f"SELECT * FROM {table_identifier}"
     return read_sql_query(
-        sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype, safe=safe
+        sql=sql,
+        con=con,
+        index_col=index_col,
+        params=params,
+        chunksize=chunksize,
+        dtype=dtype,
+        safe=safe,
+        timestamp_as_object=timestamp_as_object,
     )
 
 
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
@@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from decimal import Decimal
 
 import pandas as pd
@@ -371,3 +372,18 @@ def test_upsert_multiple_conflict_columns(postgresql_table, postgresql_con):
     df7["c1"] = df7["c1"].astype("Int64")
     df7["c2"] = df7["c2"].astype("Int64")
     assert df6.equals(df7)
+
+
+def test_timestamp_overflow(postgresql_table, postgresql_con):
+    df = pd.DataFrame({"c0": [datetime.strptime("1677-01-01 00:00:00.0", "%Y-%m-%d %H:%M:%S.%f")]})
+    wr.postgresql.to_sql(df=df, con=postgresql_con, schema="public", table=postgresql_table)
+
+    with pytest.raises(pa._lib.ArrowInvalid):
+        wr.postgresql.read_sql_table(
+            con=postgresql_con, schema="public", table=postgresql_table, timestamp_as_object=False
+        )
+
+    df2 = wr.postgresql.read_sql_table(
+        con=postgresql_con, schema="public", table=postgresql_table, timestamp_as_object=True
+    )
+    assert df.c0.values[0] == df2.c0.values[0]