@@ -222,6 +222,7 @@ def _fetch_parquet_result(
222
222
boto3_session : boto3 .Session ,
223
223
s3_additional_kwargs : Optional [Dict [str , Any ]],
224
224
temp_table_fqn : Optional [str ] = None ,
225
+ pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
225
226
) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
226
227
ret : Union [pd .DataFrame , Iterator [pd .DataFrame ]]
227
228
chunked : Union [bool , int ] = False if chunksize is None else chunksize
@@ -249,6 +250,7 @@ def _fetch_parquet_result(
249
250
chunked = chunked ,
250
251
categories = categories ,
251
252
ignore_index = True ,
253
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
252
254
)
253
255
if chunked is False :
254
256
ret = _apply_query_metadata (df = ret , query_metadata = query_metadata )
@@ -337,6 +339,7 @@ def _resolve_query_with_cache(
337
339
use_threads : bool ,
338
340
session : Optional [boto3 .Session ],
339
341
s3_additional_kwargs : Optional [Dict [str , Any ]],
342
+ pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
340
343
) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
341
344
"""Fetch cached data and return it as a pandas DataFrame (or list of DataFrames)."""
342
345
_logger .debug ("cache_info:\n %s" , cache_info )
@@ -358,6 +361,7 @@ def _resolve_query_with_cache(
358
361
use_threads = use_threads ,
359
362
boto3_session = session ,
360
363
s3_additional_kwargs = s3_additional_kwargs ,
364
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
361
365
)
362
366
if cache_info .file_format == "csv" :
363
367
return _fetch_csv_result (
@@ -389,6 +393,7 @@ def _resolve_query_without_cache_ctas(
389
393
use_threads : bool ,
390
394
s3_additional_kwargs : Optional [Dict [str , Any ]],
391
395
boto3_session : boto3 .Session ,
396
+ pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
392
397
) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
393
398
path : str = f"{ s3_output } /{ name } "
394
399
ext_location : str = "\n " if wg_config .enforced is True else f",\n external_location = '{ path } '\n "
@@ -465,6 +470,7 @@ def _resolve_query_without_cache_ctas(
465
470
s3_additional_kwargs = s3_additional_kwargs ,
466
471
boto3_session = boto3_session ,
467
472
temp_table_fqn = fully_qualified_name ,
473
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
468
474
)
469
475
470
476
@@ -532,6 +538,7 @@ def _resolve_query_without_cache(
532
538
use_threads : bool ,
533
539
s3_additional_kwargs : Optional [Dict [str , Any ]],
534
540
boto3_session : boto3 .Session ,
541
+ pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
535
542
) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
536
543
"""
537
544
Execute a query in Athena and returns results as DataFrame, back to `read_sql_query`.
@@ -565,6 +572,7 @@ def _resolve_query_without_cache(
565
572
use_threads = use_threads ,
566
573
s3_additional_kwargs = s3_additional_kwargs ,
567
574
boto3_session = boto3_session ,
575
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
568
576
)
569
577
finally :
570
578
catalog .delete_table_if_exists (
@@ -612,6 +620,7 @@ def read_sql_query(
612
620
data_source : Optional [str ] = None ,
613
621
params : Optional [Dict [str , Any ]] = None ,
614
622
s3_additional_kwargs : Optional [Dict [str , Any ]] = None ,
623
+ pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
615
624
) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
616
625
"""Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame.
617
626
@@ -781,6 +790,14 @@ def read_sql_query(
781
790
s3_additional_kwargs : Optional[Dict[str, Any]]
782
791
Forwarded to botocore requests.
783
792
e.g. s3_additional_kwargs={'RequestPayer': 'requester'}
793
+ pyarrow_additional_kwargs : Optional[Dict[str, Any]]
794
+ Forward to the ParquetFile class or converting an Arrow table to Pandas, currently only an
795
+ "coerce_int96_timestamp_unit" or "timestamp_as_object" argument will be considered. If reading parquet
796
+ files where you cannot convert a timestamp to pandas Timestamp[ns] consider setting timestamp_as_object=True,
797
+ to allow for timestamp units larger than "ns". If reading parquet data that still uses INT96 (like Athena
798
+ outputs) you can use coerce_int96_timestamp_unit to specify what timestamp unit to encode INT96 to (by default
799
+ this is "ns", if you know the output parquet came from a system that encodes timestamp to a particular unit
800
+ then set this to that same unit e.g. coerce_int96_timestamp_unit="ms").
784
801
785
802
Returns
786
803
-------
@@ -837,6 +854,7 @@ def read_sql_query(
837
854
use_threads = use_threads ,
838
855
session = session ,
839
856
s3_additional_kwargs = s3_additional_kwargs ,
857
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
840
858
)
841
859
except Exception as e : # pylint: disable=broad-except
842
860
_logger .error (e ) # if there is anything wrong with the cache, just fallback to the usual path
@@ -859,6 +877,7 @@ def read_sql_query(
859
877
use_threads = use_threads ,
860
878
s3_additional_kwargs = s3_additional_kwargs ,
861
879
boto3_session = session ,
880
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
862
881
)
863
882
864
883
@@ -885,6 +904,7 @@ def read_sql_table(
885
904
max_local_cache_entries : int = 100 ,
886
905
data_source : Optional [str ] = None ,
887
906
s3_additional_kwargs : Optional [Dict [str , Any ]] = None ,
907
+ pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
888
908
) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
889
909
"""Extract the full table AWS Athena and return the results as a Pandas DataFrame.
890
910
@@ -1045,6 +1065,15 @@ def read_sql_table(
1045
1065
s3_additional_kwargs : Optional[Dict[str, Any]]
1046
1066
Forwarded to botocore requests.
1047
1067
e.g. s3_additional_kwargs={'RequestPayer': 'requester'}
1068
+ pyarrow_additional_kwargs : Optional[Dict[str, Any]]
1069
+ Forward to the ParquetFile class or converting an Arrow table to Pandas, currently only an
1070
+ "coerce_int96_timestamp_unit" or "timestamp_as_object" argument will be considered. If
1071
+ reading parquet fileswhere you cannot convert a timestamp to pandas Timestamp[ns] consider
1072
+ setting timestamp_as_object=True, to allow for timestamp units > NS. If reading parquet data that
1073
+ still uses INT96 (like Athena outputs) you can use coerce_int96_timestamp_unit to specify what
1074
+ timestamp unit to encode INT96 to (by default this is "ns", if you know the output parquet came from
1075
+ a system that encodes timestamp to a particular unit then set this to that same unit e.g.
1076
+ coerce_int96_timestamp_unit="ms").
1048
1077
1049
1078
Returns
1050
1079
-------
@@ -1081,6 +1110,7 @@ def read_sql_table(
1081
1110
max_remote_cache_entries = max_remote_cache_entries ,
1082
1111
max_local_cache_entries = max_local_cache_entries ,
1083
1112
s3_additional_kwargs = s3_additional_kwargs ,
1113
+ pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
1084
1114
)
1085
1115
1086
1116
0 commit comments