Skip to content

Commit f045512

Browse files
authored
Merge pull request #128 from awslabs/pandas-v1
Bumping Pandas version to 1.0.0
2 parents 60c3377 + aae3e44 commit f045512

File tree

6 files changed

+65
-11
lines changed

6 files changed

+65
-11
lines changed

awswrangler/data_types.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def athena2pandas(dtype: str) -> str:
2020
elif dtype == "boolean":
2121
return "bool"
2222
elif dtype in ("string", "char", "varchar"):
23-
return "str"
23+
return "string"
2424
elif dtype in ("timestamp", "timestamp with time zone"):
2525
return "datetime64"
2626
elif dtype == "date":
@@ -117,6 +117,8 @@ def pandas2athena(dtype: str) -> str:
117117
return "double"
118118
elif dtype == "bool":
119119
return "boolean"
120+
elif dtype == "string":
121+
return "string"
120122
elif dtype == "object":
121123
return "string"
122124
elif dtype.startswith("datetime64"):
@@ -137,7 +139,9 @@ def pandas2redshift(dtype: str, varchar_length: int = 256) -> str:
137139
return "FLOAT8"
138140
elif dtype == "bool":
139141
return "BOOLEAN"
140-
elif dtype == "object" and isinstance(dtype, str):
142+
elif dtype == "string":
143+
return f"VARCHAR({varchar_length})"
144+
elif dtype == "object":
141145
return f"VARCHAR({varchar_length})"
142146
elif dtype[:10] == "datetime64":
143147
return "TIMESTAMP"
@@ -375,11 +379,13 @@ def extract_pyarrow_schema_from_pandas(dataframe: pd.DataFrame,
375379
if indexes_position not in ("right", "left"):
376380
raise ValueError(f"indexes_position must be \"right\" or \"left\"")
377381

378-
# Handle exception data types (e.g. Int64)
382+
# Handle exception data types (e.g. Int64, string)
379383
for name, dtype in dataframe.dtypes.to_dict().items():
380384
dtype = str(dtype)
381385
if dtype == "Int64":
382386
cols_dtypes[name] = "int64"
387+
elif dtype == "string":
388+
cols_dtypes[name] = "string"
383389
else:
384390
cols.append(name)
385391

awswrangler/glue.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -235,12 +235,9 @@ def _build_schema(
235235
preserve_index: bool,
236236
indexes_position: str,
237237
cast_columns: Optional[Dict[str, str]] = None) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
238-
if cast_columns is None:
239-
cast_columns = {}
238+
cast_columns = {} if cast_columns is None else cast_columns
239+
partition_cols = [] if partition_cols is None else partition_cols
240240
logger.debug(f"dataframe.dtypes:\n{dataframe.dtypes}")
241-
if partition_cols is None:
242-
partition_cols = []
243-
244241
pyarrow_schema: List[Tuple[str, Any]] = data_types.extract_pyarrow_schema_from_pandas(
245242
dataframe=dataframe, preserve_index=preserve_index, indexes_position=indexes_position)
246243

requirements-dev.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ mypy~=0.761
33
flake8~=3.7.9
44
pytest-cov~=2.8.1
55
scikit-learn~=0.22.1
6-
cfn-lint~=0.27.2
6+
cfn-lint~=0.27.3
77
twine~=3.1.1
8-
wheel~=0.34.0
8+
wheel~=0.34.2
99
sphinx~=2.3.1
1010
pyspark~=2.4.4
1111
pyspark-stubs~=2.4.0.post7

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
numpy~=1.18.1
2-
pandas~=0.25.3
2+
pandas~=1.0.0
33
pyarrow~=0.15.1
44
botocore>=1.13.34
55
boto3>=1.10.34

testing/test_awswrangler/test_pandas.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2369,6 +2369,7 @@ def test_s3_overall_nan(bucket, database):
23692369

23702370
def test_aurora_postgres_load_varchar(bucket, postgres_parameters):
23712371
df = pd.DataFrame({"id": [1, 2, 3], "varchar3": ["foo", "boo", "bar"], "varchar1": ["a", "b", "c"]})
2372+
df["varchar3"] = df["varchar3"].astype("string")
23722373
path = f"s3://{bucket}/test_aurora_postgres_load_varchar"
23732374
wr.pandas.to_aurora(dataframe=df,
23742375
connection="aws-data-wrangler-postgres",
@@ -2404,6 +2405,7 @@ def test_aurora_postgres_load_varchar(bucket, postgres_parameters):
24042405

24052406
def test_aurora_mysql_load_varchar(bucket):
24062407
df = pd.DataFrame({"id": [1, 2, 3], "varchar3": ["foo", "boo", "bar"], "varchar1": ["a", "b", "c"]})
2408+
df["varchar3"] = df["varchar3"].astype("string")
24072409
path = f"s3://{bucket}/test_aurora_mysql_load_varchar"
24082410
wr.pandas.to_aurora(dataframe=df,
24092411
connection="aws-data-wrangler-mysql",
@@ -2430,3 +2432,51 @@ def test_aurora_mysql_load_varchar(bucket):
24302432
assert rows[1][2] == "b"
24312433
assert rows[2][2] == "c"
24322434
conn.close()
2435+
2436+
2437+
def test_to_parquet_string(bucket, database):
2438+
path = f"s3://{bucket}/test_to_parquet_string"
2439+
wr.s3.delete_objects(path=path)
2440+
df = pd.DataFrame({
2441+
"id": [1, 2, 3, 4, 5],
2442+
"c_str": ["foo", None, None, "bar", None],
2443+
})
2444+
df["id"] = df["id"].astype("Int64")
2445+
df["c_str"] = df["c_str"].astype("string")
2446+
wr.pandas.to_parquet(dataframe=df,
2447+
database=database,
2448+
path=path,
2449+
mode="overwrite",
2450+
preserve_index=False,
2451+
procs_cpu_bound=5,
2452+
inplace=False)
2453+
sleep(15)
2454+
df2 = wr.pandas.read_sql_athena(database=database,
2455+
sql="SELECT * FROM test_to_parquet_string ORDER BY id",
2456+
ctas_approach=False)
2457+
wr.s3.delete_objects(path=path)
2458+
assert df.equals(df2)
2459+
2460+
2461+
def test_to_csv_string(bucket, database):
2462+
path = f"s3://{bucket}/test_to_csv_string"
2463+
wr.s3.delete_objects(path=path)
2464+
df = pd.DataFrame({
2465+
"id": [1, 2, 3, 4, 5],
2466+
"c_str": ["foo", None, None, "bar", None],
2467+
})
2468+
df["id"] = df["id"].astype("Int64")
2469+
df["c_str"] = df["c_str"].astype("string")
2470+
wr.pandas.to_parquet(dataframe=df,
2471+
database=database,
2472+
path=path,
2473+
mode="overwrite",
2474+
preserve_index=False,
2475+
procs_cpu_bound=5,
2476+
inplace=False)
2477+
sleep(5)
2478+
df2 = wr.pandas.read_sql_athena(database=database,
2479+
sql="SELECT * FROM test_to_csv_string ORDER BY id",
2480+
ctas_approach=False)
2481+
wr.s3.delete_objects(path=path)
2482+
assert df.equals(df2)

testing/test_awswrangler/test_redshift.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,7 @@ def test_spectrum_csv(bucket, glue_database, external_schema):
870870

871871
def test_to_redshift_pandas_varchar(bucket, redshift_parameters):
872872
df = pd.DataFrame({"id": [1, 2, 3], "varchar3": ["foo", "boo", "bar"], "varchar1": ["a", "b", "c"]})
873+
df["varchar3"] = df["varchar3"].astype("string")
873874
path = f"s3://{bucket}/test_to_redshift_pandas_varchar"
874875
wr.pandas.to_redshift(dataframe=df,
875876
path=path,

0 commit comments

Comments
 (0)