Skip to content

Commit 54fad1e

Browse files
committed
Bumping version to 0.0.10
1 parent 5c02b21 commit 54fad1e

33 files changed

+1989
-1754
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,6 @@ python/
134134

135135
# SAM
136136
.aws-sam
137-
testing/*parameters-*.json
137+
testing/*parameters-*.properties
138138
testing/*requirements*.txt
139139
building/*requirements*.txt

README.md

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22

33
> Utility belt to handle data on AWS.
44
5-
[![Release](https://img.shields.io/badge/release-0.0.9-brightgreen.svg)](https://pypi.org/project/awswrangler/)
5+
[![Release](https://img.shields.io/badge/release-0.0.10-brightgreen.svg)](https://pypi.org/project/awswrangler/)
66
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
77
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
88
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)
9-
[![Coverage](https://img.shields.io/badge/coverage-83%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
9+
[![Coverage](https://img.shields.io/badge/coverage-87%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
1010
[![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/awslabs/aws-data-wrangler.svg)](http://isitmaintained.com/project/awslabs/aws-data-wrangler "Average time to resolve an issue")
1111
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
1212

@@ -35,7 +35,8 @@
3535

3636
### PySpark
3737
* PySpark -> Redshift (Parallel)
38-
* Register Glue table from Dataframe stored on S3 (NEW :star:)
38+
* Register Glue table from Dataframe stored on S3
39+
* Flatten nested DataFrames (NEW :star:)
3940

4041
### General
4142
* List S3 objects (Parallel)
@@ -45,7 +46,12 @@
4546
* Copy listed S3 objects (Parallel)
4647
* Get the size of S3 objects (Parallel)
4748
* Get CloudWatch Logs Insights query results
48-
* Load partitions on Athena/Glue table (repair table) (NEW :star:)
49+
* Load partitions on Athena/Glue table (repair table)
50+
* Create EMR cluster (For humans) (NEW :star:)
51+
* Terminate EMR cluster (NEW :star:)
52+
* Get EMR cluster state (NEW :star:)
53+
* Submit EMR step (For humans) (NEW :star:)
54+
* Ger EMR step state (NEW :star:)
4955

5056
## Installation
5157

@@ -195,6 +201,16 @@ session.spark.create_glue_table(dataframe=dataframe,
195201
database="my_database")
196202
```
197203

204+
#### Flatten nested PySpark DataFrame
205+
206+
```py3
207+
session = awswrangler.Session(spark_session=spark)
208+
dfs = session.spark.flatten(df=df_nested)
209+
for name, df_flat in dfs:
210+
print(name)
211+
df_flat.show()
212+
```
213+
198214
### General
199215

200216
#### Deleting a bunch of S3 objects (parallel)
@@ -221,6 +237,51 @@ session = awswrangler.Session()
221237
session.athena.repair_table(database="db_name", table="tbl_name")
222238
```
223239

240+
#### Create EMR cluster
241+
242+
```py3
243+
session = awswrangler.Session()
244+
cluster_id = session.emr.create_cluster(
245+
cluster_name="wrangler_cluster",
246+
logging_s3_path=f"s3://BUCKET_NAME/emr-logs/",
247+
emr_release="emr-5.27.0",
248+
subnet_id="SUBNET_ID",
249+
emr_ec2_role="EMR_EC2_DefaultRole",
250+
emr_role="EMR_DefaultRole",
251+
instance_type_master="m5.xlarge",
252+
instance_type_core="m5.xlarge",
253+
instance_type_task="m5.xlarge",
254+
instance_ebs_size_master=50,
255+
instance_ebs_size_core=50,
256+
instance_ebs_size_task=50,
257+
instance_num_on_demand_master=1,
258+
instance_num_on_demand_core=1,
259+
instance_num_on_demand_task=1,
260+
instance_num_spot_master=0,
261+
instance_num_spot_core=1,
262+
instance_num_spot_task=1,
263+
spot_bid_percentage_of_on_demand_master=100,
264+
spot_bid_percentage_of_on_demand_core=100,
265+
spot_bid_percentage_of_on_demand_task=100,
266+
spot_provisioning_timeout_master=5,
267+
spot_provisioning_timeout_core=5,
268+
spot_provisioning_timeout_task=5,
269+
spot_timeout_to_on_demand_master=True,
270+
spot_timeout_to_on_demand_core=True,
271+
spot_timeout_to_on_demand_task=True,
272+
python3=True,
273+
spark_glue_catalog=True,
274+
hive_glue_catalog=True,
275+
presto_glue_catalog=True,
276+
bootstraps_paths=None,
277+
debugging=True,
278+
applications=["Hadoop", "Spark", "Ganglia", "Hive"],
279+
visible_to_all_users=True,
280+
key_pair_name=None,
281+
)
282+
print(cluster_id)
283+
```
284+
224285
## Diving Deep
225286

226287

awswrangler/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from awswrangler.cloudwatchlogs import CloudWatchLogs # noqa
1010
from awswrangler.glue import Glue # noqa
1111
from awswrangler.redshift import Redshift # noqa
12+
from awswrangler.emr import EMR # noqa
1213
import awswrangler.utils # noqa
1314
import awswrangler.data_types # noqa
1415

awswrangler/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
__title__ = "awswrangler"
22
__description__ = "Utility belt to handle data on AWS."
3-
__version__ = "0.0.9"
3+
__version__ = "0.0.10"
44
__license__ = "Apache License 2.0"

awswrangler/athena.py

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,10 @@
1515
class Athena:
1616
def __init__(self, session):
1717
self._session = session
18-
self._client_athena = session.boto3_session.client(
19-
service_name="athena", config=session.botocore_config
20-
)
18+
self._client_athena = session.boto3_session.client(service_name="athena", config=session.botocore_config)
2119

2220
def get_query_columns_metadata(self, query_execution_id):
23-
response = self._client_athena.get_query_results(
24-
QueryExecutionId=query_execution_id, MaxResults=1
25-
)
21+
response = self._client_athena.get_query_results(QueryExecutionId=query_execution_id, MaxResults=1)
2622
col_info = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]
2723
return {x["Name"]: x["Type"] for x in col_info}
2824

@@ -54,11 +50,8 @@ def create_athena_bucket(self):
5450
5551
:return: Bucket s3 path (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
5652
"""
57-
account_id = (
58-
self._session.boto3_session.client(
59-
service_name="sts", config=self._session.botocore_config
60-
).get_caller_identity().get("Account")
61-
)
53+
account_id = (self._session.boto3_session.client(
54+
service_name="sts", config=self._session.botocore_config).get_caller_identity().get("Account"))
6255
session_region = self._session.boto3_session.region_name
6356
s3_output = f"s3://aws-athena-query-results-{account_id}-{session_region}/"
6457
s3_resource = self._session.boto3_session.resource("s3")
@@ -80,12 +73,10 @@ def run_query(self, query, database, s3_output=None, workgroup=None):
8073
if workgroup is None:
8174
workgroup = self._session.athena_workgroup
8275
logger.debug(f"Workgroup: {workgroup}")
83-
response = self._client_athena.start_query_execution(
84-
QueryString=query,
85-
QueryExecutionContext={"Database": database},
86-
ResultConfiguration={"OutputLocation": s3_output},
87-
WorkGroup=workgroup
88-
)
76+
response = self._client_athena.start_query_execution(QueryString=query,
77+
QueryExecutionContext={"Database": database},
78+
ResultConfiguration={"OutputLocation": s3_output},
79+
WorkGroup=workgroup)
8980
return response["QueryExecutionId"]
9081

9182
def wait_query(self, query_execution_id):
@@ -103,9 +94,7 @@ def wait_query(self, query_execution_id):
10394
response = self._client_athena.get_query_execution(QueryExecutionId=query_execution_id)
10495
state = response["QueryExecution"]["Status"]["State"]
10596
logger.debug(f"state: {state}")
106-
logger.debug(
107-
f"StateChangeReason: {response['QueryExecution']['Status'].get('StateChangeReason')}"
108-
)
97+
logger.debug(f"StateChangeReason: {response['QueryExecution']['Status'].get('StateChangeReason')}")
10998
if state == "FAILED":
11099
raise QueryFailed(response["QueryExecution"]["Status"].get("StateChangeReason"))
111100
elif state == "CANCELLED":
@@ -129,17 +118,13 @@ def repair_table(self, database, table, s3_output=None, workgroup=None):
129118
:return: Query execution ID
130119
"""
131120
query = f"MSCK REPAIR TABLE {table};"
132-
query_id = self.run_query(
133-
query=query, database=database, s3_output=s3_output, workgroup=workgroup
134-
)
121+
query_id = self.run_query(query=query, database=database, s3_output=s3_output, workgroup=workgroup)
135122
self.wait_query(query_execution_id=query_id)
136123
return query_id
137124

138125
@staticmethod
139126
def _normalize_name(name):
140-
name = "".join(
141-
c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn"
142-
)
127+
name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn")
143128
name = name.replace(" ", "_")
144129
name = name.replace("-", "_")
145130
name = name.replace(".", "_")

awswrangler/cloudwatchlogs.py

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,14 @@
1212
class CloudWatchLogs:
1313
def __init__(self, session):
1414
self._session = session
15-
self._client_logs = session.boto3_session.client(
16-
service_name="logs", config=session.botocore_config
17-
)
15+
self._client_logs = session.boto3_session.client(service_name="logs", config=session.botocore_config)
1816

19-
def start_query(
20-
self,
21-
query,
22-
log_group_names,
23-
start_time=datetime(year=1970, month=1, day=1),
24-
end_time=datetime.utcnow(),
25-
limit=None
26-
):
17+
def start_query(self,
18+
query,
19+
log_group_names,
20+
start_time=datetime(year=1970, month=1, day=1),
21+
end_time=datetime.utcnow(),
22+
limit=None):
2723
"""
2824
Run a query against AWS CloudWatchLogs Insights and wait the results
2925
https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/CWL_QuerySyntax.html
@@ -72,14 +68,12 @@ def wait_query(self, query_id):
7268
raise QueryCancelled(f"query ID: {query_id}")
7369
return response
7470

75-
def query(
76-
self,
77-
query,
78-
log_group_names,
79-
start_time=datetime(year=1970, month=1, day=1),
80-
end_time=datetime.utcnow(),
81-
limit=None
82-
):
71+
def query(self,
72+
query,
73+
log_group_names,
74+
start_time=datetime(year=1970, month=1, day=1),
75+
end_time=datetime.utcnow(),
76+
limit=None):
8377
"""
8478
Run a query against AWS CloudWatchLogs Insights and wait the results
8579
https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/CWL_QuerySyntax.html
@@ -91,12 +85,10 @@ def query(
9185
:param limit: The maximum number of log events to return in the query.
9286
:return: Results
9387
"""
94-
query_id = self.start_query(
95-
query=query,
96-
log_group_names=log_group_names,
97-
start_time=start_time,
98-
end_time=end_time,
99-
limit=limit
100-
)
88+
query_id = self.start_query(query=query,
89+
log_group_names=log_group_names,
90+
start_time=start_time,
91+
end_time=end_time,
92+
limit=limit)
10193
response = self.wait_query(query_id=query_id)
10294
return response["results"]

awswrangler/data_types.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,8 @@ def convert_schema(func: Callable, schema: List[Tuple[str, str]]) -> Dict[str, s
294294
return {name: func(dtype) for name, dtype in schema}
295295

296296

297-
def extract_pyarrow_schema_from_pandas(
298-
dataframe: pd.DataFrame, preserve_index: bool, indexes_position: str = "right"
299-
) -> List[Tuple[str, str]]:
297+
def extract_pyarrow_schema_from_pandas(dataframe: pd.DataFrame, preserve_index: bool,
298+
indexes_position: str = "right") -> List[Tuple[str, str]]:
300299
"""
301300
Extract the related Pyarrow schema from any Pandas DataFrame
302301

0 commit comments

Comments
 (0)