From 26e92d087a888fe78430a404f44ff68a4cbaee06 Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Fri, 18 Jul 2025 13:37:02 -0700 Subject: [PATCH 1/3] feature: s3 bucket path checking --- .githooks/pre-commit | 31 ++++++++++++++++++++++++++ s3_bucket_validator.py | 49 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 .githooks/pre-commit create mode 100644 s3_bucket_validator.py diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100644 index 0000000000..159b85a818 --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,31 @@ +#!/bin/sh +# This pre-commit hook validates S3 bucket references in modified files + +set -e + +# Get list of staged files +staged_files=$(git diff --cached --name-only --diff-filter=ACMR | grep -E '\.(py|ipynb|md|rst|yaml|yml|json)$' || true) + +if [ -z "$staged_files" ]; then + echo "No relevant files to check for S3 bucket references." + exit 0 +fi + +echo "Checking S3 bucket references in staged files..." +has_invalid_buckets=0 + +for file in $staged_files; do + echo "Validating S3 references in $file" + python s3_bucket_validator.py "$file" + if [ $? -ne 0 ]; then + has_invalid_buckets=1 + fi +done + +if [ $has_invalid_buckets -ne 0 ]; then + echo "ERROR: Invalid S3 bucket references found. Please fix them before committing." + exit 1 +fi + +echo "S3 bucket validation passed." +exit 0 diff --git a/s3_bucket_validator.py b/s3_bucket_validator.py new file mode 100644 index 0000000000..f73b8c3871 --- /dev/null +++ b/s3_bucket_validator.py @@ -0,0 +1,49 @@ +import re + +import boto3 +from botocore.exceptions import ClientError + + +def is_bucket_accessible(bucket_name): + s3 = boto3.client('s3') + try: + s3.head_bucket(Bucket=bucket_name) + return True + except ClientError as e: + error_code = int(e.response['Error']['Code']) + if error_code == 403: + print(f"Bucket {bucket_name} exists, but you don't have permission to access it.") + elif error_code == 404: + print(f"Bucket {bucket_name} does not exist.") + else: + print(f"Error checking bucket {bucket_name}: {e}") + return False + +def validate_s3_references(file_path): + with open(file_path, 'r') as file: + content = file.read() + + s3_pattern = re.compile(r's3:\/\/([a-zA-Z0-9._-]+)') + matches = s3_pattern.findall(content) + + invalid_buckets = [] + for bucket in matches: + if not is_bucket_accessible(bucket): + invalid_buckets.append(bucket) + + return invalid_buckets + +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + print("Usage: python s3_bucket_validator.py ") + sys.exit(1) + + file_path = sys.argv[1] + invalid_buckets = validate_s3_references(file_path) + + if invalid_buckets: + print(f"Invalid or inaccessible S3 buckets found: {', '.join(invalid_buckets)}") + sys.exit(1) + else: + print("All referenced S3 buckets are valid and accessible.") \ No newline at end of file From ee438e4f049037caf57982fe2baa63191d8dfe5c Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Fri, 18 Jul 2025 14:18:16 -0700 Subject: [PATCH 2/3] format: run black --- s3_bucket_validator.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/s3_bucket_validator.py b/s3_bucket_validator.py index f73b8c3871..1dfa271c57 100644 --- a/s3_bucket_validator.py +++ b/s3_bucket_validator.py @@ -5,12 +5,12 @@ def is_bucket_accessible(bucket_name): - s3 = boto3.client('s3') + s3 = boto3.client("s3") try: s3.head_bucket(Bucket=bucket_name) return True except ClientError as e: - error_code = int(e.response['Error']['Code']) + error_code = int(e.response["Error"]["Code"]) if error_code == 403: print(f"Bucket {bucket_name} exists, but you don't have permission to access it.") elif error_code == 404: @@ -19,11 +19,12 @@ def is_bucket_accessible(bucket_name): print(f"Error checking bucket {bucket_name}: {e}") return False + def validate_s3_references(file_path): - with open(file_path, 'r') as file: + with open(file_path, "r") as file: content = file.read() - s3_pattern = re.compile(r's3:\/\/([a-zA-Z0-9._-]+)') + s3_pattern = re.compile(r"s3:\/\/([a-zA-Z0-9._-]+)") matches = s3_pattern.findall(content) invalid_buckets = [] @@ -33,8 +34,10 @@ def validate_s3_references(file_path): return invalid_buckets + if __name__ == "__main__": import sys + if len(sys.argv) < 2: print("Usage: python s3_bucket_validator.py ") sys.exit(1) @@ -46,4 +49,4 @@ def validate_s3_references(file_path): print(f"Invalid or inaccessible S3 buckets found: {', '.join(invalid_buckets)}") sys.exit(1) else: - print("All referenced S3 buckets are valid and accessible.") \ No newline at end of file + print("All referenced S3 buckets are valid and accessible.") From 13696de059b1a8d81389e61cca7835d562a4246c Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Fri, 18 Jul 2025 14:26:14 -0700 Subject: [PATCH 3/3] format: add future import --- s3_bucket_validator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/s3_bucket_validator.py b/s3_bucket_validator.py index 1dfa271c57..4d86294096 100644 --- a/s3_bucket_validator.py +++ b/s3_bucket_validator.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import + import re import boto3