Skip to content

Commit 4a4aafb

Browse files
authored
Merge pull request #793 from rust-lang/chore-gcp-backup-transfer-jobs-at-6-am
chore(gcp-backup): transfer jobs at 6 AM
2 parents 5c4eaf5 + 9330388 commit 4a4aafb

File tree

4 files changed

+718
-0
lines changed

4 files changed

+718
-0
lines changed

extract_failed_files.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Extract all failed files from Google Cloud Storage Transfer Service operations
4+
"""
5+
import json
6+
import subprocess
7+
import sys
8+
from collections import defaultdict
9+
10+
def run_gcloud_command(cmd):
11+
"""Run gcloud command and return JSON output"""
12+
try:
13+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
14+
if result.returncode != 0:
15+
print(f"Error running command: {cmd}", file=sys.stderr)
16+
print(f"Error: {result.stderr}", file=sys.stderr)
17+
return None
18+
return json.loads(result.stdout) if result.stdout.strip() else None
19+
except json.JSONDecodeError as e:
20+
print(f"Failed to parse JSON from command: {cmd}", file=sys.stderr)
21+
print(f"Output: {result.stdout}", file=sys.stderr)
22+
return None
23+
24+
def extract_failed_files_from_operation(operation_name, project_id):
25+
"""Extract failed files from a single transfer operation"""
26+
cmd = f'gcloud transfer operations describe {operation_name} --project={project_id} --format=json'
27+
operation_data = run_gcloud_command(cmd)
28+
29+
if not operation_data or 'metadata' not in operation_data:
30+
return []
31+
32+
failed_files = []
33+
metadata = operation_data['metadata']
34+
35+
# Extract job name for context
36+
job_name = metadata.get('transferJobName', 'unknown')
37+
bucket_name = 'unknown'
38+
39+
if 'transferSpec' in metadata:
40+
if 'awsS3DataSource' in metadata['transferSpec']:
41+
bucket_name = metadata['transferSpec']['awsS3DataSource'].get('bucketName', 'unknown')
42+
43+
# Extract error breakdowns
44+
if 'errorBreakdowns' in metadata:
45+
for error_breakdown in metadata['errorBreakdowns']:
46+
error_code = error_breakdown.get('errorCode', 'UNKNOWN')
47+
error_count = int(error_breakdown.get('errorCount', 0))
48+
49+
# Note: errorLogEntries only shows a sample, not all failed files
50+
if 'errorLogEntries' in error_breakdown:
51+
for error_entry in error_breakdown['errorLogEntries']:
52+
failed_files.append({
53+
'operation': operation_name,
54+
'job_name': job_name,
55+
'bucket': bucket_name,
56+
'url': error_entry.get('url', ''),
57+
'error_code': error_code,
58+
'error_details': error_entry.get('errorDetails', []),
59+
'total_errors_this_type': error_count
60+
})
61+
62+
return failed_files
63+
64+
def main():
65+
project_id = 'rust-asset-backup-production'
66+
67+
# List of failed operations we know about
68+
failed_operations = [
69+
'transferOperations/transferJobs-transfer-crates-io-8112795250505597565',
70+
'transferOperations/transferJobs-transfer-static-rust-lang-org-205732933237355629',
71+
'transferOperations/transferJobs-transfer-crates-io-14989467690258957078',
72+
'transferOperations/transferJobs-transfer-static-rust-lang-org-6742344679027984831'
73+
]
74+
75+
all_failed_files = []
76+
error_summary = defaultdict(int)
77+
bucket_summary = defaultdict(lambda: defaultdict(int))
78+
79+
print("Extracting failed files from transfer operations...")
80+
print("=" * 60)
81+
82+
for operation in failed_operations:
83+
print(f"Processing {operation}")
84+
failed_files = extract_failed_files_from_operation(operation, project_id)
85+
all_failed_files.extend(failed_files)
86+
87+
# Update summaries
88+
for file_info in failed_files:
89+
error_summary[file_info['error_code']] += 1
90+
bucket_summary[file_info['bucket']][file_info['error_code']] += 1
91+
92+
# Print summary
93+
print(f"\nSUMMARY:")
94+
print(f"Total sample failed files extracted: {len(all_failed_files)}")
95+
print(f"\nError types:")
96+
for error_code, count in error_summary.items():
97+
print(f" {error_code}: {count} sample files")
98+
99+
print(f"\nBy bucket:")
100+
for bucket, errors in bucket_summary.items():
101+
print(f" {bucket}:")
102+
for error_code, count in errors.items():
103+
print(f" {error_code}: {count} sample files")
104+
105+
# Group by error type and bucket
106+
print(f"\n" + "=" * 80)
107+
print("DETAILED FAILED FILES LIST")
108+
print("=" * 80)
109+
110+
# Group files by bucket and error code
111+
grouped_files = defaultdict(lambda: defaultdict(list))
112+
for file_info in all_failed_files:
113+
grouped_files[file_info['bucket']][file_info['error_code']].append(file_info)
114+
115+
for bucket, error_groups in grouped_files.items():
116+
print(f"\n🪣 BUCKET: {bucket}")
117+
print("-" * 50)
118+
119+
for error_code, files in error_groups.items():
120+
print(f"\n ❌ ERROR TYPE: {error_code}")
121+
122+
# Show total count for this error type
123+
total_count = files[0]['total_errors_this_type'] if files else 0
124+
print(f" Total files with this error: {total_count}")
125+
print(f" Sample files shown: {len(files)}")
126+
print()
127+
128+
for i, file_info in enumerate(files, 1):
129+
file_url = file_info['url'].replace(f"s3://{bucket}/", "")
130+
print(f" {i:2d}. {file_url}")
131+
if file_info['error_details']:
132+
error_detail = file_info['error_details'][0]
133+
# Truncate very long error messages
134+
if len(error_detail) > 100:
135+
error_detail = error_detail[:100] + "..."
136+
print(f" Error: {error_detail}")
137+
138+
# Write detailed JSON output
139+
output_file = '/Users/marco/proj/simpleinfra/failed_files_detailed.json'
140+
with open(output_file, 'w') as f:
141+
json.dump({
142+
'summary': {
143+
'total_sample_files': len(all_failed_files),
144+
'error_summary': dict(error_summary),
145+
'bucket_summary': {k: dict(v) for k, v in bucket_summary.items()}
146+
},
147+
'failed_files': all_failed_files
148+
}, f, indent=2)
149+
150+
print(f"\n📄 Detailed JSON output written to: {output_file}")
151+
152+
# Important note about limitations
153+
print(f"\n" + "⚠️ " * 20)
154+
print("IMPORTANT NOTE:")
155+
print("The Google Cloud Transfer Service API only returns a SAMPLE of failed files")
156+
print("in the errorLogEntries (typically 5 per error type). The actual number of")
157+
print("failed files is shown in the 'total_errors_this_type' field.")
158+
print("")
159+
print("From the operations analyzed:")
160+
for operation in failed_operations:
161+
cmd = f'gcloud transfer operations describe {operation} --project={project_id} --format="value(metadata.counters.objectsFromSourceFailed)"'
162+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
163+
if result.returncode == 0 and result.stdout.strip():
164+
failed_count = result.stdout.strip()
165+
operation_short = operation.split('/')[-1]
166+
print(f" {operation_short}: {failed_count} total failed files")
167+
168+
if __name__ == '__main__':
169+
main()

0 commit comments

Comments
 (0)