Skip to content

Commit 0dcb2ca

Browse files
committed
Add tests
1 parent 096ca82 commit 0dcb2ca

File tree

2 files changed

+424
-0
lines changed

2 files changed

+424
-0
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
"""
2+
Module for running post-indexing validation.
3+
validate_manifest takes as input a manifest, an api credentials file, and an output path
4+
then attemps to obtain a pre-signed url and download a file from each bucket
5+
to verify it has been indexed, and then generate a report in csv format.
6+
7+
The output format is as follows:
8+
| ACL | Bucket | Protocol | Presigned URL Status | Download Status | GUID |
9+
"""
10+
11+
12+
import csv
13+
from cdislogging import get_logger, get_stream_handler
14+
from gen3.file import Gen3File
15+
import requests
16+
17+
18+
logger = get_logger(__name__)
19+
logger.addHandler(get_stream_handler())
20+
logger.setLevel("INFO")
21+
22+
23+
class GuidError(Exception):
24+
pass
25+
26+
27+
class Record:
28+
def __init__(self, guid, bucket, protocol, acl, access_token, commons, size):
29+
self.guid = guid
30+
self.bucket = bucket
31+
self.protocol = protocol
32+
self.acl = acl
33+
self.commons = commons
34+
self.size = size
35+
self.response_status = -1
36+
self.download_status = -1
37+
self.headers = {
38+
"accept": "application/json",
39+
"authorization": f"bearer {access_token}",
40+
}
41+
42+
def check_record(self, gen3file):
43+
"""
44+
Checks the status of a record by generating a pre-signed URL and attempting to download the file.
45+
46+
This method performs the following actions:
47+
1. Attempts to generate a pre-signed URL for the record identified by `self.guid` using the provided `gen3file` object.
48+
2. Logs the result of the pre-signed URL generation, including the response status code.
49+
3. If the URL is successfully generated (status code 200), it attempts to download the file from the generated URL.
50+
4. Logs the result of the download attempt, including the status code.
51+
5. Sets the `response_status` attribute to indicate the success or failure of the pre-signed URL generation.
52+
6. Sets the `download_status` attribute to indicate the success or failure of the download attempt (if applicable).
53+
54+
Args:
55+
gen3file (object): An object that provides the `get_presigned_url` method to generate a pre-signed URL for the record.
56+
57+
Returns:
58+
None: This function does not return any value. It modifies the `response_status` and `download_status` attributes.
59+
"""
60+
logger.info(f"Checking record {self.guid}")
61+
try:
62+
resp = gen3file.get_presigned_url(self.guid)
63+
url = resp.get("url")
64+
response_status = 200
65+
logger.info(
66+
f"Pre-signed url successfully generated for record {self.guid} with status code {response_status}"
67+
)
68+
except requests.HTTPError as err:
69+
response_status = err.response.status_code
70+
logger.info(f"Pre-signed url generation failed for record {self.guid}")
71+
self.response_status = response_status
72+
73+
download_success = -1
74+
if response_status == 200:
75+
try:
76+
download_success = requests.get(url).status_code
77+
logger.info(
78+
f"Download process complete with status code {download_success}"
79+
)
80+
except:
81+
download_success = -1
82+
self.download_status = download_success
83+
return
84+
85+
86+
class Records:
87+
def __init__(self, auth):
88+
self.auth = auth
89+
self.access_token = auth.get_access_token()
90+
self.commons = auth.endpoint
91+
self.record_dict = {}
92+
self.record_sizes = {}
93+
self.headers = {
94+
"accept": "application/json",
95+
"authorization": f"bearer {self.access_token}",
96+
}
97+
98+
def read_records_from_manifest(self, manifest):
99+
"""
100+
Parses a manifest and creates a dictionary of Record objects.
101+
102+
Args:
103+
manifest (str): the location of a manifest file
104+
"""
105+
if manifest[-3:] == "tsv":
106+
sep = "\t"
107+
else:
108+
sep = ","
109+
with open(manifest, mode="r") as f:
110+
csv_reader = csv.DictReader(f, delimiter=sep)
111+
rows = [row for row in csv_reader]
112+
113+
try:
114+
guid_cols = {"GUID", "guid", "id"}
115+
guid_col = list(guid_cols.intersection(set(csv_reader.fieldnames)))[0]
116+
except IndexError:
117+
raise GuidError(
118+
"Manifest file has no column named 'GUID', 'guid', or 'id'"
119+
)
120+
121+
for row in rows:
122+
url_parsed = False
123+
size = row["size"]
124+
guid = row[guid_col]
125+
for acl in row["acl"].split(" "):
126+
if acl != "admin":
127+
for url in row["url"].split(" "):
128+
if "://" not in url:
129+
continue
130+
else:
131+
protocol, bucket = (
132+
url.split("://")[0].replace("[", ""),
133+
url.split("/")[2],
134+
)
135+
key = (bucket, protocol, acl)
136+
if key not in self.record_dict or (
137+
int(self.record_dict[key].size) >= int(size)
138+
and int(size) != 0
139+
):
140+
record = Record(
141+
guid,
142+
bucket,
143+
protocol,
144+
acl,
145+
self.access_token,
146+
self.commons,
147+
size,
148+
)
149+
self.record_dict[key] = record
150+
url_parsed = True
151+
152+
if url_parsed == False:
153+
logger.warning(f"No url parsed for record {guid}")
154+
155+
def check_records(self):
156+
"""
157+
Iterates through all records in `self.record_dict` and checks each record's status.
158+
159+
This method performs the following actions:
160+
1. Initializes a `Gen3File` object using the authentication information from `self.auth`.
161+
2. Iterates over the items in `self.record_dict`, where each item consists of a tuple `(bucket, protocol, acl)` and a `record` object.
162+
3. For each record, the `check_record` method is called, which attempts to generate a pre-signed URL and check the download status.
163+
164+
Args:
165+
None: This method does not take any arguments beyond `self`.
166+
167+
Returns:
168+
None: This function does not return any value. It triggers the `check_record` method for each record in `self.record_dict`.
169+
"""
170+
gen3file = Gen3File(self.auth)
171+
for (bucket, protocol, acl), record in self.record_dict.items():
172+
record.check_record(gen3file)
173+
174+
def save_download_check_results_to_csv(self, csv_filename):
175+
"""
176+
Generates results from presigned url generation and file downloads.
177+
Output format is: | ACL | Bucket | Protocol | Presigned URL Status | Download Status | GUID |
178+
179+
Args:
180+
csv_filename (str): the relative file path of the output csv
181+
"""
182+
download_results = []
183+
for record in self.record_dict.values():
184+
download_results.append(
185+
{
186+
"acl": record.acl,
187+
"bucket": record.bucket,
188+
"protocol": record.protocol,
189+
"presigned_url_status": record.response_status,
190+
"download_status": record.download_status,
191+
"guid": record.guid,
192+
}
193+
)
194+
195+
self.download_results = download_results
196+
197+
# Check if the results list is empty
198+
if not download_results:
199+
logger.warning("No results to save.")
200+
return
201+
202+
# Define the CSV file header
203+
fieldnames = [
204+
"acl",
205+
"bucket",
206+
"protocol",
207+
"presigned_url_status",
208+
"download_status",
209+
"guid",
210+
]
211+
212+
with open(csv_filename, mode="w", newline="") as csv_file:
213+
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
214+
215+
# Write the header row
216+
writer.writeheader()
217+
218+
# Iterate through the DownloadCheckResult instances and write each row
219+
for result in download_results:
220+
writer.writerow(
221+
{
222+
"ACL": result["acl"],
223+
"Bucket": result["bucket"],
224+
"Protocol": result["protocol"],
225+
"Presigned URL Status": result["presigned_url_status"],
226+
"Download Status": result["download_status"],
227+
"GUID": result["guid"],
228+
}
229+
)
230+
231+
logger.info(f"Results saved to {csv_filename}")
232+
233+
234+
def validate_manifest(MANIFEST, auth, output_file="results.csv"):
235+
"""
236+
Takes as input a manifest location, a Gen3Auth instance, and an output file
237+
Attempts to obtain a presigned url from a record from each bucket then download the file.
238+
Outputs report in csv format.
239+
240+
Args:
241+
MANIFEST (str): the location of a manifest file
242+
api_key (str): the location of an api credentials file
243+
auth (str): a Gen3Auth instance
244+
"""
245+
logger.info("Starting...")
246+
records = Records(auth)
247+
records.read_records_from_manifest(MANIFEST)
248+
records.check_records()
249+
records.save_download_check_results_to_csv(output_file)
250+
return records

0 commit comments

Comments
 (0)