Skip to content

Commit 8259871

Browse files
committed
print as CSV when DRYRUN; add total count during range
1 parent 291234c commit 8259871

File tree

1 file changed

+37
-16
lines changed

1 file changed

+37
-16
lines changed

tools/analyze_data_synchronization/analyze.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -242,32 +242,47 @@ def create_months(start_month, end_month):
242242
months.append(f"{year}-{str(month).zfill(2)}")
243243
return months
244244

245-
def page_count_and_setup(collection, query, range_label, invalid_data):
245+
def page_count_and_setup(collection, all_query, missing_query, range_label, invalid_data):
246246
"""Get the count of pages and set up the stats for the range"""
247-
all_docs_count = collection.count_documents(query,
248-
max_time_ms=10000000)
249-
initialize_stats(range_label, all_docs_count, invalid_data)
250-
if all_docs_count == 0:
251-
print(f"No documents found with missing licenses in {range_label}.")
247+
all_docs_count = collection.count_documents(
248+
all_query,
249+
max_time_ms=10000000
250+
)
251+
252+
docs_with_missing_count = collection.count_documents(
253+
missing_query,
254+
max_time_ms=10000000
255+
)
256+
initialize_stats(range_label, docs_with_missing_count, invalid_data)
257+
if docs_with_missing_count == 0:
258+
if DRYRUN:
259+
print(f"{range_label}, {all_docs_count}, 0%, 0, 0, 0")
260+
else:
261+
print(f"No documents found with missing licenses out of {all_docs_count} total in {range_label}.")
252262
return 0
253263

254264
if INITIAL_SKIP > 0:
255265
print(f"Skipping {INITIAL_SKIP} documents")
256-
all_docs_count -= INITIAL_SKIP
266+
docs_with_missing_count -= INITIAL_SKIP
257267

258-
page_count = all_docs_count // PAGE_SIZE
259-
if all_docs_count % PAGE_SIZE:
268+
page_count = docs_with_missing_count // PAGE_SIZE
269+
if docs_with_missing_count % PAGE_SIZE:
260270
page_count += 1
261271

262272
est_hours_to_complete = round(page_count * 2.5 / 60)
263273
est_completion_time = datetime.now() + timedelta(hours=est_hours_to_complete)
264-
print(f"Found {all_docs_count} documents missing licenses in {range_label}. Estimated time to complete is {est_hours_to_complete} hours ending at {est_completion_time}.")
274+
275+
if DRYRUN:
276+
print(f"{range_label}, {all_docs_count}, {docs_with_missing_count}, {round(docs_with_missing_count/all_docs_count, 4)*100}%, {est_hours_to_complete}, {est_hours_to_complete / 24}")
277+
else:
278+
print(f"Found {docs_with_missing_count} documents missing licenses out of {all_docs_count} total in {range_label}. Estimated time to complete is {est_hours_to_complete} hours ending at {est_completion_time}.")
265279

266280
return page_count
267281

268282
def analyze_docs(collection, query, range_label, invalid_data, one_pass=False):
269283
"""Analyze the documents in the collection for the given query"""
270-
page_count = page_count_and_setup(collection, query, range_label, invalid_data)
284+
missing_query = {**query, "licensed.declared": {"$exists": False}}
285+
page_count = page_count_and_setup(collection, query, missing_query, range_label, invalid_data)
271286
if page_count == 0 or DRYRUN:
272287
return
273288

@@ -279,7 +294,7 @@ def analyze_docs(collection, query, range_label, invalid_data, one_pass=False):
279294
skip = INITIAL_SKIP
280295
while True:
281296
print(f"Processing page {page+1} of {page_count} in {range_label} starting at offset {skip} - {datetime.now()}")
282-
docs = collection.find(query).skip(skip).limit(PAGE_SIZE).max_time_ms(10000000)
297+
docs = collection.find(missing_query).skip(skip).limit(PAGE_SIZE).max_time_ms(10000000)
283298
new_docs_count, new_invalid_count = analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_label, invalid_data)
284299
running_count_invalid += new_invalid_count
285300
running_count_docs += new_docs_count
@@ -395,13 +410,15 @@ def analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_
395410
print("Processing custom date range")
396411
print(f" START_DATE: {START_DATE}")
397412
print(f" END_DATE: {END_DATE}")
398-
413+
414+
if DRYRUN:
415+
print("Range, # all docs, # missing, % missing, est hours to complete}, est days to complete")
416+
399417
label = custom_range_label() if not DRYRUN else f"{custom_range_label()}_dryrun"
400418
analyze_docs(
401419
collection,
402420
{
403421
"_meta.updated": {"$gte": START_DATE, "$lte": END_DATE},
404-
"licensed.declared": {"$exists": False},
405422
},
406423
label,
407424
invalid_data,
@@ -415,15 +432,19 @@ def analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_
415432
months = create_months(START_MONTH, END_MONTH)
416433
print(f" {months}")
417434

435+
if DRYRUN:
436+
print("Range, # all docs, # missing, % missing, est hours to complete}, est days to complete")
437+
418438
for month in months:
419-
print(f"Processing {month}")
439+
if not DRYRUN:
440+
print(f"Processing {month}")
420441

421442
label = month if not DRYRUN else f"{month}_dryrun"
443+
422444
analyze_docs(
423445
collection,
424446
{
425447
"_meta.updated": {"$gte": f"{month}-01", "$lte": f"{month}-31"},
426-
"licensed.declared": {"$exists": False},
427448
},
428449
label,
429450
invalid_data,

0 commit comments

Comments
 (0)