@@ -242,32 +242,47 @@ def create_months(start_month, end_month):
242242 months .append (f"{ year } -{ str (month ).zfill (2 )} " )
243243 return months
244244
245- def page_count_and_setup (collection , query , range_label , invalid_data ):
245+ def page_count_and_setup (collection , all_query , missing_query , range_label , invalid_data ):
246246 """Get the count of pages and set up the stats for the range"""
247- all_docs_count = collection .count_documents (query ,
248- max_time_ms = 10000000 )
249- initialize_stats (range_label , all_docs_count , invalid_data )
250- if all_docs_count == 0 :
251- print (f"No documents found with missing licenses in { range_label } ." )
247+ all_docs_count = collection .count_documents (
248+ all_query ,
249+ max_time_ms = 10000000
250+ )
251+
252+ docs_with_missing_count = collection .count_documents (
253+ missing_query ,
254+ max_time_ms = 10000000
255+ )
256+ initialize_stats (range_label , docs_with_missing_count , invalid_data )
257+ if docs_with_missing_count == 0 :
258+ if DRYRUN :
259+ print (f"{ range_label } , { all_docs_count } , 0%, 0, 0, 0" )
260+ else :
261+ print (f"No documents found with missing licenses out of { all_docs_count } total in { range_label } ." )
252262 return 0
253263
254264 if INITIAL_SKIP > 0 :
255265 print (f"Skipping { INITIAL_SKIP } documents" )
256- all_docs_count -= INITIAL_SKIP
266+ docs_with_missing_count -= INITIAL_SKIP
257267
258- page_count = all_docs_count // PAGE_SIZE
259- if all_docs_count % PAGE_SIZE :
268+ page_count = docs_with_missing_count // PAGE_SIZE
269+ if docs_with_missing_count % PAGE_SIZE :
260270 page_count += 1
261271
262272 est_hours_to_complete = round (page_count * 2.5 / 60 )
263273 est_completion_time = datetime .now () + timedelta (hours = est_hours_to_complete )
264- print (f"Found { all_docs_count } documents missing licenses in { range_label } . Estimated time to complete is { est_hours_to_complete } hours ending at { est_completion_time } ." )
274+
275+ if DRYRUN :
276+ print (f"{ range_label } , { all_docs_count } , { docs_with_missing_count } , { round (docs_with_missing_count / all_docs_count , 4 )* 100 } %, { est_hours_to_complete } , { est_hours_to_complete / 24 } " )
277+ else :
278+ print (f"Found { docs_with_missing_count } documents missing licenses out of { all_docs_count } total in { range_label } . Estimated time to complete is { est_hours_to_complete } hours ending at { est_completion_time } ." )
265279
266280 return page_count
267281
268282def analyze_docs (collection , query , range_label , invalid_data , one_pass = False ):
269283 """Analyze the documents in the collection for the given query"""
270- page_count = page_count_and_setup (collection , query , range_label , invalid_data )
284+ missing_query = {** query , "licensed.declared" : {"$exists" : False }}
285+ page_count = page_count_and_setup (collection , query , missing_query , range_label , invalid_data )
271286 if page_count == 0 or DRYRUN :
272287 return
273288
@@ -279,7 +294,7 @@ def analyze_docs(collection, query, range_label, invalid_data, one_pass=False):
279294 skip = INITIAL_SKIP
280295 while True :
281296 print (f"Processing page { page + 1 } of { page_count } in { range_label } starting at offset { skip } - { datetime .now ()} " )
282- docs = collection .find (query ).skip (skip ).limit (PAGE_SIZE ).max_time_ms (10000000 )
297+ docs = collection .find (missing_query ).skip (skip ).limit (PAGE_SIZE ).max_time_ms (10000000 )
283298 new_docs_count , new_invalid_count = analyze_page_of_docs (docs , running_count_docs , running_count_invalid , range_label , invalid_data )
284299 running_count_invalid += new_invalid_count
285300 running_count_docs += new_docs_count
@@ -395,13 +410,15 @@ def analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_
395410 print ("Processing custom date range" )
396411 print (f" START_DATE: { START_DATE } " )
397412 print (f" END_DATE: { END_DATE } " )
398-
413+
414+ if DRYRUN :
415+ print ("Range, # all docs, # missing, % missing, est hours to complete}, est days to complete" )
416+
399417 label = custom_range_label () if not DRYRUN else f"{ custom_range_label ()} _dryrun"
400418 analyze_docs (
401419 collection ,
402420 {
403421 "_meta.updated" : {"$gte" : START_DATE , "$lte" : END_DATE },
404- "licensed.declared" : {"$exists" : False },
405422 },
406423 label ,
407424 invalid_data ,
@@ -415,15 +432,19 @@ def analyze_page_of_docs(docs, running_count_docs, running_count_invalid, range_
415432 months = create_months (START_MONTH , END_MONTH )
416433 print (f" { months } " )
417434
435+ if DRYRUN :
436+ print ("Range, # all docs, # missing, % missing, est hours to complete}, est days to complete" )
437+
418438 for month in months :
419- print (f"Processing { month } " )
439+ if not DRYRUN :
440+ print (f"Processing { month } " )
420441
421442 label = month if not DRYRUN else f"{ month } _dryrun"
443+
422444 analyze_docs (
423445 collection ,
424446 {
425447 "_meta.updated" : {"$gte" : f"{ month } -01" , "$lte" : f"{ month } -31" },
426- "licensed.declared" : {"$exists" : False },
427448 },
428449 label ,
429450 invalid_data ,
0 commit comments