1515import math
1616
1717from delphi .epidata .acquisition .rvdss .constants import (
18- DASHBOARD_BASE_URLS_2023 , HISTORIC_SEASON_URL ,
18+ DASHBOARD_BASE_URLS_2023_2024_SEASON , HISTORIC_SEASON_URL ,
1919 ALTERNATIVE_SEASON_BASE_URL , SEASON_BASE_URL , LAST_WEEK_OF_YEAR ,
2020 RESP_COUNTS_OUTPUT_FILE , POSITIVE_TESTS_OUTPUT_FILE
2121 )
@@ -91,9 +91,8 @@ def get_report_date(week,start_year,epi=False):
9191 report_date = str (epi_week )
9292
9393 return (report_date )
94-
9594
96- def parse_table_captions (soup ):
95+ def extract_captions_of_interest (soup ):
9796 """
9897 finds all the table captions for the current week so tables can be identified
9998
@@ -369,6 +368,8 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
369368 return (table )
370369
371370def get_season_reports (url ):
371+ # From the url, go to the main landing page for a season
372+ # which contains all the links to each week in the season
372373 page = requests .get (url )
373374 soup = BeautifulSoup (page .text ,'html.parser' )
374375
@@ -387,7 +388,9 @@ def get_season_reports(url):
387388 current_week = weeks [week_num ]
388389 current_week_end = end_dates [week_num ]
389390
390- # Skip empty pages
391+ # In the 2019=2020 season, the webpages for weeks 5 and 47 only have
392+ # the abbreviations table and the headers for the respiratory detections
393+ # table, so they are effectively empty, and skipped
391394 if season [0 ] == '2019' :
392395 if current_week == 5 or current_week == 47 :
393396 continue
@@ -396,7 +399,7 @@ def get_season_reports(url):
396399 temp_url = urls [week_num ]
397400 temp_page = requests .get (temp_url )
398401 new_soup = BeautifulSoup (temp_page .text , 'html.parser' )
399- captions = parse_table_captions (new_soup )
402+ captions = extract_captions_of_interest (new_soup )
400403 modified_date = get_modified_dates (new_soup ,current_week_end )
401404
402405 positive_tables = []
@@ -405,55 +408,87 @@ def get_season_reports(url):
405408 caption = captions [i ]
406409 tab = caption .find_next ('table' )
407410
408- # Remove footers from tables
411+ # Remove footers from tables so the text isn't read in as a table row
409412 if tab .find ('tfoot' ):
410413 tab .tfoot .decompose ()
411414
412- # Delete duplicate entry from week 35 of the 2019-2020 season
415+ # In the positive adenovirus table in week 35 of the 2019-2020 season
416+ # The week number has been duplicated, which makes all the entries in the table
417+ # are one column to the right of where they should be. To fix this the
418+ # entry in the table (which is the first "td" element in the html) is deleted
413419 if season [0 ] == '2019' and current_week == 35 :
414420 if "Positive Adenovirus" in caption .text :
415421 tab .select_one ('td' ).decompose ()
416422
417423 # Replace commas with periods
424+ # Some "number of detections" tables have number with commas (i.e 1,000)
425+ # In this case the commas must be deleted, otherwise turn into periods
426+ # because some tables have commas instead of decimal points
418427 if "number" not in caption .text .lower ():
419428 tab = re .sub ("," ,r"." ,str (tab ))
420429 else :
421430 tab = re .sub ("," ,"" ,str (tab ))
422431
423- # Read table
432+ # Read table, coding all the abbreviations for missing data into NA
433+ # Also use dropna because removing footers causes the html to have an empty row
424434 na_values = ['N.A.' ,'N.A' , 'N.C.' ,'N.R.' ,'Not Available' ,'Not Tested' ,"N.D." ,"-" ]
425435 table = pd .read_html (tab ,na_values = na_values )[0 ].dropna (how = "all" )
426436
427437 # Check for multiline headers
438+ # If there are any, combine them into a single line header
428439 if isinstance (table .columns , pd .MultiIndex ):
429440 table .columns = [c [0 ] + " " + c [1 ] if c [0 ] != c [1 ] else c [0 ] for c in table .columns ]
430441
431442 # Make column names lowercase
432443 table .columns = table .columns .str .lower ()
433444
445+ # One-off edge cases where tables need to be manually adjusted because
446+ # they will cause errors otherwise
434447 if season [0 ] == '2017' :
435448 if current_week == 35 and "entero" in caption .text .lower ():
436- # Remove french from headers in week 35 for the entero table
449+ # The positive enterovirus table in week 35 of the 2017-2018 season has french
450+ # in the headers,so the french needs to be removed
437451 table .columns = ['week' , 'week end' , 'canada tests' , 'entero/rhino%' , 'at tests' ,
438452 'entero/rhino%.1' , 'qc tests' , 'entero/rhino%.2' , 'on tests' ,
439453 'entero/rhino%.3' , 'pr tests' , 'entero/rhino%.4' , 'bc tests' ,
440454 'entero/rhino%.5' ]
441455 elif current_week == 35 and "adeno" in caption .text .lower ():
442- # Remove > from column name
456+ # In week 35 of the 2017-2018, the positive adenovirus table has ">week end"
457+ # instead of "week end", so remove > from the column
443458 table = table .rename (columns = {'>week end' :"week end" })
444459 elif current_week == 47 and "rsv" in caption .text .lower ():
445- # fix date written as 201-11-25
460+ # In week 47 of the 2017-2018 season, a date is written as 201-11-25,
461+ # instead of 2017-11-25
446462 table .loc [table ['week' ] == 47 , 'week end' ] = "2017-11-25"
447463 elif season [0 ] == '2015' and current_week == 41 :
448- # Fix date written m-d-y not d-m-y
464+ # In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
449465 table = table .replace ("10-17-2015" ,"17-10-2015" ,regex = True )
450466 elif season [0 ] == '2022' and current_week == 11 and "hmpv" in caption .text .lower ():
451- # fix date written as 022-09-03
467+ # In week 11 of the 2022-2023 season, in the positive hmpv table,
468+ # a date is written as 022-09-03, instead of 2022-09-03
452469 table .loc [table ['week' ] == 35 , 'week end' ] = "2022-09-03"
453470
454471 # Rename columns
455472 table = preprocess_table_columns (table )
456473
474+ # If "reporting laboratory" is one of the columns of the table, the table must be
475+ # the "Respiratory virus detections " table for a given week
476+ # this is the lab level table that has weekly positive tests for each virus, with no revisions
477+ # and each row represents a lab
478+
479+ # If "number" is in the table caption, the table must be the
480+ # "Number of positive respiratory detections" table, for a given week
481+ # this is a national level table, reporting the number of detections for each virus,
482+ # this table has revisions, so each row is a week in the season, with weeks going from the
483+ # start of the season up to and including the current week
484+
485+ # If "positive" is in the table caption, the table must be one of the
486+ # "Positive [virus] Tests (%)" table, for a given week
487+ # This is a region level table, reporting the total tests and percent positive tests for each virus,
488+ # this table has revisions, so each row is a week in the season, with weeks going from the
489+ # start of the season up to and including the current week
490+ # The columns have the region information (i.e Pr tests, meaning this columns has the tests for the prairies)
491+
457492 if "reporting laboratory" in str (table .columns ):
458493 respiratory_detection_table = create_detections_table (table ,modified_date ,current_week ,current_week_end ,season [0 ])
459494 respiratory_detection_table = respiratory_detection_table .set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
@@ -465,9 +500,13 @@ def get_season_reports(url):
465500 flu = " influenza" in caption .text .lower ()
466501
467502 # tables are missing week 53
468- if season [0 ]== "2014" and current_week == 2 :
469- overwrite_weeks = True
470- elif season [0 ]== "2014" and current_week == 3 :
503+ # In the 2014-2015 season the year ends at week 53 before starting at week 1 again.
504+ # weeks 53,2 and 3 skip week 53 in the positive detection tables, going from 52 to 1,
505+ # this means the week numbers following 52 are 1 larger then they should be
506+ # fix this by overwriting the week number columns
507+
508+ missing_week_53 = [53 ,2 ,3 ]
509+ if season [0 ]== "2014" and current_week in missing_week_53 :
471510 overwrite_weeks = True
472511 else :
473512 overwrite_weeks = False
@@ -491,6 +530,8 @@ def get_season_reports(url):
491530
492531 # Check if the indices are already in the season table
493532 # If not, add the weeks tables into the season table
533+
534+ # check for deduplication pandas
494535 if not respiratory_detection_table .index .isin (all_respiratory_detection_table .index ).any ():
495536 all_respiratory_detection_table = pd .concat ([all_respiratory_detection_table ,respiratory_detection_table ])
496537
@@ -519,7 +560,7 @@ def main():
519560 old_detection_data = pd .read_csv ('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
520561 old_positive_data = pd .read_csv ('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
521562
522- for base_url in DASHBOARD_BASE_URLS_2023 :
563+ for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON :
523564 # Get weekly dashboard data
524565 weekly_data = get_weekly_data (base_url ,2023 ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
525566 positive_data = get_revised_data (base_url )
0 commit comments