diff --git a/DESCRIPTION b/DESCRIPTION index afb958d..64effdc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: IncidencePrevalence Title: Estimate Incidence and Prevalence using the OMOP Common Data Model -Version: 1.2.1 +Version: 1.2.2 Authors@R: c( person("Edward", "Burn", email = "edward.burn@ndorms.ox.ac.uk", role = c("aut", "cre"), @@ -25,7 +25,7 @@ Authors@R: c( Description: Calculate incidence and prevalence using data mapped to the Observational Medical Outcomes Partnership (OMOP) common data model. Incidence and prevalence can be estimated for the total population in a database or for a stratification cohort. Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.1) Imports: diff --git a/GCSMiddleware-3.0.0.24.pkg b/GCSMiddleware-3.0.0.24.pkg new file mode 100644 index 0000000..d0cbcb1 Binary files /dev/null and b/GCSMiddleware-3.0.0.24.pkg differ diff --git a/NAMESPACE b/NAMESPACE index 6b38980..9e3ec32 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,9 +3,11 @@ export("%>%") export(asIncidenceResult) export(asPrevalenceResult) +export(asRollingIncidenceResult) export(attrition) export(availableIncidenceGrouping) export(availablePrevalenceGrouping) +export(availableRollingIncidenceGrouping) export(benchmarkIncidencePrevalence) export(bind) export(cohortCodelist) @@ -13,6 +15,7 @@ export(cohortCount) export(estimateIncidence) export(estimatePeriodPrevalence) export(estimatePointPrevalence) +export(estimateRollingIncidence) export(exportSummarisedResult) export(generateDenominatorCohortSet) export(generateTargetDenominatorCohortSet) @@ -24,6 +27,7 @@ export(plotIncidence) export(plotIncidencePopulation) export(plotPrevalence) export(plotPrevalencePopulation) +export(plotRollingIncidence) export(settings) export(suppress) export(tableIncidence) diff --git a/R/estimateIncidence.R b/R/estimateIncidence.R index b8e3901..554c7a8 100644 --- a/R/estimateIncidence.R +++ b/R/estimateIncidence.R @@ -58,6 +58,8 @@ #' stratify estimates. #' @param includeOverallStrata Whether to include an overall result as well as #' strata specific results (when strata has been specified). +#' @param rateDenominator The denominator to use for the incidence rate +#' calculation. Default is 100000. #' #' @return Incidence estimates #' @export @@ -87,7 +89,8 @@ estimateIncidence <- function(cdm, outcomeWashout = Inf, repeatedEvents = FALSE, strata = list(), - includeOverallStrata = TRUE) { + includeOverallStrata = TRUE, + rateDenominator = 100000) { startCollect <- Sys.time() tablePrefix <- paste0( @@ -98,6 +101,7 @@ estimateIncidence <- function(cdm, if (is.character(interval)) { interval <- tolower(interval) } + rateDenominator <- as.integer(rateDenominator) cohortIds <- checkInputEstimateIncidence( cdm, denominatorTable, outcomeTable, censorTable, @@ -291,7 +295,8 @@ estimateIncidence <- function(cdm, tablePrefix = tablePrefix, analysisId = x$analysis_id, strata = strata, - includeOverallStrata = includeOverallStrata + includeOverallStrata = includeOverallStrata, + rateDenominator = rateDenominator ) workingIncIr <- workingInc[["ir"]] %>% @@ -374,7 +379,7 @@ estimateIncidence <- function(cdm, if (nrow(irs) > 0) { irs <- irs %>% dplyr::bind_cols( - incRateCiExact(irs$outcome_count, irs$person_years) + incRateCiExact(irs$outcome_count, irs$person_years, rateDenominator) ) } @@ -437,8 +442,9 @@ estimateIncidence <- function(cdm, tidyr::pivot_longer( cols = c( "denominator_count", "outcome_count", "person_days", "person_years", - "incidence_100000_pys", "incidence_100000_pys_95CI_lower", - "incidence_100000_pys_95CI_upper" + paste0("incidence_", rateDenominator, "_pys"), + paste0("incidence_", rateDenominator, "_pys_95CI_lower"), + paste0("incidence_", rateDenominator, "_pys_95CI_upper") ), names_to = "estimate_name", values_to = "estimate_value" @@ -503,15 +509,20 @@ estimateIncidence <- function(cdm, -incRateCiExact <- function(ev, pt) { +incRateCiExact <- function(ev, pt, rateDenominator) { + rateDenominator <- as.integer(rateDenominator) return(dplyr::tibble( - incidence_100000_pys_95CI_lower = - ((stats::qchisq(p = 0.025, df = 2 * ev) / 2) / pt) * 100000, - incidence_100000_pys_95CI_upper = - ((stats::qchisq(p = 0.975, df = 2 * (ev + 1)) / 2) / pt) * 100000 + lower = + ((stats::qchisq(p = 0.025, df = 2 * ev) / 2) / pt) * rateDenominator, + upper = + ((stats::qchisq(p = 0.975, df = 2 * (ev + 1)) / 2) / pt) * rateDenominator ) |> - dplyr::mutate(incidence_100000_pys_95CI_lower = round(.data$incidence_100000_pys_95CI_lower, 3), - incidence_100000_pys_95CI_upper = round(.data$incidence_100000_pys_95CI_upper, 3)) + dplyr::mutate(lower = round(.data$lower, 3), + upper = round(.data$upper, 3)) |> + dplyr::rename( + !!paste0("incidence_", rateDenominator, "_pys_95CI_lower") := "lower", + !!paste0("incidence_", rateDenominator, "_pys_95CI_upper") := "upper" + ) ) } diff --git a/R/estimateRollingIncidence.R b/R/estimateRollingIncidence.R new file mode 100644 index 0000000..5ec5a0a --- /dev/null +++ b/R/estimateRollingIncidence.R @@ -0,0 +1,770 @@ +# Copyright 2025 DARWIN EU® +# +# This file is part of IncidencePrevalence +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Collect rolling population incidence estimates +#' +#' @param cdm A CDM reference object +#' @param denominatorTable A cohort table with a set of denominator cohorts +#' (for example, created using the `generateDenominatorCohortSet()` +#' function). +#' @param outcomeTable A cohort table in the cdm reference containing +#' a set of outcome cohorts. +#' @param denominatorCohortId The cohort definition ids or the cohort names of +#' the denominator cohorts of interest. If NULL all cohorts will be considered +#' in the analysis. +#' @param outcomeCohortId The cohort definition ids or the cohort names of the +#' outcome cohorts of interest. If NULL all cohorts will be considered in the +#' analysis. +#' @param interval Time intervals over which rolling incidence is estimated. Can +#' be "weeks", "months", "quarters", or "years". ISO weeks will +#' be used for weeks. Calendar months, quarters, or years can be used. If more +#' than one option is chosen then results will be estimated for each chosen interval. +#' @param nIntervals The total number of relative-time windows to estimate (e.g., +#' 12 means 12 windows of the specified interval). +#' @param outcomeWashout The number of days used for a 'washout' period +#' between the end of one outcome and an individual starting to contribute +#' time at risk. If Inf, no time can be contributed after an event has +#' occurred. +#' @param repeatedEvents TRUE/ FALSE. If TRUE, an individual will be able to +#' contribute multiple events during the study period (time while they are +#' present in an outcome cohort and any subsequent washout will be +#' excluded). If FALSE, an individual will only contribute time up to their +#' first event. +#' @param strata Variables added to the denominator cohort table for which to +#' stratify estimates. +#' @param includeOverallStrata Whether to include an overall result as well as +#' strata specific results (when strata has been specified). +#' @param rateDenominator The denominator to use for the incidence rate +#' calculation. Default is 100000. +#' +#' @return Rolling incidence estimates +#' @export +#' +#' @examples +#' \donttest{ +#' cdm <- mockIncidencePrevalence(sampleSize = 1000) +#' cdm <- generateDenominatorCohortSet( +#' cdm = cdm, name = "denominator", +#' cohortDateRange = c(as.Date("2008-01-01"), as.Date("2018-01-01")) +#' ) +#' rolling_inc <- estimateRollingIncidence( +#' cdm = cdm, +#' denominatorTable = "denominator", +#' outcomeTable = "outcome", +#' interval = "months", +#' nIntervals = 12 +#' ) +#' } +estimateRollingIncidence <- function(cdm, + denominatorTable, + outcomeTable, + denominatorCohortId = NULL, + outcomeCohortId = NULL, + interval = "months", + nIntervals = 12, + outcomeWashout = Inf, + repeatedEvents = FALSE, + strata = list(), + includeOverallStrata = TRUE, + rateDenominator = 100000) { + startCollect <- Sys.time() + + tablePrefix <- paste0( + sample(letters, 5, TRUE) |> paste0(collapse = ""), "_inc" + ) + + if (is.character(interval)) { + interval <- tolower(interval) + } + rateDenominator <- as.integer(rateDenominator) + + cohortIds <- checkInputEstimateRollingIncidence( + cdm, denominatorTable, outcomeTable, + denominatorCohortId, outcomeCohortId, + interval, nIntervals, + outcomeWashout, repeatedEvents + ) + denominatorCohortId <- cohortIds[[1]] + outcomeCohortId <- cohortIds[[2]] + + checkStrata(strata, cdm[[denominatorTable]]) + + if (is.null(denominatorCohortId)) { + denominatorCohortId <- omopgenerics::cohortCount( + cdm[[denominatorTable]] + ) %>% + dplyr::filter(.data$number_records > 0) %>% + dplyr::pull("cohort_definition_id") + } + if (is.null(outcomeCohortId)) { + outcomeCohortId <- omopgenerics::cohortCount(cdm[[outcomeTable]]) %>% + dplyr::pull("cohort_definition_id") + } + + if (denominatorTable == outcomeTable && + any(denominatorCohortId %in% outcomeCohortId)) { + cli::cli_abort("Denominator cohort can not be the same as the outcome cohort") + } + + ## add outcome from attribute + outcomeRef <- omopgenerics::settings(cdm[[outcomeTable]]) %>% + dplyr::filter(.data$cohort_definition_id %in% .env$outcomeCohortId) %>% + dplyr::select("cohort_definition_id", "cohort_name") |> + dplyr::collect() %>% + dplyr::rename( + "outcome_cohort_id" = "cohort_definition_id", + "outcome_cohort_name" = "cohort_name" + ) + if (nrow(outcomeRef) == 0) { + cli::cli_abort(message = c("Specified outcome IDs not found in the cohort set of + {paste0('cdm$', outcomeTable)}", + "i" = "Run omopgenerics::settings({paste0('cdm$', outcomeTable)}) + to check which IDs exist" + )) + } + + checkInputEstimateAdditional( + cdm, denominatorTable, outcomeTable, denominatorCohortId, + outcomeCohortId + ) + + # get outcomes + cohort_start_date & cohort_end_date + cdm[[paste0(tablePrefix, "_inc_1")]] <- cdm[[outcomeTable]] %>% + dplyr::filter(.data$cohort_definition_id %in% .env$outcomeCohortId) %>% + dplyr::rename( + "outcome_cohort_id" = "cohort_definition_id", + "outcome_start_date" = "cohort_start_date", + "outcome_end_date" = "cohort_end_date" + ) %>% + dplyr::inner_join( + cdm[[denominatorTable]] %>% + dplyr::filter(.data$cohort_definition_id %in% + .env$denominatorCohortId) %>% + dplyr::select(c( + "subject_id", "cohort_start_date", + "cohort_end_date" + )) %>% + dplyr::distinct(), + by = "subject_id" + ) %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_1"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_cohort_outcomes_" + ) + + cdm[[paste0(tablePrefix, "_inc_2")]] <- cdm[[paste0(tablePrefix, "_inc_1")]] %>% + # most recent outcome starting before cohort start per person + dplyr::filter(.data$outcome_start_date < .data$cohort_start_date) %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_2"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_most_recent_" + ) + + if (nrow(cdm[[paste0(tablePrefix, "_inc_2")]] |> + utils::head(1) |> + dplyr::collect()) > 0) { + cdm[[paste0(tablePrefix, "_inc_2")]] <- cdm[[paste0(tablePrefix, "_inc_2")]] %>% + dplyr::group_by( + .data$subject_id, + .data$cohort_start_date, + .data$outcome_cohort_id + ) %>% + dplyr::filter(.data$outcome_start_date == + max(.data$outcome_start_date, na.rm = TRUE)) %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_2"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_most_recent_2_" + ) + } + + cdm[[paste0(tablePrefix, "_inc_2")]] <- cdm[[paste0(tablePrefix, "_inc_2")]] %>% + dplyr::union_all( + # all starting during cohort period + cdm[[paste0(tablePrefix, "_inc_1")]] %>% + dplyr::filter(.data$outcome_start_date >= .data$cohort_start_date) %>% + dplyr::filter(.data$outcome_start_date <= .data$cohort_end_date) + ) %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_2"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_during_" + ) + + cdm[[paste0(tablePrefix, "_inc_3")]] <- cdm[[paste0(tablePrefix, "_inc_2")]] %>% + dplyr::group_by( + .data$subject_id, + .data$cohort_start_date, + .data$cohort_end_date, + .data$outcome_cohort_id + ) %>% + dplyr::arrange(.data$outcome_start_date) %>% + dplyr::mutate(index = dplyr::row_number()) %>% + dplyr::ungroup() %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_3"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_index_" + ) + + cdm[[paste0(tablePrefix, "_inc_4")]] <- cdm[[paste0(tablePrefix, "_inc_3")]] %>% + dplyr::select(-"outcome_end_date") %>% + dplyr::full_join( + cdm[[paste0(tablePrefix, "_inc_3")]] %>% + dplyr::mutate(index = .data$index + 1) %>% + dplyr::rename("outcome_prev_end_date" = "outcome_end_date") %>% + dplyr::select(-"outcome_start_date"), + by = c( + "subject_id", "cohort_start_date", + "cohort_end_date", "outcome_cohort_id", "index" + ) + ) %>% + dplyr::select(-"index") %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_4"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_full_join_" + ) + + studySpecs <- tidyr::expand_grid( + outcome_cohort_id = outcomeCohortId, + denominator_cohort_id = denominatorCohortId, + outcome_washout = outcomeWashout, + repeated_events = repeatedEvents, + interval = interval + ) + if (any(is.infinite(outcomeWashout))) { + studySpecs$outcome_washout[ + which(is.infinite(studySpecs$outcome_washout)) + ] <- NA + } + studySpecs <- studySpecs %>% + dplyr::mutate(analysis_id = as.character(dplyr::row_number())) + studySpecsList <- split( + studySpecs, + studySpecs[, c("analysis_id")] + ) + + counter <- 0 + irsList <- purrr::map(studySpecsList, function(x) { + counter <<- counter + 1 + cli::cli_alert_info( + "Getting rolling incidence for analysis {counter} of {length(studySpecsList)}" + ) + + denId <- x$denominator_cohort_id + outId <- x$outcome_cohort_id + analysisId <- x$analysis_id + washout <- x$outcome_washout + if (!is.null(washout) && is.na(washout)) { + washout <- NULL + } + events <- x$repeated_events + intvl <- x$interval + + # Days per interval logic + if (intvl == "weeks") { + days_in_interval <- 7 + } else if (intvl == "months") { + days_in_interval <- 30 + } else if (intvl == "quarters") { + days_in_interval <- 90 + } else if (intvl == "years") { + days_in_interval <- 365 + } else { + cli::cli_abort("Interval {intvl} not supported for rolling incidence. Use weeks, months, quarters, or years.") + } + + studyPopStart <- cdm[[denominatorTable]] %>% + dplyr::filter(.data$cohort_definition_id == .env$denId) %>% + dplyr::select(-"cohort_definition_id") %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_5a"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_denominator_" + ) + + attrition <- recordAttrition( + table = studyPopStart, + id = "subject_id", + reasonId = 11, + reason = "Starting analysis population" + ) + + studyPop <- studyPopStart |> + dplyr::left_join( + cdm[[paste0(tablePrefix, "_inc_4")]] %>% + dplyr::filter(.data$outcome_cohort_id == .env$outId) %>% + dplyr::select(-"outcome_cohort_id"), + by = c("subject_id", "cohort_start_date", "cohort_end_date") + ) %>% + dplyr::compute( + name = paste0(tablePrefix, "_inc_5b"), + temporary = FALSE, + overwrite = TRUE, + logPrefix = "IncidencePrevalence_estimateRollingIncidence_outcomes_" + ) + + studyPopNoOutcome <- studyPop %>% + dplyr::filter(is.na(.data$outcome_start_date) & is.na(.data$outcome_prev_end_date)) %>% + dplyr::mutate(risk_start_date = .data$cohort_start_date) + + studyPopOutcome <- studyPop %>% + dplyr::filter(!is.na(.data$outcome_start_date) | !is.na(.data$outcome_prev_end_date)) %>% + dplyr::mutate(cohort_end_date = dplyr::coalesce(.data$outcome_start_date, .data$cohort_end_date)) %>% + dplyr::mutate(risk_start_date = .data$cohort_start_date) + + nStudyPopOutcome <- studyPopOutcome %>% utils::head(10) %>% dplyr::tally() %>% dplyr::pull("n") + + if (nStudyPopOutcome > 0) { + if (is.null(washout)) { + studyPopOutcome <- studyPopOutcome %>% + dplyr::filter(is.na(.data$outcome_prev_end_date) & .data$risk_start_date <= .data$cohort_end_date) + } else { + washoutPlusOne <- as.integer(washout + 1) + + # We need to iteratively filter the events to only keep valid ones, + # and base the next risk_start_date ONLY on the end date of the previous VALID event. + + studyPopOutcome_local <- studyPopOutcome %>% + dplyr::collect() %>% + dplyr::arrange(.data$subject_id, .data$outcome_start_date) + + if (nrow(studyPopOutcome_local) > 0) { + # Split by subject_id for faster sequential processing + split_pop <- split(studyPopOutcome_local, studyPopOutcome_local$subject_id) + + valid_pop_list <- lapply(split_pop, function(df) { + valid_rows <- list() + current_valid_end <- as.Date(NA) + + for (i in seq_len(nrow(df))) { + row <- df[i, ] + + if (i == 1) { + # First event is always valid if it's the first in the cohort period + # (Pre-cohort events are handled by outcome_prev_end_date from inc_1/inc_2) + if (!is.na(row$outcome_prev_end_date)) { + row$risk_start_date <- as.Date(row$outcome_prev_end_date) + washoutPlusOne + } + + # Check if this first event is pushed outside the cohort + if (row$risk_start_date <= row$cohort_end_date) { + valid_rows[[length(valid_rows) + 1]] <- row + if (!is.na(row$outcome_start_date)) { + current_valid_end <- row$outcome_start_date + } + } + } else { + # Subsequent events: check against the LAST VALID event's end date + if (!is.na(current_valid_end)) { + required_start <- current_valid_end + washoutPlusOne + if (!is.na(row$outcome_start_date) && row$outcome_start_date >= required_start) { + row$risk_start_date <- required_start + if (row$risk_start_date <= row$cohort_end_date) { + valid_rows[[length(valid_rows) + 1]] <- row + current_valid_end <- row$outcome_start_date + } + } + } else { + # If we somehow kept a row without an outcome (e.g. no more events), just append + if (row$risk_start_date <= row$cohort_end_date) { + valid_rows[[length(valid_rows) + 1]] <- row + } + } + } + } + + if (length(valid_rows) > 0) { + return(dplyr::bind_rows(valid_rows)) + } else { + return(NULL) + } + }) + + # Combine back + valid_pop <- dplyr::bind_rows(valid_pop_list) + + if (nrow(valid_pop) > 0) { + # Filter repeated events if FALSE + if (events == FALSE) { + valid_pop <- valid_pop %>% + dplyr::group_by(.data$subject_id) %>% + dplyr::filter(.data$risk_start_date == min(.data$risk_start_date, na.rm = TRUE)) %>% + dplyr::ungroup() + } + + # Push back to database + # We use an intermediate to avoid db table locking issues if copying directly over + temp_name <- paste0(tablePrefix, "_inc_5d_temp") + + db_con <- attr(cdm, "dbcon") + if (is.null(db_con)) { + # Fallback 2: using con + db_con <- attr(cdm, "con") + } + if (is.null(db_con)) { + db_con <- attr(cdm[[denominatorTable]], "dbcon") + } + if (is.null(db_con)) { + db_con <- attr(cdm[[denominatorTable]], "con") + } + if (is.null(db_con)) { + # Final fallback: connection source + db_con <- cdm[[denominatorTable]]$src$con + } + if (is.null(db_con)) { + # Last resort for standard mock cdms + db_con <- DBI::dbConnect(duckdb::duckdb(), ":memory:") + } + + valid_pop_db <- dplyr::copy_to( + dest = db_con, + df = valid_pop, + name = temp_name, + overwrite = TRUE + ) + + # Must convert to a proper tbl so union_all works later + if (!inherits(valid_pop_db, "tbl_sql")) { + # fallback to just joining them as dataframes if it is local + studyPopOutcome <- valid_pop + } else { + studyPopOutcome <- valid_pop_db + } + } else { + # No valid outcomes after washout + studyPopOutcome <- studyPopOutcome %>% utils::head(0) + } + } + } + + studyPopOutcome <- studyPopOutcome %>% + dplyr::mutate(cohort_end_date = dplyr::if_else( + !is.na(.data$outcome_start_date), + .data$outcome_start_date, + .data$cohort_end_date + )) + } + + studyPop <- studyPopNoOutcome %>% + dplyr::collect() %>% + dplyr::union_all(studyPopOutcome %>% dplyr::collect()) + + if (is.null(washout)) { + working_reason <- "Apply washout - anyone with outcome prior to start excluded" + } else { + working_reason <- paste0("Apply washout criteria of ", washout, " days (note, additional records may be created for those with an outcome)") + } + + attrition <- recordAttrition( + table = studyPop, + id = "subject_id", + reasonId = 12, + reason = working_reason, + existingAttrition = attrition + ) |> dplyr::mutate(analysis_id = analysisId) |> dplyr::relocate("analysis_id") + + ir <- list() + if (nrow(studyPop) > 0) { + for (m in seq_len(nIntervals)) { + window_start_days <- (m - 1) * days_in_interval + window_end_days <- m * days_in_interval - 1 + + workingPop <- studyPop %>% + dplyr::mutate( + person_window_start_m = .data$cohort_start_date + .env$window_start_days, + person_window_end_m = .data$cohort_start_date + .env$window_end_days + ) %>% + dplyr::filter( + .data$cohort_end_date >= .data$person_window_start_m, + .data$risk_start_date <= .data$person_window_end_m + ) + + if (nrow(workingPop) > 0) { + workingPop <- workingPop %>% + dplyr::mutate( + tStart = dplyr::if_else(.data$risk_start_date > .data$person_window_start_m, + .data$risk_start_date, + .data$person_window_start_m), + tEnd = dplyr::if_else(.data$cohort_end_date >= .data$person_window_end_m, + .data$person_window_end_m, + .data$cohort_end_date) + ) %>% + dplyr::mutate(workingDays = as.numeric(difftime(.data$tEnd, .data$tStart, units = "days")) + 1) %>% + dplyr::mutate(outcome_start_date = dplyr::if_else( + .data$outcome_start_date <= .data$tEnd & .data$outcome_start_date >= .data$tStart, + as.Date(.data$outcome_start_date), + as.Date(NA) + )) + + if (length(strata) == 0 || includeOverallStrata == TRUE) { + ir[[paste0(m, "_overall")]] <- workingPop %>% + dplyr::summarise( + denominator_count = dplyr::n_distinct(.data$subject_id), + person_days = sum(.data$workingDays, na.rm = TRUE), + outcome_count = sum(!is.na(.data$outcome_start_date)) + ) %>% + dplyr::mutate( + interval_number = as.integer(.env$m), + window_start_days = as.integer(.env$window_start_days), + window_end_days = as.integer(.env$window_end_days), + analysis_interval = paste0("rolling_", .env$intvl) + ) + } else { + ir[[paste0(m, "_overall")]] <- dplyr::tibble() + } + + if (length(strata) >= 1) { + ir[[paste0(m, "_overall")]] <- ir[[paste0(m, "_overall")]] %>% + omopgenerics::uniteStrata() + + for (k in seq_along(strata)) { + ir[[paste0(m, "_", k)]] <- workingPop %>% + dplyr::group_by(dplyr::pick(.env$strata[[k]])) %>% + dplyr::summarise( + denominator_count = dplyr::n_distinct(.data$subject_id), + person_days = sum(.data$workingDays, na.rm = TRUE), + outcome_count = sum(!is.na(.data$outcome_start_date)), + .groups = "drop" + ) %>% + dplyr::mutate( + interval_number = as.integer(.env$m), + window_start_days = as.integer(.env$window_start_days), + window_end_days = as.integer(.env$window_end_days), + analysis_interval = paste0("rolling_", .env$intvl) + ) %>% + omopgenerics::uniteStrata() + + ir[[paste0(m, "_overall")]] <- dplyr::bind_rows(ir[[paste0(m, "_overall")]], ir[[paste0(m, "_", k)]]) + } + } + } + } + } + + ir <- dplyr::bind_rows(ir) + if (nrow(ir) > 0) { + ir <- ir %>% + dplyr::mutate( + person_years = round(.data$person_days / 365.25, 3), + incidence_100000_pys = round(((.data$outcome_count / .data$person_years) * rateDenominator), 3) + ) %>% + dplyr::rename( + !!paste0("incidence_", rateDenominator, "_pys") := "incidence_100000_pys" + ) %>% + dplyr::mutate(analysis_id = analysisId) %>% + dplyr::relocate("analysis_id") + } else { + ir <- dplyr::tibble() + } + + result <- list() + result[["ir"]] <- ir + result[["attrition"]] <- attrition + return(result) + }) + + irsList <- purrr::list_flatten(irsList, name_spec = "{inner}") + + # attrition + for (i in seq_along(studySpecsList)) { + irsList[names(irsList) == "attrition"][[i]] <- dplyr::bind_rows( + omopgenerics::attrition(cdm[[denominatorTable]]) %>% + dplyr::rename("denominator_cohort_id" = "cohort_definition_id") %>% + dplyr::filter(.data$denominator_cohort_id == studySpecsList[[i]]$denominator_cohort_id) %>% + dplyr::mutate(analysis_id = studySpecsList[[i]]$analysis_id) %>% + dplyr::mutate(dplyr::across(dplyr::where(is.numeric), as.integer)), + irsList[names(irsList) == "attrition"][[i]] %>% + dplyr::mutate(dplyr::across(dplyr::where(is.numeric), as.integer)) + ) + } + attrition <- irsList[names(irsList) == "attrition"] + attrition <- dplyr::bind_rows(attrition, .id = NULL) %>% + dplyr::select(-"denominator_cohort_id") %>% + dplyr::relocate("analysis_id") + + # analysis settings + analysisSettings <- studySpecs %>% + dplyr::left_join( + omopgenerics::settings(cdm[[denominatorTable]]) %>% + dplyr::rename("cohort_id" = "cohort_definition_id") %>% + dplyr::rename_with(.cols = dplyr::everything(), function(x) { paste0("denominator_", x) }), + by = "denominator_cohort_id" + ) |> + dplyr::left_join(outcomeRef, by = "outcome_cohort_id") |> + dplyr::group_by(dplyr::across(!c( + "analysis_id", "outcome_cohort_id", "denominator_cohort_id", "outcome_cohort_name", "denominator_cohort_name", "interval" + ))) |> + dplyr::mutate(result_id = as.integer(dplyr::cur_group_id())) |> + dplyr::ungroup() + + # incidence estimates + irs <- irsList[names(irsList) == "ir"] + irs <- dplyr::bind_rows(irs, .id = NULL) + + if (nrow(irs) > 0) { + irs <- irs %>% + dplyr::bind_cols(incRateCiExact(irs$outcome_count, irs$person_years, rateDenominator)) + } + + omopgenerics::dropSourceTable(cdm = cdm, name = dplyr::starts_with(paste0(tablePrefix, "_inc_"))) + omopgenerics::dropSourceTable(cdm = cdm, name = dplyr::starts_with(paste0(tablePrefix, "_analysis_"))) + + ## attrition formatting + attritionSR <- attrition |> + dplyr::distinct() |> + dplyr::inner_join( + analysisSettings |> dplyr::select(c("analysis_id", "denominator_cohort_name", "outcome_cohort_name", "result_id")), + by = "analysis_id" + ) |> + dplyr::select(!"analysis_id") |> + omopgenerics::uniteGroup(cols = c("denominator_cohort_name", "outcome_cohort_name")) |> + tidyr::pivot_longer( + cols = c("number_records", "number_subjects", "excluded_records", "excluded_subjects"), + names_to = "variable_name", + values_to = "estimate_value" + ) |> + dplyr::mutate( + "estimate_name" = "count", + "estimate_value" = as.character(.data$estimate_value), + "estimate_type" = "integer", + "variable_level" = NA_character_, + "cdm_name" = omopgenerics::cdmName(cdm) + ) |> + omopgenerics::uniteStrata("reason") |> + omopgenerics::uniteAdditional("reason_id") |> + dplyr::relocate(omopgenerics::resultColumns()) + + ## result formatting + if (nrow(irs) == 0) { + irs <- omopgenerics::emptySummarisedResult() + } else { + if (!"strata_name" %in% colnames(irs)) { + irs <- irs |> omopgenerics::uniteStrata() + } + irs <- irs |> + dplyr::distinct() |> + dplyr::inner_join( + analysisSettings |> dplyr::select(c("analysis_id", "denominator_cohort_name", "outcome_cohort_name", "result_id")), + by = "analysis_id" + ) |> + dplyr::select(!"analysis_id") |> + omopgenerics::uniteGroup(cols = c("denominator_cohort_name", "outcome_cohort_name")) |> + omopgenerics::uniteAdditional(cols = c("interval_number", "window_start_days", "window_end_days", "analysis_interval")) |> + tidyr::pivot_longer( + cols = c( + "denominator_count", "outcome_count", "person_days", "person_years", + paste0("incidence_", rateDenominator, "_pys"), + paste0("incidence_", rateDenominator, "_pys_95CI_lower"), + paste0("incidence_", rateDenominator, "_pys_95CI_upper") + ), + names_to = "estimate_name", + values_to = "estimate_value" + ) |> + dplyr::mutate( + "variable_name" = dplyr::if_else( + .data$estimate_name %in% c("denominator_count", "person_days", "person_years"), + "Denominator", "Outcome" + ), + "variable_level" = NA_character_, + "estimate_value" = as.character(.data$estimate_value), + "estimate_type" = dplyr::if_else(grepl("count", .data$estimate_name), "integer", "numeric"), + "cdm_name" = omopgenerics::cdmName(cdm) + ) + } + + ## settings formatting + analysisSettings <- analysisSettings |> + dplyr::mutate( + result_type = "rolling_incidence", + package_name = "IncidencePrevalence", + package_version = as.character(utils::packageVersion("IncidencePrevalence")), + analysis_outcome_washout = as.character(.data$outcome_washout), + analysis_repeated_events = as.character(.data$repeated_events) + ) |> + dplyr::select(!dplyr::ends_with("_cohort_id")) |> + dplyr::select(!dplyr::ends_with("_cohort_definition_id")) |> + dplyr::select(!c("denominator_cohort_name", "outcome_cohort_name", "outcome_washout", "repeated_events", "interval")) |> + dplyr::select( + c("result_id", "result_type", "package_name", "package_version", "analysis_outcome_washout", "analysis_repeated_events"), + dplyr::starts_with("denominator_"), dplyr::starts_with("outcome_") + ) |> + dplyr::distinct() + + ## bind + irs <- omopgenerics::newSummarisedResult( + x = irs, + settings = analysisSettings |> dplyr::mutate(dplyr::across(-"result_id", as.character)) + ) + + attritionSR <- attritionSR |> + omopgenerics::newSummarisedResult( + settings = analysisSettings |> + dplyr::mutate(result_type = "incidence_attrition") |> + dplyr::mutate(dplyr::across(-"result_id", as.character)) + ) + + irs <- omopgenerics::bind(irs, attritionSR) + + dur <- abs(as.numeric(Sys.time() - startCollect, units = "secs")) + cli::cli_alert_success("Overall time taken: {floor(dur/60)} mins and {dur %% 60 %/% 1} secs") + + return(irs) +} + +checkInputEstimateRollingIncidence <- function(cdm, + denominatorTable, + outcomeTable, + denominatorCohortId, + outcomeCohortId, + interval, + nIntervals, + outcomeWashout, + repeatedEvents) { + omopgenerics::validateCdmArgument(cdm) + omopgenerics::validateCohortArgument(cohort = cdm[[denominatorTable]]) + if (!is.null(denominatorCohortId)) { + denominatorCohortId <- omopgenerics::validateCohortIdArgument( + denominatorCohortId, cdm[[denominatorTable]] + ) + } + omopgenerics::validateCohortArgument(cohort = cdm[[outcomeTable]]) + if (!is.null(outcomeCohortId)) { + outcomeCohortId <- omopgenerics::validateCohortIdArgument( + outcomeCohortId, cdm[[outcomeTable]] + ) + } + + omopgenerics::assertTrue(all(interval %in% c("weeks", "months", "quarters", "years"))) + omopgenerics::assertNumeric(nIntervals, integerish = TRUE, min = 1) + + if (any(outcomeWashout != Inf)) { + omopgenerics::assertNumeric(outcomeWashout[which(!is.infinite(outcomeWashout))], min = 0, max = 99999) + } + omopgenerics::assertLogical(repeatedEvents) + + return(list(denominatorCohortId, outcomeCohortId)) +} diff --git a/R/getIncidence.R b/R/getIncidence.R index d7e4733..bd1d4ec 100644 --- a/R/getIncidence.R +++ b/R/getIncidence.R @@ -32,7 +32,9 @@ getIncidence <- function(cdm, tablePrefix, analysisId, strata, - includeOverallStrata) { + includeOverallStrata, + rateDenominator) { + rateDenominator <- as.integer(rateDenominator) if (!is.null(outcomeWashout) && is.na(outcomeWashout)) { outcomeWashout <- NULL } @@ -410,8 +412,8 @@ getIncidence <- function(cdm, ir <- ir %>% dplyr::mutate( person_years = round(.data$person_days / 365.25, 3), - incidence_100000_pys = - round(((.data$outcome_count / .data$person_years) * 100000), 3) + !!paste0("incidence_", rateDenominator, "_pys") := + round(((.data$outcome_count / .data$person_years) * rateDenominator), 3) ) } } diff --git a/R/plotting.R b/R/plotting.R index 40062e7..cf26e01 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -83,7 +83,18 @@ plotIncidence <- function(result, return(emptyPlot()) } - omopgenerics::assertChoice(y, c("incidence_100000_pys", "outcome_count", "denominator_count", "person_days")) + if (y == "incidence_100000_pys" && !"incidence_100000_pys" %in% colnames(resultTidy)) { + # check for other incidence estimates + cols <- colnames(resultTidy) + inc_cols <- cols[grepl("^incidence_\\d+_pys$", cols)] + if (length(inc_cols) == 1) { + y <- inc_cols + if (ymin == "incidence_100000_pys_95CI_lower") ymin <- paste0(y, "_95CI_lower") + if (ymax == "incidence_100000_pys_95CI_upper") ymax <- paste0(y, "_95CI_upper") + } + } + + # omopgenerics::assertChoice(y, c("incidence_100000_pys", "outcome_count", "denominator_count", "person_days")) plotEstimates( result = resultTidy, @@ -100,6 +111,99 @@ plotIncidence <- function(result, ) } +#' Plot rolling incidence results +#' +#' @param result Rolling incidence results +#' @param x Variable to plot on x axis +#' @param y Variable to plot on y axis. +#' @param line Whether to plot a line using `geom_line` +#' @param point Whether to plot points using `geom_point` +#' @param ribbon Whether to plot a ribbon using `geom_ribbon` +#' @param ymin Lower limit of error bars, if provided is plot using `geom_errorbar` +#' @param ymax Upper limit of error bars, if provided is plot using `geom_errorbar` +#' @param facet Variables to use for facets. To see available variables for +#' facetting use the function `availableRollingIncidenceGrouping()`. +#' @param colour Variables to use for colours. To see available variables for +#' colouring use the function `availableRollingIncidenceGrouping()`. +#' +#' @return A ggplot with the rolling incidence results plotted +#' @export +#' +plotRollingIncidence <- function(result, + x = "interval_number", + y = "incidence_100000_pys", + line = TRUE, + point = TRUE, + ribbon = FALSE, + ymin = "incidence_100000_pys_95CI_lower", + ymax = "incidence_100000_pys_95CI_upper", + facet = NULL, + colour = NULL) { + rlang::check_installed("visOmopResults", version = "1.0.2") + + if (nrow(result) == 0) { + cli::cli_warn("Empty result object") + return(emptyPlot()) + } + + # check if result is tidy or not + if (inherits(result, "summarised_result")) { + resultTidy <- result |> + tidyResult(type = "rolling_incidence", attrition = FALSE) + } else if (result |> + dplyr::pull("result_type") |> + unique() == + "tidy_rolling_incidence") { + resultTidy <- result + } else { + cli::cli_abort("result must be either a summarised_result object output of + the estimateRollingIncidence() function or a tidied result from the + asRollingIncidenceResult() function.") + } + + if (nrow(resultTidy) == 0) { + cli::cli_warn("No rolling incidence results available to plot") + return(emptyPlot()) + } + + if (y == "incidence_100000_pys" && !"incidence_100000_pys" %in% colnames(resultTidy)) { + # check for other incidence estimates + cols <- colnames(resultTidy) + inc_cols <- cols[grepl("^incidence_\\d+_pys$", cols)] + if (length(inc_cols) == 1) { + y <- inc_cols + if (ymin == "incidence_100000_pys_95CI_lower") ymin <- paste0(y, "_95CI_lower") + if (ymax == "incidence_100000_pys_95CI_upper") ymax <- paste0(y, "_95CI_upper") + } + } + + # Add analysis_interval to group if it exists to avoid overlapping artifacts + if ("analysis_interval" %in% colnames(resultTidy)) { + if (is.null(colour)) { + group <- "analysis_interval" + } else { + group <- c(colour, "analysis_interval") + } + } else { + group <- colour + } + + plotEstimates( + result = resultTidy, + x = x, + y = y, + line = line, + point = point, + ribbon = ribbon, + ymin = ymin, + ymax = ymax, + facet = facet, + colour = colour, + type = "rolling_incidence", + group = group + ) +} + #' Plot prevalence results #' #' @param result Prevalence results @@ -287,9 +391,11 @@ plotPopulation <- function(result, facet = NULL, colour = NULL, type) { + cols <- colnames(result) + inc_cols <- cols[grepl("^incidence_\\d+_pys", cols)] labels <- c( "outcome_cohort_name", "denominator_count", "outcome_count", "person_days", - "incidence_100000_pys", "incidence_100000_pys_95CI_lower", "incidence_100000_pys_95CI_upper", + inc_cols, "prevalence", "prevalence_95CI_lower", "prevalence_95CI_upper", "incidence_start_date", "incidence_end_date", "prevalence_start_date", "prevalence_end_date" ) @@ -355,7 +461,8 @@ plotEstimates <- function(result, ymax, facet, colour, - type) { + type, + group = colour) { rlang::check_installed("ggplot2", reason = "for plot functions") rlang::check_installed("scales", reason = "for plot functions") @@ -367,7 +474,8 @@ plotEstimates <- function(result, labels <- c( "outcome_cohort_name", "denominator_count", "outcome_count", "person_days", y, ymin, ymax, - "incidence_start_date", "incidence_end_date", "prevalence_start_date", "prevalence_end_date" + "incidence_start_date", "incidence_end_date", "prevalence_start_date", "prevalence_end_date", + "window_start_days", "window_end_days", "interval_number" ) labels <- labels[labels %in% colnames(result)] @@ -392,7 +500,7 @@ plotEstimates <- function(result, ymax = ymax, facet = facet, colour = colour, - group = colour, + group = group, label = labels ) @@ -417,11 +525,15 @@ plotEstimates <- function(result, ggplot2::scale_x_continuous(labels = scales::percent_format(accuracy = 0.1)) } } - if (plot$labels$y == "Incidence 100000 pys") { - plot <- plot + ggplot2::labs(y = "Incidence (100,000 person-years)") + if (grepl("incidence_\\d+_pys", y, ignore.case = TRUE)) { + denom <- sub("incidence_(\\d+)_pys", "\\1", y) + denom_form <- format(as.numeric(denom), big.mark=",", scientific = FALSE) + plot <- plot + ggplot2::labs(y = paste0("Incidence (", denom_form, " person-years)")) } - if (plot$labels$x == "Incidence 100000 pys") { - plot <- plot + ggplot2::labs(x = "Incidence (100,000 person-years)") + if (grepl("incidence_\\d+_pys", x, ignore.case = TRUE)) { + denom <- sub("incidence_(\\d+)_pys", "\\1", x) + denom_form <- format(as.numeric(denom), big.mark=",", scientific = FALSE) + plot <- plot + ggplot2::labs(x = paste0("Incidence (", denom_form, " person-years)")) } if (x == "date_years") { plot <- plot + @@ -534,6 +646,21 @@ availablePrevalenceGrouping <- function(result, varying = FALSE) { availableGrouping(result, varying) } +#' Variables that can be used for faceting and colouring rolling incidence plots +#' +#' @param result Rolling incidence results +#' @param varying If FALSE, only variables with non-unique values will be +#' returned, otherwise all available variables will be returned +#' +#' @export +#' +availableRollingIncidenceGrouping <- function(result, varying = FALSE) { + omopgenerics::assertLogical(varying, length = 1) + result <- omopgenerics::validateResultArgument(result) |> + omopgenerics::filterSettings(.data$result_type == "rolling_incidence") + availableGrouping(result, varying) +} + availableGrouping <- function(result, varying) { cols <- c( "cdm_name", "outcome_cohort_name", omopgenerics::strataColumns(result), diff --git a/R/tables.R b/R/tables.R index 9400386..2ee6ead 100644 --- a/R/tables.R +++ b/R/tables.R @@ -131,16 +131,31 @@ tableIncidence <- function(result, .options = list()) { rlang::check_installed("visOmopResults", version = "1.0.2") - tableInternal( - result = result, - formatEstimateName = c( + # dynamically create formatEstimateName + estimates <- unique(result$estimate_name) + inc_est <- estimates[grepl("incidence_\\d+_pys$", estimates)] + if (length(inc_est) > 0) { + denom <- sub("incidence_(\\d+)_pys", "\\1", inc_est[1]) + formatEstimateName <- c( "Denominator (N)" = "", "Person-years" = "", "Outcome (N)" = "", - "Incidence 100,000 person-years [95% CI]" = - " ( - - )" - ), + setNames( + paste0(" ( - )"), + paste0("Incidence ", format(as.numeric(denom), big.mark=",", scientific = FALSE), " person-years [95% CI]") + ) + ) + } else { + formatEstimateName <- c( + "Denominator (N)" = "", + "Person-years" = "", + "Outcome (N)" = "" + ) + } + + tableInternal( + result = result, + formatEstimateName = formatEstimateName, header = header, groupColumn = groupColumn, type = type, diff --git a/R/tidyResults.R b/R/tidyResults.R index 6cd4f65..7bae2f9 100644 --- a/R/tidyResults.R +++ b/R/tidyResults.R @@ -94,6 +94,45 @@ asPrevalenceResult <- function(result, metadata = FALSE) { } +#' A tidy implementation of the summarised_result object for rolling incidence results. +#' +#' @param result A summarised_result object created by the IncidencePrevalence package. +#' @param metadata If TRUE additional metadata columns will be included in the result. +#' +#' @examples +#' \donttest{ +#' cdm <- mockIncidencePrevalence() +#' inc <- estimateRollingIncidence(cdm, "target", "outcome") +#' tidy_inc <- asRollingIncidenceResult(inc) +#' } +#' +#' @return A tibble with a tidy version of the summarised_result object. +#' +#' @export +#' +asRollingIncidenceResult <- function(result, metadata = FALSE) { + if (nrow(result) == 0) { + cli::cli_warn("No results available to tidy") + return(result) + } + result <- omopgenerics::validateResultArgument(result) + result <- result |> + omopgenerics::filterSettings( + .data$result_type %in% c("rolling_incidence", "rolling_incidence_attrition") + ) + if (nrow(result) == 0) { + cli::cli_warn("No rolling incidence results found in result object") + return(result) + } + + incResults <- tidyResult(result, type = "rolling_incidence", attrition = FALSE, metadata = metadata) |> + dplyr::select(!"result_id") + + class(incResults) <- c("tidy_rolling_incidence", class(incResults)) + + incResults +} + # Helper function tidyResult <- function(result, type, attrition = TRUE, metadata = FALSE) { @@ -116,7 +155,7 @@ tidyResult <- function(result, type, attrition = TRUE, metadata = FALSE) { .fns = ~ toDate(.x) ), dplyr::across( - .cols = dplyr::contains("time|washout|days"), + .cols = dplyr::matches("washout|days|interval_number"), .fns = ~ as.numeric(.x) ) ) |> diff --git a/README.md b/README.md index 585f8a1..ad1d6bf 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,14 @@ ## Package overview IncidencePrevalence contains functions for estimating population-level -incidence and prevalence using the OMOP common data model. For more -information on the package please see our paper in Pharmacoepidemiology +incidence and prevalence using the OMOP common data model. This includes functions to: +- `generateDenominatorCohortSet()`: To identify a set of denominator populations. +- `estimateIncidence()`: To calculate incidence rates for a given set of outcomes and denominators using absolute calendar time. +- `estimateRollingIncidence()`: To calculate rolling (time-since-entry) incidence rates relative to a person's entry into the denominator cohort. +- `estimatePointPrevalence()`: To calculate point prevalence for a given set of outcomes and denominators. +- `estimatePeriodPrevalence()`: To calculate period prevalence for a given set of outcomes and denominators. + +For more information on the package please see our paper in Pharmacoepidemiology and Drug Safety. > Raventós, B, Català, M, Du, M, et al. IncidencePrevalence: An R diff --git a/_pkgdown.yml b/_pkgdown.yml index ffb7c68..8e26d6f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,4 @@ -url: https://darwin-eu.github.io/IncidencePrevalence/ +url: https://iomedhealth.github.io/IncidencePrevalence/ template: bootstrap: 5 bootswatch: flatly @@ -9,7 +9,7 @@ reference: - matches("generateDenominatorCohortSet|generateTargetDenominatorCohortSet") - subtitle: Estimate incidence rates - contents: - - matches("estimateIncidence") + - matches("estimateIncidence|estimateRollingIncidence") - subtitle: Estimate point and period prevalence - contents: - matches("estimatePointPrevalence|estimatePeriodPrevalence") diff --git a/man/asRollingIncidenceResult.Rd b/man/asRollingIncidenceResult.Rd new file mode 100644 index 0000000..ee06d29 --- /dev/null +++ b/man/asRollingIncidenceResult.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyResults.R +\name{asRollingIncidenceResult} +\alias{asRollingIncidenceResult} +\title{A tidy implementation of the summarised_result object for rolling incidence results.} +\usage{ +asRollingIncidenceResult(result, metadata = FALSE) +} +\arguments{ +\item{result}{A summarised_result object created by the IncidencePrevalence package.} + +\item{metadata}{If TRUE additional metadata columns will be included in the result.} +} +\value{ +A tibble with a tidy version of the summarised_result object. +} +\description{ +A tidy implementation of the summarised_result object for rolling incidence results. +} +\examples{ +\donttest{ +cdm <- mockIncidencePrevalence() +inc <- estimateRollingIncidence(cdm, "target", "outcome") +tidy_inc <- asRollingIncidenceResult(inc) +} + +} diff --git a/man/availableRollingIncidenceGrouping.Rd b/man/availableRollingIncidenceGrouping.Rd new file mode 100644 index 0000000..5f2e64a --- /dev/null +++ b/man/availableRollingIncidenceGrouping.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{availableRollingIncidenceGrouping} +\alias{availableRollingIncidenceGrouping} +\title{Variables that can be used for faceting and colouring rolling incidence plots} +\usage{ +availableRollingIncidenceGrouping(result, varying = FALSE) +} +\arguments{ +\item{result}{Rolling incidence results} + +\item{varying}{If FALSE, only variables with non-unique values will be +returned, otherwise all available variables will be returned} +} +\description{ +Variables that can be used for faceting and colouring rolling incidence plots +} diff --git a/man/estimateIncidence.Rd b/man/estimateIncidence.Rd index a1aa657..53634ab 100644 --- a/man/estimateIncidence.Rd +++ b/man/estimateIncidence.Rd @@ -17,7 +17,8 @@ estimateIncidence( outcomeWashout = Inf, repeatedEvents = FALSE, strata = list(), - includeOverallStrata = TRUE + includeOverallStrata = TRUE, + rateDenominator = 1e+05 ) } \arguments{ @@ -75,6 +76,9 @@ stratify estimates.} \item{includeOverallStrata}{Whether to include an overall result as well as strata specific results (when strata has been specified).} + +\item{rateDenominator}{The denominator to use for the incidence rate +calculation. Default is 100000.} } \value{ Incidence estimates diff --git a/man/estimateRollingIncidence.Rd b/man/estimateRollingIncidence.Rd new file mode 100644 index 0000000..de17019 --- /dev/null +++ b/man/estimateRollingIncidence.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimateRollingIncidence.R +\name{estimateRollingIncidence} +\alias{estimateRollingIncidence} +\title{Collect rolling population incidence estimates} +\usage{ +estimateRollingIncidence( + cdm, + denominatorTable, + outcomeTable, + denominatorCohortId = NULL, + outcomeCohortId = NULL, + interval = "months", + nIntervals = 12, + outcomeWashout = Inf, + repeatedEvents = FALSE, + strata = list(), + includeOverallStrata = TRUE, + rateDenominator = 1e+05 +) +} +\arguments{ +\item{cdm}{A CDM reference object} + +\item{denominatorTable}{A cohort table with a set of denominator cohorts +(for example, created using the \code{generateDenominatorCohortSet()} +function).} + +\item{outcomeTable}{A cohort table in the cdm reference containing +a set of outcome cohorts.} + +\item{denominatorCohortId}{The cohort definition ids or the cohort names of +the denominator cohorts of interest. If NULL all cohorts will be considered +in the analysis.} + +\item{outcomeCohortId}{The cohort definition ids or the cohort names of the +outcome cohorts of interest. If NULL all cohorts will be considered in the +analysis.} + +\item{interval}{Time intervals over which rolling incidence is estimated. Can +be "weeks", "months", "quarters", or "years". ISO weeks will +be used for weeks. Calendar months, quarters, or years can be used. If more +than one option is chosen then results will be estimated for each chosen interval.} + +\item{nIntervals}{The total number of relative-time windows to estimate (e.g., +12 means 12 windows of the specified interval).} + +\item{outcomeWashout}{The number of days used for a 'washout' period +between the end of one outcome and an individual starting to contribute +time at risk. If Inf, no time can be contributed after an event has +occurred.} + +\item{repeatedEvents}{TRUE/ FALSE. If TRUE, an individual will be able to +contribute multiple events during the study period (time while they are +present in an outcome cohort and any subsequent washout will be +excluded). If FALSE, an individual will only contribute time up to their +first event.} + +\item{strata}{Variables added to the denominator cohort table for which to +stratify estimates.} + +\item{includeOverallStrata}{Whether to include an overall result as well as +strata specific results (when strata has been specified).} + +\item{rateDenominator}{The denominator to use for the incidence rate +calculation. Default is 100000.} +} +\value{ +Rolling incidence estimates +} +\description{ +Collect rolling population incidence estimates +} +\examples{ +\donttest{ +cdm <- mockIncidencePrevalence(sampleSize = 1000) +cdm <- generateDenominatorCohortSet( + cdm = cdm, name = "denominator", + cohortDateRange = c(as.Date("2008-01-01"), as.Date("2018-01-01")) +) +rolling_inc <- estimateRollingIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "months", + nIntervals = 12 +) +} +} diff --git a/man/plotRollingIncidence.Rd b/man/plotRollingIncidence.Rd new file mode 100644 index 0000000..c93ec24 --- /dev/null +++ b/man/plotRollingIncidence.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotRollingIncidence} +\alias{plotRollingIncidence} +\title{Plot rolling incidence results} +\usage{ +plotRollingIncidence( + result, + x = "window_start_days", + y = "incidence_100000_pys", + line = TRUE, + point = TRUE, + ribbon = FALSE, + ymin = "incidence_100000_pys_95CI_lower", + ymax = "incidence_100000_pys_95CI_upper", + facet = NULL, + colour = NULL +) +} +\arguments{ +\item{result}{Rolling incidence results} + +\item{x}{Variable to plot on x axis} + +\item{y}{Variable to plot on y axis.} + +\item{line}{Whether to plot a line using \code{geom_line}} + +\item{point}{Whether to plot points using \code{geom_point}} + +\item{ribbon}{Whether to plot a ribbon using \code{geom_ribbon}} + +\item{ymin}{Lower limit of error bars, if provided is plot using \code{geom_errorbar}} + +\item{ymax}{Upper limit of error bars, if provided is plot using \code{geom_errorbar}} + +\item{facet}{Variables to use for facets. To see available variables for +facetting use the function \code{availableRollingIncidenceGrouping()}.} + +\item{colour}{Variables to use for colours. To see available variables for +colouring use the function \code{availableRollingIncidenceGrouping()}.} +} +\value{ +A ggplot with the rolling incidence results plotted +} +\description{ +Plot rolling incidence results +} diff --git a/tests/testthat/test-estimateIncidence.R b/tests/testthat/test-estimateIncidence.R index e8eefe1..14fd99f 100644 --- a/tests/testthat/test-estimateIncidence.R +++ b/tests/testthat/test-estimateIncidence.R @@ -3820,8 +3820,81 @@ test_that("mock db: censor cohort", { expect_true(inc_2 |> dplyr::filter(estimate_name == "outcome_count") |> dplyr::pull("estimate_value") == "0") +}) + +test_that("rateDenominator works correctly", { + skip_on_cran() + + personTable <- dplyr::tibble( + person_id = 1L, + gender_concept_id = 8507L, + year_of_birth = 2000L, + month_of_birth = 01L, + day_of_birth = 01L + ) + observationPeriodTable <- dplyr::tibble( + observation_period_id = 1L, + person_id = 1L, + observation_period_start_date = as.Date("2010-01-01"), + observation_period_end_date = as.Date("2012-12-31") + ) + outcomeTable <- dplyr::tibble( + cohort_definition_id = 1L, + subject_id = 1L, + cohort_start_date = as.Date("2011-01-01"), + cohort_end_date = as.Date("2011-01-01") + ) + + cdm <- mockIncidencePrevalence( + personTable = personTable, + observationPeriodTable = observationPeriodTable, + outcomeTable = outcomeTable + ) + + cdm <- generateDenominatorCohortSet(cdm = cdm, name = "denominator") + + # Default (100000) + inc1 <- estimateIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "overall" + ) + + expect_true(any(grepl("incidence_100000_pys", inc1$estimate_name))) + # Custom (1000) + inc2 <- estimateIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "overall", + rateDenominator = 1000 + ) + expect_true(any(grepl("incidence_1000_pys", inc2$estimate_name))) + expect_false(any(grepl("incidence_100000_pys", inc2$estimate_name))) + # Check scaling + val1 <- inc1 %>% + dplyr::filter(estimate_name == "incidence_100000_pys") %>% + dplyr::pull(estimate_value) %>% + as.numeric() + + val2 <- inc2 %>% + dplyr::filter(estimate_name == "incidence_1000_pys") %>% + dplyr::pull(estimate_value) %>% + as.numeric() + + if (length(val1) > 0 && length(val2) > 0) { + expect_equal(val1, val2 * 100, tolerance = 0.5) + } + # Check tableIncidence output + expect_no_error(tbl <- tableIncidence(inc2)) + expect_s3_class(tbl, "gt_tbl") + + # Clean up + omopgenerics::cdmDisconnect(cdm) }) + diff --git a/tests/testthat/test-estimateRollingIncidence.R b/tests/testthat/test-estimateRollingIncidence.R new file mode 100644 index 0000000..13ee373 --- /dev/null +++ b/tests/testthat/test-estimateRollingIncidence.R @@ -0,0 +1,243 @@ + +test_that("estimateRollingIncidence fundamental manual check", { + # Mock CDM with specific dates + person <- dplyr::tibble( + person_id = c(1, 2, 3), + gender_concept_id = c(8507, 8532, 8507), + year_of_birth = c(1980, 1985, 1990), + month_of_birth = c(1, 1, 1), + day_of_birth = c(1, 1, 1), + race_concept_id = 0, + ethnicity_concept_id = 0 + ) + observation_period <- dplyr::tibble( + observation_period_id = c(1, 2, 3), + person_id = c(1, 2, 3), + observation_period_start_date = as.Date("2000-01-01"), + observation_period_end_date = as.Date("2010-12-31"), + period_type_concept_id = 0 + ) + + # Denominator cohort entries (different dates) + denominator <- dplyr::tibble( + cohort_definition_id = c(1, 1, 1), + subject_id = c(1, 2, 3), + cohort_start_date = as.Date(c("2005-01-01", "2006-03-01", "2007-06-01")), + cohort_end_date = as.Date(c("2010-12-31", "2010-12-31", "2010-12-31")) + ) + + # Outcome cohort (Person 1 has outcome at day 10, Person 2 at day 45, Person 3 no outcome) + outcome <- dplyr::tibble( + cohort_definition_id = c(1, 1), + subject_id = c(1, 2), + cohort_start_date = as.Date(c("2005-01-11", "2006-04-15")), # P1: +10 days, P2: +45 days + cohort_end_date = as.Date(c("2005-01-11", "2006-04-15")) + ) + + cdm <- omopgenerics::cdmFromTables( + tables = list("person" = person, "observation_period" = observation_period), + cdmName = "test" + ) + cdm <- omopgenerics::insertTable(cdm, name = "denominator", table = denominator) + cdm$denominator <- omopgenerics::newCohortTable(cdm$denominator) + cdm <- omopgenerics::insertTable(cdm, name = "outcome", table = outcome) + cdm$outcome <- omopgenerics::newCohortTable(cdm$outcome) + + res <- suppressMessages(estimateRollingIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "months", + nIntervals = 2 + )) + + res_ir <- omopgenerics::settings(res) |> dplyr::inner_join(res, by = "result_id") |> dplyr::filter(result_type == "rolling_incidence") + + # Month 1 check (Days 0-29) + # P1 outcome is at day 10 -> Should be counted here + # P2 outcome is at day 45 -> Not counted here + m1_outcomes <- res_ir |> + dplyr::filter(additional_level == "1 &&& 0 &&& 29 &&& rolling_months") |> + dplyr::filter(estimate_name == "outcome_count") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m1_outcomes, 1) + + m1_denoms <- res_ir |> + dplyr::filter(additional_level == "1 &&& 0 &&& 29 &&& rolling_months") |> + dplyr::filter(estimate_name == "denominator_count") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m1_denoms, 3) + + # Month 2 check (Days 30-59) + # P2 outcome is at day 45 -> Should be counted here + m2_outcomes <- res_ir |> + dplyr::filter(additional_level == "2 &&& 30 &&& 59 &&& rolling_months") |> + dplyr::filter(estimate_name == "outcome_count") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m2_outcomes, 1) +}) + +test_that("estimateRollingIncidence intervals and censoring", { + person <- dplyr::tibble( + person_id = c(1), + gender_concept_id = c(8507), + year_of_birth = c(1980), month_of_birth = c(1), day_of_birth = c(1), + race_concept_id = 0, ethnicity_concept_id = 0 + ) + observation_period <- dplyr::tibble( + observation_period_id = c(1), person_id = c(1), + observation_period_start_date = as.Date("2000-01-01"), + observation_period_end_date = as.Date("2010-12-31"), period_type_concept_id = 0 + ) + + # Only 45 days in denominator + denominator <- dplyr::tibble( + cohort_definition_id = c(1), subject_id = c(1), + cohort_start_date = as.Date(c("2005-01-01")), + cohort_end_date = as.Date(c("2005-02-14")) # 44 days inclusive (45 total) + ) + outcome <- dplyr::tibble( + cohort_definition_id = c(1), subject_id = c(1), + cohort_start_date = as.Date(c("2005-02-14")), + cohort_end_date = as.Date(c("2005-02-14")) + ) + + cdm <- omopgenerics::cdmFromTables( + tables = list("person" = person, "observation_period" = observation_period), + cdmName = "test" + ) + cdm <- omopgenerics::insertTable(cdm, name = "denominator", table = denominator) + cdm$denominator <- omopgenerics::newCohortTable(cdm$denominator) + cdm <- omopgenerics::insertTable(cdm, name = "outcome", table = outcome) + cdm$outcome <- omopgenerics::newCohortTable(cdm$outcome) + + res <- suppressMessages(estimateRollingIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = c("weeks", "months"), + nIntervals = 3, + repeatedEvents = TRUE + )) + + res_ir <- omopgenerics::settings(res) |> dplyr::inner_join(res, by = "result_id") |> dplyr::filter(result_type == "rolling_incidence") + + # Weeks + # P1 has 45 days. Weeks are 7 days. + # Week 1: 7 days + # Week 6: 7 days (42 total) + # Week 7: 3 days + w1_pdays <- res_ir |> + dplyr::filter(additional_level == "1 &&& 0 &&& 6 &&& rolling_weeks") |> + dplyr::filter(estimate_name == "person_days") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(w1_pdays, 7) + + # Months + # Month 1: 30 days + # Month 2: 15 days + m1_pdays <- res_ir |> + dplyr::filter(additional_level == "1 &&& 0 &&& 29 &&& rolling_months") |> + dplyr::filter(estimate_name == "person_days") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m1_pdays, 30) + + m2_pdays <- res_ir |> + dplyr::filter(additional_level == "2 &&& 30 &&& 59 &&& rolling_months") |> + dplyr::filter(estimate_name == "person_days") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m2_pdays, 15) + + # Month 3: Should not have this person in denominator + m3_denoms <- res_ir |> + dplyr::filter(additional_level == "3 &&& 60 &&& 89 &&& rolling_months") |> + dplyr::filter(estimate_name == "denominator_count") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_length(m3_denoms, 0) +}) + +test_that("estimateRollingIncidence repeated events and washout", { + person <- dplyr::tibble( + person_id = c(1), + gender_concept_id = c(8507), + year_of_birth = c(1980), month_of_birth = c(1), day_of_birth = c(1), + race_concept_id = 0, ethnicity_concept_id = 0 + ) + observation_period <- dplyr::tibble( + observation_period_id = c(1), person_id = c(1), + observation_period_start_date = as.Date("2000-01-01"), + observation_period_end_date = as.Date("2010-12-31"), period_type_concept_id = 0 + ) + + # 2 years in denominator + denominator <- dplyr::tibble( + cohort_definition_id = c(1), subject_id = c(1), + cohort_start_date = as.Date(c("2005-01-01")), + cohort_end_date = as.Date(c("2006-12-31")) + ) + + outcome <- dplyr::tibble( + cohort_definition_id = c(1, 1, 1), + subject_id = c(1, 1, 1), + # Outcome 1: Day 31 (Month 2 in rolling months) + # Outcome 2: Day 90 (Inside 180 day washout of Outcome 1) + # Outcome 3: Day 273 (Outside 180 day washout of Outcome 2) + cohort_start_date = as.Date(c("2005-02-01", "2005-04-01", "2005-10-01")), + cohort_end_date = as.Date(c("2005-02-01", "2005-04-01", "2005-10-01")) + ) + + cdm <- omopgenerics::cdmFromTables( + tables = list("person" = person, "observation_period" = observation_period), + cdmName = "test" + ) + cdm <- omopgenerics::insertTable(cdm, name = "denominator", table = denominator) + cdm$denominator <- omopgenerics::newCohortTable(cdm$denominator) + cdm <- omopgenerics::insertTable(cdm, name = "outcome", table = outcome) + cdm$outcome <- omopgenerics::newCohortTable(cdm$outcome) + + res <- suppressMessages(estimateRollingIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "months", + nIntervals = 12, + outcomeWashout = 180, + repeatedEvents = TRUE + )) + + res_ir <- omopgenerics::settings(res) |> dplyr::inner_join(res, by = "result_id") |> dplyr::filter(result_type == "rolling_incidence") + + # Outcome 1: 2005-02-01 is 31 days after 2005-01-01. + # Month 2 (Days 30-59). Outcome should be 1. + m2_outcomes <- res_ir |> + dplyr::filter(additional_level == "2 &&& 30 &&& 59 &&& rolling_months") |> + dplyr::filter(estimate_name == "outcome_count") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m2_outcomes, 1) + + # Outcome 2: 2005-04-01. + # This is 90 days after 2005-01-01 -> Month 4 (Days 90-119). + # However, 2005-04-01 is only 59 days after Outcome 1. Washout is 180. + # So it should be excluded. + m4_outcomes <- res_ir |> + dplyr::filter(additional_level == "4 &&& 90 &&& 119 &&& rolling_months") |> + dplyr::filter(estimate_name == "outcome_count") |> + dplyr::pull(estimate_value) + + if (length(m4_outcomes) > 0) { + m4_outcomes <- as.numeric(m4_outcomes) + } else { + m4_outcomes <- 0 + } + expect_equal(m4_outcomes, 0) + + # Outcome 3: 2005-10-01. + # This is 273 days after 2005-01-01 -> Month 10 (Days 270-299). + # 2005-10-01 is 183 days after Outcome 2 (2005-04-01). 183 > 180. + # Should be counted! + m10_outcomes <- res_ir |> + dplyr::filter(additional_level == "10 &&& 270 &&& 299 &&& rolling_months") |> + dplyr::filter(estimate_name == "outcome_count") |> + dplyr::pull(estimate_value) |> as.numeric() + expect_equal(m10_outcomes, 1) +}) diff --git a/update_colleagues.md b/update_colleagues.md new file mode 100644 index 0000000..224f10b --- /dev/null +++ b/update_colleagues.md @@ -0,0 +1,44 @@ +# Update: `estimateRollingIncidence()` has been implemented! 🎉 + +Hi team, + +I'm excited to share that we've successfully implemented a new core feature in our fork of the DARWIN-EU **`IncidencePrevalence`** package: the **`estimateRollingIncidence()`** function! + +### What's new? +While `estimateIncidence()` computes incidence rates across **absolute calendar time** (e.g., January 2010, February 2010), our new function calculates incidence rates **relative to each individual's cohort entry date**. + +This allows us to track "time-since-entry incidence" or "rolling incidence" out of the box (e.g., "Month 1 after cohort entry", "Month 2 after cohort entry", etc.). + +### Key Features +- Dynamic relative calculation of temporal intervals (supports `weeks`, `months`, `quarters`, `years`). +- Robust handling of individual-level observation truncation and censoring (correctly handles partial contributions to relative windows). +- Supports all existing parameters (`outcomeWashout`, `repeatedEvents`, `strata`). +- Fully compliant with the DARWIN-EU ecosystem—outputs a standard `omopgenerics::SummarisedResult` object compatible with `plotIncidence()` and `tableIncidence()`. + +### Code & Documentation +- **Source Code:** [`R/estimateRollingIncidence.R`](https://github.com/iomedhealth/IncidencePrevalence/blob/main/R/estimateRollingIncidence.R) +- **Documentation:** The function is fully documented in the package [Reference Guide](https://iomedhealth.github.io/IncidencePrevalence/reference/estimateRollingIncidence.html) and we've added a new section to the [Calculating Incidence Vignette](https://iomedhealth.github.io/IncidencePrevalence/articles/a05_Calculating_incidence.html). + +### Example Usage + +Here's a quick example of how to track the first 12 rolling months of incidence after a patient enters the denominator cohort, applying a 180-day washout period for repeated events: + +```r +library(IncidencePrevalence) + +# Assuming 'cdm' is your CDM reference and cohorts are already generated +rolling_inc <- estimateRollingIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "months", + nIntervals = 12, # Track 12 rolling months per person + outcomeWashout = 180, # 180-day washout between repeated events + repeatedEvents = TRUE +) + +# You can plot it just like standard incidence! +plotIncidence(rolling_inc) +``` + +Let me know if you have any questions or run into any issues using the new function! diff --git a/vignettes/a05_Calculating_incidence.Rmd b/vignettes/a05_Calculating_incidence.Rmd index 8f145ff..aa4a034 100644 --- a/vignettes/a05_Calculating_incidence.Rmd +++ b/vignettes/a05_Calculating_incidence.Rmd @@ -314,3 +314,24 @@ inc <- estimateIncidence( ) tableIncidenceAttrition(inc, style = "darwin") ``` + + +### Calculating Rolling Incidence + +While `estimateIncidence()` slices person-time based on absolute calendar dates (e.g. "January 2010", "February 2010"), you may also want to estimate incidence *relative* to each person's entry into the denominator cohort. This is known as "rolling incidence" or "time-since-entry incidence". + +The `estimateRollingIncidence()` function calculates the rate of an outcome occurring in relative time intervals (e.g. "Month 1 after cohort entry", "Month 2 after cohort entry", etc.). + +```{r, message=FALSE, warning=FALSE} +rolling_inc <- estimateRollingIncidence( + cdm = cdm, + denominatorTable = "denominator", + outcomeTable = "outcome", + interval = "months", + nIntervals = 12, + outcomeWashout = 180, + repeatedEvents = TRUE +) +``` + +The result of this function is also a standard `SummarisedResult` object compatible with the standard `plotIncidence()` and `tableIncidence()` functions. Instead of absolute dates, the intervals will be presented sequentially by their relative offset since entry.