tidymodels · Wander03 · Nov 11, 2024 · Nov 13, 2024 · Nov 17, 2024 · Nov 18, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,6 +39,7 @@ Imports:
     utils,
     vctrs (>= 0.5.0)
 Suggests:
+    arules,
     cluster,
     ClusterR,
     clustMixType (>= 0.3-5),

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,12 +3,14 @@
 S3method(as_tibble,cluster_metric_set)
 S3method(augment,cluster_fit)
 S3method(check_args,default)
+S3method(check_args,freq_itemsets)
 S3method(check_args,hier_clust)
 S3method(check_args,k_means)
 S3method(extract_cluster_assignment,KMeansCluster)
 S3method(extract_cluster_assignment,cluster_fit)
 S3method(extract_cluster_assignment,cluster_spec)
 S3method(extract_cluster_assignment,hclust)
+S3method(extract_cluster_assignment,itemsets)
 S3method(extract_cluster_assignment,kmeans)
 S3method(extract_cluster_assignment,kmodes)
 S3method(extract_cluster_assignment,kproto)
@@ -18,6 +20,7 @@ S3method(extract_fit_summary,KMeansCluster)
 S3method(extract_fit_summary,cluster_fit)
 S3method(extract_fit_summary,cluster_spec)
 S3method(extract_fit_summary,hclust)
+S3method(extract_fit_summary,itemsets)
 S3method(extract_fit_summary,kmeans)
 S3method(extract_fit_summary,kmodes)
 S3method(extract_fit_summary,kproto)
@@ -37,6 +40,7 @@ S3method(print,cluster_fit)
 S3method(print,cluster_metric_set)
 S3method(print,cluster_spec)
 S3method(print,control_cluster)
+S3method(print,freq_itemsets)
 S3method(print,hier_clust)
 S3method(print,k_means)
 S3method(required_pkgs,cluster_fit)
@@ -58,23 +62,28 @@ S3method(sse_within_total,cluster_spec)
 S3method(sse_within_total,workflow)
 S3method(tidy,cluster_fit)
 S3method(translate_tidyclust,default)
+S3method(translate_tidyclust,freq_itemsets)
 S3method(translate_tidyclust,hier_clust)
 S3method(translate_tidyclust,k_means)
 S3method(tunable,cluster_spec)
+S3method(tunable,freq_itemsets)
 S3method(tunable,k_means)
 S3method(tune_args,cluster_spec)
 S3method(tune_cluster,cluster_spec)
 S3method(tune_cluster,default)
 S3method(tune_cluster,workflow)
+S3method(update,freq_itemsets)
 S3method(update,hier_clust)
 S3method(update,k_means)
 export("%>%")
+export(.freq_itemsets_fit_arules)
 export(.hier_clust_fit_stats)
 export(.k_means_fit_ClusterR)
 export(.k_means_fit_clustMixType)
 export(.k_means_fit_klaR)
 export(.k_means_fit_stats)
 export(augment)
+export(augment_itemset_predict)
 export(cluster_metric_set)
 export(control_cluster)
 export(cut_height)
@@ -84,6 +93,7 @@ export(extract_fit_engine)
 export(extract_fit_parsnip)
 export(extract_fit_summary)
 export(extract_parameter_set_dials)
+export(extract_predictions)
 export(extract_preprocessor)
 export(extract_spec_parsnip)
 export(finalize_model_tidyclust)
@@ -92,6 +102,7 @@ export(fit)
 export(fit.cluster_spec)
 export(fit_xy)
 export(fit_xy.cluster_spec)
+export(freq_itemsets)
 export(get_tidyclust_colors)
 export(glance)
 export(hier_clust)

diff --git a/R/augment_itemset_predict.R b/R/augment_itemset_predict.R
@@ -0,0 +1,119 @@
+#' Augment Itemset Predictions with Truth Values
+#'
+#' This function processes the output of a `predict()` call for frequent itemset models
+#' and joins it with the corresponding ground truth data. It's designed to prepare
+#' the prediction and truth values in a format suitable for calculating evaluation metrics
+#' using packages like `yardstick`.
+#'
+#' @param pred_output A data frame that is the output of `predict()` from a `freq_itemsets` model.
+#'   It is expected to have a column named `.pred_cluster`, where each cell contains
+#'   a data frame with prediction details (including `.pred_item`, `.obs_item`, and `item`).
+#' @param truth_output A data frame representing the ground truth. It should have a similar
+#'   structure to the input data used for prediction, where columns represent items
+#'   and rows represent transactions.
+#'
+#' @details
+#' The function first extracts and combines all individual item prediction data frames
+#' nested within the `pred_output`. It then filters for items where a prediction was made
+#' (i.e., `!is.na(.pred_item)`) and standardizes item names by removing backticks.
+#' The `truth_output` is pivoted to a long format to match the structure of the predictions.
+#' Finally, an inner join is performed to ensure that only predicted items are included in
+#' the final result, aligning predictions with their corresponding true values.
+#'
+#' @return A data frame with the following columns:
+#'   \itemize{
+#'     \item `item`: The name of the item.
+#'     \item `row_id`: An identifier for the transaction (row) from which the prediction came.
+#'     \item `preds`: The predicted value for the item (either raw probability or binary prediction).
+#'     \item `truth`: The true value for the item from `truth_output`.
+#'   }
+#'   This output is suitable for direct use with `yardstick` metric functions.
+#' @export
+
+augment_itemset_predict <- function(pred_output, truth_output) {
+  # Extract all predictions (bind all .pred_cluster dataframes)
+  preds_df <- dplyr::bind_rows(pred_output$.pred_cluster, .id = "row_id") %>%
+    dplyr::filter(!is.na(.pred_item)) %>%  # Keep only rows with predictions
+    dplyr::mutate(item = stringr::str_remove_all(item, "`")) %>% # Remove backticks from item names
+    dplyr::select(row_id, item, preds = .pred_item)  # Standardize column names
+
+  # Pivot truth data to long format (to match predictions)
+  truth_long <- truth_output %>%
+    tibble::rownames_to_column("row_id") %>%
+    tidyr::pivot_longer(
+      cols = -row_id,
+      names_to = "item",
+      values_to = "truth_value"
+    )
+
+  # Join predictions with truth (inner join to keep only predicted items)
+  result <- preds_df %>%
+    dplyr::inner_join(truth_long, by = c("row_id", "item"))
+
+  # Return simplified output (preds vs truth)
+  dplyr::select(result, item, row_id, preds, truth = truth_value)
+}
+
+#' Generate Dataframe with Random NAs and Corresponding Truth
+#'
+#' @description
+#' This helper function creates a new data frame by randomly introducing `NA` values
+#' into an input data frame. It also returns the original data frame as a "truth"
+#' reference, which can be useful for simulating scenarios with missing data
+#' for prediction tasks.
+#'
+#' @param df The input data frame to which `NA` values will be introduced.
+#'   It is typically a transactional dataset where columns are items and rows are transactions.
+#' @param na_prob The probability (between 0 and 1) that any given cell in the
+#'   input data frame will be replaced with `NA`.
+#'
+#' @return A list containing two data frames:
+#'   \itemize{
+#'     \item `na_data`: The data frame with `NA` values randomly introduced.
+#'     \item `truth`: The original input data frame, serving as the ground truth.
+#'   }
+#' @examples
+#' # Create a sample data frame
+#' sample_df <- data.frame(
+#'   itemA = c(1, 0, 1),
+#'   itemB = c(0, 1, 1),
+#'   itemC = c(1, 1, 0)
+#' )
+#'
+#' # Generate NA data and truth with 30% NA probability
+#' set.seed(123)
+#' na_data_list <- random_na_with_truth(sample_df, na_prob = 0.3)
+#'
+#' # View the NA data
+#' print(na_data_list$na_data)
+#'
+#' # View the truth data
+#' print(na_data_list$truth)
+#'
+#' This function is not exported as it was used to test and provide examples in
+#' the vignettes, it may be formally introduced in the future.
+random_na_with_truth <- function(df, na_prob = 0.3) {
+  # Create a copy of the original dataframe to store truth values
+  truth_df <- df
+
+  # Create a mask of NAs (TRUE = becomes NA)
+  na_mask <- matrix(
+    sample(
+      c(TRUE, FALSE),
+      size = nrow(df) * ncol(df),
+      replace = TRUE,
+      prob = c(na_prob, 1 - na_prob)
+    ),
+    nrow = nrow(df)
+  )
+
+  # Apply the mask to create NA values
+  na_df <- df
+  na_df[na_mask] <- NA
+
+  # Return both the NA-filled dataframe and the truth
+  list(
+    na_data = na_df,
+    truth = truth_df
+  )
+}
diff --git a/R/extract_cluster_assignment.R b/R/extract_cluster_assignment.R
@@ -159,6 +159,79 @@ extract_cluster_assignment.hclust <- function(
   cluster_assignment_tibble(clusters, length(unique(clusters)), ...)
 }
 
+#' @export
+extract_cluster_assignment.itemsets <- function(object, ...) {
+  max_iter = 1000
+  items <- attr(object, "item_names")
+  itemsets <- arules::DATAFRAME(object)
+
+  itemset_list <- lapply(strsplit(gsub("[{}]", "", itemsets$items), ","), stringr::str_trim)
+  support <- itemsets$support
+  clusters <- numeric(length(items))
+  changed <- TRUE  # Flag to track convergence
+  iter <- 0 # Initialize iteration counter
+
+  # Continue until no changes occur
+  while (changed && iter < max_iter) {
+    changed <- FALSE
+    iter <- iter + 1
+    for (i in 1:length(items)) {
+      current_item <- items[i]
+      relevant_itemsets <- which(sapply(itemset_list, function(x) current_item %in% x))
+
+      if (length(relevant_itemsets) == 0) next  # Skip if no itemsets
+
+      # Find the best itemset (largest size, then highest support)
+      best_itemset <- relevant_itemsets[
+        which.max(
+          sapply(itemset_list[relevant_itemsets], length) * 1000 +  # Size dominates
+          support[relevant_itemsets]                                # Support breaks ties
+          )
+        ]
+      best_itemset_size <- length(itemset_list[[best_itemset]])
+      best_itemset_support <- support[best_itemset]
+
+      # Current cluster info (if any)
+      current_cluster <- clusters[i]
+      current_cluster_size <- if (current_cluster != 0)
+        length(itemset_list[[current_cluster]]) else 0
+      current_cluster_support <- if (current_cluster != 0)
+        support[current_cluster] else 0
+
+      # Reassign if:
+      # 1. No current cluster, OR
+      # 2. New itemset is larger, OR
+      # 3. Same size but higher support
+      if (current_cluster == 0 ||
+          best_itemset_size > current_cluster_size ||
+          (best_itemset_size == current_cluster_size &&
+          best_itemset_support > current_cluster_support)) {
+
+        # Assign all items in the best itemset to its cluster
+        new_cluster <- best_itemset
+        items_in_best <- match(itemset_list[[best_itemset]], items)
+
+        if (!all(clusters[items_in_best] == new_cluster)) {
+          clusters[items_in_best] <- new_cluster
+          changed <- TRUE  # Mark that a change occurred
+        }
+      }
+    }
+  }
+
+  if (iter == max_iter && changed) {
+    rlang::warn(
+      paste0(
+        "Cluster assignment did not converge within the maximum of ",
+        max_iter,
+        " iterations. Returning the current cluster assignments."
+      )
+    )
+  }
+
+  item_assignment_tibble_w_outliers(clusters, ...)
+}
+
 # ------------------------------------------------------------------------------
 
 cluster_assignment_tibble <- function(
@@ -173,3 +246,34 @@ cluster_assignment_tibble <- function(
 
   tibble::tibble(.cluster = factor(res))
 }
+
+item_assignment_tibble_w_outliers <- function(clusters,
+                                              ...,
+                                              prefix = "Cluster_") {
+  # Vector to store the resulting cluster names
+  res <- character(length(clusters))
+
+  # For items with cluster value 0, assign to "Cluster_0"
+  res[clusters == 0] <- "Cluster_0"
+  zero_count <- 0
+  res <- sapply(res, function(x) {
+    if (x == "Cluster_0") {
+      zero_count <<- zero_count + 1
+      paste0("Cluster_0_", zero_count)
+    } else {
+      x
+    }
+  })
+
+  # For non-zero clusters, assign sequential cluster numbers starting from "Cluster_1"
+  non_zero_clusters <- clusters[clusters != 0]
+  unique_non_zero_clusters <- unique(non_zero_clusters)
+
+  # Map each unique non-zero cluster to a new cluster starting from Cluster_1
+  cluster_map <- setNames(paste0(prefix, seq_along(unique_non_zero_clusters)), unique_non_zero_clusters)
-  cluster_map <- setNames(paste0(prefix, seq_along(unique_non_zero_clusters)), unique_non_zero_clusters)
+  cluster_map <- stats::setNames(paste0(prefix, seq_along(unique_non_zero_clusters)), unique_non_zero_clusters)
-  cluster_map <- setNames(paste0(prefix, seq_along(unique_non_zero_clusters)), unique_non_zero_clusters)
+  cluster_map <- stats::setNames(paste0(prefix, seq_along(unique_non_zero_clusters)), unique_non_zero_clusters)
+
+  # Assign the corresponding cluster names to the non-zero clusters
+  res[clusters != 0] <- cluster_map[as.character(non_zero_clusters)]
+
+  tibble::tibble(.cluster = factor(res))
+}
diff --git a/R/extract_fit_summary.R b/R/extract_fit_summary.R
@@ -192,3 +192,15 @@ extract_fit_summary.hclust <- function(object, ...) {
     cluster_assignments = clusts
   )
 }
+
+#' @export
+extract_fit_summary.itemsets <- function(object, ...,
+                                         call = rlang::caller_env(n = 0)) {
+  rlang::abort(
+    paste(
+      "Centroids are not usfeul for frequent itemsets, we suggust looking at the frequent itemsets directly.\n",
+      "Please use arules::inspect() on the fit of your cluster specification."
+    ),
+    call = call
+  )
+}
diff --git a/R/extract_predictions.R b/R/extract_predictions.R
@@ -0,0 +1,30 @@
+#' Extract Predictions from Observation Data Frames
+#'
+#' This function processes a data frame containing observation data frames and extracts non-NA values.
+#'
+#' Returns recommender predictions with predicted values imputed into dataset
+#' Notes: currently imputes thresholded probabilities
+#'
+#' @param pred_output A data frame with one column, where each cell contains a data frame.
+#' @return A data frame with items as columns and non-NA values as rows.
+#' @export
+
+extract_predictions <- function(pred_output) {
+  # Extract the list of data frames from the single column
+  data_frames <- pred_output$.pred_cluster
+
+  # Process each observation and combine results using reduce
+  result_df <- data_frames %>%
+    purrr::reduce(.f = ~ {
+      # Process each observation (data frame)
+      processed <- .y %>%
+        dplyr::mutate(value = ifelse(!is.na(.obs_item), .obs_item, .pred_item)) %>%
+        dplyr::select(item, value) %>%
+        tidyr::pivot_wider(names_from = item, values_from = value)
+
+      # Combine the processed data frame with the accumulated results
+      dplyr::bind_rows(.x, processed)
+    }, .init = NULL)
+
+  return(result_df)
+}