From 8ed95d5416cce5de6de245d602c8d669c30bbdb1 Mon Sep 17 00:00:00 2001 From: sumny Date: Tue, 11 Aug 2020 19:28:53 +0200 Subject: [PATCH 1/7] add cross-method for training of encode impact --- DESCRIPTION | 2 +- R/PipeOpEncodeImpact.R | 210 ++++++++++++++++------ man/mlr_pipeops_encodeimpact.Rd | 56 +++--- tests/testthat/test_pipeop_encodeimpact.R | 113 ++++++++++-- 4 files changed, 290 insertions(+), 91 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index eeda3b631..666c43991 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -87,7 +87,7 @@ Encoding: UTF-8 LazyData: true NeedsCompilation: no Roxygen: list(markdown = TRUE, r6 = FALSE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.1.9000 Collate: 'Graph.R' 'GraphLearner.R' diff --git a/R/PipeOpEncodeImpact.R b/R/PipeOpEncodeImpact.R index 1360ef048..a89e1c2b8 100644 --- a/R/PipeOpEncodeImpact.R +++ b/R/PipeOpEncodeImpact.R @@ -2,18 +2,23 @@ #' #' @usage NULL #' @name mlr_pipeops_encodeimpact -#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description -#' Encodes columns of type `factor`, `character` and `ordered`. +#' Encodes columns of type `factor`, and `ordered`. #' -#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each (factorial) column -#' to the difference between each target level's conditional log-likelihood -#' given this level, and the target level's global log-likelihood. +#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each +#' (factorial) column to the difference between each target level's conditional log-likelihood given +#' this level, and the target level's global log-likelihood. #' -#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial) column -#' to the difference between the target's conditional mean given -#' this level, and the target's global mean. +#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial) +#' column to the difference between the target's conditional mean given this level, and the target's +#' global mean. +#' +#' During training, the impact coding is done using a cross-method. This means that the training +#' [`Task`][mlr3::Task] is split into several folds via [`ResamplingCV`][mlr3::ResamplingCV] and for +#' each fold, impact coding is performed for each test set based on the respective training set. +#' This is helpful to prevent nested model bias. #' #' Treats new levels during prediction like missing values. #' @@ -31,24 +36,35 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpTaskPreproc`]. #' -#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, `character` or -#' `ordered` parameters encoded. +#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, or `ordered` parameters encoded. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: -#' * `impact` :: a named `list`\cr +#' * `train_task_hash` :: `character(1)`\cr +#' The hash (unique identifier) for the training [`Task`][mlr3::Task]. +#' * `rsmp_cv_instance` :: a `data.table`\cr +#' If `folds` is larger than one, the resampling instance of the [`ResamplingCV`][mlr3::ResamplingCV] used during training. +#' * `impact_predict` :: a named `list`\cr #' A list with an element for each affected feature:\cr -#' For regression each element is a single column matrix of impact values for each level of that feature.\cr -#' For classification, it is a list with an element for each *feature level*, which is a vector giving the impact of -#' this feature level on each *outcome level*. +#' For regression, each element is a single column matrix of impact values for each level of that feature.\cr +#' For classification, this is a list with an element for each *feature level*, which is a vector +#' giving the impact of this feature level on each *outcome level*. +#' This list is used to encode impact of the prediction [`Task`][mlr3::Task]. +#' * `impact_cv` :: a `list` of named `lists`\cr +#' A list of length `folds` with each element holding a list like `impact_predict` above. +#' These lists are used to encode impact of the training [`Task`][mlr3::Task]. #' #' @section Parameters: -#' * `smoothing` :: `numeric(1)` \cr +#' * `smoothing` :: `numeric(1)` \cr #' A finite positive value used for smoothing. Mostly relevant for [classification Tasks][mlr3::TaskClassif] if #' a factor does not coincide with a target factor level (and would otherwise give an infinite logit value). #' Initialized to `1e-4`. #' * `impute_zero` :: `logical(1)`\cr -#' If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default `FALSE`. +#' If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default is `FALSE`. +#' * `folds` :: `integer(1)`\cr +#' Number of folds used in the cross-method and passed to [`ResamplingCV`][mlr3::ResamplingCV]. Default is `3`. +#' If set to `1`, no cross-method will be applied during training, i.e., the whole training +#' [`Task`][mlr3::Task] is used to encode impact during training. #' #' @section Internals: #' Uses laplace smoothing, mostly to avoid infinite values for [classification Task][mlr3::TaskClassif]. @@ -61,10 +77,10 @@ #' poe = po("encodeimpact") #' #' task = TaskClassif$new("task", -#' data.table::data.table( -#' x = factor(c("a", "a", "a", "b", "b")), -#' y = factor(c("a", "a", "b", "b", "b"))), -#' "x") +#' backend = data.table::data.table( +#' x = factor(c("a", "a", "b", "b", "b")), +#' y = factor(c("a", "a", "a", "b", "b"))), +#' target = "y") #' #' poe$train(list(task))[[1]]$data() #' @@ -73,53 +89,96 @@ #' @include PipeOpTaskPreproc.R #' @export PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact", - inherit = PipeOpTaskPreprocSimple, + inherit = PipeOpTaskPreproc, public = list( initialize = function(id = "encodeimpact", param_vals = list()) { ps = ParamSet$new(params = list( - ParamDbl$new("smoothing", 0, Inf, tags = c("train", "required")), - ParamLgl$new("impute_zero", tags = c("train", "required")) + ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required"), default = 1e-4), + ParamLgl$new("impute_zero", tags = c("train", "required"), default = FALSE), + ParamInt$new("folds", lower = 1L, tags = c("train", "required"), default = 3L) )) - ps$values = list(smoothing = 1e-4, impute_zero = FALSE) + ps$values = list(smoothing = 1e-4, impute_zero = FALSE, folds = 3L) super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("factor", "ordered")) } ), private = list( - .get_state_dt = function(dt, levels, target) { - task_type = if (is.numeric(target)) "regr" else "classif" - state = list() + .train_task = function(task) { + dt_columns = private$.select_cols(task) + cols = dt_columns + if (!length(cols)) { + self$state = list(dt_columns = dt_columns) + return(task) # early exit + } + dt = task$data(cols = cols) + target = task$truth() + task_type = task$task_type + row_ids = task$row_ids + row_seq = seq_len(task$nrow) smoothing = self$param_set$values$smoothing + impute_zero = self$param_set$values$impute_zero + folds = self$param_set$values$folds + folds_seq = seq_len(folds) + + # note that matching the row_ids below is necessary because of the resampling + + # impact encoding for the prediction task + impact_predict = get_impact(task_type, folds_seq = 1L, train_sets = list(row_seq), dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero)[[1L]] + + if (folds > 1L) { + # cross-method + rcv = ResamplingCV$new() + rcv$param_set$values$folds = folds + rcv$instantiate(task) + + train_sets = map(folds_seq, function(fold) match(rcv$train_set(fold), row_ids)) + test_sets = map(folds_seq, .f = function(fold) match(rcv$test_set(fold), row_ids)) + + impact_cv = get_impact(task_type, folds_seq = folds_seq, train_sets = train_sets, dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero) - # different funs depending on task.type - list(impact = switch(task_type, - classif = sapply(dt, function(col) - sapply(levels(target), function(tl) { - tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing) - tplogit = log(tprop / (1 - tprop)) - map_dbl(c(setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)), - function(cl) { - if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_) - condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / - (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing) - cplogit = log(condprob / (1 - condprob)) - cplogit - tplogit - }) - }), simplify = FALSE), - regr = { - meanimp = mean(target) - sapply(dt, function(col) - t(t(c(sapply(levels(col), function(lvl) { - (sum(target[col == lvl], na.rm = TRUE) + smoothing * meanimp) / - (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp - }), if (self$param_set$values$impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))), simplify = FALSE) - })) + } else { + # no cross-method + test_sets = list(row_seq) + + impact_cv = list(impact_predict) + } + + self$state = list(train_task_hash = task$hash, rsmp_cv_instance = if (folds > 1L) rcv$instance else data.table(), impact_predict = impact_predict, impact_cv = impact_cv, dt_columns = dt_columns) + + # cross-method (folds > 1) will encode test_set of fold i using the impact encoding trained on train_set of fold i + dt = imap(dt, .f = function(curdat, idx) { + fold_dt = map(folds_seq, .f = function(fold) { + impact_test = self$state$impact_cv[[fold]] + test_set = test_sets[[fold]] + curdat = as.character(curdat[test_set]) + curdat[is.na(curdat)] = ".TEMP.MISSING" + curdat[curdat %nin% rownames(impact_test[[idx]])] = ".TEMP.MISSING" + # we only want to "drop" if there are no column names. + # otherwise we want the naming scheme . + impact_test[[idx]][match(curdat, rownames(impact_test[[idx]])), , drop = is.null(colnames(impact_test[[idx]]))] + }) + switch(task_type, + classif = do.call(rbind, fold_dt), + regr = unlist(fold_dt) + ) + }) + + dt = as.data.table(dt) + dt = dt[match(row_seq, unlist(test_sets)), ] # row ids have to be reordered because of resampling + task$select(setdiff(task$feature_names, cols))$cbind(dt) }, - .transform_dt = function(dt, levels) { - impact = self$state$impact - imap(dt, function(curdat, idx) { + .predict_task = function(task) { + cols = self$state$dt_columns + if (!length(cols)) { + return(task) + } + dt = task$data(cols = cols) + + # impact encoding for the prediction task always relies on the encoding of the whole training task + impact = self$state$impact_predict + dt = imap(dt, function(curdat, idx) { curdat = as.character(curdat) curdat[is.na(curdat)] = ".TEMP.MISSING" curdat[curdat %nin% rownames(impact[[idx]])] = ".TEMP.MISSING" @@ -127,8 +186,53 @@ PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact", # otherwise we want the naming scheme . impact[[idx]][match(curdat, rownames(impact[[idx]])), , drop = is.null(colnames(impact[[idx]]))] }) + + dt = as.data.table(dt) + task$select(setdiff(task$feature_names, cols))$cbind(dt) } ) ) mlr_pipeops$add("encodeimpact", PipeOpEncodeImpact) + +get_impact = function(task_type, folds_seq, train_sets, dt, target, smoothing, impute_zero) { + switch(task_type, + classif = map(folds_seq, .f = function(fold) { + target_lvls = levels(target) + train_set = train_sets[[fold]] + dt_train = dt[train_set, ] + target_train = target[train_set] + + map(dt_train, .f = function(col) { + col_lvls = levels(col) + + do.call(cbind, stats::setNames(map(target_lvls, .f = function(tl) { + tprop = (sum(target_train == tl) + smoothing) / (length(target_train) + 2 * smoothing) + tplogit = log(tprop / (1 - tprop)) + + map_dbl(c(stats::setNames(col_lvls, nm = col_lvls), c(.TEMP.MISSING = NA)), .f = function(cl) { + if (!impute_zero && is.na(cl)) return(NA_real_) # early exit + condprob = (sum(target_train[is.na(cl) | (col == cl)] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | (col == cl), na.rm = TRUE) + 2 * smoothing) + cplogit = log(condprob / (1 - condprob)) + cplogit - tplogit + }) + }), nm = target_lvls)) + }) + }), + regr = map(folds_seq, .f = function(fold) { + train_set = train_sets[[fold]] + dt_train = dt[train_set, ] + target_train = target[train_set] + + meanimp = mean(target_train) + + map(dt_train, .f = function(col) { + col_lvls = levels(col) + + as.matrix(c(stats::setNames(map_dbl(col_lvls, .f = function(lvl) { + (sum(target_train[col == lvl], na.rm = TRUE) + smoothing * meanimp) / (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp + }), nm = col_lvls), if (impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA))) + }) + }) + ) +} diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd index 8e59a4161..5b1273573 100644 --- a/man/mlr_pipeops_encodeimpact.Rd +++ b/man/mlr_pipeops_encodeimpact.Rd @@ -5,18 +5,23 @@ \alias{PipeOpEncodeImpact} \title{Conditional Target Value Impact Encoding} \format{ -\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. } \description{ -Encodes columns of type \code{factor}, \code{character} and \code{ordered}. +Encodes columns of type \code{factor}, and \code{ordered}. -Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each (factorial) column -to the difference between each target level's conditional log-likelihood -given this level, and the target level's global log-likelihood. +Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each +(factorial) column to the difference between each target level's conditional log-likelihood given +this level, and the target level's global log-likelihood. -Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial) column -to the difference between the target's conditional mean given -this level, and the target's global mean. +Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial) +column to the difference between the target's conditional mean given this level, and the target's +global mean. + +During training, the impact coding is done using a cross-method. This means that the training +\code{\link[mlr3:Task]{Task}} is split into several folds via \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} and for +each fold, impact coding is performed for each test set based on the respective training set. +This is helpful to prevent nested model bias. Treats new levels during prediction like missing values. } @@ -36,31 +41,42 @@ otherwise be set during construction. Default \code{list()}. Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. -The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, \code{character} or -\code{ordered} parameters encoded. +The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, or \code{ordered} parameters encoded. } \section{State}{ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as: \itemize{ -\item \code{impact} :: a named \code{list}\cr +\item \code{train_task_hash} :: \code{character(1)}\cr +The hash (unique identifier) for the training \code{\link[mlr3:Task]{Task}}. +\item \code{rsmp_cv_instance} :: a \code{data.table}\cr +If \code{folds} is larger than one, the resampling instance of the \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} used during training. +\item \code{impact_predict} :: a named \code{list}\cr A list with an element for each affected feature:\cr -For regression each element is a single column matrix of impact values for each level of that feature.\cr -For classification, it is a list with an element for each \emph{feature level}, which is a vector giving the impact of -this feature level on each \emph{outcome level}. +For regression, each element is a single column matrix of impact values for each level of that feature.\cr +For classification, this is a list with an element for each \emph{feature level}, which is a vector +giving the impact of this feature level on each \emph{outcome level}. +This list is used to encode impact of the prediction \code{\link[mlr3:Task]{Task}}. +\item \code{impact_cv} :: a \code{list} of named \code{lists}\cr +A list of length \code{folds} with each element holding a list like \code{impact_predict} above. +These lists are used to encode impact of the training \code{\link[mlr3:Task]{Task}}. } } \section{Parameters}{ \itemize{ -\item \code{smoothing} :: \code{numeric(1)} \cr +\item \code{smoothing} :: \code{numeric(1)} \cr A finite positive value used for smoothing. Mostly relevant for \link[mlr3:TaskClassif]{classification Tasks} if a factor does not coincide with a target factor level (and would otherwise give an infinite logit value). Initialized to \code{1e-4}. \item \code{impute_zero} :: \code{logical(1)}\cr -If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default \code{FALSE}. +If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default is \code{FALSE}. +\item \code{folds} :: \code{integer(1)}\cr +Number of folds used in the cross-method and passed to \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}}. Default is \code{3}. +If set to \code{1}, no cross-method will be applied during training, i.e., the whole training +\code{\link[mlr3:Task]{Task}} is used to encode impact during training. } } @@ -79,10 +95,10 @@ library("mlr3") poe = po("encodeimpact") task = TaskClassif$new("task", - data.table::data.table( - x = factor(c("a", "a", "a", "b", "b")), - y = factor(c("a", "a", "b", "b", "b"))), - "x") + backend = data.table::data.table( + x = factor(c("a", "a", "b", "b", "b")), + y = factor(c("a", "a", "a", "b", "b"))), + target = "y") poe$train(list(task))[[1]]$data() diff --git a/tests/testthat/test_pipeop_encodeimpact.R b/tests/testthat/test_pipeop_encodeimpact.R index c3a7000db..0c23367ac 100644 --- a/tests/testthat/test_pipeop_encodeimpact.R +++ b/tests/testthat/test_pipeop_encodeimpact.R @@ -8,13 +8,25 @@ test_that("PipeOpEncodeImpact", { t2 = po("histbin")$train(list(tsk("iris")))[[1]] - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = task) - - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = t2) - - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = mlr_tasks$get("iris")) + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, + constargs = list(param_vals = list(folds = 1L)), task = task) + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, + constargs = list(param_vals = list(folds = 1L)), task = t2) + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, + constargs = list(param_vals = list(folds = 1L)), task = mlr_tasks$get("iris")) + + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, + constargs = list(param_vals = list(folds = 2L)), task = task, + predict_like_train = FALSE, deterministic_train = FALSE) + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, + constargs = list(param_vals = list(folds = 2L)), task = t2, + predict_like_train = FALSE, deterministic_train = FALSE) + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, + constargs = list(param_vals = list(folds = 2L)), task = mlr_tasks$get("iris"), + predict_like_train = FALSE, deterministic_train = FALSE) op = PipeOpEncodeImpact$new() + op$param_set$values$folds = 1 expect_pipeop(op) nt = train_pipeop(op, inputs = list(task))[[1L]] @@ -29,10 +41,8 @@ test_that("PipeOpEncodeImpact", { # factor cols are removed expect_true(all(tsk("iris")$feature_names %nin% fn)) expect_true("factor" %nin% nt$feature_types$type) - }) - test_that("PipeOpImpactEncode on Classification", { testdf = data.frame( @@ -43,6 +53,7 @@ test_that("PipeOpImpactEncode on Classification", { testtask = TaskClassif$new("test", testdf, "t") op = PipeOpEncodeImpact$new() + op$param_set$values$folds = 1 expect_equal(op$train(list(tsk("iris")))[[1]], tsk("iris")) @@ -57,15 +68,16 @@ test_that("PipeOpImpactEncode on Classification", { op$train(list(testtask)) - expect_equal(op$state$impact$a, expm) + expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a) # folds = 1, no cross-method + expect_equal(op$state$impact_predict$a, expm) op$param_set$values$smoothing = 1e-4 op$train(list(testtask)) - expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE), 0.5e-4) + expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE), 0.5e-4) op$param_set$values$smoothing = 1e-8 op$train(list(testtask)) - expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE) * 1e4, 0.5e-4) + expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE) * 1e4, 0.5e-4) op$param_set$values$smoothing = 6.362e-9 # similar to what glm uses encoded = op$train(list(testtask))[[1]]$data() @@ -77,11 +89,12 @@ test_that("PipeOpImpactEncode on Classification", { expm2 = rbind(expm2, c(NA, NA)) rownames(expm2) = c("a", "b", ".TEMP.MISSING") - expect_equal(op$state$impact$b, expm2, tolerance = 1e-5) + expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b) # folds = 1, no cross-method + expect_equal(op$state$impact_predict$b, expm2, tolerance = 1e-5) expect_equal(encoded, - data.table(t = testdf$t, a = op$state$impact$a[testdf$a, ], - b = op$state$impact$b[testdf$b, ])) + data.table(t = testdf$t, a = op$state$impact_predict$a[testdf$a, ], + b = op$state$impact_predict$b[testdf$b, ])) # test NA handling / imputation @@ -100,7 +113,6 @@ test_that("PipeOpImpactEncode on Classification", { encoded = op$train(list(testtask2))[[1]]$data() expect_equal(as.numeric(as.matrix(encoded)[c(11, 17, 24, 30)]), c(0, 0, 0, 0)) # imputation by 0 - }) test_that("PipeOpImpactEncode on Regression", { @@ -118,13 +130,16 @@ test_that("PipeOpImpactEncode on Regression", { t = c(1, 2, 3, 1, 2, 3)) op = PipeOpEncodeImpact$new() + op$param_set$values$folds = 1 op$param_set$values$smoothing = 0 expect_equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE) + expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a) # folds = 1, no cross-method + expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b) # folds = 1, no cross-method - expect_equal(op$state$impact$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA)))) - expect_equal(op$state$impact$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA)))) + expect_equal(op$state$impact_predict$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA)))) + expect_equal(op$state$impact_predict$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA)))) op$param_set$values$smoothing = 1e-4 expect_false(isTRUE(all.equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE, tolerance = 1e-5))) @@ -165,12 +180,12 @@ test_that("PipeOpImpactEncode on Regression", { encoded = op$train(list(testtask2))[[1]]$data() expect_equal(which(is.na(encoded)), c(11, 18)) - }) test_that("PipeOpImpactEncode factor level ``", { op = PipeOpEncodeImpact$new() + op$param_set$values$folds = 1 testdf3 = iris levels(testdf3$Species) = c("setosa", "versicolor", "") @@ -181,5 +196,69 @@ test_that("PipeOpImpactEncode factor level ``", { train_out3ref = op$train(list(testtask3ref))[[1L]] expect_equal(train_out3$data(), train_out3ref$data()) +}) + +test_that("PipeOpImpactEncode cross-method on Classification", { + # FIXME: could also add some more technical tests + library(mlr3learners) + set.seed(2409) + n = 300L + x = as.factor(rep(c("x1", "x2"), each = n / 2L)) + y = as.factor(c(sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.9, 0.1)), sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.1, 0.9)))) + z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE)) + dat = data.table(y = y, x = x, z = z) + + task = TaskClassif$new("test", backend = dat, target = "y") + + learner = lrn("classif.log_reg", id = "l") # baseline + graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1") # no cross-method + graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2") # cross-method + + # check if nested resampling for the cross-method would work + train = sample(task$row_ids, size = 200L) + test = setdiff(task$row_ids, train) + + learner$train(task, row_ids = train) + graphlearner1$train(task, row_ids = train) + graphlearner2$train(task, row_ids = train) + + ce = c(suppressWarnings(learner$predict(task, row_ids = test)$score(msr("classif.ce"))), + suppressWarnings(graphlearner1$predict(task, row_ids = test)$score(msr("classif.ce"))), + suppressWarnings(graphlearner2$predict(task, row_ids = test)$score(msr("classif.ce")))) + expect_true(all(exp(diff(log(ce))) - 1 < 0.1)) # ratios of mean ce's should be around 1 +}) + +test_that("PipeOpImpactEncode cross-method on Regression", { + # FIXME: could also add some more technical tests + + library(mlr3learners) + set.seed(2409) + n = 300L + x = as.factor(rep(c("x1", "x2"), each = n / 2L)) # x1 ~ N(-5, 2), x2 ~ N(5, 2) + y = c(rnorm(n / 2L, mean = -5, sd = 2), rnorm(n / 2L, mean = 5, sd = 2)) + # aggregate(y ~ x, FUN = mean, data = dat) + # aggregate(y ~ x, FUN = sd, data = dat) + z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE)) # random + dat = data.table(y = y, x = x, z = z) + + task = TaskRegr$new("test", backend = dat, target = "y") + + learner = lrn("regr.lm", id = "l") # baseline + graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1") # no cross-method + graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2") # cross-method + + # check if nested resampling for the cross-method would work + train = sample(task$row_ids, size = 200L) + test = setdiff(task$row_ids, train) + + learner$train(task, row_ids = train) + graphlearner1$train(task, row_ids = train) + graphlearner2$train(task, row_ids = train) + + mse = c(learner$predict(task, row_ids = test)$score(msr("regr.mse")), + graphlearner1$predict(task, row_ids = test)$score(msr("regr.mse")), + graphlearner2$predict(task, row_ids = test)$score(msr("regr.mse"))) + expect_true(all(exp(diff(log(mse))) - 1 < 0.1)) # ratios of mean mse's should be around 1 }) + From 84311b403fc68550db0b3528e82634823512a079 Mon Sep 17 00:00:00 2001 From: sumny Date: Mon, 24 Aug 2020 21:00:59 +0200 Subject: [PATCH 2/7] update NEWS --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index c5945446f..6fb0e23a9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # mlr3pipelines 0.2.1-9000 * NULL input channels accept any kind of input +* PipeOpEncodeImpact now allows for using a cross-method during training # mlr3pipelines 0.2.1 From c72f75cc3a244c84b13171a7ed9164ef9e01de00 Mon Sep 17 00:00:00 2001 From: sumny Date: Thu, 24 Sep 2020 14:44:57 +0200 Subject: [PATCH 3/7] test --- R/PipeOpEncodeImpact.R | 80 ++++++++++++++++++++++++++++++++++++++++++ R/zzz.R | 2 ++ 2 files changed, 82 insertions(+) diff --git a/R/PipeOpEncodeImpact.R b/R/PipeOpEncodeImpact.R index a89e1c2b8..74fb89882 100644 --- a/R/PipeOpEncodeImpact.R +++ b/R/PipeOpEncodeImpact.R @@ -236,3 +236,83 @@ get_impact = function(task_type, folds_seq, train_sets, dt, target, smoothing, i }) ) } + +LearnerEncodeImpact = R6Class("LearnerEncodeImpact", inherit = Learner) + +LearnerEncodeImpactClassif = R6Class("LearnerEncodeImpactClassif", inherit = LearnerEncodeImpact, + public = list( + initialize = function(id, param_set = ParamSet$new(), predict_types = "impact", feature_types = character(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) { + super$initialize(id = id, task_type = "classif", param_set = param_set, feature_types = feature_types, + predict_types = predict_types, properties = properties, data_formats = data_formats, packages = packages, man = man) + } + ) +) + +LearnerEncodeImpactClassifSimple = R6Class("LearnerEncodeImpactClassifSimple", inherit = LearnerEncodeImpactClassif, + public = list( + initialize = function() { + ps = ParamSet$new(list( + ParamUty$new("affect_columns", custom_check = check_function_or_null, default = selector_all(), tags = "train"), + ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")), + ParamLgl$new("impute_zero", tags = c("train", "required")) + )) + ps$values = list(smoothing = 1e-4, impute_zero = FALSE) + super$initialize( + id = "encode.impact.classif.simple", + feature_types = c("factor", "ordered"), + predict_types = "impact", + param_set = ps, + properties = c("twoclass", "multiclass", "missings"), + man = NA_character_ + ) + } + ), + + private = list( + .train = function(task) { + # FIXME: affect_columns + dt = task$data(cols = task$feature_names) + target = task$truth() + smoothing = self$param_set$values$smoothing + model = sapply(dt, function(col) { + sapply(levels(target), function(tl) { + tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing) + tplogit = log(tprop / (1 - tprop)) + map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)), + function(cl) { + if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_) + condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing) + cplogit = log(condprob / (1 - condprob)) + cplogit - tplogit + } + ) + }) + }, simplify = FALSE) + set_class(model, "encode.impact.classif.simple_model") + }, + + .predict = function(task) { + model = self$state$model + dt = task$data(cols = task$feature_names) + impact = imap(dt, function(curdat, idx) { + curdat = as.character(curdat) + curdat[is.na(curdat)] = ".TEMP.MISSING" + curdat[curdat %nin% rownames(model[[idx]])] = ".TEMP.MISSING" + # we only want to "drop" if there are no column names. + # otherwise we want the naming scheme . + model[[idx]][match(curdat, rownames(model[[idx]])), , drop = is.null(colnames(model[[idx]]))] + }) + list(impact = impact) + + } + ) +) + +check_prediction_data.PredictionDataEncodeImpact = function(pdata) { + browser() + pdata +} + +as_prediction.PredictionDataEncodeImpact = function(x, check = TRUE) { + invoke(PredictionEncodeImpact$new, check = check, .args = x) +} diff --git a/R/zzz.R b/R/zzz.R index 1d3ce9691..e112c79d7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -14,6 +14,8 @@ register_mlr3 = function() { x$pipeops$valid_tags = unique(c(x$pipeops$valid_tags, c("abstract", "meta", "missings", "feature selection", "imbalanced data", "data transform", "target transform", "ensemble", "robustify", "learner", "encode", "multiplicity"))) + x$learner_predict_types$classif$impact = "impact" + x$learner_predict_types$regr$impact = "impact" } .onLoad = function(libname, pkgname) { # nocov start From be4a53ad2959d52b7fd5d2c6a16b6359a6c4e743 Mon Sep 17 00:00:00 2001 From: sumny Date: Mon, 28 Sep 2020 19:36:24 +0200 Subject: [PATCH 4/7] reset changes to PipeOpEncodeImpact.R and PipeOpLearnerCV.R, add new encoder classes --- R/ImpactEncoder.R | 215 ++++++++++++++++++++++++++++++ R/PipeOpEncodeImpact.R | 290 ++++++++--------------------------------- R/PipeOpLearnerCV.R | 4 +- 3 files changed, 270 insertions(+), 239 deletions(-) create mode 100644 R/ImpactEncoder.R diff --git a/R/ImpactEncoder.R b/R/ImpactEncoder.R new file mode 100644 index 000000000..9fccbac0c --- /dev/null +++ b/R/ImpactEncoder.R @@ -0,0 +1,215 @@ +ImpactEncoderClassif = R6Class("ImpactEncoderClassif", inherit = Learner, + public = list( + initialize = function(id, param_set = ParamSet$new(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) { + super$initialize(id = id, task_type = "classif", param_set = param_set, predict_types = "impact", feature_types = c("factor", "ordered"), properties = properties, packages = packages, man = man) + } + ), + private = list( + .predict = function(task) { + impact = get_impact(task$data(cols = task$feature_names), model = self$state$model) + list(response = factor(rep_len(NA_character_, length.out = task$nrow), levels = task$levels(task$target_names)[[1L]]), impact = impact) + } + ) +) + +ImpactEncoderRegr = R6Class("ImpactEncoderRegr", inherit = Learner, + public = list( + initialize = function(id, param_set = ParamSet$new(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) { + super$initialize(id = id, task_type = "regr", param_set = param_set, predict_types = "impact", feature_types = c("factor", "ordered"), properties = properties, packages = packages, man = man) + } + ), + private = list( + .predict = function(task) { + impact = get_impact(task$data(cols = task$feature_names), model = self$state$model) + list(response = rep_len(NA_real_, length.out = task$nrow), impact = impact) + } + ) +) + +ImpactEncoderClassifSimple = R6Class("ImpactEncoderClassifSimple", inherit = ImpactEncoderClassif, + public = list( + initialize = function() { + ps = ParamSet$new(list( + ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")), + ParamLgl$new("impute_zero", tags = c("train", "required")) + )) + ps$values = list(smoothing = 1e-4, impute_zero = FALSE) + super$initialize( + id = "encode.impact.classif.simple", + param_set = ps, + properties = c("twoclass", "multiclass"), + man = "FIXME" + ) + } + ), + private = list( + .train = function(task) { + dt = task$data(cols = task$feature_names) + target = task$truth() + smoothing = self$param_set$values$smoothing + model = sapply(dt, function(col) { + sapply(levels(target), function(tl) { + tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing) + tplogit = log(tprop / (1 - tprop)) + map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)), + function(cl) { + if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_) + condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing) + cplogit = log(condprob / (1 - condprob)) + cplogit - tplogit + } + ) + }) + }, simplify = FALSE) + set_class(model, "encode.impact.classif.simple_model") + } + ) +) + +ImpactEncoderRegrSimple = R6Class("ImpactEncoderRegrSimple", inherit = ImpactEncoderRegr, + public = list( + initialize = function() { + ps = ParamSet$new(list( + ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")), + ParamLgl$new("impute_zero", tags = c("train", "required")) + )) + ps$values = list(smoothing = 1e-4, impute_zero = FALSE) + super$initialize( + id = "encode.impact.regr.simple", + param_set = ps, + man = "FIXME" + ) + } + ), + private = list( + .train = function(task) { + dt = task$data(cols = task$feature_names) + target = task$truth() + meanimp = mean(target) + smoothing = self$param_set$values$smoothing + model = sapply(dt, function(col) { + t(t(c(sapply(levels(col), function(lvl) { + (sum(target[col == lvl], na.rm = TRUE) + smoothing * meanimp) / (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp + }), if (self$param_set$values$impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))) + }, simplify = FALSE) + set_class(model, "encode.impact.regr.simple_model") + } + ) +) + +ImpactEncoderClassifGlmm = R6Class("ImpactEncoderClassifGlmm", inherit = ImpactEncoderClassif, + public = list( + initialize = function() { + ps = ParamSet$new() # FIXME: + super$initialize( + id = "encode.impact.classif.glmm", + param_set = ps, + properties = c("twoclass", "multiclass"), + man = "FIXME" + # FIXME: properties missings? + ) + } + ), + private = list( + .train = function(task) { + dt = task$data(cols = task$feature_names) + target = task$truth() + lvls = levels(target) + model = if (length(lvls) > 2L) { + # binomial glmm + binary_target = sapply(levels(target), function(x) factor(identical(x, target)), simplify = FALSE) + sapply(dt, function(col) { + tmp = sapply(lvls, function(lvl) { + fitGlmer(binary_target[[lvl]], feature = col) + }, simplify = FALSE) + tmp = do.call(cbind, tmp) + colnames(tmp) = lvls + tmp + }, simplify = FALSE) + } else { + # one vs. rest binomial glmm + sapply(dt, function(col) { + tmp = fitGlmer(target, feature = col) + tmp = cbind(-tmp, tmp) # required for the other target level + colnames(tmp) = lvls + tmp + }, simplify = FALSE) + } + set_class(model, "encode.impact.classif.glmm_model") + } + ) +) + +ImpactEncoderRegrGlmm = R6Class("ImpactEncoderRegrGlmm", inherit = ImpactEncoderRegr, + public = list( + initialize = function() { + ps = ParamSet$new() # FIXME: + super$initialize( + id = "encode.impact.regr.glmm", + param_set = ps, + man = "FIXME" + # FIXME: properties missings? + ) + } + ), + private = list( + .train = function(task) { + dt = task$data(cols = task$feature_names) + target = task$truth() + model = sapply(dt, function(col) { + fitLmer(target, feature = col) + }, simplify = FALSE) + set_class(model, "encode.impact.regr.glmm_model") + } + ) +) + +get_impact = function(dt, model) { + imap(dt, function(curdat, idx) { + curdat = as.character(curdat) + curdat[is.na(curdat)] = ".TEMP.MISSING" + curdat[curdat %nin% rownames(model[[idx]])] = ".TEMP.MISSING" + # we only want to "drop" if there are no column names + # otherwise we want the naming scheme . + model[[idx]][match(curdat, rownames(model[[idx]])), , drop = is.null(colnames(model[[idx]]))] + }) +} + +# Regr helper function around lme4::lmer +# FIXME: params +fitLmer = function(target, feature) { + args = list(formula = y ~ 1 + (1 | x), + data = data.table(y = target, x = feature), + na.action = na.omit, + control = lme4::lmerControl(calc.derivs = FALSE) + ) + mod = invoke(lme4::lmer, .args = args) + coefs = stats::coef(mod)$x + lvls = rownames(coefs) + coefs = coefs[[1L]] + names(coefs) = lvls + intercept = unname(lme4::fixef(mod)) + coefs[is.na(coefs)] = intercept + coefs = c(coefs, .TEMP.MISSING = intercept) + t(t(coefs)) +} + +# Classif helper function around lme4::glmer +# FIXME: params +fitGlmer = function(target, feature) { + args = list(formula = y ~ 1 + (1 | x), + data = data.table(y = target, x = feature), + family = stats::binomial, + na.action = na.omit, + control = lme4::glmerControl(calc.derivs = FALSE) + ) + mod = invoke(lme4::glmer, .args = args) + coefs = stats::coef(mod)$x + lvls = rownames(coefs) + coefs = coefs[[1L]] + names(coefs) = lvls + intercept = unname(lme4::fixef(mod)) + coefs[is.na(coefs)] = intercept + coefs = c(coefs, .TEMP.MISSING = intercept) + t(t(coefs)) +} diff --git a/R/PipeOpEncodeImpact.R b/R/PipeOpEncodeImpact.R index 74fb89882..57820c5d9 100644 --- a/R/PipeOpEncodeImpact.R +++ b/R/PipeOpEncodeImpact.R @@ -2,23 +2,18 @@ #' #' @usage NULL #' @name mlr_pipeops_encodeimpact -#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`]. +#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description -#' Encodes columns of type `factor`, and `ordered`. +#' Encodes columns of type `factor`, `character` and `ordered`. #' -#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each -#' (factorial) column to the difference between each target level's conditional log-likelihood given -#' this level, and the target level's global log-likelihood. +#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each (factorial) column +#' to the difference between each target level's conditional log-likelihood +#' given this level, and the target level's global log-likelihood. #' -#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial) -#' column to the difference between the target's conditional mean given this level, and the target's -#' global mean. -#' -#' During training, the impact coding is done using a cross-method. This means that the training -#' [`Task`][mlr3::Task] is split into several folds via [`ResamplingCV`][mlr3::ResamplingCV] and for -#' each fold, impact coding is performed for each test set based on the respective training set. -#' This is helpful to prevent nested model bias. +#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial) column +#' to the difference between the target's conditional mean given +#' this level, and the target's global mean. #' #' Treats new levels during prediction like missing values. #' @@ -36,35 +31,24 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpTaskPreproc`]. #' -#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, or `ordered` parameters encoded. +#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, `character` or +#' `ordered` parameters encoded. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: -#' * `train_task_hash` :: `character(1)`\cr -#' The hash (unique identifier) for the training [`Task`][mlr3::Task]. -#' * `rsmp_cv_instance` :: a `data.table`\cr -#' If `folds` is larger than one, the resampling instance of the [`ResamplingCV`][mlr3::ResamplingCV] used during training. -#' * `impact_predict` :: a named `list`\cr +#' * `impact` :: a named `list`\cr #' A list with an element for each affected feature:\cr -#' For regression, each element is a single column matrix of impact values for each level of that feature.\cr -#' For classification, this is a list with an element for each *feature level*, which is a vector -#' giving the impact of this feature level on each *outcome level*. -#' This list is used to encode impact of the prediction [`Task`][mlr3::Task]. -#' * `impact_cv` :: a `list` of named `lists`\cr -#' A list of length `folds` with each element holding a list like `impact_predict` above. -#' These lists are used to encode impact of the training [`Task`][mlr3::Task]. +#' For regression each element is a single column matrix of impact values for each level of that feature.\cr +#' For classification, it is a list with an element for each *feature level*, which is a vector giving the impact of +#' this feature level on each *outcome level*. #' #' @section Parameters: -#' * `smoothing` :: `numeric(1)` \cr +#' * `smoothing` :: `numeric(1)` \cr #' A finite positive value used for smoothing. Mostly relevant for [classification Tasks][mlr3::TaskClassif] if #' a factor does not coincide with a target factor level (and would otherwise give an infinite logit value). #' Initialized to `1e-4`. #' * `impute_zero` :: `logical(1)`\cr -#' If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default is `FALSE`. -#' * `folds` :: `integer(1)`\cr -#' Number of folds used in the cross-method and passed to [`ResamplingCV`][mlr3::ResamplingCV]. Default is `3`. -#' If set to `1`, no cross-method will be applied during training, i.e., the whole training -#' [`Task`][mlr3::Task] is used to encode impact during training. +#' If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default `FALSE`. #' #' @section Internals: #' Uses laplace smoothing, mostly to avoid infinite values for [classification Task][mlr3::TaskClassif]. @@ -77,10 +61,10 @@ #' poe = po("encodeimpact") #' #' task = TaskClassif$new("task", -#' backend = data.table::data.table( -#' x = factor(c("a", "a", "b", "b", "b")), -#' y = factor(c("a", "a", "a", "b", "b"))), -#' target = "y") +#' data.table::data.table( +#' x = factor(c("a", "a", "a", "b", "b")), +#' y = factor(c("a", "a", "b", "b", "b"))), +#' "x") #' #' poe$train(list(task))[[1]]$data() #' @@ -89,96 +73,53 @@ #' @include PipeOpTaskPreproc.R #' @export PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact", - inherit = PipeOpTaskPreproc, + inherit = PipeOpTaskPreprocSimple, public = list( initialize = function(id = "encodeimpact", param_vals = list()) { ps = ParamSet$new(params = list( - ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required"), default = 1e-4), - ParamLgl$new("impute_zero", tags = c("train", "required"), default = FALSE), - ParamInt$new("folds", lower = 1L, tags = c("train", "required"), default = 3L) + ParamDbl$new("smoothing", 0, Inf, tags = c("train", "required")), + ParamLgl$new("impute_zero", tags = c("train", "required")) )) - ps$values = list(smoothing = 1e-4, impute_zero = FALSE, folds = 3L) + ps$values = list(smoothing = 1e-4, impute_zero = FALSE) super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("factor", "ordered")) } ), private = list( - .train_task = function(task) { - dt_columns = private$.select_cols(task) - cols = dt_columns - if (!length(cols)) { - self$state = list(dt_columns = dt_columns) - return(task) # early exit - } - dt = task$data(cols = cols) - target = task$truth() + .get_state_dt = function(dt, levels, target) { + task_type = if (is.numeric(target)) "regr" else "classif" + state = list() - task_type = task$task_type - row_ids = task$row_ids - row_seq = seq_len(task$nrow) smoothing = self$param_set$values$smoothing - impute_zero = self$param_set$values$impute_zero - folds = self$param_set$values$folds - folds_seq = seq_len(folds) - - # note that matching the row_ids below is necessary because of the resampling - - # impact encoding for the prediction task - impact_predict = get_impact(task_type, folds_seq = 1L, train_sets = list(row_seq), dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero)[[1L]] - - if (folds > 1L) { - # cross-method - rcv = ResamplingCV$new() - rcv$param_set$values$folds = folds - rcv$instantiate(task) - - train_sets = map(folds_seq, function(fold) match(rcv$train_set(fold), row_ids)) - test_sets = map(folds_seq, .f = function(fold) match(rcv$test_set(fold), row_ids)) - impact_cv = get_impact(task_type, folds_seq = folds_seq, train_sets = train_sets, dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero) - - } else { - # no cross-method - test_sets = list(row_seq) - - impact_cv = list(impact_predict) - } - - self$state = list(train_task_hash = task$hash, rsmp_cv_instance = if (folds > 1L) rcv$instance else data.table(), impact_predict = impact_predict, impact_cv = impact_cv, dt_columns = dt_columns) - - # cross-method (folds > 1) will encode test_set of fold i using the impact encoding trained on train_set of fold i - dt = imap(dt, .f = function(curdat, idx) { - fold_dt = map(folds_seq, .f = function(fold) { - impact_test = self$state$impact_cv[[fold]] - test_set = test_sets[[fold]] - curdat = as.character(curdat[test_set]) - curdat[is.na(curdat)] = ".TEMP.MISSING" - curdat[curdat %nin% rownames(impact_test[[idx]])] = ".TEMP.MISSING" - # we only want to "drop" if there are no column names. - # otherwise we want the naming scheme . - impact_test[[idx]][match(curdat, rownames(impact_test[[idx]])), , drop = is.null(colnames(impact_test[[idx]]))] - }) - switch(task_type, - classif = do.call(rbind, fold_dt), - regr = unlist(fold_dt) - ) - }) - - dt = as.data.table(dt) - dt = dt[match(row_seq, unlist(test_sets)), ] # row ids have to be reordered because of resampling - task$select(setdiff(task$feature_names, cols))$cbind(dt) + # different funs depending on task.type + list(impact = switch(task_type, + classif = sapply(dt, function(col) + sapply(levels(target), function(tl) { + tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing) + tplogit = log(tprop / (1 - tprop)) + map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)), + function(cl) { + if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_) + condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / + (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing) + cplogit = log(condprob / (1 - condprob)) + cplogit - tplogit + }) + }), simplify = FALSE), + regr = { + meanimp = mean(target) + sapply(dt, function(col) + t(t(c(sapply(levels(col), function(lvl) { + (sum(target[col == lvl], na.rm = TRUE) + smoothing * meanimp) / + (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp + }), if (self$param_set$values$impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))), simplify = FALSE) + })) }, - .predict_task = function(task) { - cols = self$state$dt_columns - if (!length(cols)) { - return(task) - } - dt = task$data(cols = cols) - - # impact encoding for the prediction task always relies on the encoding of the whole training task - impact = self$state$impact_predict - dt = imap(dt, function(curdat, idx) { + .transform_dt = function(dt, levels) { + impact = self$state$impact + imap(dt, function(curdat, idx) { curdat = as.character(curdat) curdat[is.na(curdat)] = ".TEMP.MISSING" curdat[curdat %nin% rownames(impact[[idx]])] = ".TEMP.MISSING" @@ -186,133 +127,8 @@ PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact", # otherwise we want the naming scheme . impact[[idx]][match(curdat, rownames(impact[[idx]])), , drop = is.null(colnames(impact[[idx]]))] }) - - dt = as.data.table(dt) - task$select(setdiff(task$feature_names, cols))$cbind(dt) } ) ) mlr_pipeops$add("encodeimpact", PipeOpEncodeImpact) - -get_impact = function(task_type, folds_seq, train_sets, dt, target, smoothing, impute_zero) { - switch(task_type, - classif = map(folds_seq, .f = function(fold) { - target_lvls = levels(target) - train_set = train_sets[[fold]] - dt_train = dt[train_set, ] - target_train = target[train_set] - - map(dt_train, .f = function(col) { - col_lvls = levels(col) - - do.call(cbind, stats::setNames(map(target_lvls, .f = function(tl) { - tprop = (sum(target_train == tl) + smoothing) / (length(target_train) + 2 * smoothing) - tplogit = log(tprop / (1 - tprop)) - - map_dbl(c(stats::setNames(col_lvls, nm = col_lvls), c(.TEMP.MISSING = NA)), .f = function(cl) { - if (!impute_zero && is.na(cl)) return(NA_real_) # early exit - condprob = (sum(target_train[is.na(cl) | (col == cl)] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | (col == cl), na.rm = TRUE) + 2 * smoothing) - cplogit = log(condprob / (1 - condprob)) - cplogit - tplogit - }) - }), nm = target_lvls)) - }) - }), - regr = map(folds_seq, .f = function(fold) { - train_set = train_sets[[fold]] - dt_train = dt[train_set, ] - target_train = target[train_set] - - meanimp = mean(target_train) - - map(dt_train, .f = function(col) { - col_lvls = levels(col) - - as.matrix(c(stats::setNames(map_dbl(col_lvls, .f = function(lvl) { - (sum(target_train[col == lvl], na.rm = TRUE) + smoothing * meanimp) / (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp - }), nm = col_lvls), if (impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA))) - }) - }) - ) -} - -LearnerEncodeImpact = R6Class("LearnerEncodeImpact", inherit = Learner) - -LearnerEncodeImpactClassif = R6Class("LearnerEncodeImpactClassif", inherit = LearnerEncodeImpact, - public = list( - initialize = function(id, param_set = ParamSet$new(), predict_types = "impact", feature_types = character(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) { - super$initialize(id = id, task_type = "classif", param_set = param_set, feature_types = feature_types, - predict_types = predict_types, properties = properties, data_formats = data_formats, packages = packages, man = man) - } - ) -) - -LearnerEncodeImpactClassifSimple = R6Class("LearnerEncodeImpactClassifSimple", inherit = LearnerEncodeImpactClassif, - public = list( - initialize = function() { - ps = ParamSet$new(list( - ParamUty$new("affect_columns", custom_check = check_function_or_null, default = selector_all(), tags = "train"), - ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")), - ParamLgl$new("impute_zero", tags = c("train", "required")) - )) - ps$values = list(smoothing = 1e-4, impute_zero = FALSE) - super$initialize( - id = "encode.impact.classif.simple", - feature_types = c("factor", "ordered"), - predict_types = "impact", - param_set = ps, - properties = c("twoclass", "multiclass", "missings"), - man = NA_character_ - ) - } - ), - - private = list( - .train = function(task) { - # FIXME: affect_columns - dt = task$data(cols = task$feature_names) - target = task$truth() - smoothing = self$param_set$values$smoothing - model = sapply(dt, function(col) { - sapply(levels(target), function(tl) { - tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing) - tplogit = log(tprop / (1 - tprop)) - map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)), - function(cl) { - if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_) - condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing) - cplogit = log(condprob / (1 - condprob)) - cplogit - tplogit - } - ) - }) - }, simplify = FALSE) - set_class(model, "encode.impact.classif.simple_model") - }, - - .predict = function(task) { - model = self$state$model - dt = task$data(cols = task$feature_names) - impact = imap(dt, function(curdat, idx) { - curdat = as.character(curdat) - curdat[is.na(curdat)] = ".TEMP.MISSING" - curdat[curdat %nin% rownames(model[[idx]])] = ".TEMP.MISSING" - # we only want to "drop" if there are no column names. - # otherwise we want the naming scheme . - model[[idx]][match(curdat, rownames(model[[idx]])), , drop = is.null(colnames(model[[idx]]))] - }) - list(impact = impact) - - } - ) -) - -check_prediction_data.PredictionDataEncodeImpact = function(pdata) { - browser() - pdata -} - -as_prediction.PredictionDataEncodeImpact = function(x, check = TRUE) { - invoke(PredictionEncodeImpact$new, check = check, .args = x) -} diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index 9569a4a59..b8aba7bf6 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -173,8 +173,8 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", if (pv$method != "insample") { rdesc = mlr_resamplings$get(pv$method) if (pv$method == "cv") rdesc$param_set$values = list(folds = pv$folds) - res = resample(task, private$.learner, rdesc) - prds = rbindlist(lapply(map(res$data$prediction, "test"), as.data.table)) + rr = resample(task, private$.learner, rdesc) + prds = as.data.table(rr$prediction(predict_sets = "test")) } else { prds = as.data.table(private$.learner$predict(task)) } From f01a2c05049c06e59745ea0fb718946e2c411e09 Mon Sep 17 00:00:00 2001 From: sumny Date: Mon, 28 Sep 2020 19:44:50 +0200 Subject: [PATCH 5/7] add some todos --- R/ImpactEncoder.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/ImpactEncoder.R b/R/ImpactEncoder.R index 9fccbac0c..6eee35784 100644 --- a/R/ImpactEncoder.R +++ b/R/ImpactEncoder.R @@ -10,6 +10,7 @@ ImpactEncoderClassif = R6Class("ImpactEncoderClassif", inherit = Learner, list(response = factor(rep_len(NA_character_, length.out = task$nrow), levels = task$levels(task$target_names)[[1L]]), impact = impact) } ) + # FIXME: check for the structure of the model saved during train ) ImpactEncoderRegr = R6Class("ImpactEncoderRegr", inherit = Learner, @@ -24,6 +25,7 @@ ImpactEncoderRegr = R6Class("ImpactEncoderRegr", inherit = Learner, list(response = rep_len(NA_real_, length.out = task$nrow), impact = impact) } ) + # FIXME: check for the structure of the model saved during train ) ImpactEncoderClassifSimple = R6Class("ImpactEncoderClassifSimple", inherit = ImpactEncoderClassif, From 0e49c688dd0ea1c1a729dcbb74a24c12a21af0af Mon Sep 17 00:00:00 2001 From: sumny Date: Mon, 28 Sep 2020 19:49:08 +0200 Subject: [PATCH 6/7] revert some earlier changes --- man/mlr_pipeops_encodeimpact.Rd | 56 ++++------- tests/testthat/test_pipeop_encodeimpact.R | 113 ++++------------------ 2 files changed, 37 insertions(+), 132 deletions(-) diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd index e5c266f09..45907d0d2 100644 --- a/man/mlr_pipeops_encodeimpact.Rd +++ b/man/mlr_pipeops_encodeimpact.Rd @@ -5,23 +5,18 @@ \alias{PipeOpEncodeImpact} \title{Conditional Target Value Impact Encoding} \format{ -\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. } \description{ -Encodes columns of type \code{factor}, and \code{ordered}. +Encodes columns of type \code{factor}, \code{character} and \code{ordered}. -Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each -(factorial) column to the difference between each target level's conditional log-likelihood given -this level, and the target level's global log-likelihood. +Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each (factorial) column +to the difference between each target level's conditional log-likelihood +given this level, and the target level's global log-likelihood. -Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial) -column to the difference between the target's conditional mean given this level, and the target's -global mean. - -During training, the impact coding is done using a cross-method. This means that the training -\code{\link[mlr3:Task]{Task}} is split into several folds via \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} and for -each fold, impact coding is performed for each test set based on the respective training set. -This is helpful to prevent nested model bias. +Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial) column +to the difference between the target's conditional mean given +this level, and the target's global mean. Treats new levels during prediction like missing values. } @@ -41,42 +36,31 @@ otherwise be set during construction. Default \code{list()}. Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. -The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, or \code{ordered} parameters encoded. +The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, \code{character} or +\code{ordered} parameters encoded. } \section{State}{ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as: \itemize{ -\item \code{train_task_hash} :: \code{character(1)}\cr -The hash (unique identifier) for the training \code{\link[mlr3:Task]{Task}}. -\item \code{rsmp_cv_instance} :: a \code{data.table}\cr -If \code{folds} is larger than one, the resampling instance of the \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} used during training. -\item \code{impact_predict} :: a named \code{list}\cr +\item \code{impact} :: a named \code{list}\cr A list with an element for each affected feature:\cr -For regression, each element is a single column matrix of impact values for each level of that feature.\cr -For classification, this is a list with an element for each \emph{feature level}, which is a vector -giving the impact of this feature level on each \emph{outcome level}. -This list is used to encode impact of the prediction \code{\link[mlr3:Task]{Task}}. -\item \code{impact_cv} :: a \code{list} of named \code{lists}\cr -A list of length \code{folds} with each element holding a list like \code{impact_predict} above. -These lists are used to encode impact of the training \code{\link[mlr3:Task]{Task}}. +For regression each element is a single column matrix of impact values for each level of that feature.\cr +For classification, it is a list with an element for each \emph{feature level}, which is a vector giving the impact of +this feature level on each \emph{outcome level}. } } \section{Parameters}{ \itemize{ -\item \code{smoothing} :: \code{numeric(1)} \cr +\item \code{smoothing} :: \code{numeric(1)} \cr A finite positive value used for smoothing. Mostly relevant for \link[mlr3:TaskClassif]{classification Tasks} if a factor does not coincide with a target factor level (and would otherwise give an infinite logit value). Initialized to \code{1e-4}. \item \code{impute_zero} :: \code{logical(1)}\cr -If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default is \code{FALSE}. -\item \code{folds} :: \code{integer(1)}\cr -Number of folds used in the cross-method and passed to \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}}. Default is \code{3}. -If set to \code{1}, no cross-method will be applied during training, i.e., the whole training -\code{\link[mlr3:Task]{Task}} is used to encode impact during training. +If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default \code{FALSE}. } } @@ -95,10 +79,10 @@ library("mlr3") poe = po("encodeimpact") task = TaskClassif$new("task", - backend = data.table::data.table( - x = factor(c("a", "a", "b", "b", "b")), - y = factor(c("a", "a", "a", "b", "b"))), - target = "y") + data.table::data.table( + x = factor(c("a", "a", "a", "b", "b")), + y = factor(c("a", "a", "b", "b", "b"))), + "x") poe$train(list(task))[[1]]$data() diff --git a/tests/testthat/test_pipeop_encodeimpact.R b/tests/testthat/test_pipeop_encodeimpact.R index 0c23367ac..c3a7000db 100644 --- a/tests/testthat/test_pipeop_encodeimpact.R +++ b/tests/testthat/test_pipeop_encodeimpact.R @@ -8,25 +8,13 @@ test_that("PipeOpEncodeImpact", { t2 = po("histbin")$train(list(tsk("iris")))[[1]] - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, - constargs = list(param_vals = list(folds = 1L)), task = task) - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, - constargs = list(param_vals = list(folds = 1L)), task = t2) - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, - constargs = list(param_vals = list(folds = 1L)), task = mlr_tasks$get("iris")) - - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, - constargs = list(param_vals = list(folds = 2L)), task = task, - predict_like_train = FALSE, deterministic_train = FALSE) - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, - constargs = list(param_vals = list(folds = 2L)), task = t2, - predict_like_train = FALSE, deterministic_train = FALSE) - expect_datapreproc_pipeop_class(PipeOpEncodeImpact, - constargs = list(param_vals = list(folds = 2L)), task = mlr_tasks$get("iris"), - predict_like_train = FALSE, deterministic_train = FALSE) + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = task) + + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = t2) + + expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = mlr_tasks$get("iris")) op = PipeOpEncodeImpact$new() - op$param_set$values$folds = 1 expect_pipeop(op) nt = train_pipeop(op, inputs = list(task))[[1L]] @@ -41,8 +29,10 @@ test_that("PipeOpEncodeImpact", { # factor cols are removed expect_true(all(tsk("iris")$feature_names %nin% fn)) expect_true("factor" %nin% nt$feature_types$type) + }) + test_that("PipeOpImpactEncode on Classification", { testdf = data.frame( @@ -53,7 +43,6 @@ test_that("PipeOpImpactEncode on Classification", { testtask = TaskClassif$new("test", testdf, "t") op = PipeOpEncodeImpact$new() - op$param_set$values$folds = 1 expect_equal(op$train(list(tsk("iris")))[[1]], tsk("iris")) @@ -68,16 +57,15 @@ test_that("PipeOpImpactEncode on Classification", { op$train(list(testtask)) - expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a) # folds = 1, no cross-method - expect_equal(op$state$impact_predict$a, expm) + expect_equal(op$state$impact$a, expm) op$param_set$values$smoothing = 1e-4 op$train(list(testtask)) - expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE), 0.5e-4) + expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE), 0.5e-4) op$param_set$values$smoothing = 1e-8 op$train(list(testtask)) - expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE) * 1e4, 0.5e-4) + expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE) * 1e4, 0.5e-4) op$param_set$values$smoothing = 6.362e-9 # similar to what glm uses encoded = op$train(list(testtask))[[1]]$data() @@ -89,12 +77,11 @@ test_that("PipeOpImpactEncode on Classification", { expm2 = rbind(expm2, c(NA, NA)) rownames(expm2) = c("a", "b", ".TEMP.MISSING") - expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b) # folds = 1, no cross-method - expect_equal(op$state$impact_predict$b, expm2, tolerance = 1e-5) + expect_equal(op$state$impact$b, expm2, tolerance = 1e-5) expect_equal(encoded, - data.table(t = testdf$t, a = op$state$impact_predict$a[testdf$a, ], - b = op$state$impact_predict$b[testdf$b, ])) + data.table(t = testdf$t, a = op$state$impact$a[testdf$a, ], + b = op$state$impact$b[testdf$b, ])) # test NA handling / imputation @@ -113,6 +100,7 @@ test_that("PipeOpImpactEncode on Classification", { encoded = op$train(list(testtask2))[[1]]$data() expect_equal(as.numeric(as.matrix(encoded)[c(11, 17, 24, 30)]), c(0, 0, 0, 0)) # imputation by 0 + }) test_that("PipeOpImpactEncode on Regression", { @@ -130,16 +118,13 @@ test_that("PipeOpImpactEncode on Regression", { t = c(1, 2, 3, 1, 2, 3)) op = PipeOpEncodeImpact$new() - op$param_set$values$folds = 1 op$param_set$values$smoothing = 0 expect_equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE) - expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a) # folds = 1, no cross-method - expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b) # folds = 1, no cross-method - expect_equal(op$state$impact_predict$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA)))) - expect_equal(op$state$impact_predict$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA)))) + expect_equal(op$state$impact$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA)))) + expect_equal(op$state$impact$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA)))) op$param_set$values$smoothing = 1e-4 expect_false(isTRUE(all.equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE, tolerance = 1e-5))) @@ -180,12 +165,12 @@ test_that("PipeOpImpactEncode on Regression", { encoded = op$train(list(testtask2))[[1]]$data() expect_equal(which(is.na(encoded)), c(11, 18)) + }) test_that("PipeOpImpactEncode factor level ``", { op = PipeOpEncodeImpact$new() - op$param_set$values$folds = 1 testdf3 = iris levels(testdf3$Species) = c("setosa", "versicolor", "") @@ -196,69 +181,5 @@ test_that("PipeOpImpactEncode factor level ``", { train_out3ref = op$train(list(testtask3ref))[[1L]] expect_equal(train_out3$data(), train_out3ref$data()) -}) - -test_that("PipeOpImpactEncode cross-method on Classification", { - # FIXME: could also add some more technical tests - library(mlr3learners) - set.seed(2409) - n = 300L - x = as.factor(rep(c("x1", "x2"), each = n / 2L)) - y = as.factor(c(sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.9, 0.1)), sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.1, 0.9)))) - z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE)) - dat = data.table(y = y, x = x, z = z) - - task = TaskClassif$new("test", backend = dat, target = "y") - - learner = lrn("classif.log_reg", id = "l") # baseline - graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1") # no cross-method - graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2") # cross-method - - # check if nested resampling for the cross-method would work - train = sample(task$row_ids, size = 200L) - test = setdiff(task$row_ids, train) - - learner$train(task, row_ids = train) - graphlearner1$train(task, row_ids = train) - graphlearner2$train(task, row_ids = train) - - ce = c(suppressWarnings(learner$predict(task, row_ids = test)$score(msr("classif.ce"))), - suppressWarnings(graphlearner1$predict(task, row_ids = test)$score(msr("classif.ce"))), - suppressWarnings(graphlearner2$predict(task, row_ids = test)$score(msr("classif.ce")))) - expect_true(all(exp(diff(log(ce))) - 1 < 0.1)) # ratios of mean ce's should be around 1 -}) - -test_that("PipeOpImpactEncode cross-method on Regression", { - # FIXME: could also add some more technical tests - - library(mlr3learners) - set.seed(2409) - n = 300L - x = as.factor(rep(c("x1", "x2"), each = n / 2L)) # x1 ~ N(-5, 2), x2 ~ N(5, 2) - y = c(rnorm(n / 2L, mean = -5, sd = 2), rnorm(n / 2L, mean = 5, sd = 2)) - # aggregate(y ~ x, FUN = mean, data = dat) - # aggregate(y ~ x, FUN = sd, data = dat) - z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE)) # random - dat = data.table(y = y, x = x, z = z) - - task = TaskRegr$new("test", backend = dat, target = "y") - - learner = lrn("regr.lm", id = "l") # baseline - graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1") # no cross-method - graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2") # cross-method - - # check if nested resampling for the cross-method would work - train = sample(task$row_ids, size = 200L) - test = setdiff(task$row_ids, train) - - learner$train(task, row_ids = train) - graphlearner1$train(task, row_ids = train) - graphlearner2$train(task, row_ids = train) - - mse = c(learner$predict(task, row_ids = test)$score(msr("regr.mse")), - graphlearner1$predict(task, row_ids = test)$score(msr("regr.mse")), - graphlearner2$predict(task, row_ids = test)$score(msr("regr.mse"))) - expect_true(all(exp(diff(log(mse))) - 1 < 0.1)) # ratios of mean mse's should be around 1 }) - From 2489e605427994ebb2ea0ef95e8de099eb1f66bc Mon Sep 17 00:00:00 2001 From: sumny Date: Thu, 1 Oct 2020 18:07:09 +0200 Subject: [PATCH 7/7] drop response if predict_type = "impact" --- R/PipeOpLearnerCV.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index b8aba7bf6..40db58376 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -191,7 +191,7 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", pred_to_task = function(prds, task) { if (!is.null(prds$truth)) prds[, truth := NULL] - if (!self$param_set$values$resampling.keep_response && self$learner$predict_type == "prob") { + if (!self$param_set$values$resampling.keep_response && self$learner$predict_type %in% c("impact", "prob")) { prds[, response := NULL] } renaming = setdiff(colnames(prds), "row_id")