From 8ed95d5416cce5de6de245d602c8d669c30bbdb1 Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Tue, 11 Aug 2020 19:28:53 +0200
Subject: [PATCH 1/7] add cross-method for training of encode impact

---
 DESCRIPTION                               |   2 +-
 R/PipeOpEncodeImpact.R                    | 210 ++++++++++++++++------
 man/mlr_pipeops_encodeimpact.Rd           |  56 +++---
 tests/testthat/test_pipeop_encodeimpact.R | 113 ++++++++++--
 4 files changed, 290 insertions(+), 91 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index eeda3b631..666c43991 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -87,7 +87,7 @@ Encoding: UTF-8
 LazyData: true
 NeedsCompilation: no
 Roxygen: list(markdown = TRUE, r6 = FALSE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.1.9000
 Collate:
     'Graph.R'
     'GraphLearner.R'
diff --git a/R/PipeOpEncodeImpact.R b/R/PipeOpEncodeImpact.R
index 1360ef048..a89e1c2b8 100644
--- a/R/PipeOpEncodeImpact.R
+++ b/R/PipeOpEncodeImpact.R
@@ -2,18 +2,23 @@
 #'
 #' @usage NULL
 #' @name mlr_pipeops_encodeimpact
-#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`].
 #'
 #' @description
-#' Encodes columns of type `factor`, `character` and `ordered`.
+#' Encodes columns of type `factor`, and `ordered`.
 #'
-#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each (factorial) column
-#' to the difference between each target level's conditional log-likelihood
-#' given this level, and the target level's global log-likelihood.
+#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each
+#' (factorial) column to the difference between each target level's conditional log-likelihood given
+#' this level, and the target level's global log-likelihood.
 #'
-#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial) column
-#' to the difference between the target's conditional mean given
-#' this level, and the target's global mean.
+#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial)
+#' column to the difference between the target's conditional mean given this level, and the target's
+#' global mean.
+#'
+#' During training, the impact coding is done using a cross-method. This means that the training
+#' [`Task`][mlr3::Task] is split into several folds via [`ResamplingCV`][mlr3::ResamplingCV] and for
+#' each fold, impact coding is performed for each test set based on the respective training set.
+#' This is helpful to prevent nested model bias.
 #'
 #' Treats new levels during prediction like missing values.
 #'
@@ -31,24 +36,35 @@
 #' @section Input and Output Channels:
 #' Input and output channels are inherited from [`PipeOpTaskPreproc`].
 #'
-#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, `character` or
-#' `ordered` parameters encoded.
+#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, or `ordered` parameters encoded.
 #'
 #' @section State:
 #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
-#' * `impact` :: a named `list`\cr
+#' * `train_task_hash` :: `character(1)`\cr
+#'   The hash (unique identifier) for the training [`Task`][mlr3::Task].
+#' * `rsmp_cv_instance` :: a `data.table`\cr
+#'   If `folds` is larger than one, the resampling instance of the [`ResamplingCV`][mlr3::ResamplingCV] used during training.
+#' * `impact_predict` :: a named `list`\cr
 #'   A list with an element for each affected feature:\cr
-#'   For regression each element is a single column matrix of impact values for each level of that feature.\cr
-#'   For classification, it is a list with an element for each *feature level*, which is a vector giving the impact of
-#'   this feature level on each *outcome level*.
+#'   For regression, each element is a single column matrix of impact values for each level of that feature.\cr
+#'   For classification, this is a list with an element for each *feature level*, which is a vector
+#'   giving the impact of this feature level on each *outcome level*.
+#'   This list is used to encode impact of the prediction [`Task`][mlr3::Task].
+#' * `impact_cv` :: a `list` of named `lists`\cr
+#'   A list of length `folds` with each element holding a list like `impact_predict` above.
+#'   These lists are used to encode impact of the training [`Task`][mlr3::Task].
 #'
 #' @section Parameters:
-#' * `smoothing`  :: `numeric(1)` \cr
+#' * `smoothing` :: `numeric(1)` \cr
 #'   A finite positive value used for smoothing. Mostly relevant for [classification Tasks][mlr3::TaskClassif] if
 #'   a factor does not coincide with a target factor level (and would otherwise give an infinite logit value).
 #'   Initialized to `1e-4`.
 #' * `impute_zero` :: `logical(1)`\cr
-#'   If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default `FALSE`.
+#'   If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default is `FALSE`.
+#' * `folds` :: `integer(1)`\cr
+#'   Number of folds used in the cross-method and passed to [`ResamplingCV`][mlr3::ResamplingCV]. Default is `3`.
+#'   If set to `1`, no cross-method will be applied during training, i.e., the whole training
+#'   [`Task`][mlr3::Task] is used to encode impact during training.
 #'
 #' @section Internals:
 #' Uses laplace smoothing, mostly to avoid infinite values for [classification Task][mlr3::TaskClassif].
@@ -61,10 +77,10 @@
 #' poe = po("encodeimpact")
 #'
 #' task = TaskClassif$new("task",
-#'   data.table::data.table(
-#'     x = factor(c("a", "a", "a", "b", "b")),
-#'     y = factor(c("a", "a", "b", "b", "b"))),
-#'   "x")
+#'   backend = data.table::data.table(
+#'     x = factor(c("a", "a", "b", "b", "b")),
+#'     y = factor(c("a", "a", "a", "b", "b"))),
+#'   target = "y")
 #'
 #' poe$train(list(task))[[1]]$data()
 #'
@@ -73,53 +89,96 @@
 #' @include PipeOpTaskPreproc.R
 #' @export
 PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact",
-  inherit = PipeOpTaskPreprocSimple,
+  inherit = PipeOpTaskPreproc,
   public = list(
     initialize = function(id = "encodeimpact", param_vals = list()) {
       ps = ParamSet$new(params = list(
-        ParamDbl$new("smoothing", 0, Inf, tags = c("train", "required")),
-        ParamLgl$new("impute_zero", tags = c("train", "required"))
+        ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required"), default = 1e-4),
+        ParamLgl$new("impute_zero", tags = c("train", "required"), default = FALSE),
+        ParamInt$new("folds", lower = 1L, tags = c("train", "required"), default = 3L)
       ))
-      ps$values = list(smoothing = 1e-4, impute_zero = FALSE)
+      ps$values = list(smoothing = 1e-4, impute_zero = FALSE, folds = 3L)
       super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("factor", "ordered"))
     }
   ),
   private = list(
 
-    .get_state_dt = function(dt, levels, target) {
-      task_type = if (is.numeric(target)) "regr" else "classif"
-      state = list()
+    .train_task = function(task) {
+      dt_columns = private$.select_cols(task)
+      cols = dt_columns
+      if (!length(cols)) {
+        self$state = list(dt_columns = dt_columns)
+        return(task)  # early exit
+      }
+      dt = task$data(cols = cols)
+      target = task$truth()
 
+      task_type = task$task_type
+      row_ids = task$row_ids
+      row_seq = seq_len(task$nrow)
       smoothing = self$param_set$values$smoothing
+      impute_zero = self$param_set$values$impute_zero
+      folds = self$param_set$values$folds
+      folds_seq = seq_len(folds)
+
+      # note that matching the row_ids below is necessary because of the resampling
+
+      # impact encoding for the prediction task
+      impact_predict = get_impact(task_type, folds_seq = 1L, train_sets = list(row_seq), dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero)[[1L]]
+
+      if (folds > 1L) {
+        # cross-method
+        rcv = ResamplingCV$new()
+        rcv$param_set$values$folds = folds
+        rcv$instantiate(task)
+
+        train_sets = map(folds_seq, function(fold) match(rcv$train_set(fold), row_ids))
+        test_sets = map(folds_seq, .f = function(fold) match(rcv$test_set(fold), row_ids))
+
+        impact_cv = get_impact(task_type, folds_seq = folds_seq, train_sets = train_sets, dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero)
 
-      # different funs depending on task.type
-      list(impact = switch(task_type,
-        classif = sapply(dt, function(col)
-          sapply(levels(target), function(tl) {
-            tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing)
-            tplogit = log(tprop / (1 - tprop))
-            map_dbl(c(setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)),
-              function(cl) {
-                if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_)
-                condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) /
-                  (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing)
-                cplogit = log(condprob / (1 - condprob))
-                cplogit - tplogit
-              })
-          }), simplify = FALSE),
-        regr = {
-          meanimp = mean(target)
-          sapply(dt, function(col)
-            t(t(c(sapply(levels(col), function(lvl) {
-              (sum(target[col == lvl], na.rm = TRUE) + smoothing * meanimp) /
-                (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp
-            }), if (self$param_set$values$impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))), simplify = FALSE)
-        }))
+      } else {
+        # no cross-method
+        test_sets = list(row_seq)
+
+        impact_cv = list(impact_predict)
+      }
+
+      self$state = list(train_task_hash = task$hash, rsmp_cv_instance = if (folds > 1L) rcv$instance else data.table(), impact_predict = impact_predict, impact_cv = impact_cv, dt_columns = dt_columns)
+
+      # cross-method (folds > 1) will encode test_set of fold i using the impact encoding trained on train_set of fold i
+      dt = imap(dt, .f = function(curdat, idx) {
+        fold_dt = map(folds_seq, .f = function(fold) {
+          impact_test = self$state$impact_cv[[fold]]
+          test_set = test_sets[[fold]]
+          curdat = as.character(curdat[test_set])
+          curdat[is.na(curdat)] = ".TEMP.MISSING"
+          curdat[curdat %nin% rownames(impact_test[[idx]])] = ".TEMP.MISSING"
+          # we only want to "drop" if there are no column names.
+          # otherwise we want the naming scheme <original feature name>.<target level>
+          impact_test[[idx]][match(curdat, rownames(impact_test[[idx]])), , drop = is.null(colnames(impact_test[[idx]]))]
+        })
+        switch(task_type,
+          classif = do.call(rbind, fold_dt),
+          regr = unlist(fold_dt)
+        )
+      })
+
+      dt = as.data.table(dt)
+      dt = dt[match(row_seq, unlist(test_sets)), ]  # row ids have to be reordered because of resampling
+      task$select(setdiff(task$feature_names, cols))$cbind(dt)
     },
 
-    .transform_dt = function(dt, levels) {
-      impact = self$state$impact
-      imap(dt, function(curdat, idx) {
+    .predict_task = function(task) {
+      cols = self$state$dt_columns
+      if (!length(cols)) {
+        return(task)
+      }
+      dt = task$data(cols = cols)
+
+      # impact encoding for the prediction task always relies on the encoding of the whole training task
+      impact = self$state$impact_predict
+      dt = imap(dt, function(curdat, idx) {
         curdat = as.character(curdat)
         curdat[is.na(curdat)] = ".TEMP.MISSING"
         curdat[curdat %nin% rownames(impact[[idx]])] = ".TEMP.MISSING"
@@ -127,8 +186,53 @@ PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact",
         # otherwise we want the naming scheme <original feature name>.<target level>
         impact[[idx]][match(curdat, rownames(impact[[idx]])), , drop = is.null(colnames(impact[[idx]]))]
       })
+
+      dt = as.data.table(dt)
+      task$select(setdiff(task$feature_names, cols))$cbind(dt)
     }
   )
 )
 
 mlr_pipeops$add("encodeimpact", PipeOpEncodeImpact)
+
+get_impact = function(task_type, folds_seq, train_sets, dt, target, smoothing, impute_zero) {
+  switch(task_type,
+    classif = map(folds_seq, .f = function(fold) {
+      target_lvls = levels(target)
+      train_set = train_sets[[fold]]
+      dt_train = dt[train_set, ]
+      target_train = target[train_set]
+
+      map(dt_train, .f = function(col) {
+        col_lvls = levels(col)
+
+        do.call(cbind, stats::setNames(map(target_lvls, .f = function(tl) {
+          tprop = (sum(target_train == tl) + smoothing) / (length(target_train) + 2 * smoothing)
+          tplogit = log(tprop / (1 - tprop))
+
+          map_dbl(c(stats::setNames(col_lvls, nm = col_lvls), c(.TEMP.MISSING = NA)), .f = function(cl) {
+            if (!impute_zero && is.na(cl)) return(NA_real_)  # early exit
+            condprob = (sum(target_train[is.na(cl) | (col == cl)] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | (col == cl), na.rm = TRUE) + 2 * smoothing)
+            cplogit = log(condprob / (1 - condprob))
+            cplogit - tplogit
+          })
+        }), nm = target_lvls))
+      })
+    }),
+    regr = map(folds_seq, .f = function(fold) {
+      train_set = train_sets[[fold]]
+      dt_train = dt[train_set, ]
+      target_train = target[train_set]
+
+      meanimp = mean(target_train)
+
+      map(dt_train, .f = function(col) {
+        col_lvls = levels(col)
+
+        as.matrix(c(stats::setNames(map_dbl(col_lvls, .f = function(lvl) {
+          (sum(target_train[col == lvl], na.rm = TRUE) + smoothing * meanimp) / (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp
+        }), nm = col_lvls), if (impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))
+      })
+    })
+  )
+}
diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd
index 8e59a4161..5b1273573 100644
--- a/man/mlr_pipeops_encodeimpact.Rd
+++ b/man/mlr_pipeops_encodeimpact.Rd
@@ -5,18 +5,23 @@
 \alias{PipeOpEncodeImpact}
 \title{Conditional Target Value Impact Encoding}
 \format{
-\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}.
+\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}.
 }
 \description{
-Encodes columns of type \code{factor}, \code{character} and \code{ordered}.
+Encodes columns of type \code{factor}, and \code{ordered}.
 
-Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each (factorial) column
-to the difference between each target level's conditional log-likelihood
-given this level, and the target level's global log-likelihood.
+Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each
+(factorial) column to the difference between each target level's conditional log-likelihood given
+this level, and the target level's global log-likelihood.
 
-Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial) column
-to the difference between the target's conditional mean given
-this level, and the target's global mean.
+Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial)
+column to the difference between the target's conditional mean given this level, and the target's
+global mean.
+
+During training, the impact coding is done using a cross-method. This means that the training
+\code{\link[mlr3:Task]{Task}} is split into several folds via \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} and for
+each fold, impact coding is performed for each test set based on the respective training set.
+This is helpful to prevent nested model bias.
 
 Treats new levels during prediction like missing values.
 }
@@ -36,31 +41,42 @@ otherwise be set during construction. Default \code{list()}.
 
 Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}.
 
-The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, \code{character} or
-\code{ordered} parameters encoded.
+The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, or \code{ordered} parameters encoded.
 }
 
 \section{State}{
 
 The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as:
 \itemize{
-\item \code{impact} :: a named \code{list}\cr
+\item \code{train_task_hash} :: \code{character(1)}\cr
+The hash (unique identifier) for the training \code{\link[mlr3:Task]{Task}}.
+\item \code{rsmp_cv_instance} :: a \code{data.table}\cr
+If \code{folds} is larger than one, the resampling instance of the \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} used during training.
+\item \code{impact_predict} :: a named \code{list}\cr
 A list with an element for each affected feature:\cr
-For regression each element is a single column matrix of impact values for each level of that feature.\cr
-For classification, it is a list with an element for each \emph{feature level}, which is a vector giving the impact of
-this feature level on each \emph{outcome level}.
+For regression, each element is a single column matrix of impact values for each level of that feature.\cr
+For classification, this is a list with an element for each \emph{feature level}, which is a vector
+giving the impact of this feature level on each \emph{outcome level}.
+This list is used to encode impact of the prediction \code{\link[mlr3:Task]{Task}}.
+\item \code{impact_cv} :: a \code{list} of named \code{lists}\cr
+A list of length \code{folds} with each element holding a list like \code{impact_predict} above.
+These lists are used to encode impact of the training \code{\link[mlr3:Task]{Task}}.
 }
 }
 
 \section{Parameters}{
 
 \itemize{
-\item \code{smoothing}  :: \code{numeric(1)} \cr
+\item \code{smoothing} :: \code{numeric(1)} \cr
 A finite positive value used for smoothing. Mostly relevant for \link[mlr3:TaskClassif]{classification Tasks} if
 a factor does not coincide with a target factor level (and would otherwise give an infinite logit value).
 Initialized to \code{1e-4}.
 \item \code{impute_zero} :: \code{logical(1)}\cr
-If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default \code{FALSE}.
+If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default is \code{FALSE}.
+\item \code{folds} :: \code{integer(1)}\cr
+Number of folds used in the cross-method and passed to \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}}. Default is \code{3}.
+If set to \code{1}, no cross-method will be applied during training, i.e., the whole training
+\code{\link[mlr3:Task]{Task}} is used to encode impact during training.
 }
 }
 
@@ -79,10 +95,10 @@ library("mlr3")
 poe = po("encodeimpact")
 
 task = TaskClassif$new("task",
-  data.table::data.table(
-    x = factor(c("a", "a", "a", "b", "b")),
-    y = factor(c("a", "a", "b", "b", "b"))),
-  "x")
+  backend = data.table::data.table(
+    x = factor(c("a", "a", "b", "b", "b")),
+    y = factor(c("a", "a", "a", "b", "b"))),
+  target = "y")
 
 poe$train(list(task))[[1]]$data()
 
diff --git a/tests/testthat/test_pipeop_encodeimpact.R b/tests/testthat/test_pipeop_encodeimpact.R
index c3a7000db..0c23367ac 100644
--- a/tests/testthat/test_pipeop_encodeimpact.R
+++ b/tests/testthat/test_pipeop_encodeimpact.R
@@ -8,13 +8,25 @@ test_that("PipeOpEncodeImpact", {
 
   t2 = po("histbin")$train(list(tsk("iris")))[[1]]
 
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = task)
-
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = t2)
-
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = mlr_tasks$get("iris"))
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
+    constargs = list(param_vals = list(folds = 1L)), task = task)
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
+    constargs = list(param_vals = list(folds = 1L)), task = t2)
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
+    constargs = list(param_vals = list(folds = 1L)), task = mlr_tasks$get("iris"))
+
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
+    constargs = list(param_vals = list(folds = 2L)), task = task,
+    predict_like_train = FALSE, deterministic_train = FALSE)
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
+    constargs = list(param_vals = list(folds = 2L)), task = t2,
+    predict_like_train = FALSE, deterministic_train = FALSE)
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
+    constargs = list(param_vals = list(folds = 2L)), task = mlr_tasks$get("iris"),
+    predict_like_train = FALSE, deterministic_train = FALSE)
 
   op = PipeOpEncodeImpact$new()
+  op$param_set$values$folds = 1
   expect_pipeop(op)
 
   nt = train_pipeop(op, inputs = list(task))[[1L]]
@@ -29,10 +41,8 @@ test_that("PipeOpEncodeImpact", {
   # factor cols are removed
   expect_true(all(tsk("iris")$feature_names %nin% fn))
   expect_true("factor" %nin% nt$feature_types$type)
-
 })
 
-
 test_that("PipeOpImpactEncode on Classification", {
 
   testdf = data.frame(
@@ -43,6 +53,7 @@ test_that("PipeOpImpactEncode on Classification", {
   testtask = TaskClassif$new("test", testdf, "t")
 
   op = PipeOpEncodeImpact$new()
+  op$param_set$values$folds = 1
 
   expect_equal(op$train(list(tsk("iris")))[[1]], tsk("iris"))
 
@@ -57,15 +68,16 @@ test_that("PipeOpImpactEncode on Classification", {
 
   op$train(list(testtask))
 
-  expect_equal(op$state$impact$a, expm)
+  expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a)  # folds = 1, no cross-method
+  expect_equal(op$state$impact_predict$a, expm)
 
   op$param_set$values$smoothing = 1e-4
   op$train(list(testtask))
-  expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE), 0.5e-4)
+  expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE), 0.5e-4)
 
   op$param_set$values$smoothing = 1e-8
   op$train(list(testtask))
-  expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE) * 1e4, 0.5e-4)
+  expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE) * 1e4, 0.5e-4)
 
   op$param_set$values$smoothing = 6.362e-9  # similar to what glm uses
   encoded = op$train(list(testtask))[[1]]$data()
@@ -77,11 +89,12 @@ test_that("PipeOpImpactEncode on Classification", {
   expm2 = rbind(expm2, c(NA, NA))
   rownames(expm2) = c("a", "b", ".TEMP.MISSING")
 
-  expect_equal(op$state$impact$b, expm2, tolerance = 1e-5)
+  expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b)  # folds = 1, no cross-method
+  expect_equal(op$state$impact_predict$b, expm2, tolerance = 1e-5)
 
   expect_equal(encoded,
-    data.table(t = testdf$t, a = op$state$impact$a[testdf$a, ],
-      b = op$state$impact$b[testdf$b, ]))
+    data.table(t = testdf$t, a = op$state$impact_predict$a[testdf$a, ],
+      b = op$state$impact_predict$b[testdf$b, ]))
 
   # test NA handling / imputation
 
@@ -100,7 +113,6 @@ test_that("PipeOpImpactEncode on Classification", {
   encoded = op$train(list(testtask2))[[1]]$data()
 
   expect_equal(as.numeric(as.matrix(encoded)[c(11, 17, 24, 30)]), c(0, 0, 0, 0))  # imputation by 0
-
 })
 
 test_that("PipeOpImpactEncode on Regression", {
@@ -118,13 +130,16 @@ test_that("PipeOpImpactEncode on Regression", {
       t = c(1, 2, 3, 1, 2, 3))
 
   op = PipeOpEncodeImpact$new()
+  op$param_set$values$folds = 1
   op$param_set$values$smoothing = 0
 
   expect_equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE)
 
+  expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a)  # folds = 1, no cross-method
+  expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b)  # folds = 1, no cross-method
 
-  expect_equal(op$state$impact$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA))))
-  expect_equal(op$state$impact$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA))))
+  expect_equal(op$state$impact_predict$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA))))
+  expect_equal(op$state$impact_predict$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA))))
 
   op$param_set$values$smoothing = 1e-4
   expect_false(isTRUE(all.equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE, tolerance = 1e-5)))
@@ -165,12 +180,12 @@ test_that("PipeOpImpactEncode on Regression", {
   encoded = op$train(list(testtask2))[[1]]$data()
 
   expect_equal(which(is.na(encoded)), c(11, 18))
-
 })
 
 test_that("PipeOpImpactEncode factor level ``", {
 
   op = PipeOpEncodeImpact$new()
+  op$param_set$values$folds = 1
 
   testdf3 = iris
   levels(testdf3$Species) = c("setosa", "versicolor", "")
@@ -181,5 +196,69 @@ test_that("PipeOpImpactEncode factor level ``", {
   train_out3ref = op$train(list(testtask3ref))[[1L]]
 
   expect_equal(train_out3$data(), train_out3ref$data())
+})
+
+test_that("PipeOpImpactEncode cross-method on Classification", {
+  # FIXME: could also add some more technical tests
 
+  library(mlr3learners)
+  set.seed(2409)
+  n = 300L
+  x = as.factor(rep(c("x1", "x2"), each = n / 2L))
+  y = as.factor(c(sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.9, 0.1)), sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.1, 0.9))))
+  z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE))
+  dat = data.table(y = y, x = x, z = z)
+
+  task = TaskClassif$new("test", backend = dat, target = "y")
+
+  learner = lrn("classif.log_reg", id = "l")  # baseline
+  graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1")  # no cross-method
+  graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2")  # cross-method
+
+  # check if nested resampling for the cross-method would work
+  train = sample(task$row_ids, size = 200L)
+  test = setdiff(task$row_ids, train)
+
+  learner$train(task, row_ids = train)
+  graphlearner1$train(task, row_ids = train)
+  graphlearner2$train(task, row_ids = train)
+
+  ce = c(suppressWarnings(learner$predict(task, row_ids = test)$score(msr("classif.ce"))),
+    suppressWarnings(graphlearner1$predict(task, row_ids = test)$score(msr("classif.ce"))),
+    suppressWarnings(graphlearner2$predict(task, row_ids = test)$score(msr("classif.ce"))))
+  expect_true(all(exp(diff(log(ce))) - 1 < 0.1))  # ratios of mean ce's should be around 1
+})
+
+test_that("PipeOpImpactEncode cross-method on Regression", {
+  # FIXME: could also add some more technical tests
+
+  library(mlr3learners)
+  set.seed(2409)
+  n = 300L
+  x = as.factor(rep(c("x1", "x2"), each = n / 2L))  # x1 ~ N(-5, 2), x2 ~ N(5, 2)
+  y = c(rnorm(n / 2L, mean = -5, sd = 2), rnorm(n / 2L, mean = 5, sd = 2))
+  # aggregate(y ~ x, FUN = mean, data = dat)
+  # aggregate(y ~ x, FUN = sd, data = dat)
+  z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE)) # random
+  dat = data.table(y = y, x = x, z = z)
+
+  task = TaskRegr$new("test", backend = dat, target = "y")
+
+  learner = lrn("regr.lm", id = "l")  # baseline
+  graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1")  # no cross-method
+  graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2")  # cross-method
+
+  # check if nested resampling for the cross-method would work
+  train = sample(task$row_ids, size = 200L)
+  test = setdiff(task$row_ids, train)
+
+  learner$train(task, row_ids = train)
+  graphlearner1$train(task, row_ids = train)
+  graphlearner2$train(task, row_ids = train)
+
+  mse = c(learner$predict(task, row_ids = test)$score(msr("regr.mse")),
+    graphlearner1$predict(task, row_ids = test)$score(msr("regr.mse")),
+    graphlearner2$predict(task, row_ids = test)$score(msr("regr.mse")))
+  expect_true(all(exp(diff(log(mse))) - 1 < 0.1))  # ratios of mean mse's should be around 1
 })
+

From 84311b403fc68550db0b3528e82634823512a079 Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Mon, 24 Aug 2020 21:00:59 +0200
Subject: [PATCH 2/7] update NEWS

---
 NEWS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NEWS.md b/NEWS.md
index c5945446f..6fb0e23a9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,7 @@
 # mlr3pipelines 0.2.1-9000
 
 * NULL input channels accept any kind of input
+* PipeOpEncodeImpact now allows for using a cross-method during training
 
 # mlr3pipelines 0.2.1
 

From c72f75cc3a244c84b13171a7ed9164ef9e01de00 Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Thu, 24 Sep 2020 14:44:57 +0200
Subject: [PATCH 3/7] test

---
 R/PipeOpEncodeImpact.R | 80 ++++++++++++++++++++++++++++++++++++++++++
 R/zzz.R                |  2 ++
 2 files changed, 82 insertions(+)

diff --git a/R/PipeOpEncodeImpact.R b/R/PipeOpEncodeImpact.R
index a89e1c2b8..74fb89882 100644
--- a/R/PipeOpEncodeImpact.R
+++ b/R/PipeOpEncodeImpact.R
@@ -236,3 +236,83 @@ get_impact = function(task_type, folds_seq, train_sets, dt, target, smoothing, i
     })
   )
 }
+
+LearnerEncodeImpact = R6Class("LearnerEncodeImpact", inherit = Learner)
+
+LearnerEncodeImpactClassif = R6Class("LearnerEncodeImpactClassif", inherit = LearnerEncodeImpact,
+  public = list(
+    initialize = function(id, param_set = ParamSet$new(), predict_types = "impact", feature_types = character(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) {
+      super$initialize(id = id, task_type = "classif", param_set = param_set, feature_types = feature_types,
+        predict_types = predict_types, properties = properties, data_formats = data_formats, packages = packages, man = man)
+    }
+  )
+)
+
+LearnerEncodeImpactClassifSimple = R6Class("LearnerEncodeImpactClassifSimple", inherit = LearnerEncodeImpactClassif,
+  public = list(
+    initialize = function() {
+      ps = ParamSet$new(list(
+        ParamUty$new("affect_columns", custom_check = check_function_or_null, default = selector_all(), tags = "train"),
+        ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")),
+        ParamLgl$new("impute_zero", tags = c("train", "required"))
+      ))
+      ps$values = list(smoothing = 1e-4, impute_zero = FALSE)
+      super$initialize(
+        id = "encode.impact.classif.simple",
+        feature_types = c("factor", "ordered"),
+        predict_types = "impact",
+        param_set = ps,
+        properties = c("twoclass", "multiclass", "missings"),
+        man = NA_character_
+      )
+    }
+  ),
+
+  private = list(
+    .train = function(task) {
+      # FIXME: affect_columns
+      dt = task$data(cols = task$feature_names)
+      target = task$truth()
+      smoothing = self$param_set$values$smoothing
+      model = sapply(dt, function(col) {
+        sapply(levels(target), function(tl) {
+          tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing)
+          tplogit = log(tprop / (1 - tprop))
+          map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)),
+            function(cl) {
+              if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_)
+              condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing)
+              cplogit = log(condprob / (1 - condprob))
+              cplogit - tplogit
+            }
+          )
+        })
+      }, simplify = FALSE)
+      set_class(model, "encode.impact.classif.simple_model")
+    },
+
+    .predict = function(task) {
+      model = self$state$model
+      dt = task$data(cols = task$feature_names)
+      impact = imap(dt, function(curdat, idx) {
+        curdat = as.character(curdat)
+        curdat[is.na(curdat)] = ".TEMP.MISSING"
+        curdat[curdat %nin% rownames(model[[idx]])] = ".TEMP.MISSING"
+        # we only want to "drop" if there are no column names.
+        # otherwise we want the naming scheme <original feature name>.<target level>
+        model[[idx]][match(curdat, rownames(model[[idx]])), , drop = is.null(colnames(model[[idx]]))]
+      })
+      list(impact = impact)
+
+    }
+  )
+)
+
+check_prediction_data.PredictionDataEncodeImpact = function(pdata) {
+  browser()
+  pdata
+}
+
+as_prediction.PredictionDataEncodeImpact = function(x, check = TRUE) {
+  invoke(PredictionEncodeImpact$new, check = check, .args = x)
+}
diff --git a/R/zzz.R b/R/zzz.R
index 1d3ce9691..e112c79d7 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -14,6 +14,8 @@ register_mlr3 = function() {
   x$pipeops$valid_tags = unique(c(x$pipeops$valid_tags,
     c("abstract", "meta", "missings", "feature selection", "imbalanced data",
     "data transform", "target transform", "ensemble", "robustify", "learner", "encode", "multiplicity")))
+   x$learner_predict_types$classif$impact = "impact"
+   x$learner_predict_types$regr$impact = "impact"
 }
 
 .onLoad = function(libname, pkgname) {  # nocov start

From be4a53ad2959d52b7fd5d2c6a16b6359a6c4e743 Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Mon, 28 Sep 2020 19:36:24 +0200
Subject: [PATCH 4/7] reset changes to PipeOpEncodeImpact.R and
 PipeOpLearnerCV.R, add new encoder classes

---
 R/ImpactEncoder.R      | 215 ++++++++++++++++++++++++++++++
 R/PipeOpEncodeImpact.R | 290 ++++++++---------------------------------
 R/PipeOpLearnerCV.R    |   4 +-
 3 files changed, 270 insertions(+), 239 deletions(-)
 create mode 100644 R/ImpactEncoder.R

diff --git a/R/ImpactEncoder.R b/R/ImpactEncoder.R
new file mode 100644
index 000000000..9fccbac0c
--- /dev/null
+++ b/R/ImpactEncoder.R
@@ -0,0 +1,215 @@
+ImpactEncoderClassif = R6Class("ImpactEncoderClassif", inherit = Learner,
+  public = list(
+    initialize = function(id, param_set = ParamSet$new(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) {
+      super$initialize(id = id, task_type = "classif", param_set = param_set, predict_types = "impact", feature_types = c("factor", "ordered"), properties = properties, packages = packages, man = man)
+    }
+  ),
+  private = list(
+    .predict = function(task) {
+      impact = get_impact(task$data(cols = task$feature_names), model = self$state$model)
+      list(response = factor(rep_len(NA_character_, length.out = task$nrow), levels = task$levels(task$target_names)[[1L]]), impact = impact)
+    }
+  )
+)
+
+ImpactEncoderRegr = R6Class("ImpactEncoderRegr", inherit = Learner,
+  public = list(
+    initialize = function(id, param_set = ParamSet$new(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) {
+      super$initialize(id = id, task_type = "regr", param_set = param_set, predict_types = "impact", feature_types = c("factor", "ordered"), properties = properties, packages = packages, man = man)
+    }
+  ),
+  private = list(
+    .predict = function(task) {
+      impact = get_impact(task$data(cols = task$feature_names), model = self$state$model)
+      list(response = rep_len(NA_real_, length.out = task$nrow), impact = impact)
+    }
+  )
+)
+
+ImpactEncoderClassifSimple = R6Class("ImpactEncoderClassifSimple", inherit = ImpactEncoderClassif,
+  public = list(
+    initialize = function() {
+      ps = ParamSet$new(list(
+        ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")),
+        ParamLgl$new("impute_zero", tags = c("train", "required"))
+      ))
+      ps$values = list(smoothing = 1e-4, impute_zero = FALSE)
+      super$initialize(
+        id = "encode.impact.classif.simple",
+        param_set = ps,
+        properties = c("twoclass", "multiclass"),
+        man = "FIXME"
+      )
+    }
+  ),
+  private = list(
+    .train = function(task) {
+      dt = task$data(cols = task$feature_names)
+      target = task$truth()
+      smoothing = self$param_set$values$smoothing
+      model = sapply(dt, function(col) {
+        sapply(levels(target), function(tl) {
+          tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing)
+          tplogit = log(tprop / (1 - tprop))
+          map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)),
+            function(cl) {
+              if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_)
+              condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing)
+              cplogit = log(condprob / (1 - condprob))
+              cplogit - tplogit
+            }
+          )
+        })
+      }, simplify = FALSE)
+      set_class(model, "encode.impact.classif.simple_model")
+    }
+  )
+)
+
+ImpactEncoderRegrSimple = R6Class("ImpactEncoderRegrSimple", inherit = ImpactEncoderRegr,
+  public = list(
+    initialize = function() {
+      ps = ParamSet$new(list(
+        ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")),
+        ParamLgl$new("impute_zero", tags = c("train", "required"))
+      ))
+      ps$values = list(smoothing = 1e-4, impute_zero = FALSE)
+      super$initialize(
+        id = "encode.impact.regr.simple",
+        param_set = ps,
+        man = "FIXME"
+      )
+    }
+  ),
+  private = list(
+    .train = function(task) {
+      dt = task$data(cols = task$feature_names)
+      target = task$truth()
+      meanimp = mean(target)
+      smoothing = self$param_set$values$smoothing
+      model = sapply(dt, function(col) {
+        t(t(c(sapply(levels(col), function(lvl) {
+          (sum(target[col == lvl], na.rm = TRUE) + smoothing * meanimp) / (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp
+        }), if (self$param_set$values$impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA))))
+      }, simplify = FALSE)
+      set_class(model, "encode.impact.regr.simple_model")
+    }
+  )
+)
+
+ImpactEncoderClassifGlmm = R6Class("ImpactEncoderClassifGlmm", inherit = ImpactEncoderClassif,
+  public = list(
+    initialize = function() {
+      ps = ParamSet$new()  # FIXME:
+      super$initialize(
+        id = "encode.impact.classif.glmm",
+        param_set = ps,
+        properties = c("twoclass", "multiclass"),
+        man = "FIXME"
+        # FIXME: properties missings?
+      )
+    }
+  ),
+  private = list(
+    .train = function(task) {
+      dt = task$data(cols = task$feature_names)
+      target = task$truth()
+      lvls = levels(target)
+      model = if (length(lvls) > 2L) {
+        # binomial glmm
+        binary_target = sapply(levels(target), function(x) factor(identical(x, target)), simplify = FALSE)
+        sapply(dt, function(col) {
+          tmp = sapply(lvls, function(lvl) {
+            fitGlmer(binary_target[[lvl]], feature = col)
+          }, simplify = FALSE)
+          tmp = do.call(cbind, tmp)
+          colnames(tmp) = lvls
+          tmp
+        }, simplify = FALSE)
+      } else {
+        # one vs. rest binomial glmm
+        sapply(dt, function(col) {
+          tmp = fitGlmer(target, feature = col)
+          tmp = cbind(-tmp, tmp)  # required for the other target level
+          colnames(tmp) = lvls
+          tmp
+        }, simplify = FALSE)
+      }
+      set_class(model, "encode.impact.classif.glmm_model")
+    }
+  )
+)
+
+ImpactEncoderRegrGlmm = R6Class("ImpactEncoderRegrGlmm", inherit = ImpactEncoderRegr,
+  public = list(
+    initialize = function() {
+      ps = ParamSet$new()  # FIXME:
+      super$initialize(
+        id = "encode.impact.regr.glmm",
+        param_set = ps,
+        man = "FIXME"
+        # FIXME: properties missings?
+      )
+    }
+  ),
+  private = list(
+    .train = function(task) {
+      dt = task$data(cols = task$feature_names)
+      target = task$truth()
+      model = sapply(dt, function(col) {
+        fitLmer(target, feature = col)
+      }, simplify = FALSE)
+      set_class(model, "encode.impact.regr.glmm_model")
+    }
+  )
+)
+
+get_impact = function(dt, model) {
+  imap(dt, function(curdat, idx) {
+    curdat = as.character(curdat)
+    curdat[is.na(curdat)] = ".TEMP.MISSING"
+    curdat[curdat %nin% rownames(model[[idx]])] = ".TEMP.MISSING"
+    # we only want to "drop" if there are no column names
+    # otherwise we want the naming scheme <original feature name>.<target level>
+    model[[idx]][match(curdat, rownames(model[[idx]])), , drop = is.null(colnames(model[[idx]]))]
+  })
+}
+
+# Regr helper function around lme4::lmer
+# FIXME: params
+fitLmer = function(target, feature) {
+  args = list(formula = y ~ 1 + (1 | x),
+    data = data.table(y = target, x = feature),
+    na.action = na.omit,
+    control = lme4::lmerControl(calc.derivs = FALSE)
+  )
+  mod = invoke(lme4::lmer, .args = args)
+  coefs = stats::coef(mod)$x
+  lvls = rownames(coefs)
+  coefs = coefs[[1L]]
+  names(coefs) = lvls
+  intercept = unname(lme4::fixef(mod))
+  coefs[is.na(coefs)] = intercept
+  coefs = c(coefs, .TEMP.MISSING = intercept)
+  t(t(coefs))
+}
+
+# Classif helper function around lme4::glmer
+# FIXME: params
+fitGlmer = function(target, feature) {
+  args = list(formula = y ~ 1 + (1 | x),
+    data = data.table(y = target, x = feature),
+    family = stats::binomial,
+    na.action = na.omit,
+    control = lme4::glmerControl(calc.derivs = FALSE)
+  )
+  mod = invoke(lme4::glmer, .args = args)
+  coefs = stats::coef(mod)$x
+  lvls = rownames(coefs)
+  coefs = coefs[[1L]]
+  names(coefs) = lvls
+  intercept = unname(lme4::fixef(mod))
+  coefs[is.na(coefs)] = intercept
+  coefs = c(coefs, .TEMP.MISSING = intercept)
+  t(t(coefs))
+}
diff --git a/R/PipeOpEncodeImpact.R b/R/PipeOpEncodeImpact.R
index 74fb89882..57820c5d9 100644
--- a/R/PipeOpEncodeImpact.R
+++ b/R/PipeOpEncodeImpact.R
@@ -2,23 +2,18 @@
 #'
 #' @usage NULL
 #' @name mlr_pipeops_encodeimpact
-#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`].
+#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
 #'
 #' @description
-#' Encodes columns of type `factor`, and `ordered`.
+#' Encodes columns of type `factor`, `character` and `ordered`.
 #'
-#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each
-#' (factorial) column to the difference between each target level's conditional log-likelihood given
-#' this level, and the target level's global log-likelihood.
+#' Impact coding for [classification Tasks][mlr3::TaskClassif] converts factor levels of each (factorial) column
+#' to the difference between each target level's conditional log-likelihood
+#' given this level, and the target level's global log-likelihood.
 #'
-#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial)
-#' column to the difference between the target's conditional mean given this level, and the target's
-#' global mean.
-#'
-#' During training, the impact coding is done using a cross-method. This means that the training
-#' [`Task`][mlr3::Task] is split into several folds via [`ResamplingCV`][mlr3::ResamplingCV] and for
-#' each fold, impact coding is performed for each test set based on the respective training set.
-#' This is helpful to prevent nested model bias.
+#' Impact coding for [regression Tasks][mlr3::TaskRegr] converts factor levels of each (factorial) column
+#' to the difference between the target's conditional mean given
+#' this level, and the target's global mean.
 #'
 #' Treats new levels during prediction like missing values.
 #'
@@ -36,35 +31,24 @@
 #' @section Input and Output Channels:
 #' Input and output channels are inherited from [`PipeOpTaskPreproc`].
 #'
-#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, or `ordered` parameters encoded.
+#' The output is the input [`Task`][mlr3::Task] with all affected `factor`, `character` or
+#' `ordered` parameters encoded.
 #'
 #' @section State:
 #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
-#' * `train_task_hash` :: `character(1)`\cr
-#'   The hash (unique identifier) for the training [`Task`][mlr3::Task].
-#' * `rsmp_cv_instance` :: a `data.table`\cr
-#'   If `folds` is larger than one, the resampling instance of the [`ResamplingCV`][mlr3::ResamplingCV] used during training.
-#' * `impact_predict` :: a named `list`\cr
+#' * `impact` :: a named `list`\cr
 #'   A list with an element for each affected feature:\cr
-#'   For regression, each element is a single column matrix of impact values for each level of that feature.\cr
-#'   For classification, this is a list with an element for each *feature level*, which is a vector
-#'   giving the impact of this feature level on each *outcome level*.
-#'   This list is used to encode impact of the prediction [`Task`][mlr3::Task].
-#' * `impact_cv` :: a `list` of named `lists`\cr
-#'   A list of length `folds` with each element holding a list like `impact_predict` above.
-#'   These lists are used to encode impact of the training [`Task`][mlr3::Task].
+#'   For regression each element is a single column matrix of impact values for each level of that feature.\cr
+#'   For classification, it is a list with an element for each *feature level*, which is a vector giving the impact of
+#'   this feature level on each *outcome level*.
 #'
 #' @section Parameters:
-#' * `smoothing` :: `numeric(1)` \cr
+#' * `smoothing`  :: `numeric(1)` \cr
 #'   A finite positive value used for smoothing. Mostly relevant for [classification Tasks][mlr3::TaskClassif] if
 #'   a factor does not coincide with a target factor level (and would otherwise give an infinite logit value).
 #'   Initialized to `1e-4`.
 #' * `impute_zero` :: `logical(1)`\cr
-#'   If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default is `FALSE`.
-#' * `folds` :: `integer(1)`\cr
-#'   Number of folds used in the cross-method and passed to [`ResamplingCV`][mlr3::ResamplingCV]. Default is `3`.
-#'   If set to `1`, no cross-method will be applied during training, i.e., the whole training
-#'   [`Task`][mlr3::Task] is used to encode impact during training.
+#'   If `TRUE`, impute missing values as impact 0; otherwise the respective impact is coded as `NA`. Default `FALSE`.
 #'
 #' @section Internals:
 #' Uses laplace smoothing, mostly to avoid infinite values for [classification Task][mlr3::TaskClassif].
@@ -77,10 +61,10 @@
 #' poe = po("encodeimpact")
 #'
 #' task = TaskClassif$new("task",
-#'   backend = data.table::data.table(
-#'     x = factor(c("a", "a", "b", "b", "b")),
-#'     y = factor(c("a", "a", "a", "b", "b"))),
-#'   target = "y")
+#'   data.table::data.table(
+#'     x = factor(c("a", "a", "a", "b", "b")),
+#'     y = factor(c("a", "a", "b", "b", "b"))),
+#'   "x")
 #'
 #' poe$train(list(task))[[1]]$data()
 #'
@@ -89,96 +73,53 @@
 #' @include PipeOpTaskPreproc.R
 #' @export
 PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact",
-  inherit = PipeOpTaskPreproc,
+  inherit = PipeOpTaskPreprocSimple,
   public = list(
     initialize = function(id = "encodeimpact", param_vals = list()) {
       ps = ParamSet$new(params = list(
-        ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required"), default = 1e-4),
-        ParamLgl$new("impute_zero", tags = c("train", "required"), default = FALSE),
-        ParamInt$new("folds", lower = 1L, tags = c("train", "required"), default = 3L)
+        ParamDbl$new("smoothing", 0, Inf, tags = c("train", "required")),
+        ParamLgl$new("impute_zero", tags = c("train", "required"))
       ))
-      ps$values = list(smoothing = 1e-4, impute_zero = FALSE, folds = 3L)
+      ps$values = list(smoothing = 1e-4, impute_zero = FALSE)
       super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("factor", "ordered"))
     }
   ),
   private = list(
 
-    .train_task = function(task) {
-      dt_columns = private$.select_cols(task)
-      cols = dt_columns
-      if (!length(cols)) {
-        self$state = list(dt_columns = dt_columns)
-        return(task)  # early exit
-      }
-      dt = task$data(cols = cols)
-      target = task$truth()
+    .get_state_dt = function(dt, levels, target) {
+      task_type = if (is.numeric(target)) "regr" else "classif"
+      state = list()
 
-      task_type = task$task_type
-      row_ids = task$row_ids
-      row_seq = seq_len(task$nrow)
       smoothing = self$param_set$values$smoothing
-      impute_zero = self$param_set$values$impute_zero
-      folds = self$param_set$values$folds
-      folds_seq = seq_len(folds)
-
-      # note that matching the row_ids below is necessary because of the resampling
-
-      # impact encoding for the prediction task
-      impact_predict = get_impact(task_type, folds_seq = 1L, train_sets = list(row_seq), dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero)[[1L]]
-
-      if (folds > 1L) {
-        # cross-method
-        rcv = ResamplingCV$new()
-        rcv$param_set$values$folds = folds
-        rcv$instantiate(task)
-
-        train_sets = map(folds_seq, function(fold) match(rcv$train_set(fold), row_ids))
-        test_sets = map(folds_seq, .f = function(fold) match(rcv$test_set(fold), row_ids))
 
-        impact_cv = get_impact(task_type, folds_seq = folds_seq, train_sets = train_sets, dt = dt, target = target, smoothing = smoothing, impute_zero = impute_zero)
-
-      } else {
-        # no cross-method
-        test_sets = list(row_seq)
-
-        impact_cv = list(impact_predict)
-      }
-
-      self$state = list(train_task_hash = task$hash, rsmp_cv_instance = if (folds > 1L) rcv$instance else data.table(), impact_predict = impact_predict, impact_cv = impact_cv, dt_columns = dt_columns)
-
-      # cross-method (folds > 1) will encode test_set of fold i using the impact encoding trained on train_set of fold i
-      dt = imap(dt, .f = function(curdat, idx) {
-        fold_dt = map(folds_seq, .f = function(fold) {
-          impact_test = self$state$impact_cv[[fold]]
-          test_set = test_sets[[fold]]
-          curdat = as.character(curdat[test_set])
-          curdat[is.na(curdat)] = ".TEMP.MISSING"
-          curdat[curdat %nin% rownames(impact_test[[idx]])] = ".TEMP.MISSING"
-          # we only want to "drop" if there are no column names.
-          # otherwise we want the naming scheme <original feature name>.<target level>
-          impact_test[[idx]][match(curdat, rownames(impact_test[[idx]])), , drop = is.null(colnames(impact_test[[idx]]))]
-        })
-        switch(task_type,
-          classif = do.call(rbind, fold_dt),
-          regr = unlist(fold_dt)
-        )
-      })
-
-      dt = as.data.table(dt)
-      dt = dt[match(row_seq, unlist(test_sets)), ]  # row ids have to be reordered because of resampling
-      task$select(setdiff(task$feature_names, cols))$cbind(dt)
+      # different funs depending on task.type
+      list(impact = switch(task_type,
+        classif = sapply(dt, function(col)
+          sapply(levels(target), function(tl) {
+            tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing)
+            tplogit = log(tprop / (1 - tprop))
+            map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)),
+              function(cl) {
+                if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_)
+                condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) /
+                  (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing)
+                cplogit = log(condprob / (1 - condprob))
+                cplogit - tplogit
+              })
+          }), simplify = FALSE),
+        regr = {
+          meanimp = mean(target)
+          sapply(dt, function(col)
+            t(t(c(sapply(levels(col), function(lvl) {
+              (sum(target[col == lvl], na.rm = TRUE) + smoothing * meanimp) /
+                (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp
+            }), if (self$param_set$values$impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))), simplify = FALSE)
+        }))
     },
 
-    .predict_task = function(task) {
-      cols = self$state$dt_columns
-      if (!length(cols)) {
-        return(task)
-      }
-      dt = task$data(cols = cols)
-
-      # impact encoding for the prediction task always relies on the encoding of the whole training task
-      impact = self$state$impact_predict
-      dt = imap(dt, function(curdat, idx) {
+    .transform_dt = function(dt, levels) {
+      impact = self$state$impact
+      imap(dt, function(curdat, idx) {
         curdat = as.character(curdat)
         curdat[is.na(curdat)] = ".TEMP.MISSING"
         curdat[curdat %nin% rownames(impact[[idx]])] = ".TEMP.MISSING"
@@ -186,133 +127,8 @@ PipeOpEncodeImpact = R6Class("PipeOpEncodeImpact",
         # otherwise we want the naming scheme <original feature name>.<target level>
         impact[[idx]][match(curdat, rownames(impact[[idx]])), , drop = is.null(colnames(impact[[idx]]))]
       })
-
-      dt = as.data.table(dt)
-      task$select(setdiff(task$feature_names, cols))$cbind(dt)
     }
   )
 )
 
 mlr_pipeops$add("encodeimpact", PipeOpEncodeImpact)
-
-get_impact = function(task_type, folds_seq, train_sets, dt, target, smoothing, impute_zero) {
-  switch(task_type,
-    classif = map(folds_seq, .f = function(fold) {
-      target_lvls = levels(target)
-      train_set = train_sets[[fold]]
-      dt_train = dt[train_set, ]
-      target_train = target[train_set]
-
-      map(dt_train, .f = function(col) {
-        col_lvls = levels(col)
-
-        do.call(cbind, stats::setNames(map(target_lvls, .f = function(tl) {
-          tprop = (sum(target_train == tl) + smoothing) / (length(target_train) + 2 * smoothing)
-          tplogit = log(tprop / (1 - tprop))
-
-          map_dbl(c(stats::setNames(col_lvls, nm = col_lvls), c(.TEMP.MISSING = NA)), .f = function(cl) {
-            if (!impute_zero && is.na(cl)) return(NA_real_)  # early exit
-            condprob = (sum(target_train[is.na(cl) | (col == cl)] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | (col == cl), na.rm = TRUE) + 2 * smoothing)
-            cplogit = log(condprob / (1 - condprob))
-            cplogit - tplogit
-          })
-        }), nm = target_lvls))
-      })
-    }),
-    regr = map(folds_seq, .f = function(fold) {
-      train_set = train_sets[[fold]]
-      dt_train = dt[train_set, ]
-      target_train = target[train_set]
-
-      meanimp = mean(target_train)
-
-      map(dt_train, .f = function(col) {
-        col_lvls = levels(col)
-
-        as.matrix(c(stats::setNames(map_dbl(col_lvls, .f = function(lvl) {
-          (sum(target_train[col == lvl], na.rm = TRUE) + smoothing * meanimp) / (sum(col == lvl, na.rm = TRUE) + smoothing) - meanimp
-        }), nm = col_lvls), if (impute_zero) c(.TEMP.MISSING = 0) else c(.TEMP.MISSING = NA)))
-      })
-    })
-  )
-}
-
-LearnerEncodeImpact = R6Class("LearnerEncodeImpact", inherit = Learner)
-
-LearnerEncodeImpactClassif = R6Class("LearnerEncodeImpactClassif", inherit = LearnerEncodeImpact,
-  public = list(
-    initialize = function(id, param_set = ParamSet$new(), predict_types = "impact", feature_types = character(), properties = character(), data_formats = "data.table", packages = character(), man = NA_character_) {
-      super$initialize(id = id, task_type = "classif", param_set = param_set, feature_types = feature_types,
-        predict_types = predict_types, properties = properties, data_formats = data_formats, packages = packages, man = man)
-    }
-  )
-)
-
-LearnerEncodeImpactClassifSimple = R6Class("LearnerEncodeImpactClassifSimple", inherit = LearnerEncodeImpactClassif,
-  public = list(
-    initialize = function() {
-      ps = ParamSet$new(list(
-        ParamUty$new("affect_columns", custom_check = check_function_or_null, default = selector_all(), tags = "train"),
-        ParamDbl$new("smoothing", lower = 0, upper = Inf, tags = c("train", "required")),
-        ParamLgl$new("impute_zero", tags = c("train", "required"))
-      ))
-      ps$values = list(smoothing = 1e-4, impute_zero = FALSE)
-      super$initialize(
-        id = "encode.impact.classif.simple",
-        feature_types = c("factor", "ordered"),
-        predict_types = "impact",
-        param_set = ps,
-        properties = c("twoclass", "multiclass", "missings"),
-        man = NA_character_
-      )
-    }
-  ),
-
-  private = list(
-    .train = function(task) {
-      # FIXME: affect_columns
-      dt = task$data(cols = task$feature_names)
-      target = task$truth()
-      smoothing = self$param_set$values$smoothing
-      model = sapply(dt, function(col) {
-        sapply(levels(target), function(tl) {
-          tprop = (sum(target == tl) + smoothing) / (length(target) + 2 * smoothing)
-          tplogit = log(tprop / (1 - tprop))
-          map_dbl(c(stats::setNames(levels(col), levels(col)), c(.TEMP.MISSING = NA)),
-            function(cl) {
-              if (!self$param_set$values$impute_zero && is.na(cl)) return(NA_real_)
-              condprob = (sum(target[is.na(cl) | col == cl] == tl, na.rm = TRUE) + smoothing) / (sum(is.na(cl) | col == cl, na.rm = TRUE) + 2 * smoothing)
-              cplogit = log(condprob / (1 - condprob))
-              cplogit - tplogit
-            }
-          )
-        })
-      }, simplify = FALSE)
-      set_class(model, "encode.impact.classif.simple_model")
-    },
-
-    .predict = function(task) {
-      model = self$state$model
-      dt = task$data(cols = task$feature_names)
-      impact = imap(dt, function(curdat, idx) {
-        curdat = as.character(curdat)
-        curdat[is.na(curdat)] = ".TEMP.MISSING"
-        curdat[curdat %nin% rownames(model[[idx]])] = ".TEMP.MISSING"
-        # we only want to "drop" if there are no column names.
-        # otherwise we want the naming scheme <original feature name>.<target level>
-        model[[idx]][match(curdat, rownames(model[[idx]])), , drop = is.null(colnames(model[[idx]]))]
-      })
-      list(impact = impact)
-
-    }
-  )
-)
-
-check_prediction_data.PredictionDataEncodeImpact = function(pdata) {
-  browser()
-  pdata
-}
-
-as_prediction.PredictionDataEncodeImpact = function(x, check = TRUE) {
-  invoke(PredictionEncodeImpact$new, check = check, .args = x)
-}
diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R
index 9569a4a59..b8aba7bf6 100644
--- a/R/PipeOpLearnerCV.R
+++ b/R/PipeOpLearnerCV.R
@@ -173,8 +173,8 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV",
       if (pv$method != "insample") {
         rdesc = mlr_resamplings$get(pv$method)
         if (pv$method == "cv") rdesc$param_set$values = list(folds = pv$folds)
-        res = resample(task, private$.learner, rdesc)
-        prds = rbindlist(lapply(map(res$data$prediction, "test"), as.data.table))
+        rr = resample(task, private$.learner, rdesc)
+        prds = as.data.table(rr$prediction(predict_sets = "test"))
       } else {
         prds = as.data.table(private$.learner$predict(task))
       }

From f01a2c05049c06e59745ea0fb718946e2c411e09 Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Mon, 28 Sep 2020 19:44:50 +0200
Subject: [PATCH 5/7] add some todos

---
 R/ImpactEncoder.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/ImpactEncoder.R b/R/ImpactEncoder.R
index 9fccbac0c..6eee35784 100644
--- a/R/ImpactEncoder.R
+++ b/R/ImpactEncoder.R
@@ -10,6 +10,7 @@ ImpactEncoderClassif = R6Class("ImpactEncoderClassif", inherit = Learner,
       list(response = factor(rep_len(NA_character_, length.out = task$nrow), levels = task$levels(task$target_names)[[1L]]), impact = impact)
     }
   )
+  # FIXME: check for the structure of the model saved during train
 )
 
 ImpactEncoderRegr = R6Class("ImpactEncoderRegr", inherit = Learner,
@@ -24,6 +25,7 @@ ImpactEncoderRegr = R6Class("ImpactEncoderRegr", inherit = Learner,
       list(response = rep_len(NA_real_, length.out = task$nrow), impact = impact)
     }
   )
+  # FIXME: check for the structure of the model saved during train
 )
 
 ImpactEncoderClassifSimple = R6Class("ImpactEncoderClassifSimple", inherit = ImpactEncoderClassif,

From 0e49c688dd0ea1c1a729dcbb74a24c12a21af0af Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Mon, 28 Sep 2020 19:49:08 +0200
Subject: [PATCH 6/7] revert some earlier changes

---
 man/mlr_pipeops_encodeimpact.Rd           |  56 ++++-------
 tests/testthat/test_pipeop_encodeimpact.R | 113 ++++------------------
 2 files changed, 37 insertions(+), 132 deletions(-)

diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd
index e5c266f09..45907d0d2 100644
--- a/man/mlr_pipeops_encodeimpact.Rd
+++ b/man/mlr_pipeops_encodeimpact.Rd
@@ -5,23 +5,18 @@
 \alias{PipeOpEncodeImpact}
 \title{Conditional Target Value Impact Encoding}
 \format{
-\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}.
+\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}.
 }
 \description{
-Encodes columns of type \code{factor}, and \code{ordered}.
+Encodes columns of type \code{factor}, \code{character} and \code{ordered}.
 
-Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each
-(factorial) column to the difference between each target level's conditional log-likelihood given
-this level, and the target level's global log-likelihood.
+Impact coding for \link[mlr3:TaskClassif]{classification Tasks} converts factor levels of each (factorial) column
+to the difference between each target level's conditional log-likelihood
+given this level, and the target level's global log-likelihood.
 
-Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial)
-column to the difference between the target's conditional mean given this level, and the target's
-global mean.
-
-During training, the impact coding is done using a cross-method. This means that the training
-\code{\link[mlr3:Task]{Task}} is split into several folds via \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} and for
-each fold, impact coding is performed for each test set based on the respective training set.
-This is helpful to prevent nested model bias.
+Impact coding for \link[mlr3:TaskRegr]{regression Tasks} converts factor levels of each (factorial) column
+to the difference between the target's conditional mean given
+this level, and the target's global mean.
 
 Treats new levels during prediction like missing values.
 }
@@ -41,42 +36,31 @@ otherwise be set during construction. Default \code{list()}.
 
 Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}.
 
-The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, or \code{ordered} parameters encoded.
+The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor}, \code{character} or
+\code{ordered} parameters encoded.
 }
 
 \section{State}{
 
 The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as:
 \itemize{
-\item \code{train_task_hash} :: \code{character(1)}\cr
-The hash (unique identifier) for the training \code{\link[mlr3:Task]{Task}}.
-\item \code{rsmp_cv_instance} :: a \code{data.table}\cr
-If \code{folds} is larger than one, the resampling instance of the \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}} used during training.
-\item \code{impact_predict} :: a named \code{list}\cr
+\item \code{impact} :: a named \code{list}\cr
 A list with an element for each affected feature:\cr
-For regression, each element is a single column matrix of impact values for each level of that feature.\cr
-For classification, this is a list with an element for each \emph{feature level}, which is a vector
-giving the impact of this feature level on each \emph{outcome level}.
-This list is used to encode impact of the prediction \code{\link[mlr3:Task]{Task}}.
-\item \code{impact_cv} :: a \code{list} of named \code{lists}\cr
-A list of length \code{folds} with each element holding a list like \code{impact_predict} above.
-These lists are used to encode impact of the training \code{\link[mlr3:Task]{Task}}.
+For regression each element is a single column matrix of impact values for each level of that feature.\cr
+For classification, it is a list with an element for each \emph{feature level}, which is a vector giving the impact of
+this feature level on each \emph{outcome level}.
 }
 }
 
 \section{Parameters}{
 
 \itemize{
-\item \code{smoothing} :: \code{numeric(1)} \cr
+\item \code{smoothing}  :: \code{numeric(1)} \cr
 A finite positive value used for smoothing. Mostly relevant for \link[mlr3:TaskClassif]{classification Tasks} if
 a factor does not coincide with a target factor level (and would otherwise give an infinite logit value).
 Initialized to \code{1e-4}.
 \item \code{impute_zero} :: \code{logical(1)}\cr
-If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default is \code{FALSE}.
-\item \code{folds} :: \code{integer(1)}\cr
-Number of folds used in the cross-method and passed to \code{\link[mlr3:mlr_resamplings_cv]{ResamplingCV}}. Default is \code{3}.
-If set to \code{1}, no cross-method will be applied during training, i.e., the whole training
-\code{\link[mlr3:Task]{Task}} is used to encode impact during training.
+If \code{TRUE}, impute missing values as impact 0; otherwise the respective impact is coded as \code{NA}. Default \code{FALSE}.
 }
 }
 
@@ -95,10 +79,10 @@ library("mlr3")
 poe = po("encodeimpact")
 
 task = TaskClassif$new("task",
-  backend = data.table::data.table(
-    x = factor(c("a", "a", "b", "b", "b")),
-    y = factor(c("a", "a", "a", "b", "b"))),
-  target = "y")
+  data.table::data.table(
+    x = factor(c("a", "a", "a", "b", "b")),
+    y = factor(c("a", "a", "b", "b", "b"))),
+  "x")
 
 poe$train(list(task))[[1]]$data()
 
diff --git a/tests/testthat/test_pipeop_encodeimpact.R b/tests/testthat/test_pipeop_encodeimpact.R
index 0c23367ac..c3a7000db 100644
--- a/tests/testthat/test_pipeop_encodeimpact.R
+++ b/tests/testthat/test_pipeop_encodeimpact.R
@@ -8,25 +8,13 @@ test_that("PipeOpEncodeImpact", {
 
   t2 = po("histbin")$train(list(tsk("iris")))[[1]]
 
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
-    constargs = list(param_vals = list(folds = 1L)), task = task)
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
-    constargs = list(param_vals = list(folds = 1L)), task = t2)
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
-    constargs = list(param_vals = list(folds = 1L)), task = mlr_tasks$get("iris"))
-
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
-    constargs = list(param_vals = list(folds = 2L)), task = task,
-    predict_like_train = FALSE, deterministic_train = FALSE)
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
-    constargs = list(param_vals = list(folds = 2L)), task = t2,
-    predict_like_train = FALSE, deterministic_train = FALSE)
-  expect_datapreproc_pipeop_class(PipeOpEncodeImpact,
-    constargs = list(param_vals = list(folds = 2L)), task = mlr_tasks$get("iris"),
-    predict_like_train = FALSE, deterministic_train = FALSE)
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = task)
+
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = t2)
+
+  expect_datapreproc_pipeop_class(PipeOpEncodeImpact, task = mlr_tasks$get("iris"))
 
   op = PipeOpEncodeImpact$new()
-  op$param_set$values$folds = 1
   expect_pipeop(op)
 
   nt = train_pipeop(op, inputs = list(task))[[1L]]
@@ -41,8 +29,10 @@ test_that("PipeOpEncodeImpact", {
   # factor cols are removed
   expect_true(all(tsk("iris")$feature_names %nin% fn))
   expect_true("factor" %nin% nt$feature_types$type)
+
 })
 
+
 test_that("PipeOpImpactEncode on Classification", {
 
   testdf = data.frame(
@@ -53,7 +43,6 @@ test_that("PipeOpImpactEncode on Classification", {
   testtask = TaskClassif$new("test", testdf, "t")
 
   op = PipeOpEncodeImpact$new()
-  op$param_set$values$folds = 1
 
   expect_equal(op$train(list(tsk("iris")))[[1]], tsk("iris"))
 
@@ -68,16 +57,15 @@ test_that("PipeOpImpactEncode on Classification", {
 
   op$train(list(testtask))
 
-  expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a)  # folds = 1, no cross-method
-  expect_equal(op$state$impact_predict$a, expm)
+  expect_equal(op$state$impact$a, expm)
 
   op$param_set$values$smoothing = 1e-4
   op$train(list(testtask))
-  expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE), 0.5e-4)
+  expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE), 0.5e-4)
 
   op$param_set$values$smoothing = 1e-8
   op$train(list(testtask))
-  expect_equal(mean(abs(op$state$impact_predict$a - expm), na.rm = TRUE) * 1e4, 0.5e-4)
+  expect_equal(mean(abs(op$state$impact$a - expm), na.rm = TRUE) * 1e4, 0.5e-4)
 
   op$param_set$values$smoothing = 6.362e-9  # similar to what glm uses
   encoded = op$train(list(testtask))[[1]]$data()
@@ -89,12 +77,11 @@ test_that("PipeOpImpactEncode on Classification", {
   expm2 = rbind(expm2, c(NA, NA))
   rownames(expm2) = c("a", "b", ".TEMP.MISSING")
 
-  expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b)  # folds = 1, no cross-method
-  expect_equal(op$state$impact_predict$b, expm2, tolerance = 1e-5)
+  expect_equal(op$state$impact$b, expm2, tolerance = 1e-5)
 
   expect_equal(encoded,
-    data.table(t = testdf$t, a = op$state$impact_predict$a[testdf$a, ],
-      b = op$state$impact_predict$b[testdf$b, ]))
+    data.table(t = testdf$t, a = op$state$impact$a[testdf$a, ],
+      b = op$state$impact$b[testdf$b, ]))
 
   # test NA handling / imputation
 
@@ -113,6 +100,7 @@ test_that("PipeOpImpactEncode on Classification", {
   encoded = op$train(list(testtask2))[[1]]$data()
 
   expect_equal(as.numeric(as.matrix(encoded)[c(11, 17, 24, 30)]), c(0, 0, 0, 0))  # imputation by 0
+
 })
 
 test_that("PipeOpImpactEncode on Regression", {
@@ -130,16 +118,13 @@ test_that("PipeOpImpactEncode on Regression", {
       t = c(1, 2, 3, 1, 2, 3))
 
   op = PipeOpEncodeImpact$new()
-  op$param_set$values$folds = 1
   op$param_set$values$smoothing = 0
 
   expect_equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE)
 
-  expect_equal(op$state$impact_predict$a, op$state$impact_cv[[1]]$a)  # folds = 1, no cross-method
-  expect_equal(op$state$impact_predict$b, op$state$impact_cv[[1]]$b)  # folds = 1, no cross-method
 
-  expect_equal(op$state$impact_predict$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA))))
-  expect_equal(op$state$impact_predict$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA))))
+  expect_equal(op$state$impact$a, t(t(c(a = 0, b = 0, .TEMP.MISSING = NA))))
+  expect_equal(op$state$impact$b, t(t(c(a = -1/4, b = 1/2, .TEMP.MISSING = NA))))
 
   op$param_set$values$smoothing = 1e-4
   expect_false(isTRUE(all.equal(op$train(list(testtask))[[1]]$data(), expect, ignore.col.order = TRUE, tolerance = 1e-5)))
@@ -180,12 +165,12 @@ test_that("PipeOpImpactEncode on Regression", {
   encoded = op$train(list(testtask2))[[1]]$data()
 
   expect_equal(which(is.na(encoded)), c(11, 18))
+
 })
 
 test_that("PipeOpImpactEncode factor level ``", {
 
   op = PipeOpEncodeImpact$new()
-  op$param_set$values$folds = 1
 
   testdf3 = iris
   levels(testdf3$Species) = c("setosa", "versicolor", "")
@@ -196,69 +181,5 @@ test_that("PipeOpImpactEncode factor level ``", {
   train_out3ref = op$train(list(testtask3ref))[[1L]]
 
   expect_equal(train_out3$data(), train_out3ref$data())
-})
-
-test_that("PipeOpImpactEncode cross-method on Classification", {
-  # FIXME: could also add some more technical tests
 
-  library(mlr3learners)
-  set.seed(2409)
-  n = 300L
-  x = as.factor(rep(c("x1", "x2"), each = n / 2L))
-  y = as.factor(c(sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.9, 0.1)), sample(c("y1", "y2"), size = n / 2L, replace = TRUE, prob = c(0.1, 0.9))))
-  z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE))
-  dat = data.table(y = y, x = x, z = z)
-
-  task = TaskClassif$new("test", backend = dat, target = "y")
-
-  learner = lrn("classif.log_reg", id = "l")  # baseline
-  graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1")  # no cross-method
-  graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2")  # cross-method
-
-  # check if nested resampling for the cross-method would work
-  train = sample(task$row_ids, size = 200L)
-  test = setdiff(task$row_ids, train)
-
-  learner$train(task, row_ids = train)
-  graphlearner1$train(task, row_ids = train)
-  graphlearner2$train(task, row_ids = train)
-
-  ce = c(suppressWarnings(learner$predict(task, row_ids = test)$score(msr("classif.ce"))),
-    suppressWarnings(graphlearner1$predict(task, row_ids = test)$score(msr("classif.ce"))),
-    suppressWarnings(graphlearner2$predict(task, row_ids = test)$score(msr("classif.ce"))))
-  expect_true(all(exp(diff(log(ce))) - 1 < 0.1))  # ratios of mean ce's should be around 1
-})
-
-test_that("PipeOpImpactEncode cross-method on Regression", {
-  # FIXME: could also add some more technical tests
-
-  library(mlr3learners)
-  set.seed(2409)
-  n = 300L
-  x = as.factor(rep(c("x1", "x2"), each = n / 2L))  # x1 ~ N(-5, 2), x2 ~ N(5, 2)
-  y = c(rnorm(n / 2L, mean = -5, sd = 2), rnorm(n / 2L, mean = 5, sd = 2))
-  # aggregate(y ~ x, FUN = mean, data = dat)
-  # aggregate(y ~ x, FUN = sd, data = dat)
-  z = as.factor(sample(c("z1", "z2", "z3"), size = n, replace = TRUE)) # random
-  dat = data.table(y = y, x = x, z = z)
-
-  task = TaskRegr$new("test", backend = dat, target = "y")
-
-  learner = lrn("regr.lm", id = "l")  # baseline
-  graphlearner1 = GraphLearner$new(po("encodeimpact", folds = 1L) %>>% learner, id = "gl1")  # no cross-method
-  graphlearner2 = GraphLearner$new(po("encodeimpact", folds = 2L) %>>% learner, id = "gl2")  # cross-method
-
-  # check if nested resampling for the cross-method would work
-  train = sample(task$row_ids, size = 200L)
-  test = setdiff(task$row_ids, train)
-
-  learner$train(task, row_ids = train)
-  graphlearner1$train(task, row_ids = train)
-  graphlearner2$train(task, row_ids = train)
-
-  mse = c(learner$predict(task, row_ids = test)$score(msr("regr.mse")),
-    graphlearner1$predict(task, row_ids = test)$score(msr("regr.mse")),
-    graphlearner2$predict(task, row_ids = test)$score(msr("regr.mse")))
-  expect_true(all(exp(diff(log(mse))) - 1 < 0.1))  # ratios of mean mse's should be around 1
 })
-

From 2489e605427994ebb2ea0ef95e8de099eb1f66bc Mon Sep 17 00:00:00 2001
From: sumny <lennart.sch@web.de>
Date: Thu, 1 Oct 2020 18:07:09 +0200
Subject: [PATCH 7/7] drop response if predict_type = "impact"

---
 R/PipeOpLearnerCV.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R
index b8aba7bf6..40db58376 100644
--- a/R/PipeOpLearnerCV.R
+++ b/R/PipeOpLearnerCV.R
@@ -191,7 +191,7 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV",
 
     pred_to_task = function(prds, task) {
       if (!is.null(prds$truth)) prds[, truth := NULL]
-      if (!self$param_set$values$resampling.keep_response && self$learner$predict_type == "prob") {
+      if (!self$param_set$values$resampling.keep_response && self$learner$predict_type %in% c("impact", "prob")) {
         prds[, response := NULL]
       }
       renaming = setdiff(colnames(prds), "row_id")