hooks.R

              #' Add a hook to a tundraContainer.
#'
#' Hooks are useful for defining additional checks that should be
#' performed prior to and during training and prediction. For example,
#' one might want to issue a warning if the user is predicting on 
#' rows that were used for training, or a sanity check might be 
#' present prior to training to ensure a dependent variable is present.
#'
#' The following hooks are available.
#'
#' \enumerate{
#'   \item{train_pre_munge}{This hook runs during a call to the
#'     container's \code{train} method, just prior to invoking the
#'     \code{munge_procedure} to clean up the dataset. It could be
#'     useful for defining pre-conditions on the dataset to ensure
#'     it can be munged successfully.}
#'   \item{train_post_munge}{This hook runs during a call to the
#'     container's \code{train} method, just after invoking the
#'     \code{munge_procedure} to clean up the dataset. It could be
#'     useful for defining post-conditions on the dataset to ensure
#'     it was munged successfully.}
#'   \item{train_finalize}{This hook runs just after the \code{train}
#'     method calls the \code{train_function}. It could be used to
#'     verify presence or validate properties of the trained model.}
#'   \item{predict_pre_munge}{This hook runs during a call to the
#'     container's \code{predict} method, just prior to invoking the
#'     \code{munge_procedure} to clean up the dataset. It could be
#'     useful for defining pre-conditions on the dataset to ensure
#'     it can be munged successfully.}
#'   \item{predict_post_munge}{This hook runs during a call to the
#'     container's \code{predict} method, just after invoking the
#'     \code{munge_procedure} to clean up the dataset. It could be
#'     useful for defining post-conditions on the dataset to ensure
#'     it was munged successfully.}
#' }
#'
#' Each hook will be provided the \code{tundraContainer} as input
#' (unless it has no arguments, in which case it will simply be called).
#'
#' @name hooks
#' @param hook_name character. The hook to run. Must be one of the available
#'    hooks.
run_hooks <- function(hook_name) {
  for (hook in self$.hooks[[hook_name]]) {
    if (length(formals(hook)) > 0) {
      hook(self)
    } else {
      hook()
    }
  }
}

#' Add a hook to a tundraContainer.
#'
#' @param hook_function function. The hook to execute. It will be provided
#'    the \code{tundraContainer} as its only argument.
#' @rdname hooks
add_hook <- function(hook_name, hook_function) {
  stopifnot(is.simple_string(hook_name),
            is.function(hook_function))

  allowed_types <- c("train_pre_munge", "predict_pre_munge",
                     "train_post_munge", "predict_post_munge",
                     "train_finalize")
  hook_name <- match.arg(hook_name, allowed_types)

  self$.hooks[[hook_name]] <- c(self$.hooks[[hook_name]], hook_function)
}

package.tundra.R

              #' Tundra is a standardized classifier container format for R.
#'
#' Deploying models in production systems is generally a cumbersome process.
#' If analysis is performed in a language like R or SAS, the coefficients of the
#' model are usually extracted and translated to a "production-ready" language like
#' R or Java.
#'
#' However, this approach is flawed. The translation process is time consuming
#' and error-prone. R is demonstrably capable of serving models
#' in production environments as long as submillisecond latency is not a
#' requirement. This means it should be possible to push analysis performed in
#' R to directly score records in production systems without an intermediary.
#' This significantly decreases the cost of iterating on machine learning
#' models.
#'
#' A tundraContainer is a simple bundling of the two critical components of 
#' any machine learning model.
#'
#' \itemize{
#'   \item{The data preparation required to convert raw production data to
#'     a record that is acceptable to a trained classifier. For example,
#'     a regression-based model may need discretization of non-categorical
#'     variables or imputation of missing values.}
#'   \item{The trained classifier, usually a native R S3 object with
#'     a \code{train} method.}
#' }
#'
#' The former is provided by the \href{https://github.com/syberia/mungebits2}{mungebits2}
#' package, while the latter is fully customizable to any R function. This
#' approach allows arbitrary data preparation and statistical methods, unlike
#' attempts such as PMML (Predictive Modeling Markup Language) which constrain
#' the space of possible data preparation methodologies and statistical
#' methodologies to a very limited subset.
#'
#' @name tundra
#' @docType package
NULL

tundraContainer-initialize.R

              #' Initialize a tundraContainer object.
#'
#' @param keyword character. The name of the classifier; for example,
#'    "lm" or "knn".
#' @param train_function function. The function used to train the model.
#'    Its first argument will be a data.frame, and the second argument
#'    a list of additional parameters used for training the model.
#' @param predict_function function. The function used to predict
#'    on new datasets. Its first argument will be a data.frame,
#'    the dataset to predict on, and its second (optional)
#'    argument will be additional parameters used for prediction
#'    output (such as whether to return a probabilistic or absolute
#'    value).
#' @param munge_procedure list. A list of trained
#'    \code{\link[mungebits2]{mungepiece}}s to apply to data sets
#'    during prediction.
#' @param default_args list. A list of default arguments to provide to
#'    the second argument of the \code{train_function}. The additional
#'    arguments provided to the \code{tundraContainer}'s \code{train}
#'    method will be merged on top of these defaults.
#' @param internal list. Internal metadata that should accompany the
#'    model. Usually this is domain/organization specific, and can
#'    include things such as a list of primary keys used for training
#'    the model, identifiers or names of data sources used for
#'    training the model, etc. It is a playground entirely under
#'    your control, and can be used by other packages or a production
#'    server hosting the model to achieve additional behavior.
initialize <- function(keyword, train_function = identity,
                       predict_function = identity, munge_procedure = list(),
                       default_args = list(), internal = list()) {
  if (!(is.list(munge_procedure) || is(munge_procedure, "stageRunner"))) {
    stop("The ", sQuote("munge_procedure"), " parameter must be a list or ",
         "stageRunner object.")
  }

  self$.keyword          <- keyword
  self$.train_function   <- train_function
  self$.predict_function <- predict_function
  self$.munge_procedure  <- munge_procedure
  self$.default_args     <- default_args
  self$.internal         <- internal
  self$.input            <- list_to_env(list())
  lockEnvironment(self$.input)
  self$.output           <- list_to_env(list())
  self$.internal         <- list_to_env(list())
  self$.hooks            <- list()
}

tundraContainer-predict.R

              #' Predict on a dataset using a trained tundraContainer.
#'
#' @param dataframe data.frame. The dataset to generate predictions on
#'    with the trained model. The data will be preprocessed with the
#'    \code{tundraContainer}'s trained \code{munge_procedure} and
#'    then passed as the first argument to the \code{tundraContainer}'s
#'    \code{predict_function}.
#' @param predict_args list. A list of arguments to pass to the
#'    \code{tundraContainer}'s \code{predict_function} as its second argument.
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
#'    default the latter. If \code{TRUE}, then output produced by
#'    running the \code{munge_procedure} or the \code{predict_function}
#'    will not be silenced.
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
#'    default the former. If \code{TRUE}, the \code{munge_procedure}
#'    provided to the container during initialization will be used to
#'    preprocess the given \code{dataframe}.
#' @return The value returned by the \code{tundraContainer}'s
#'    \code{predict_function}, usually a numeric vector or
#'    \code{data.frame} of predictions.
predict <- function(dataframe, predict_args = list(), verbose = FALSE, munge = TRUE) {
  if (!isTRUE(self$.trained)) {
    stop("Tundra model ", sQuote(self$.keyword), " has not been trained yet.")
  }

  force(verbose)
  force(munge)
  force(predict_args)

  private$run_hooks("predict_pre_munge")
  if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
    initial_nrow <- NROW(dataframe)
    dataframe <- mungebits2::munge(dataframe, self$.munge_procedure, verbose)
    if (NROW(dataframe) != initial_nrow) {
      warning("Some rows were removed during data preparation. ",
              "Predictions will not match input dataframe.")
    }
  }
  private$run_hooks("predict_post_munge")

  if (length(formals(self$.predict_function)) < 2 || missing(predict_args)) {
    args <- list(dataframe)
  } else {
    args <- list(dataframe, predict_args)
  }

  call_with(
    self$.predict_function,
    args,
    list(input = self$.input, output = self$.output)
  )
}

tundraContainer-train.R

              #' Train a model encapsulated within a tundraContainer.
#'
#' @param dataframe data.frame. The dataset to train the model on. This
#'    will be preprocessed with the \code{tundraContainer}'s 
#'    \code{munge_procedure} and then passed as the first argument to
#'    the \code{tundraContainer}'s \code{train_function}.
#' @param train_args list. A list of arguments to make available
#'    to the \code{tundraContainer}'s \code{train_function} through
#'    use of the \code{input} keyword. See the examples.
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
#'    default the latter. If \code{TRUE}, then output produced by
#'    running the \code{munge_procedure} or the \code{train_function}
#'    will not be silenced.
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
#'    default the former. If \code{FALSE}, the \code{munge_procedure}
#'    provided to the container during initialization will be assumed
#'    to have been trained, and the \code{dataframe} provided will not
#'    be run through it.
#' @return The value returned by the \code{tundraContainer}'s
#'    \code{train_function}. Since the \code{train_function} has side effects
#'    on the container as its primary purpose, this can usually be
#'    \code{invisible(NULL)}.
train <- function(dataframe, train_args = list(), verbose = FALSE, munge = TRUE) {
  if (isTRUE(self$.trained)) {
    stop("The tundra ", sQuote(self$.keyword), " model has already been trained.")
  }

  force(train_args)
  force(verbose)
  force(munge)

  private$run_hooks("train_pre_munge")
  if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
    dataframe <- munge(dataframe, self$.munge_procedure, verbose)
    attr(dataframe, "mungepieces") <- NULL
  }
  private$run_hooks("train_post_munge")

  output <- call_with(
    self$.train_function,
    list(dataframe),
    list(
      input = list_to_env(list_merge(self$.default_args, train_args), self$.input),
      output = self$.output
    )
  )

  private$run_hooks("train_finalize")
  self$.trained <<- TRUE

  output
}

munge <- function(dataframe, munge_procedure, verbose) {
  if (isTRUE(verbose)) {
    capture.output(Recall(dataframe, munge_procedure, FALSE))
  } else {
    mungebits2::munge(dataframe, munge_procedure)
  }
}

tundraContainer.R

              #' A standard container format for classifiers developed in R.
#'
#' @docType class
#' @name tundraContainer
#' @export
tundraContainer <- R6::R6Class("tundraContainer",
  public = list(
    .keyword          = NULL,  # character
    .train_function   = NULL,  # function
    .predict_function = NULL,  # function
    .munge_procedure  = NULL,  # list of mungepieces
    .default_args     = NULL,  # list
    .trained          = FALSE, # logical
    .input            = NULL,  # environment
    .output           = NULL,  # environment
    .internal         = NULL,  # environment
    .hooks            = NULL,  # list

    initialize = initialize,
    train      = train,
    predict    = predict,
    add_hook   = add_hook,
    munge      = function(dataframe, steps = TRUE) {
      mungebits2::munge(dataframe, munge_procedure[steps])
    },
    show       = function() {
      cat(paste0("A tundraContainer of type ", sQuote(self$.keyword), "\n"))
      invisible(self)
    }
  ),
  private = list(
    run_hooks = run_hooks
  )
)

#' @export
tundra_container <- tundraContainer

#' @export
print.tundraContainer <- function(x, ...) { x$show() }

#' @export
summary.tundraContainer <- function(x, ...) { summary(x$.output$model, ...) }

utils.r

              `%||%` <- function(x, y) if (is.null(x)) y else x

list_to_env <- function(obj, parent = emptyenv()) {
  if (length(obj) == 0) {
    new.env(parent = parent)
  } else {
    list2env(obj, parent = parent)
  }
}

#' Evaluate a function while injecting some locals.
#'
#' Instead of modifying a closure's parent environment directly,
#' sometimes it may be desirable to do a one-time injection that
#' overrides what would normally be accessible through the closure.
#' \code{call_with} allows this by extending the usual \code{do.call}
#' to a third argument that is a list or environment temporarily
#' injected during the course of the call.
#'
#' @param fn function.
#' @param args list. The arguments to call the \code{fn} with.
#' @param with list or environment. Additional locals to make available
#'    during the call.
#' @return The result of calling \code{fn} with the injection provided
#'    by the \code{with} parameter.
#' @examples \dontrun{
#' fn <- local({ x <- 1; function(y) { x + y } })
#' stopifnot(fn(1) == 2)
#' stopifnot(call_with(fn, list(1), list(x = 2)) == 3)
#' }
call_with <- function(fn, args, with) {
  stopifnot(is.list(with) || is.environment(with))
  debugged <- isdebugged(fn)
  copy_fn <- fn
  if (debugged) debug(copy_fn)
  env <- with
  if (!is.environment(with)) {
    with <- list_to_env(with, parent = environment(copy_fn))
  }
  environment(fn) <- with
  do.call(fn, args)
}

#' Merge two lists and overwrite latter entries with former entries
#' if names are the same.
#'
#' For example, \code{list_merge(list(a = 1, b = 2), list(b = 3, c = 4))}
#' will be \code{list(a = 1, b = 3, c = 4)}.
#' @param list1 list
#' @param list2 list
#' @return the merged list.
#' @examples \dontrun{
#' stopifnot(identical(list_merge(list(a = 1, b = 2), list(b = 3, c = 4)),
#'                     list(a = 1, b = 3, c = 4)))
#' stopifnot(identical(list_merge(NULL, list(a = 1)), list(a = 1)))
#' }
list_merge <- function(list1, list2) {
  list1 <- list1 %||% list()
  # Pre-allocate memory to make this slightly faster.
  list1[Filter(function(x) nchar(x) > 0, names(list2) %||% c())] <- NULL
  for (i in seq_along(list2)) {
    name <- names(list2)[i]
    if (!identical(name, NULL) && !identical(name, "")) {
      list1[[name]] <- list2[[i]]
    } else {
      list1 <- append(list1, list(list2[[i]]))
    }
  }
  list1
}

is.simple_string <- function(obj) {
  is.character(obj) && length(obj) == 1 && !is.na(obj) && nzchar(obj)
}