#' Add a hook to a tundraContainer.
#'
#' Hooks are useful for defining additional checks that should be
#' performed prior to and during training and prediction. For example,
#' one might want to issue a warning if the user is predicting on
#' rows that were used for training, or a sanity check might be
#' present prior to training to ensure a dependent variable is present.
#'
#' The following hooks are available.
#'
#' \enumerate{
#' \item{train_pre_munge}{This hook runs during a call to the
#' container's \code{train} method, just prior to invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining pre-conditions on the dataset to ensure
#' it can be munged successfully.}
#' \item{train_post_munge}{This hook runs during a call to the
#' container's \code{train} method, just after invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining post-conditions on the dataset to ensure
#' it was munged successfully.}
#' \item{train_finalize}{This hook runs just after the \code{train}
#' method calls the \code{train_function}. It could be used to
#' verify presence or validate properties of the trained model.}
#' \item{predict_pre_munge}{This hook runs during a call to the
#' container's \code{predict} method, just prior to invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining pre-conditions on the dataset to ensure
#' it can be munged successfully.}
#' \item{predict_post_munge}{This hook runs during a call to the
#' container's \code{predict} method, just after invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining post-conditions on the dataset to ensure
#' it was munged successfully.}
#' }
#'
#' Each hook will be provided the \code{tundraContainer} as input
#' (unless it has no arguments, in which case it will simply be called).
#'
#' @name hooks
#' @param hook_name character. The hook to run. Must be one of the available
#' hooks.
run_hooks <- function(hook_name) {
for (hook in self$.hooks[[hook_name]]) {
if (length(formals(hook)) > 0) {
hook(self)
} else {
hook()
}
}
}
#' Add a hook to a tundraContainer.
#'
#' @param hook_function function. The hook to execute. It will be provided
#' the \code{tundraContainer} as its only argument.
#' @rdname hooks
add_hook <- function(hook_name, hook_function) {
stopifnot(is.simple_string(hook_name),
is.function(hook_function))
allowed_types <- c("train_pre_munge", "predict_pre_munge",
"train_post_munge", "predict_post_munge",
"train_finalize")
hook_name <- match.arg(hook_name, allowed_types)
self$.hooks[[hook_name]] <- c(self$.hooks[[hook_name]], hook_function)
}
#' Tundra is a standardized classifier container format for R.
#'
#' Deploying models in production systems is generally a cumbersome process.
#' If analysis is performed in a language like R or SAS, the coefficients of the
#' model are usually extracted and translated to a "production-ready" language like
#' R or Java.
#'
#' However, this approach is flawed. The translation process is time consuming
#' and error-prone. R is demonstrably capable of serving models
#' in production environments as long as submillisecond latency is not a
#' requirement. This means it should be possible to push analysis performed in
#' R to directly score records in production systems without an intermediary.
#' This significantly decreases the cost of iterating on machine learning
#' models.
#'
#' A tundraContainer is a simple bundling of the two critical components of
#' any machine learning model.
#'
#' \itemize{
#' \item{The data preparation required to convert raw production data to
#' a record that is acceptable to a trained classifier. For example,
#' a regression-based model may need discretization of non-categorical
#' variables or imputation of missing values.}
#' \item{The trained classifier, usually a native R S3 object with
#' a \code{train} method.}
#' }
#'
#' The former is provided by the \href{https://github.com/syberia/mungebits2}{mungebits2}
#' package, while the latter is fully customizable to any R function. This
#' approach allows arbitrary data preparation and statistical methods, unlike
#' attempts such as PMML (Predictive Modeling Markup Language) which constrain
#' the space of possible data preparation methodologies and statistical
#' methodologies to a very limited subset.
#'
#' @name tundra
#' @docType package
NULL
#' Initialize a tundraContainer object.
#'
#' @param keyword character. The name of the classifier; for example,
#' "lm" or "knn".
#' @param train_function function. The function used to train the model.
#' Its first argument will be a data.frame, and the second argument
#' a list of additional parameters used for training the model.
#' @param predict_function function. The function used to predict
#' on new datasets. Its first argument will be a data.frame,
#' the dataset to predict on, and its second (optional)
#' argument will be additional parameters used for prediction
#' output (such as whether to return a probabilistic or absolute
#' value).
#' @param munge_procedure list. A list of trained
#' \code{\link[mungebits2]{mungepiece}}s to apply to data sets
#' during prediction.
#' @param default_args list. A list of default arguments to provide to
#' the second argument of the \code{train_function}. The additional
#' arguments provided to the \code{tundraContainer}'s \code{train}
#' method will be merged on top of these defaults.
#' @param internal list. Internal metadata that should accompany the
#' model. Usually this is domain/organization specific, and can
#' include things such as a list of primary keys used for training
#' the model, identifiers or names of data sources used for
#' training the model, etc. It is a playground entirely under
#' your control, and can be used by other packages or a production
#' server hosting the model to achieve additional behavior.
initialize <- function(keyword, train_function = identity,
predict_function = identity, munge_procedure = list(),
default_args = list(), internal = list()) {
if (!(is.list(munge_procedure) || is(munge_procedure, "stageRunner"))) {
stop("The ", sQuote("munge_procedure"), " parameter must be a list or ",
"stageRunner object.")
}
self$.keyword <- keyword
self$.train_function <- train_function
self$.predict_function <- predict_function
self$.munge_procedure <- munge_procedure
self$.default_args <- default_args
self$.internal <- internal
self$.input <- list_to_env(list())
lockEnvironment(self$.input)
self$.output <- list_to_env(list())
self$.internal <- list_to_env(list())
self$.hooks <- list()
}
#' Predict on a dataset using a trained tundraContainer.
#'
#' @param dataframe data.frame. The dataset to generate predictions on
#' with the trained model. The data will be preprocessed with the
#' \code{tundraContainer}'s trained \code{munge_procedure} and
#' then passed as the first argument to the \code{tundraContainer}'s
#' \code{predict_function}.
#' @param predict_args list. A list of arguments to pass to the
#' \code{tundraContainer}'s \code{predict_function} as its second argument.
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
#' default the latter. If \code{TRUE}, then output produced by
#' running the \code{munge_procedure} or the \code{predict_function}
#' will not be silenced.
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
#' default the former. If \code{TRUE}, the \code{munge_procedure}
#' provided to the container during initialization will be used to
#' preprocess the given \code{dataframe}.
#' @return The value returned by the \code{tundraContainer}'s
#' \code{predict_function}, usually a numeric vector or
#' \code{data.frame} of predictions.
predict <- function(dataframe, predict_args = list(), verbose = FALSE, munge = TRUE) {
if (!isTRUE(self$.trained)) {
stop("Tundra model ", sQuote(self$.keyword), " has not been trained yet.")
}
force(verbose)
force(munge)
force(predict_args)
private$run_hooks("predict_pre_munge")
if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
initial_nrow <- NROW(dataframe)
dataframe <- mungebits2::munge(dataframe, self$.munge_procedure, verbose)
if (NROW(dataframe) != initial_nrow) {
warning("Some rows were removed during data preparation. ",
"Predictions will not match input dataframe.")
}
}
private$run_hooks("predict_post_munge")
if (length(formals(self$.predict_function)) < 2 || missing(predict_args)) {
args <- list(dataframe)
} else {
args <- list(dataframe, predict_args)
}
call_with(
self$.predict_function,
args,
list(input = self$.input, output = self$.output)
)
}
#' Train a model encapsulated within a tundraContainer.
#'
#' @param dataframe data.frame. The dataset to train the model on. This
#' will be preprocessed with the \code{tundraContainer}'s
#' \code{munge_procedure} and then passed as the first argument to
#' the \code{tundraContainer}'s \code{train_function}.
#' @param train_args list. A list of arguments to make available
#' to the \code{tundraContainer}'s \code{train_function} through
#' use of the \code{input} keyword. See the examples.
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
#' default the latter. If \code{TRUE}, then output produced by
#' running the \code{munge_procedure} or the \code{train_function}
#' will not be silenced.
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
#' default the former. If \code{FALSE}, the \code{munge_procedure}
#' provided to the container during initialization will be assumed
#' to have been trained, and the \code{dataframe} provided will not
#' be run through it.
#' @return The value returned by the \code{tundraContainer}'s
#' \code{train_function}. Since the \code{train_function} has side effects
#' on the container as its primary purpose, this can usually be
#' \code{invisible(NULL)}.
train <- function(dataframe, train_args = list(), verbose = FALSE, munge = TRUE) {
if (isTRUE(self$.trained)) {
stop("The tundra ", sQuote(self$.keyword), " model has already been trained.")
}
force(train_args)
force(verbose)
force(munge)
private$run_hooks("train_pre_munge")
if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
dataframe <- munge(dataframe, self$.munge_procedure, verbose)
attr(dataframe, "mungepieces") <- NULL
}
private$run_hooks("train_post_munge")
output <- call_with(
self$.train_function,
list(dataframe),
list(
input = list_to_env(list_merge(self$.default_args, train_args), self$.input),
output = self$.output
)
)
private$run_hooks("train_finalize")
self$.trained <<- TRUE
output
}
munge <- function(dataframe, munge_procedure, verbose) {
if (isTRUE(verbose)) {
capture.output(Recall(dataframe, munge_procedure, FALSE))
} else {
mungebits2::munge(dataframe, munge_procedure)
}
}
#' A standard container format for classifiers developed in R.
#'
#' @docType class
#' @name tundraContainer
#' @export
tundraContainer <- R6::R6Class("tundraContainer",
public = list(
.keyword = NULL, # character
.train_function = NULL, # function
.predict_function = NULL, # function
.munge_procedure = NULL, # list of mungepieces
.default_args = NULL, # list
.trained = FALSE, # logical
.input = NULL, # environment
.output = NULL, # environment
.internal = NULL, # environment
.hooks = NULL, # list
initialize = initialize,
train = train,
predict = predict,
add_hook = add_hook,
munge = function(dataframe, steps = TRUE) {
mungebits2::munge(dataframe, munge_procedure[steps])
},
show = function() {
cat(paste0("A tundraContainer of type ", sQuote(self$.keyword), "\n"))
invisible(self)
}
),
private = list(
run_hooks = run_hooks
)
)
#' @export
tundra_container <- tundraContainer
#' @export
print.tundraContainer <- function(x, ...) { x$show() }
#' @export
summary.tundraContainer <- function(x, ...) { summary(x$.output$model, ...) }
`%||%` <- function(x, y) if (is.null(x)) y else x
list_to_env <- function(obj, parent = emptyenv()) {
if (length(obj) == 0) {
new.env(parent = parent)
} else {
list2env(obj, parent = parent)
}
}
#' Evaluate a function while injecting some locals.
#'
#' Instead of modifying a closure's parent environment directly,
#' sometimes it may be desirable to do a one-time injection that
#' overrides what would normally be accessible through the closure.
#' \code{call_with} allows this by extending the usual \code{do.call}
#' to a third argument that is a list or environment temporarily
#' injected during the course of the call.
#'
#' @param fn function.
#' @param args list. The arguments to call the \code{fn} with.
#' @param with list or environment. Additional locals to make available
#' during the call.
#' @return The result of calling \code{fn} with the injection provided
#' by the \code{with} parameter.
#' @examples \dontrun{
#' fn <- local({ x <- 1; function(y) { x + y } })
#' stopifnot(fn(1) == 2)
#' stopifnot(call_with(fn, list(1), list(x = 2)) == 3)
#' }
call_with <- function(fn, args, with) {
stopifnot(is.list(with) || is.environment(with))
debugged <- isdebugged(fn)
copy_fn <- fn
if (debugged) debug(copy_fn)
env <- with
if (!is.environment(with)) {
with <- list_to_env(with, parent = environment(copy_fn))
}
environment(fn) <- with
do.call(fn, args)
}
#' Merge two lists and overwrite latter entries with former entries
#' if names are the same.
#'
#' For example, \code{list_merge(list(a = 1, b = 2), list(b = 3, c = 4))}
#' will be \code{list(a = 1, b = 3, c = 4)}.
#' @param list1 list
#' @param list2 list
#' @return the merged list.
#' @examples \dontrun{
#' stopifnot(identical(list_merge(list(a = 1, b = 2), list(b = 3, c = 4)),
#' list(a = 1, b = 3, c = 4)))
#' stopifnot(identical(list_merge(NULL, list(a = 1)), list(a = 1)))
#' }
list_merge <- function(list1, list2) {
list1 <- list1 %||% list()
# Pre-allocate memory to make this slightly faster.
list1[Filter(function(x) nchar(x) > 0, names(list2) %||% c())] <- NULL
for (i in seq_along(list2)) {
name <- names(list2)[i]
if (!identical(name, NULL) && !identical(name, "")) {
list1[[name]] <- list2[[i]]
} else {
list1 <- append(list1, list(list2[[i]]))
}
}
list1
}
is.simple_string <- function(obj) {
is.character(obj) && length(obj) == 1 && !is.na(obj) && nzchar(obj)
}