ShapML
is computationally expensive. Because of this, the goal of this vignette is to (a) give a sense of the real-world clock time needed to explain datasets of various sizes and (b) compare these times to those from other popular open source packages implementing similar algorithms.R
with a C++
back endFor all of the simulations examined, the Shapley value approximation algorithm in Julia
’s ShapML
was consistently and noticeably faster than the R
implementation in fastshap
.
These results may or may not hold for big data and parallel computation so check back for additional simulations.
R
library(dplyr)
library(fastshap)
library(ggplot2)
library(JuliaCall)
library(lubridate)
library(ranger)
library(tidyr)
JuliaCall::julia_setup()
Sys.setenv(PATH = paste("C:/Users/nickr/AppData/Local/Julia-1.3.1/bin", Sys.getenv("PATH"), sep = ";"))
Julia
using DataFrames
using Random
using RCall
using ShapML
R
n_instances <- c(1, 100, 1000)
n_features <- c(10, 50, 200)
conditions <- expand.grid("n_features" = n_features, "n_instances" = n_instances)
conditions$condition <- 1:nrow(conditions)
n_conditions <- nrow(conditions)
n_models <- length(n_features)
n_simulations <- 3
n_monte_carlo <- 20
seed <- 1
conditions
R
Create a series of synthetic datasets, one for each set of features–10, 50, 200. We’ll down-sample each dataset to create new datasets with fewer instances for the various conditions in the simulation.
Train one Random Forest model for each set of features–10, 50, 200–with 1,000 instances.
data_train <- vector("list", n_models)
models <- vector("list", n_models)
for(i in 1:n_models) {
data_train[[i]] <- fastshap::gen_friedman(max(n_instances), n_features[i], seed = seed)
models[[i]] <- ranger::ranger(y ~ ., data = data_train[[i]], seed = seed)
}
names(data_train) <- n_features
names(models) <- n_features
R
data_explain <- vector("list", n_conditions)
models_condition <- vector("list", n_conditions)
for(i in 1:n_conditions) {
data_condition <- data_train[[which(names(data_train) == as.character(conditions$n_features[i]))]]
# 1 dataset of features to be explained for each condition.
data_explain[[i]] <- data_condition[1:conditions$n_instances[i], !names(data_condition) %in% "y"]
# 1 model for each condition.
models_condition[[i]] <- models[[which(names(models) == as.character(conditions$n_features[i]))]]
}
R
predict()
functions will be written in R
and then converted to Julia
to work with the trained Random Forest model–which is not explicitly converted.predict_fun_r <- function(object, newdata) {
predict(object, data = newdata)$predictions
}
predict_fun_julia <- function(model, data) {
data_pred = data.frame("y_pred" = predict(model, data)$predictions) # Must return a DataFrame.
return(data_pred)
}
R
fastshap
.data_shap_r <- vector("list", n_conditions)
data_shap_r <- lapply(data_shap_r, function(pass){vector("list", n_simulations)})
runtime_r <- vector("list", n_conditions)
runtime_r <- lapply(data_shap_r, function(pass){vector("list", n_simulations)})
for (i in 1:n_conditions) {
for (j in 1:n_simulations) {
set.seed(seed)
start_time <- Sys.time()
data_shap_r[[i]][[j]] <- fastshap::explain(models_condition[[i]],
X = data_explain[[i]],
pred_wrapper = predict_fun_r,
nsim = n_monte_carlo
)
stop_time <- Sys.time()
runtime_r[[i]][[j]] <- data.frame("time" = as.numeric(difftime(stop_time, start_time, units = "secs")),
"condition" = i,
"simulation" = j)
print(runtime_r[[i]][[j]])
}
}
runtime_r <- dplyr::bind_rows(unlist(runtime_r, recursive = FALSE))
Julia
R
objects into the Julia
environment.n_conditions = RCall.reval("n_conditions")
n_conditions = convert(Integer, n_conditions)
n_simulations = RCall.reval("n_simulations")
n_simulations = convert(Integer, n_simulations)
models_condition = RCall.reval("models_condition")
predict_fun_julia = RCall.reval("predict_fun_julia")
predict_fun_julia = convert(Function, predict_fun_julia)
data_explain = RCall.reval("data_explain")
n_monte_carlo = RCall.reval("n_monte_carlo")
n_monte_carlo = convert(Integer, n_monte_carlo)
seed = RCall.reval("seed")
seed = convert(Integer, seed)
ShapML
.data_shap_julia = [Array{DataFrame}(undef, n_simulations) for i in 1:n_conditions]
runtime_julia = [Array{Any}(undef, n_simulations) for i in 1:n_conditions]
for i in 1:n_conditions
for j in 1:n_simulations
data_explain_temp = convert(DataFrame, data_explain[i])
start_time = time()
data_shap_julia[i][j] = ShapML.shap(explain = data_explain_temp,
reference = data_explain_temp,
model = models_condition[i],
predict_function = predict_fun_julia,
sample_size = n_monte_carlo,
seed = seed
)
stop_time = time()
runtime_julia[i][j] = DataFrame(time = stop_time - start_time, condition = i, simulation = j)
end
end
runtime_julia = vcat(vcat(runtime_julia...)...)
runtime_r$language <- "R"
runtime_julia$language <- "Julia"
data_runtime <- dplyr::bind_rows(runtime_r, runtime_julia)
data_runtime <- dplyr::left_join(data_runtime, conditions, by = "condition")
Julia
’s ShapML
is consistently and noticeably faster than the R
implementation in fastshap
.plot_theme <- theme(
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 14),
axis.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12, face = "bold"),
legend.position = "bottom"
)
p <- ggplot(data_runtime, aes(n_instances, time, color = language))
p <- p + geom_point(size = 5, alpha = .5)
p <- p + stat_smooth(method = "lm", se = FALSE, show.legend = FALSE, size = 1.1, alpha = .5)
p <- p + scale_color_manual(values = c("R" = "#276DC2", "Julia" = "#9558B2"))
p <- p + facet_wrap(~ n_features, scales = "free")
p <- p + theme_bw() + plot_theme
p <- p + xlab("Number of instances in dataset") + ylab("Runtime in seconds") + labs(color = NULL) +
ggtitle("Shapley Approximation Algorithm Runtime - Julia vs R (Non-Parallel)",
subtitle = "Faceted by number of features")
p_1 <- p
p_1
p <- ggplot(data_runtime, aes(n_features, time, color = language))
p <- p + geom_point(size = 5, alpha = .5)
p <- p + stat_smooth(method = "lm", formula = y ~ poly(x, 2),
se = FALSE, show.legend = FALSE, size = 1.1, alpha = .5)
p <- p + scale_color_manual(values = c("R" = "#276DC2", "Julia" = "#9558B2"))
p <- p + scale_x_continuous(limits = c(0, 200))
p <- p + facet_wrap(~ n_instances, scales = "free")
p <- p + theme_bw() + plot_theme
p <- p + xlab("Number of features in dataset") + ylab("Runtime in seconds") + labs(color = NULL) +
ggtitle("Shapley Approximation Algorithm Runtime - Julia vs R (Non-Parallel)",
subtitle = "Faceted by number of instances")
p_2 <- p
p_2