This report use the realistically random generated data to predict what investment investment_type a user is likely to take out in the future.
library(feather)
library(tidyverse)
library(tidymodels)
library(themis)
# library(vip)
library(DALEX)
library(DALEXtra)
library(waterfalls)
This model will be used in the marketing for an individual users.
The data included for this report is:
Outcome variable: investment_type. (Money market, Bonds, Equity, Property)
Predictors:
age
family_size
risk_appetite
income
other_investments
model_data <- read_feather('investment_data.feather') %>%
select(-user_id)
model_data %>%
count(investment_type)
## # A tibble: 4 × 2
## investment_type n
## <chr> <int>
## 1 Bonds 12309
## 2 Equity 60088
## 3 Money market 8335
## 4 Property 19268
Split data and create folds.
# split
set.seed(123)
data_split <- initial_split(model_data, prop = 0.8, strata = investment_type)
test_data <- testing(data_split)
train_data <- training(data_split)
# folds
set.seed(234)
train_folds <- vfold_cv(train_data, v = 5, strata = investment_type)
Create a recipe for the model. Before training the model, use a recipe to create a few new predictors and conduct some preprocessing required by the model.
Never transform the outcome variable in a recipe, rather do it in the data feature engineering step.
Imbalanced data: To correct the imbalanced data either upsample or downsample.
model_recipe <- recipe(formula = investment_type ~ . , data = train_data) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(income, age)
model_recipe %>%
prep(data = NULL) %>%
juice() %>%
glimpse()
## Rows: 79,999
## Columns: 7
## $ age <dbl> 1.051602517, -1.591853126, 1.106674510, -0.…
## $ family_size <int> 4, 1, 2, 2, 3, 1, 1, 3, 4, 2, 1, 5, 4, 3, 5…
## $ income <dbl> -1.1882883, -1.6883103, -0.9069257, -0.5129…
## $ investment_type <fct> Bonds, Bonds, Bonds, Bonds, Bonds, Bonds, B…
## $ other_investments_Yes <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0…
## $ risk_appetite_Balanced <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ risk_appetite_Conservative <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
model_recipe %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(investment_type)) +
geom_bar()
Set the model to be used.
model_spec <-
rand_forest() %>%
set_mode("classification") %>%
set_engine("ranger")
model_spec
## Random Forest Model Specification (classification)
##
## Computational engine: ranger
Create a model workflow, which pairs a model and recipe together.
Model contains hyper parameter that needs to be tuned.
model_workflow <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(model_spec)
model_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_dummy()
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Computational engine: ranger
When we do resampling, tidymodels doesn’t train a final model for us until we tell it to. This makes sense as if we have lots of models to try with different recipes or hyperparameters we usually want to inspect how the model is performing on the resamples before selecting our final model to build on all of Train. What this also means though is that we don’t have any feature importance scores available to us until we train our final model.
As we pass it the split object from the start of our code we get two processes for the price of one. We train/fit our selected model on 100% of Train and then it automatically scores up the Test set with the newly created, final model.
# cl <- makePSOCKcluster(detectCores() - 1)
# registerDoParallel(cl)
final_fit <- model_workflow %>%
last_fit(data_split)
# final_fit
# collect_metrics(final_fit)
collect_predictions(final_fit) %>%
conf_mat(investment_type, .pred_class) %>%
autoplot()
# final_fit$.predictions
collect_predictions(final_fit) %>%
roc_curve(truth = investment_type, .pred_Equity, .pred_Property, .pred_Bonds, `.pred_Money market`) %>%
ggplot(aes(1 - specificity, sensitivity, color = .level)) +
geom_abline(slope = 1, color = "gray50", lty = 2, alpha = 0.8) +
geom_path(size = 1.5, alpha = 0.7) +
labs(color = NULL) +
coord_fixed()
# stopCluster(cl)
# registerDoSEQ()
Create an explainer. The global and local explanations below is specific to the model and data transformations above. It can change significantly if eg. a different model is chosen.
vip_train <- train_data %>%
select(-investment_type)
model_explainer <-
explain_tidymodels(
extract_workflow(final_fit),
data = vip_train,
y = train_data$investment_type,
label = "tidymodel",
verbose = FALSE
)
Below is the variables that are most important in the model, ie they describe most of the variance in the outcome variable. These are the global explainers.
vi_rf <- model_parts(model_explainer)
plot(vi_rf)
Below is details on the impact of individual variables.
vr_eff <- model_profile(model_explainer, variables = "income")
plot(vr_eff)
vr_eff <- model_profile(model_explainer, variables = "risk_appetite")
plot(vr_eff)
vr_eff <- model_profile(model_explainer, variables = "family_size")
plot(vr_eff)
vr_eff <- model_profile(model_explainer, variables = "age")
plot(vr_eff)
Models are deployed as APIs. This mean models can be called via the web and provide real time results.
set.seed(246)
final_model <- extract_workflow(final_fit)
saveRDS(final_model, 'final_model.RDS')
model_deploy <- read_rds('final_model.RDS')
model_deploy <- final_model
# Upload data and predict
input_data <- test_data %>%
sample_n(1)
api_output <- augment(model_deploy, input_data)
api_output %>% glimpse()
## Rows: 1
## Columns: 11
## $ .pred_class <fct> Equity
## $ .pred_Bonds <dbl> 0.04995448
## $ .pred_Equity <dbl> 0.8914781
## $ `.pred_Money market` <dbl> 0
## $ .pred_Property <dbl> 0.0585674
## $ age <int> 42
## $ family_size <int> 2
## $ income <dbl> 80565
## $ other_investments <chr> "Yes"
## $ risk_appetite <chr> "Conservative"
## $ investment_type <chr> "Equity"
Below is a local explanation for the prediction made by the model.
mod_output_pred <- as.character(api_output$.pred_class)
# local explaination
local_expl <- predict_parts(explainer = model_explainer, new_observation = input_data)
local_expl_tbl <- local_expl %>%
as_tibble() %>%
filter(str_detect(label, mod_output_pred))
# plot prep
plot_tbl <- local_expl_tbl %>%
mutate(plot_col = if_else(contribution > 0, "#94a2b1", "#e58b84")) %>%
select(values = contribution, group = variable, plot_col) %>%
filter(group != 'prediction') %>%
mutate(values = round(values, 2))
# plot
plot_tbl %>%
waterfall(calc_total = TRUE,
fill_by_sign = FALSE,
fill_colours = plot_tbl$plot_col,
rect_text_size = 2) +
coord_flip() +
# theme
theme(text = element_text(size=18),
plot.title = ggtext::element_markdown(),
axis.title.y = element_text(hjust = 1, color = 'grey'),
# axis.text.y = element_blank(),
axis.title.x = element_text(hjust = 1, color = 'grey'),
axis.text.x = element_text(color="grey"),
legend.position = "none",
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()
# axis.ticks = element_blank()
) +
# colours
# scale_fill_manual(values = my_colors) +
# headers
labs(title = str_glue("Prediction: {mod_output_pred}"),
x = "Predictors",
y = "Probability")