Introduction

This report use the realistically random generated data to predict what investment investment_type a user is likely to take out in the future.

library(feather)

library(tidyverse)
library(tidymodels)
library(themis)

# library(vip)
library(DALEX)
library(DALEXtra)
library(waterfalls)

Objective

This model will be used in the marketing for an individual users.

Data

The data included for this report is:

Outcome variable: investment_type. (Money market, Bonds, Equity, Property)

Predictors:

  • age

  • family_size

  • risk_appetite

  • income

  • other_investments

model_data <- read_feather('investment_data.feather') %>%
  select(-user_id)


model_data %>%
  count(investment_type)
## # A tibble: 4 × 2
##   investment_type     n
##   <chr>           <int>
## 1 Bonds           12309
## 2 Equity          60088
## 3 Money market     8335
## 4 Property        19268

Data budget

Split data and create folds.

# split
set.seed(123)

data_split <- initial_split(model_data, prop = 0.8,  strata = investment_type)

test_data  <- testing(data_split)
train_data <- training(data_split)


# folds
set.seed(234)

train_folds <- vfold_cv(train_data, v = 5, strata = investment_type)

Recipe

Create a recipe for the model. Before training the model, use a recipe to create a few new predictors and conduct some preprocessing required by the model.

Never transform the outcome variable in a recipe, rather do it in the data feature engineering step.

Imbalanced data: To correct the imbalanced data either upsample or downsample.

model_recipe <- recipe(formula = investment_type  ~ . , data = train_data) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(income, age)


model_recipe %>% 
  prep(data = NULL) %>%
  juice() %>% 
  glimpse()
## Rows: 79,999
## Columns: 7
## $ age                        <dbl> 1.051602517, -1.591853126, 1.106674510, -0.…
## $ family_size                <int> 4, 1, 2, 2, 3, 1, 1, 3, 4, 2, 1, 5, 4, 3, 5…
## $ income                     <dbl> -1.1882883, -1.6883103, -0.9069257, -0.5129…
## $ investment_type            <fct> Bonds, Bonds, Bonds, Bonds, Bonds, Bonds, B…
## $ other_investments_Yes      <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0…
## $ risk_appetite_Balanced     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ risk_appetite_Conservative <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
model_recipe %>%
  prep() %>%
  bake(new_data = NULL) %>%
  ggplot(aes(investment_type)) +
  geom_bar()

Model

Set the model to be used.

model_spec <-
  rand_forest() %>%
  set_mode("classification") %>%
  set_engine("ranger")

model_spec
## Random Forest Model Specification (classification)
## 
## Computational engine: ranger

Workflow

Create a model workflow, which pairs a model and recipe together.

Model contains hyper parameter that needs to be tuned.

model_workflow <- workflow() %>%
  add_recipe(model_recipe) %>% 
  add_model(model_spec) 

model_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_dummy()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Computational engine: ranger

Fit the model

When we do resampling, tidymodels doesn’t train a final model for us until we tell it to. This makes sense as if we have lots of models to try with different recipes or hyperparameters we usually want to inspect how the model is performing on the resamples before selecting our final model to build on all of Train. What this also means though is that we don’t have any feature importance scores available to us until we train our final model.

As we pass it the split object from the start of our code we get two processes for the price of one. We train/fit our selected model on 100% of Train and then it automatically scores up the Test set with the newly created, final model.

# cl <- makePSOCKcluster(detectCores() - 1)  
# registerDoParallel(cl)

final_fit <- model_workflow %>%
  last_fit(data_split)


# final_fit
# collect_metrics(final_fit)

collect_predictions(final_fit) %>%
  conf_mat(investment_type, .pred_class) %>%
  autoplot()

# final_fit$.predictions

collect_predictions(final_fit) %>%
  roc_curve(truth = investment_type, .pred_Equity, .pred_Property, .pred_Bonds, `.pred_Money market`) %>%
  ggplot(aes(1 - specificity, sensitivity, color = .level)) +
  geom_abline(slope = 1, color = "gray50", lty = 2, alpha = 0.8) +
  geom_path(size = 1.5, alpha = 0.7) +
  labs(color = NULL) +
  coord_fixed()

# stopCluster(cl)
# registerDoSEQ()

Variable importance

Create an explainer. The global and local explanations below is specific to the model and data transformations above. It can change significantly if eg. a different model is chosen.

vip_train <- train_data %>% 
  select(-investment_type)

model_explainer <- 
  explain_tidymodels(
    extract_workflow(final_fit), 
    data = vip_train, 
    y = train_data$investment_type,
    label = "tidymodel",
    verbose = FALSE
  )

Below is the variables that are most important in the model, ie they describe most of the variance in the outcome variable. These are the global explainers.

vi_rf <- model_parts(model_explainer)
plot(vi_rf)

Variable effects

Below is details on the impact of individual variables.

Income

vr_eff  <- model_profile(model_explainer, variables =  "income")
plot(vr_eff)

Risk appetite

vr_eff  <- model_profile(model_explainer, variables =  "risk_appetite")
plot(vr_eff)

Family size

vr_eff  <- model_profile(model_explainer, variables =  "family_size")
plot(vr_eff)

Age

vr_eff  <- model_profile(model_explainer, variables =  "age")
plot(vr_eff)

Deployment

Models are deployed as APIs. This mean models can be called via the web and provide real time results.

set.seed(246)

final_model <- extract_workflow(final_fit)

saveRDS(final_model, 'final_model.RDS')
model_deploy <- read_rds('final_model.RDS')

model_deploy <- final_model

# Upload data and predict
input_data <- test_data %>% 
  sample_n(1)

api_output <- augment(model_deploy, input_data) 

api_output %>% glimpse()
## Rows: 1
## Columns: 11
## $ .pred_class          <fct> Equity
## $ .pred_Bonds          <dbl> 0.04995448
## $ .pred_Equity         <dbl> 0.8914781
## $ `.pred_Money market` <dbl> 0
## $ .pred_Property       <dbl> 0.0585674
## $ age                  <int> 42
## $ family_size          <int> 2
## $ income               <dbl> 80565
## $ other_investments    <chr> "Yes"
## $ risk_appetite        <chr> "Conservative"
## $ investment_type      <chr> "Equity"

Local explainer

Below is a local explanation for the prediction made by the model.

mod_output_pred <- as.character(api_output$.pred_class)
 
 
# local explaination
local_expl <- predict_parts(explainer = model_explainer, new_observation = input_data)
 
local_expl_tbl <- local_expl %>%
  as_tibble() %>% 
  filter(str_detect(label, mod_output_pred))
 
# plot prep
plot_tbl <- local_expl_tbl %>%
  mutate(plot_col = if_else(contribution > 0, "#94a2b1", "#e58b84")) %>%
  select(values = contribution, group = variable, plot_col) %>%
  filter(group != 'prediction') %>%
  mutate(values = round(values, 2))
 
# plot
plot_tbl %>%
 
  waterfall(calc_total = TRUE,
            fill_by_sign = FALSE,
            fill_colours = plot_tbl$plot_col,
            rect_text_size = 2)  +
  coord_flip() +
 
  # theme
  theme(text = element_text(size=18),
 
        plot.title = ggtext::element_markdown(),
 
        axis.title.y = element_text(hjust = 1, color = 'grey'),
        # axis.text.y = element_blank(),
 
        axis.title.x = element_text(hjust = 1, color = 'grey'),
        axis.text.x = element_text(color="grey"),
 
        legend.position = "none",
 
        axis.line = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()
        # axis.ticks = element_blank()
 
  ) +
 
  # colours
  # scale_fill_manual(values = my_colors) +
 
  # headers
  labs(title    = str_glue("Prediction: {mod_output_pred}"),
       x        = "Predictors",
       y        = "Probability")