Random Forests

Author

Arvind Venkatadri

Published

January 6, 2021

References:

  1. Machine Learning Basics - Random Forest at Shirin’s Playground

Penguin Random Forest Model withrandomForest

Using the penguins dataset and Random Forest Classification.

penguins
summary(penguins)
      species          island    bill_length_mm  bill_depth_mm  
 Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
 Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
 Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
                                 Mean   :43.92   Mean   :17.15  
                                 3rd Qu.:48.50   3rd Qu.:18.70  
                                 Max.   :59.60   Max.   :21.50  
                                 NAs    :2       NAs    :2      
 flipper_length_mm  body_mass_g       sex           year     
 Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
 1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
 Median :197.0     Median :4050   NAs   : 11   Median :2008  
 Mean   :200.9     Mean   :4202                Mean   :2008  
 3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
 Max.   :231.0     Max.   :6300                Max.   :2009  
 NAs    :2         NAs    :2                                 
penguins %>% skimr::skim()
Data summary
Name Piped data
Number of rows 344
Number of columns 8
_______________________
Column type frequency:
factor 3
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
species 0 1.00 FALSE 3 Ade: 152, Gen: 124, Chi: 68
island 0 1.00 FALSE 3 Bis: 168, Dre: 124, Tor: 52
sex 11 0.97 FALSE 2 mal: 168, fem: 165

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
bill_length_mm 2 0.99 43.92 5.46 32.1 39.23 44.45 48.5 59.6 ▃▇▇▆▁
bill_depth_mm 2 0.99 17.15 1.97 13.1 15.60 17.30 18.7 21.5 ▅▅▇▇▂
flipper_length_mm 2 0.99 200.92 14.06 172.0 190.00 197.00 213.0 231.0 ▂▇▃▅▂
body_mass_g 2 0.99 4201.75 801.95 2700.0 3550.00 4050.00 4750.0 6300.0 ▃▇▆▃▂
year 0 1.00 2008.03 0.82 2007.0 2007.00 2008.00 2009.0 2009.0 ▇▁▇▁▇
penguins <- penguins %>% tidyr::drop_na()
# Spent one hour trying to find `drop-na()` ( 14 June 2020)
# library(corrplot)
cor <- penguins %>%
  select(is.numeric) %>%
  cor()
cor %>% corrplot(., method = "ellipse", order = "hclust", tl.cex = 0.5)

# try these too:
# cor %>% corrplot(., method = "square", order = "hclust",tl.cex = 0.5)
# cor %>% corrplot(., method = "color", order = "hclust",tl.cex = 0.5)
# cor %>% corrplot(., method = "shade", order = "hclust",tl.cex = 0.5)

Notes: - flipper_length_mm and culmen_depth_mm are negtively correlated at approx (-0.7) - flipper_length_mm and body_mass_g are positively correlated at approx 0.8

So we will use steps in the recipe to remove correlated variables.

Penguin Data Sampling and Recipe

# Data Split
penguin_split <- initial_split(penguins, prop = 0.6)
penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)
penguin_split
<Training/Testing/Total>
<199/134/333>
head(penguin_train)
# Recipe
penguin_recipe <- penguins %>%
  recipe(species ~ .) %>%
  step_normalize(all_numeric()) %>% # Scaling and Centering
  step_corr(all_numeric()) %>% # Handling correlated variables
  prep()

# Baking the data
penguin_train_baked <- penguin_train %>%
  bake(object = penguin_recipe, new_data = .)

penguin_test_baked <- penguin_test %>%
  bake(object = penguin_recipe, new_data = .)

head(penguin_train_baked)

Penguin Random Forest Model

penguin_model <-
  rand_forest(trees = 100) %>%
  set_engine("randomForest") %>%
  set_mode("classification")
penguin_model
Random Forest Model Specification (classification)

Main Arguments:
  trees = 100

Computational engine: randomForest 
penguin_fit <-
  penguin_model %>%
  fit(species ~ ., penguin_train_baked)
penguin_fit
parsnip model object


Call:
 randomForest(x = maybe_data_frame(x), y = y, ntree = ~100) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 2

        OOB estimate of  error rate: 2.01%
Confusion matrix:
          Adelie Chinstrap Gentoo class.error
Adelie        97         1      0  0.01020408
Chinstrap      3        32      0  0.08571429
Gentoo         0         0     66  0.00000000
# iris_ranger <-
#   rand_forest(trees = 100) %>%
#   set_mode("classification") %>%
#   set_engine("ranger") %>%
#   fit(Species ~ ., data = iris_training_baked)

Metrics for the Penguin Random Forest Model

# Predictions
predict(object = penguin_fit, new_data = penguin_test_baked) %>%
  dplyr::bind_cols(penguin_test_baked) %>%
  glimpse()
Rows: 134
Columns: 9
$ .pred_class       <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> -0.8215515, -1.3335592, -0.5289757, -0.9861254, -1.3…
$ bill_depth_mm     <dbl> 0.11940428, 1.08424573, 0.22096653, 2.04908718, 0.32…
$ flipper_length_mm <dbl> -1.0678666, -0.5684290, -1.3532595, -0.7111254, -1.1…
$ body_mass_g       <dbl> -0.50552542, -0.94019151, -1.25066728, -0.50552542, …
$ sex               <fct> female, female, female, male, female, female, male, …
$ year              <dbl> -1.2818130, -1.2818130, -1.2818130, -1.2818130, -1.2…
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
# Prediction Accuracy Metrics
predict(object = penguin_fit, new_data = penguin_test_baked) %>%
  dplyr::bind_cols(penguin_test_baked) %>%
  yardstick::metrics(truth = species, estimate = .pred_class)
# Prediction Probabilities
penguin_fit_probs <-
  predict(penguin_fit, penguin_test_baked, type = "prob") %>%
  dplyr::bind_cols(penguin_test_baked)
glimpse(penguin_fit_probs)
Rows: 134
Columns: 11
$ .pred_Adelie      <dbl> 0.95, 0.98, 0.98, 1.00, 0.99, 0.97, 0.92, 1.00, 1.00…
$ .pred_Chinstrap   <dbl> 0.05, 0.02, 0.02, 0.00, 0.01, 0.03, 0.07, 0.00, 0.00…
$ .pred_Gentoo      <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> -0.8215515, -1.3335592, -0.5289757, -0.9861254, -1.3…
$ bill_depth_mm     <dbl> 0.11940428, 1.08424573, 0.22096653, 2.04908718, 0.32…
$ flipper_length_mm <dbl> -1.0678666, -0.5684290, -1.3532595, -0.7111254, -1.1…
$ body_mass_g       <dbl> -0.50552542, -0.94019151, -1.25066728, -0.50552542, …
$ sex               <fct> female, female, female, male, female, female, male, …
$ year              <dbl> -1.2818130, -1.2818130, -1.2818130, -1.2818130, -1.2…
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
# Confusion Matrix
penguin_fit$fit$confusion %>% tidy()
# Gain Curves
penguin_fit_probs %>%
  yardstick::gain_curve(species, .pred_Adelie:.pred_Gentoo) %>%
  autoplot()

# ROC Plot
penguin_fit_probs %>%
  roc_curve(species, .pred_Adelie:.pred_Gentoo) %>%
  autoplot()

Using broom on the penguin model

penguin_split
<Training/Testing/Total>
<199/134/333>
penguin_split %>% broom::tidy()
penguin_recipe %>% broom::tidy()
# Following do not work for `random forest models` !! ;-()
# penguin_model %>% tidy()
# penguin_fit %>% tidy()
penguin_model %>% str()
List of 7
 $ args                 :List of 3
  ..$ mtry : language ~NULL
  .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
  ..$ trees: language ~100
  .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
  ..$ min_n: language ~NULL
  .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
 $ eng_args             : Named list()
  ..- attr(*, "class")= chr [1:2] "quosures" "list"
 $ mode                 : chr "classification"
 $ user_specified_mode  : logi TRUE
 $ method               : NULL
 $ engine               : chr "randomForest"
 $ user_specified_engine: logi TRUE
 - attr(*, "class")= chr [1:2] "rand_forest" "model_spec"
penguin_test_baked

Iris Random Forest Model with ranger

Using the iris dataset and Random Forest Classification. This part uses rsample to split the data and the recipes to prep the data for model making.

# set.seed(100)
iris_split <- rsample::initial_split(iris, prop = 0.6)
iris_split
<Training/Testing/Total>
<90/60/150>
iris_split %>%
  training() %>%
  glimpse()
Rows: 90
Columns: 5
$ Sepal.Length <dbl> 5.6, 7.7, 5.5, 6.6, 4.7, 5.9, 6.7, 4.4, 5.8, 6.3, 6.3, 5.…
$ Sepal.Width  <dbl> 2.5, 3.8, 3.5, 3.0, 3.2, 3.0, 3.1, 3.2, 2.6, 2.9, 2.8, 3.…
$ Petal.Length <dbl> 3.9, 6.7, 1.3, 4.4, 1.6, 4.2, 4.4, 1.3, 4.0, 5.6, 5.1, 1.…
$ Petal.Width  <dbl> 1.1, 2.2, 0.2, 1.4, 0.2, 1.5, 1.4, 0.2, 1.2, 1.8, 1.5, 0.…
$ Species      <fct> versicolor, virginica, setosa, versicolor, setosa, versic…
iris_split %>%
  testing() %>%
  glimpse()
Rows: 60
Columns: 5
$ Sepal.Length <dbl> 5.1, 4.9, 4.6, 5.0, 4.4, 4.8, 5.8, 5.7, 5.1, 5.7, 5.1, 4.…
$ Sepal.Width  <dbl> 3.5, 3.0, 3.1, 3.4, 2.9, 3.4, 4.0, 4.4, 3.5, 3.8, 3.8, 3.…
$ Petal.Length <dbl> 1.4, 1.4, 1.5, 1.5, 1.4, 1.6, 1.2, 1.5, 1.4, 1.7, 1.5, 1.…
$ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.3, 0.3, 0.…
$ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…

Iris Data Pre-Processing: Creating the Recipe

The recipes package provides an interface that specializes in data pre-processing. Within the package, the functions that start, or execute, the data transformations are named after cooking actions. That makes the interface more user-friendly. For example:

  • recipe() - Starts a new set of transformations to be applied, similar to the ggplot() command. Its main argument is the model’s formula.

  • prep() - Executes the transformations on top of the data that is supplied (typically, the training data). Each data transformation is a step() function. ( Recall what we did with the caret package: Centering, Scaling, Removing Correlated variables…)

Note that in order to avoid data leakage (e.g: transferring information from the train set into the test set), data should be “prepped” using the train_tbl only. https://towardsdatascience.com/modelling-with-tidymodels-and-parsnip-bae2c01c131c CRAN: The idea is that the preprocessing operations will all be created using the training set and then these steps will be applied to both the training and test set.

# Pre Processing the Training Data

iris_recipe <-
  training(iris_split) %>% # Note: Using TRAINING data !!
  recipe(Species ~ .) # Note: Outcomes ~ Predictors !!

# The data contained in the `data` argument need not be the training set; this data is only used to catalog the names of the variables and their types (e.g. numeric, etc.).

Q: How does the recipe “figure” out which are the outcomes and which are the predictors? A.The recipe command defines Outcomes and Predictors using the formula interface. Not clear how this recipe “figures” out which are the outcomes and which are the predictors, when we have not yet specified them…

Q. Why is the recipe not agnostic to data set? Is that a meaningful question? A. The use of the training set in the recipe command is just to declare the variables and specify the roles of the data, nothing else. Roles are open-ended and extensible. From https://cran.r-project.org/web/packages/recipes/vignettes/Simple_Example.html :

This document demonstrates some basic uses of recipes. First, some definitions are required: - variables are the original (raw) data columns in a data frame or tibble. For example, in a traditional formula Y ~ A + B + A:B, the variables are A, B, and Y. - roles define how variables will be used in the model. Examples are: predictor (independent variables), response, and case weight. This is meant to be open-ended and extensible. - terms are columns in a design matrix such as A, B, and A:B. These can be other derived entities that are grouped, such as a set of principal components or a set of columns, that define a basis function for a variable. These are synonymous with features in machine learning. Variables that have predictor roles would automatically be main effect terms.

# Apply the transformation steps
iris_recipe <- iris_recipe %>%
  step_corr(all_predictors()) %>%
  step_center(all_predictors(), -all_outcomes()) %>%
  step_scale(all_predictors(), -all_outcomes()) %>%
  prep()

This has created the recipe() and prepped it too. We now need to apply it to our datasets:

  • Take training data and bake() it to prepare it for modelling.
  • Do the same for the testing set.
iris_training_baked <-
  iris_split %>%
  training() %>%
  bake(iris_recipe, .)
iris_training_baked
iris_testing_baked <-
  iris_split %>%
  testing() %>%
  bake(iris_recipe, .)
iris_testing_baked

Iris Model Training using parsnip

Different ML packages provide different interfaces (APIs ) to do the same thing (e.g random forests). The tidymodels package provides a consistent interface to invoke a wide variety of packages supporting a wide variety of models.

The parsnip package is a successor to caret.

To model with parsnip: 1. Pick a model : 2. Set the engine 3. Set the mode (if needed): Classification or Regression

Check here for models available in parsnip.

  • Mode: classification and regression in parsnip, each using a variety of models. ( Which Way). This defines the form of the output.

  • Engine: The engine is the R package that is invoked by parsnip to execute the model. E.g glm, glmnet,keras.( How ) parsnip provides wrappers for models from these packages.

  • Model: is the specific technique used for the modelling task. E.g linear_reg(), logistic_reg(), mars, decision_tree, nearest_neighbour…(What model).

and models have: - hyperparameters: that are numerical or factor variables that tune the model ( Like the alpha beta parameters for Bayesian priors)

We can use the random forest model to classify the iris into species. Here Species is the Outcome variable and the rest are predictor variables. The random forest model is provided by the ranger package, to which tidymodels/parsnip provides a simple and consistent interface.

library(ranger)
iris_ranger <-
  rand_forest(trees = 100) %>%
  set_mode("classification") %>%
  set_engine("ranger") %>%
  fit(Species ~ ., data = iris_training_baked)

ranger can generate random forest models for classification, regression, survival( time series, time to event stuff). Extreme Forests are also supported, wherein all points in the dataset are used ( instead of bootstrap samples) along with feature bagging. We can also run the same model using the randomForest package:

library(randomForest, quietly = TRUE)
iris_rf <-
  rand_forest(trees = 100) %>%
  set_mode("classification") %>%
  set_engine("randomForest") %>%
  fit(Species ~ ., data = iris_training_baked)

Iris Predictions

The predict() function run against a parsnip model returns a prediction tibble. By default, the prediction variable is called .pred_class.

predict(object = iris_ranger, new_data = iris_testing_baked) %>%
  dplyr::bind_cols(iris_testing_baked) %>%
  glimpse()
Rows: 60
Columns: 5
$ .pred_class  <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…
$ Sepal.Length <dbl> -0.87694148, -1.10873218, -1.45641823, -0.99283683, -1.68…
$ Sepal.Width  <dbl> 1.04348343, -0.15439296, 0.08518232, 0.80390816, -0.39396…
$ Petal.Width  <dbl> -1.2513140, -1.2513140, -1.2513140, -1.2513140, -1.251314…
$ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…

Iris Classification Model Validation

We use metrics() function from the yardstick package to evaluate how good the model is.

predict(iris_ranger, iris_testing_baked) %>%
  dplyr::bind_cols(iris_testing_baked) %>%
  yardstick::metrics(truth = Species, estimate = .pred_class)

We can also check the metrics for randomForest model:

predict(iris_rf, iris_testing_baked) %>%
  dplyr::bind_cols(iris_testing_baked) %>%
  yardstick::metrics(truth = Species, estimate = .pred_class)

Iris Per-Classifier Metrics

We can use the parameter type = "prob" in the predict() function to obtain a probability score on each prediction. TBD: How is this prob calculated? Possible answer: the Random Forest model outputs its answer by majority voting across n trees. Each of the possible answers( i.e. predictions) for a particular test datum gets a share of the vote, that represents its probability. Hence each dataum in the test vector can show a probability for the “winning” answer. ( Quite possibly we can get the probabilities for all possible outcomes for each test datum)

iris_ranger_probs <-
  predict(iris_ranger, iris_testing_baked, type = "prob") %>%
  dplyr::bind_cols(iris_testing_baked)
glimpse(iris_ranger_probs)
Rows: 60
Columns: 7
$ .pred_setosa     <dbl> 0.996250000, 0.931075758, 0.975238095, 0.993535714, 0…
$ .pred_versicolor <dbl> 0.003750000, 0.060696970, 0.023095238, 0.005464286, 0…
$ .pred_virginica  <dbl> 0.000000000, 0.008227273, 0.001666667, 0.001000000, 0…
$ Sepal.Length     <dbl> -0.87694148, -1.10873218, -1.45641823, -0.99283683, -…
$ Sepal.Width      <dbl> 1.04348343, -0.15439296, 0.08518232, 0.80390816, -0.3…
$ Petal.Width      <dbl> -1.2513140, -1.2513140, -1.2513140, -1.2513140, -1.25…
$ Species          <fct> setosa, setosa, setosa, setosa, setosa, setosa, setos…
iris_rf_probs <-
  predict(iris_rf, iris_testing_baked, type = "prob") %>%
  dplyr::bind_cols(iris_testing_baked)
glimpse(iris_rf_probs)
Rows: 60
Columns: 7
$ .pred_setosa     <dbl> 1.00, 0.96, 0.98, 1.00, 0.93, 1.00, 0.67, 0.72, 1.00,…
$ .pred_versicolor <dbl> 0.00, 0.03, 0.02, 0.00, 0.07, 0.00, 0.27, 0.25, 0.00,…
$ .pred_virginica  <dbl> 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.06, 0.03, 0.00,…
$ Sepal.Length     <dbl> -0.87694148, -1.10873218, -1.45641823, -0.99283683, -…
$ Sepal.Width      <dbl> 1.04348343, -0.15439296, 0.08518232, 0.80390816, -0.3…
$ Petal.Width      <dbl> -1.2513140, -1.2513140, -1.2513140, -1.2513140, -1.25…
$ Species          <fct> setosa, setosa, setosa, setosa, setosa, setosa, setos…
# Tabulating the probabilities
ftable(iris_rf_probs$.pred_versicolor)
  0 0.01 0.02 0.03 0.07 0.1 0.13 0.18 0.21 0.23 0.24 0.25 0.27 0.29 0.32 0.35 0.47 0.49 0.59 0.62 0.65 0.7 0.78 0.83 0.87 0.9 0.91 0.92 0.93 0.95 0.96 0.97 0.98 0.99
                                                                                                                                                                     
 13    1    3    2    1   1    3    1    2    1    2    2    2    1    1    1    1    1    1    1    1   1    1    1    1   2    2    1    2    1    1    2    2    1
ftable(iris_rf_probs$.pred_virginica)
  0 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 0.19 0.22 0.25 0.38 0.41 0.51 0.53 0.65 0.67 0.73 0.76 0.77 0.79 0.82 0.85 0.93 0.97 0.98  1
                                                                                                                                                 
 14    5    1    4    1    1    2    1    1    2   2    1    1    1    1    1    1    1    1    1    1    2    1    2    4    1    1    1    2  2
ftable(iris_rf_probs$.pred_setosa)
  0 0.01 0.02 0.05 0.07 0.11 0.13 0.16 0.67 0.71 0.72 0.93 0.96 0.98  1
                                                                       
 27    4    2    5    1    1    1    1    1    1    2    1    1    2 10

### Iris Classifier: Gain and ROC Curves

We can plot gain and ROC curves for each of these models


::: {.cell}

```{.r .cell-code}
iris_ranger_probs %>%
  yardstick::gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  glimpse()
Rows: 133
Columns: 5
$ .level          <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "set…
$ .n              <dbl> 0, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19…
$ .n_events       <dbl> 0, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 18…
$ .percent_tested <dbl> 0.000000, 3.333333, 6.666667, 8.333333, 10.000000, 11.…
$ .percent_found  <dbl> 0.00000, 11.11111, 22.22222, 27.77778, 33.33333, 38.88…
iris_ranger_probs %>%
  yardstick::gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

iris_ranger_probs %>%
  yardstick::roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  glimpse()
Rows: 136
Columns: 4
$ .level      <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "setosa"…
$ .threshold  <dbl> -Inf, 0.000000000, 0.001000000, 0.002250000, 0.004444444, …
$ specificity <dbl> 0.0000000, 0.0000000, 0.2619048, 0.3571429, 0.4523810, 0.5…
$ sensitivity <dbl> 1.0000000, 1.0000000, 1.0000000, 1.0000000, 1.0000000, 1.0…
iris_ranger_probs %>%
  yardstick::roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

:::

iris_rf_probs %>%
  yardstick::gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  glimpse()
Rows: 82
Columns: 5
$ .level          <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "set…
$ .n              <dbl> 0, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 27, 29,…
$ .n_events       <dbl> 0, 10, 12, 13, 14, 16, 17, 18, 18, 18, 18, 18, 18, 18,…
$ .percent_tested <dbl> 0.000000, 16.666667, 20.000000, 21.666667, 23.333333, …
$ .percent_found  <dbl> 0.000000, 55.555556, 66.666667, 72.222222, 77.777778, …
iris_rf_probs %>%
  yardstick::gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

iris_rf_probs %>%
  yardstick::roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  glimpse()
Rows: 85
Columns: 4
$ .level      <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "setosa"…
$ .threshold  <dbl> -Inf, 0.00, 0.01, 0.02, 0.05, 0.07, 0.11, 0.13, 0.16, 0.67…
$ specificity <dbl> 0.0000000, 0.0000000, 0.6428571, 0.7380952, 0.7857143, 0.9…
$ sensitivity <dbl> 1.0000000, 1.0000000, 1.0000000, 1.0000000, 1.0000000, 1.0…
iris_rf_probs %>%
  yardstick::roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

Iris Classifier: Metrics

predict(iris_ranger, iris_testing_baked, type = "prob") %>%
  bind_cols(predict(iris_ranger, iris_testing_baked)) %>%
  bind_cols(select(iris_testing_baked, Species)) %>%
  glimpse()
Rows: 60
Columns: 5
$ .pred_setosa     <dbl> 0.996250000, 0.931075758, 0.975238095, 0.993535714, 0…
$ .pred_versicolor <dbl> 0.003750000, 0.060696970, 0.023095238, 0.005464286, 0…
$ .pred_virginica  <dbl> 0.000000000, 0.008227273, 0.001666667, 0.001000000, 0…
$ .pred_class      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setos…
$ Species          <fct> setosa, setosa, setosa, setosa, setosa, setosa, setos…
predict(iris_ranger, iris_testing_baked, type = "prob") %>%
  bind_cols(predict(iris_ranger, iris_testing_baked)) %>%
  bind_cols(select(iris_testing_baked, Species)) %>%
  yardstick::metrics(data = ., truth = Species, estimate = .pred_class, ... = .pred_setosa:.pred_virginica)
Error in `mn_log_loss()`:
! Can't select columns that don't exist.
✖ Columns `...1`, `...2`, and `...3` don't exist.
# And for the `randomForest`method

predict(iris_rf, iris_testing_baked, type = "prob") %>%
  bind_cols(predict(iris_ranger, iris_testing_baked)) %>%
  bind_cols(select(iris_testing_baked, Species)) %>%
  glimpse()
Rows: 60
Columns: 5
$ .pred_setosa     <dbl> 1.00, 0.96, 0.98, 1.00, 0.93, 1.00, 0.67, 0.72, 1.00,…
$ .pred_versicolor <dbl> 0.00, 0.03, 0.02, 0.00, 0.07, 0.00, 0.27, 0.25, 0.00,…
$ .pred_virginica  <dbl> 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.06, 0.03, 0.00,…
$ .pred_class      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setos…
$ Species          <fct> setosa, setosa, setosa, setosa, setosa, setosa, setos…
predict(iris_rf, iris_testing_baked, type = "prob") %>%
  bind_cols(predict(iris_ranger, iris_testing_baked)) %>%
  bind_cols(select(iris_testing_baked, Species)) %>%
  yardstick::metrics(data = ., truth = Species, estimate = .pred_class, ... = .pred_setosa:.pred_virginica)
Error in `mn_log_loss()`:
! Can't select columns that don't exist.
✖ Columns `...1`, `...2`, and `...3` don't exist.
Back to top