# import packages
suppressPackageStartupMessages({
library(dplyr)
library(caret)
library(recipes)
library(randomForest)
})
# load data
<- MLDataR::diabetes_data
diabetes_raw
# clean data
<-
df %>%
diabetes_raw ::clean_names() %>%
janitormutate(diabetic_class = as.factor(diabetic_class))
The two main approaches to building machine learning models in R are caret and tidymodels. Having tried both, I found that I struggled to pick my favorite. There’s elements of both that made more intuitive sense to me than the other. I think it’s a product of having become very familiar with the tidyverse, particularly dplyr, for data wrangling, but still using a lot of Base R functions for statistical modeling.
The process for prepping the data for a machine learning model seems to make a ton of sense to me when done in tidymodels (using recipes and rsample), but the equivalent process using caret felt a little clunky. However, specifying and training models using caret made a lot of sense to my broken brain.
Anyway, I recently discovered something that is probably entirely unremarkable to everyone else, and that probably shouldn’t have taken me by surprise… You can just combine the two! You can split and preprocess your data using the tidymodels framework before defecting to caret for the next steps. What a time to be alive.
Predicting Diabetes Using Random Forest
Because I’m not a savage, I won’t leave you without a simple worked example. We’ll use Gary Hutson’s really useful MLDataR package to grab a toy diabetes dataset, cleaning the variable names using janitor, and converting the target variable, diabetic_class, to a factor.
Train/Test Splits
Having done this, we can use rsample to split the data into a train and test set.
# set random seed
set.seed(456)
# split train/test data
<-
train_test_split ::initial_split(df,
rsamplestrata = diabetic_class,
prop = 0.7
)
# create train/test sets
<- rsample::training(train_test_split)
train_df <- rsample::testing(train_test_split) test_df
Data Preprocessing
The next step is a little more involved, and is where I think tidymodels really excels. Using the recipes package, we can specify all the preprocessing steps needed for the dataset, such that the data will then be ready for training a machine learning model.
# preprocessing
<-
model_recipe recipe(diabetic_class ~ ., data = train_df) %>%
# combine low frequency factor levels
step_other(all_nominal(), threshold = 0.05) %>%
# remove predictors with zero variance
step_nzv(all_predictors()) %>%
# normalize numeric variables (sigma = 1, mu = 0)
step_normalize(all_numeric()) %>%
# convert nominal variables to numeric binary variables
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)
You can check that all the preprocessing steps are working as expected by using prep() and juice().
# check preprocessing results
%>%
model_recipe prep() %>%
juice() %>%
glimpse()
Rows: 364
Columns: 32
$ age <dbl> -0.67696788, -1.18732268, -1.52755921, 1.61962872, 1.02421478, 0.85409652, 0.51385998, …
$ diabetic_class <fct> Negative, Negative, Negative, Negative, Negative, Negative, Negative, Negative, Negativ…
$ gender_Female <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gender_Male <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ excess_urination_No <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ excess_urination_Yes <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ polydipsia_No <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, …
$ polydipsia_Yes <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ weight_loss_sudden_No <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
$ weight_loss_sudden_Yes <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
$ fatigue_No <dbl> 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
$ fatigue_Yes <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, …
$ polyphagia_No <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, …
$ polyphagia_Yes <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
$ genital_thrush_No <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, …
$ genital_thrush_Yes <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
$ blurred_vision_No <dbl> 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, …
$ blurred_vision_Yes <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, …
$ itching_No <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
$ itching_Yes <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, …
$ irritability_No <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ irritability_Yes <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ delay_healing_No <dbl> 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
$ delay_healing_Yes <dbl> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, …
$ partial_psoriasis_No <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, …
$ partial_psoriasis_Yes <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
$ muscle_stiffness_No <dbl> 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, …
$ muscle_stiffness_Yes <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, …
$ alopecia_No <dbl> 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
$ alopecia_Yes <dbl> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, …
$ obesity_No <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ obesity_Yes <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
If everything looks alright, you can take the model_recipe object that you’ve created and use it as the model formula that you would otherwise have to specify in the caret train() function.
Model Training
For the rest of the process, you can switch over to caret, first using the trainControl() function to specify the training parameters and then the train() function for the model training.
# set random seed
set.seed(456)
# control parameters for model training
<-
ctrl trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# train random forest model
<-
rf_mod train(
model_recipe,data = train_df,
method = "rf",
tunelength = 10,
metric = "ROC",
trControl = ctrl,
importance = TRUE
)
Having trained the random forest model, you can check the performance, and see what parameters were chosen in the tuning process.
# check results
print(rf_mod)
Random Forest
364 samples
16 predictor
2 classes: 'Negative', 'Positive'
Recipe steps: other, nzv, normalize, dummy
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 292, 291, 291, 291, 291
Resampling results across tuning parameters:
mtry ROC Sens Spec
2 0.9947186 0.9500000 0.9372727
16 0.9873918 0.9428571 0.9281818
31 0.9850108 0.9428571 0.9235354
ROC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
Not bad! The best performing model has an ROC of 0.995 and both the sensitivity and specificity are ~0.95. Pretty solid for a quick and easy model.
Test Predictions
To really test the model’s performance, we want to see how it copes with the test data that it hasn’t seen.
# make predictions on test data
<- predict(rf_mod, newdata = test_df, type = "prob")
rf_predict <- predict(rf_mod, newdata = test_df, type = "raw")
rf_class
<-
preds cbind(rf_predict, rf_class) %>%
mutate(
Positive = round(Positive, digits = 2),
Negative = round(Negative, digits = 2)
)
Finally, we can produce a confidence matrix for a more intuitive look at how the model is performing on the test set.
<- test_df[, names(test_df) %in% c("diabetic_class")]
cm_class
confusionMatrix(
rf_class,as.factor(cm_class$diabetic_class),
positive = "Positive"
)
Confusion Matrix and Statistics
Reference
Prediction Negative Positive
Negative 57 1
Positive 3 95
Accuracy : 0.9744
95% CI : (0.9357, 0.993)
No Information Rate : 0.6154
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9455
Mcnemar's Test P-Value : 0.6171
Sensitivity : 0.9896
Specificity : 0.9500
Pos Pred Value : 0.9694
Neg Pred Value : 0.9828
Prevalence : 0.6154
Detection Rate : 0.6090
Detection Prevalence : 0.6282
Balanced Accuracy : 0.9698
'Positive' Class : Positive
The results are pretty good for a very quick model. How exciting. Lets pretend that it’s because I’m a brilliant data scientist rather than it being due to the very clean, balanced toy dataset we used.
Wrapping Up
So there you have it, if you’re in the same position as me and you’re struggling to pick between tidymodels and caret, because both frameworks offer something you like, you can just combine the two and make Frankenstein’s framework.
Ultimately, despite this blog post, I’m probably going to stick with tidymodels (why am I like this?). I think that I’m going to force myself to get used to the tidymodels framework end-to-end because a) it is receiving tons of development so it’s probably going to continue to get better and bigger, and will be leading the way for the foreseeable future, and b) because in reality I think the explicit way that you structure each step is probably sensible, even if it confuses me a bit.
But it’s nice to know that I’ve got options.
Acknowledgments
Preview image by Rock’n Roll Monkey on Unsplash.
Support
If you enjoyed this blog post and would like to support my work, you can buy me a coffee or a beer or give me a tip as a thank you.