diff --git a/.DS_Store b/.DS_Store
index c2579a6..fbc356d 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/.Rbuildignore b/.Rbuildignore
index e840dee..c75e4c9 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -6,3 +6,4 @@
docs
^README\.Rmd$
^\.github$
+^vignettes/articles$
diff --git a/DESCRIPTION b/DESCRIPTION
index 7f38cc5..f3623e9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
Package: tidyrules
Type: Package
Title: Utilities to Retrieve Rulelists from Model Fits, Filter, Prune, Reorder and Predict on unseen data
-Version: 0.2.6
+Version: 0.2.7
Authors@R: c(
person("Srikanth", "Komala Sheshachala", email = "sri.teach@gmail.com", role = c("aut", "cre")),
person("Amith Kumar", "Ullur Raghavendra", email = "amith54@gmail.com", role = c("aut"))
@@ -24,6 +24,7 @@ Imports:
glue (>= 1.7.0),
pheatmap (>= 1.0.12),
proxy (>= 0.4.27),
+ tibble (>= 3.2.1),
Suggests:
AmesHousing (>= 0.0.3),
dplyr (>= 0.8),
@@ -35,16 +36,14 @@ Suggests:
testthat (>= 2.0.1),
MASS (>= 7.3.50),
mlbench (>= 2.1.1),
- knitr (>= 1.23),
rmarkdown (>= 1.13),
palmerpenguins (>= 0.1.1),
Description: Provides a framework to work with decision rules. Rules can be extracted from supported models, augmented with (custom) metrics using validation data, manipulated using standard dataframe operations, reordered and pruned based on a metric, predict on unseen (test) data. Utilities include; Creating a rulelist manually, Exporting a rulelist as a SQL case statement and so on. The package offers two classes; rulelist and rulelset based on dataframe.
-URL: https://github.com/talegari/tidyrules
+URL: https://github.com/talegari/tidyrules, https://talegari.github.io/tidyrules/
BugReports: https://github.com/talegari/tidyrules/issues
License: GPL-3
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.1
-VignetteBuilder: knitr
Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
index b39b6d1..93f001d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -40,6 +40,7 @@ importFrom(magrittr,"%>%")
importFrom(rlang,"%||%")
importFrom(stats,IQR)
importFrom(stats,predict)
+importFrom(stats,reorder)
importFrom(stats,runif)
importFrom(stats,weighted.mean)
importFrom(tidytable,across)
@@ -64,6 +65,7 @@ importFrom(tidytable,select)
importFrom(tidytable,slice)
importFrom(tidytable,summarise)
importFrom(tidytable,unnest)
+importFrom(utils,capture.output)
importFrom(utils,data)
importFrom(utils,head)
importFrom(utils,tail)
diff --git a/NEWS.md b/NEWS.md
index 35b105b..582981b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# tidyrules 0.2.7
+
+- Major rewrite of tidyrules
+ - rulelist class introduced with many methods, mainly `predict`
+ - breaking change: `tidyRules` function no longer exists!
+ - Support added to `party` models
+
# tidyrules 0.1.5
- Maintenance release (replace package rsample with modeldata)
diff --git a/R/dev_mindmap.R b/R/dev_mindmap.R
deleted file mode 100644
index e710b6f..0000000
--- a/R/dev_mindmap.R
+++ /dev/null
@@ -1,44 +0,0 @@
-################################################################################
-# This is the part of the 'tidyrules' R package hosted at
-# https://github.com/talegari/tidyrules with GPL-3 license.
-################################################################################
-
-# Structure
-#
-# Model/fitted object to rules should happens via 'tidy' call
-# We get the generic from generics::tidy
-# Rules object will be one among: ruleset/rulelist.
-# This is a wrapper over tidytable/dataframe.
-#
-# Methods for rulelist/set: print, predict, augment
-# At high level, predict returns the rule_nbr for a row_nbr in new_data
-# At high level, augment (TODO) returns some metrics on new_data as new column
-#
-# Models:
-#
-# C5
-# - (rulelist when fitted with rules = TRUE) -- implemented
-# - (ruleset when fitted with rules = FALSE) -- NOT implemented
-#
-# rpart
-# - (ruleset with classification aka class) -- implemented
-# - (ruleset with regression aka anova) -- implemented
-# - (ruleset with poisson) -- NOT implemented
-# - (ruleset with survival) -- NOT implemented
-# - (ruleset with exp) -- NOT implemented
-# - (ruleset with used defined split) -- NOT implemented
-#
-# party
-# - (ruleset with classification) -- NOT implemented
-# - (ruleset with regression) -- NOT implemented
-# - (ruleset with survival) -- NOT implemented
-# - (ruleset with used defined split) -- NOT implemented
-#
-# cubist
-# - (ruleset with regression) -- implemented
-#
-# ranger
-# - (rulelist) -- NOT implemented
-#
-# sirus
-# - (ruleset ??) -- NOT implemented
\ No newline at end of file
diff --git a/R/package.R b/R/package.R
index 2ba6858..caacc5e 100644
--- a/R/package.R
+++ b/R/package.R
@@ -51,6 +51,7 @@
#' @importFrom stats runif
#' @importFrom utils head
#' @importFrom utils tail
+#' @importFrom utils capture.output
#'
"_PACKAGE"
diff --git a/R/rulelist.R b/R/rulelist.R
index 141f203..02486e8 100644
--- a/R/rulelist.R
+++ b/R/rulelist.R
@@ -343,8 +343,12 @@ set_validation_data = function(x, validation_data, y_name, weight = 1){
res = rlang::duplicate(x)
- checkmate::assert_data_frame(validation_data)
- attr(res, "validation_data") = data.table::as.data.table(validation_data)
+ checkmate::assert_data_frame(validation_data, null.ok = TRUE)
+ if (!is.null(validation_data)) {
+ attr(res, "validation_data") =
+ data.table::as.data.table(validation_data)
+ }
+
attr(res, "y_name") = y_name
attr(res, "weight") = weight
@@ -376,48 +380,86 @@ print.rulelist = function(x, banner = TRUE, ...){
model_type = attr(rulelist, "model_type")
validation_data = attr(rulelist, "validation_data")
+ text = character(0)
if (banner) {
- cli::cli_rule(left = "Rulelist")
- cli::cli_text("")
+ text = c(text, "---- Rulelist --------------------------------")
}
if (is.null(keys)) {
- cli::cli_alert_info("{.emph Keys}: {.strong NULL}")
+ text = c(text,
+ paste(cli::symbol$play,
+ "Keys: NULL"
+ )
+ )
} else {
- cli::cli_alert_info("{.emph keys}: {.val {keys}}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Keys: {keys}")
+ )
+ )
n_combo = nrow(distinct(select(x, all_of(keys))))
- cli::cli_alert_info("{.emph Number of distinct keys}: {.val {n_combo}}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Number of distinct keys: {n_combo}")
+ )
+ )
}
- cli::cli_alert_info("{.emph Number of rules}: {.val {nrow(x)}}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Number of rules: {nrow(x)}")
+ )
+ )
if (is.null(model_type)){
- cli::cli_alert_info("{.emph Model type}: {.strong NULL}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Model Type: NULL")
+ )
+ )
} else {
- cli::cli_alert_info("{.emph Model type}: {.val {model_type}}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Model type: {model_type}")
+ )
+ )
}
- if (is.null(estimation_type)){
- cli::cli_alert_info("{.emph Estimation type}: {.strong NULL}")
+ if (is.null(estimation_type)) {
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Estimation type: NULL")
+ )
+ )
} else {
- cli::cli_alert_info("{.emph Estimation type}: {.val {estimation_type}}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Estimation type: {estimation_type}")
+ )
+ )
}
- if (is.null(validation_data)){
- cli::cli_alert_warning("{.emph Is validation data set}: {.strong FALSE}")
+ if (is.null(validation_data)) {
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Is validation data set: FALSE")
+ )
+ )
} else {
- cli::cli_alert_success("{.emph Is validation data set}: {.strong TRUE}")
+ text = c(text,
+ paste(cli::symbol$play,
+ stringr::str_glue("Is validation data set: TRUE")
+ )
+ )
}
- cli::cli_text("")
-
- class(rulelist) = setdiff(class(rulelist), "rulelist")
- # now 'rulelist' is a dataframe and not a 'rulelist'
- print(rulelist, ...)
+ print_output = capture.output(print(tibble::as_tibble(x), ...), file = NULL)
+ text = c(text, "\n", utils::tail(print_output, -1))
if (banner) {
- cli::cli_rule()
+ text = c(text, "----------------------------------------------")
}
+ cat(paste(text, collapse = "\n"))
return(invisible(x))
}
@@ -592,7 +634,7 @@ predict_all_rulelist = function(rulelist, new_data){
res =
rulelist %>%
as.data.frame() %>%
- nest(data__ = tidytable::everything(), .by = keys) %>%
+ nest(data__ = tidytable::everything(), .by = all_of(keys)) %>%
mutate(rn_df__ =
purrr::map(data__,
~ predict_all_nokeys_rulelist(.x, new_data)
@@ -603,7 +645,7 @@ predict_all_rulelist = function(rulelist, new_data){
drop_na(row_nbr) %>%
select(all_of(c("row_nbr", keys, "rule_nbr"))) %>%
arrange(!!!rlang::syms(c("row_nbr", keys, "rule_nbr"))) %>%
- nest(.by = c("row_nbr", keys), .key = "rule_nbr") %>%
+ nest(.by = all_of(c("row_nbr", keys)), .key = "rule_nbr") %>%
mutate(rule_nbr = purrr::map(rule_nbr, ~ .x[[1]]))
}
@@ -685,7 +727,7 @@ predict_rulelist = function(rulelist, new_data){
res =
rulelist %>%
as.data.frame() %>%
- nest(data__ = tidytable::everything(), .by = keys) %>%
+ nest(data__ = tidytable::everything(), .by = all_of(keys)) %>%
mutate(rn_df__ =
purrr::map(data__, ~ predict_nokeys_rulelist(.x, new_data))
) %>%
@@ -1759,10 +1801,9 @@ plot.prune_rulelist = function(x, ...) {
#' @seealso [rulelist], [tidy], [augment][augment.rulelist],
#' [predict][predict.rulelist], [calculate][calculate.rulelist],
#' [prune][prune.rulelist], [reorder][reorder.rulelist]
+#' @importFrom stats reorder
#' @export
-reorder = function(x, ...){
- UseMethod("reorder", x)
-}
+stats::reorder
#' @name reorder.rulelist
#' @title Reorder the rules/rows of a [rulelist]
@@ -1891,7 +1932,7 @@ reorder.rulelist = function(x,
rule_metrics = purrr::map_dfr(splitted, wrapper_metric_fun)
ord = do.call(base::order,
c(rule_metrics,
- list(decreasing = minimize)
+ list(decreasing = !minimize)
)
)
pos = which(ord == 1)
diff --git a/R/ruleset.R b/R/ruleset.R
index c5f2097..cbfa73b 100644
--- a/R/ruleset.R
+++ b/R/ruleset.R
@@ -53,18 +53,22 @@ print.ruleset = function(x, banner = TRUE, ...){
ruleset = rlang::duplicate(x)
+ text = character(0)
if (banner) {
- cli::cli_rule(left = "Ruleset")
- cli::cli_text("")
+ text = c(text, "---- Ruleset -------------------------------")
}
class(ruleset) = setdiff(class(ruleset), "ruleset")
- # now 'ruleset' is a rulelist
- print(ruleset, banner = FALSE, ...)
+ text = c(text,
+ capture.output(print(ruleset, banner = FALSE, ...),
+ file = NULL
+ )
+ )
if (banner) {
- cli::cli_rule()
+ text = c(text, "--------------------------------------------")
}
+ cat(paste(text, collapse = "\n"))
return(invisible(x))
}
diff --git a/R/utils.R b/R/utils.R
index f48474b..3dc0919 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -312,12 +312,14 @@ convert_rule_flavor = function(rule, flavor){
} else if (flavor == "sql"){
res =
rule %>%
+ stringr::str_replace_all("==", "=") %>%
+
stringr::str_replace_all("\\( ", "") %>%
stringr::str_replace_all(" \\)", "") %>%
stringr::str_replace_all("%in%", "IN") %>%
- stringr::str_replace_all("c\\(", "[") %>%
- stringr::str_replace_all("\\)", "]") %>%
+ stringr::str_replace_all("c\\(", "(") %>%
+ stringr::str_replace_all("\\)", ")") %>%
stringr::str_replace_all("&", " ) AND (") %>%
diff --git a/README.Rmd b/README.Rmd
index 9e80aca..a04421d 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -11,6 +11,7 @@ knitr::opts_chunk$set(
fig.path = "man/figures/README-",
out.width = "100%"
)
+devtools::load_all() #todo
```
# tidyrules
@@ -20,9 +21,13 @@ knitr::opts_chunk$set(
[![R-CMD-check](https://github.com/talegari/tidyrules/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/talegari/tidyrules/actions/workflows/R-CMD-check.yaml)
-`tidyrules` converts textual rules from models to dataframes with parseable rules. Supported models are: `C5`, `cubist` and `rpart`.
+> [tidyrules](https://cran.r-project.org/package=tidyrules) [R](https://www.r-project.org/) [package](https://cran.r-project.org/) provides a framework to work with decision rules. Rules can be extracted from supported models, augmented with (custom) metrics using validation data, manipulated using standard dataframe operations, reordered and pruned based on a metric, predict on unseen (test) data. Utilities include; Creating a rulelist manually, Exporting a rulelist as a SQL case statement and so on. The package offers two classes; rulelist and rulelset based on dataframe.
+
+![](man/figures/tidyrules_schematic.png)
## Example
+expand/collapse
```{r example}
library(tidyrules)
@@ -30,26 +35,25 @@ library(tidyrules)
```{r basic C5 example}
model_c5 = C50::C5.0(Species ~ ., data = iris, rules = TRUE)
-summary(model_c5)
-```
-
-Tidy the rules:
-
-```{r tidyrules}
-pander::pandoc.table(tidyRules(model_c5), split.tables = 120)
+pander::pandoc.table(tidy(model_c5), split.tables = 120)
```
+expand/collapse
You can install the released version of tidyrules from [CRAN](https://CRAN.R-project.org) with:
-``` r
+```{r, eval = FALSE}
install.packages("tidyrules")
```
And the development version from [GitHub](https://github.com/) with:
-``` r
+```{r, eval = FALSE}
# install.packages("devtools")
devtools::install_github("talegari/tidyrules")
```
+
+expand/collapse
+
+
``` r
library(tidyrules)
```
``` r
model_c5 = C50::C5.0(Species ~ ., data = iris, rules = TRUE)
-summary(model_c5)
-#>
-#> Call:
-#> C5.0.formula(formula = Species ~ ., data = iris, rules = TRUE)
-#>
-#>
-#> C5.0 [Release 2.07 GPL Edition] Tue Dec 10 14:47:18 2019
-#> -------------------------------
-#>
-#> Class specified by attribute `outcome'
-#>
-#> Read 150 cases (5 attributes) from undefined.data
-#>
-#> Rules:
-#>
-#> Rule 1: (50, lift 2.9)
-#> Petal.Length <= 1.9
-#> -> class setosa [0.981]
-#>
-#> Rule 2: (48/1, lift 2.9)
-#> Petal.Length > 1.9
-#> Petal.Length <= 4.9
-#> Petal.Width <= 1.7
-#> -> class versicolor [0.960]
-#>
-#> Rule 3: (46/1, lift 2.9)
-#> Petal.Width > 1.7
-#> -> class virginica [0.958]
-#>
-#> Rule 4: (46/2, lift 2.8)
-#> Petal.Length > 4.9
-#> -> class virginica [0.938]
-#>
-#> Default class: setosa
-#>
-#>
-#> Evaluation on training data (150 cases):
+pander::pandoc.table(tidy(model_c5), split.tables = 120)
#>
-#> Rules
-#> ----------------
-#> No Errors
+#> ----------------------------------------------------------------------------------------------
+#> rule_nbr trial_nbr LHS RHS support confidence lift
+#> ---------- ----------- ---------------------------- ------------ --------- ------------ ------
+#> 1 1 ( Petal.Length <= 1.9 ) setosa 50 0.9808 2.9
#>
-#> 4 4( 2.7%) <<
+#> 2 1 ( Petal.Length > 1.9 ) & ( versicolor 48 0.96 2.9
+#> Petal.Length <= 4.9 ) & (
+#> Petal.Width <= 1.7 )
#>
+#> 3 1 ( Petal.Width > 1.7 ) virginica 46 0.9583 2.9
#>
-#> (a) (b) (c) <-classified as
-#> ---- ---- ----
-#> 50 (a): class setosa
-#> 47 3 (b): class versicolor
-#> 1 49 (c): class virginica
-#>
-#>
-#> Attribute usage:
-#>
-#> 96.00% Petal.Length
-#> 62.67% Petal.Width
-#>
-#>
-#> Time: 0.0 secs
+#> 4 1 ( Petal.Length > 4.9 ) virginica 46 0.9375 2.8
+#> ----------------------------------------------------------------------------------------------
```
-Tidy the rules:
-
-``` r
-pander::pandoc.table(tidyRules(model_c5), split.tables = 120)
-#>
-#> ----------------------------------------------------------------------------------------------------
-#> id LHS RHS support confidence lift rule_number trial_number
-#> ---- ----------------------- ------------ --------- ------------ ------ ------------- --------------
-#> 1 Petal.Length <= 1.9 setosa 50 0.9808 2.9 1 1
-#>
-#> 2 Petal.Length > 1.9 & versicolor 48 0.96 2.9 2 1
-#> Petal.Length <= 4.9 &
-#> Petal.Width <= 1.7
-#>
-#> 3 Petal.Width > 1.7 virginica 46 0.9583 2.9 3 1
-#>
-#> 4 Petal.Length > 4.9 virginica 46 0.9375 2.8 4 1
-#> ----------------------------------------------------------------------------------------------------
-```
+
+expand/collapse
+
+
You can install the released version of tidyrules from
[CRAN](https://CRAN.R-project.org) with:
@@ -117,3 +75,5 @@ And the development version from [GitHub](https://github.com/) with:
# install.packages("devtools")
devtools::install_github("talegari/tidyrules")
```
+
+
vignettes/articles/using_tidyrules.Rmd
+ using_tidyrules.Rmd
++tidyrules +R package provides a framework to +work with decision rules. Rules can be extracted from supported models, +augmented with (custom) metrics using validation data, manipulated using +standard dataframe operations, reordered and pruned based on a metric, +predict on unseen (test) data. Utilities include; Creating a rulelist +manually, Exporting a rulelist as a SQL case statement and so on. The +package offers two classes; rulelist and rulelset based on +dataframe.
+
This document provides a working example of a classification problem
+where the functionality of package is showcased. We use
+modeldata::attrition
dataset where Attrition
+column is the binary dependent variable.
+att = modeldata::attrition
+set.seed(1)
+valid_index = sample(c(TRUE, FALSE), nrow(att), replace = TRUE)
+att_train = att[!valid_index, ] # nrow: 742
+att_valid = att[valid_index, ] # nrow: 728
+glimpse(att)
## Rows: 1,470
+## Columns: 31
+## $ Age <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
+## $ Attrition <fct> Yes, No, Yes, No, No, No, No, No, No, No, No,…
+## $ BusinessTravel <fct> Travel_Rarely, Travel_Frequently, Travel_Rare…
+## $ DailyRate <int> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
+## $ Department <fct> Sales, Research_Development, Research_Develop…
+## $ DistanceFromHome <int> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
+## $ Education <ord> College, Below_College, College, Master, Belo…
+## $ EducationField <fct> Life_Sciences, Life_Sciences, Other, Life_Sci…
+## $ EnvironmentSatisfaction <ord> Medium, High, Very_High, Very_High, Low, Very…
+## $ Gender <fct> Female, Male, Male, Female, Male, Male, Femal…
+## $ HourlyRate <int> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
+## $ JobInvolvement <ord> High, Medium, Medium, High, High, High, Very_…
+## $ JobLevel <int> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
+## $ JobRole <fct> Sales_Executive, Research_Scientist, Laborato…
+## $ JobSatisfaction <ord> Very_High, Medium, High, High, Medium, Very_H…
+## $ MaritalStatus <fct> Single, Married, Single, Married, Married, Si…
+## $ MonthlyIncome <int> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
+## $ MonthlyRate <int> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
+## $ NumCompaniesWorked <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
+## $ OverTime <fct> Yes, No, Yes, Yes, No, No, Yes, No, No, No, N…
+## $ PercentSalaryHike <int> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
+## $ PerformanceRating <ord> Excellent, Outstanding, Excellent, Excellent,…
+## $ RelationshipSatisfaction <ord> Low, Very_High, Medium, High, Very_High, High…
+## $ StockOptionLevel <int> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
+## $ TotalWorkingYears <int> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
+## $ TrainingTimesLastYear <int> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
+## $ WorkLifeBalance <ord> Bad, Better, Better, Better, Better, Good, Go…
+## $ YearsAtCompany <int> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
+## $ YearsInCurrentRole <int> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
+## $ YearsSinceLastPromotion <int> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
+## $ YearsWithCurrManager <int> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
+tidy
generic creates rulelist
from a
+supported model fit. rulelist
class is fundamental data
+structure which offers many methods such as predict
,
+augment
and so on. A rulelist
is a dataframe
+with some extra attributes. The order of rows of the dataframe defines
+the order of preference of rules.
tidy
supports these model fits:
C5
rule-based model (classification)rpart
tree (classification / regression)party
tree (classification / regression)cubist
tree (regression)Lets build a C5 model and then extract a rulelist:
+
+model_c5 = C50::C5.0(Attrition ~., data = att_train, rules = TRUE)
+model_c5
##
+## Call:
+## C5.0.formula(formula = Attrition ~ ., data = att_train, rules = TRUE)
+##
+## Rule-Based Model
+## Number of samples: 742
+## Number of predictors: 30
+##
+## Number of Rules: 19
+##
+## Non-standard options: attempt to group attributes
+
+tidy_c5 = tidy(model_c5)
+tidy_c5
+## ---- Rulelist --------------------------------
+## ▶ Keys: trial_nbr
+## ▶ Number of distinct keys: 1
+## ▶ Number of rules: 19
+## ▶ Model type: C5
+## ▶ Estimation type: classification
+## ▶ Is validation data set: FALSE
+##
+##
+## rule_nbr trial_nbr LHS RHS support confidence lift
+## <int> <int> <chr> <fct> <int> <dbl> <dbl>
+## 1 1 1 ( Age > 26 ) & ( Environme… No 189 0.963 1.2
+## 2 2 1 ( Age > 26 ) & ( Environme… No 244 0.951 1.1
+## 3 3 1 ( BusinessTravel == 'Non-T… No 74 0.947 1.1
+## 4 4 1 ( Age <= 31 ) & ( Educatio… Yes 12 0.929 5.4
+## 5 5 1 ( JobSatisfaction %in% c('… No 157 0.924 1.1
+## 6 6 1 ( Age > 26 ) & ( Environme… No 351 0.924 1.1
+## 7 7 1 ( EnvironmentSatisfaction … Yes 8 0.9 5.3
+## 8 8 1 ( OverTime == 'Yes' ) & ( … Yes 8 0.9 5.3
+## 9 9 1 ( BusinessTravel %in% c('T… Yes 8 0.9 5.3
+## 10 10 1 ( EnvironmentSatisfaction … Yes 7 0.889 5.2
+## 11 11 1 ( JobInvolvement == 'Low' … Yes 7 0.889 5.2
+## 12 12 1 ( OverTime == 'No' ) No 516 0.888 1.1
+## 13 13 1 ( EnvironmentSatisfaction … Yes 5 0.857 5
+## 14 14 1 ( MaritalStatus %in% c('Ma… Yes 17 0.842 4.9
+## 15 15 1 ( NumCompaniesWorked > 6 )… Yes 10 0.833 4.9
+## 16 16 1 ( EnvironmentSatisfaction … Yes 8 0.8 4.7
+## 17 17 1 ( Age <= 26 ) & ( Environm… Yes 22 0.75 4.4
+## 18 18 1 ( EnvironmentSatisfaction … Yes 9 0.636 3.7
+## 19 19 1 ( EnvironmentSatisfaction … Yes 28 0.633 3.7
+## ----------------------------------------------
A rulelist is expected to have these mandatory columns:
+rule_nbr
: Something that identifies a rule uniquely per
+keys
. Typically, an integer vector starting from 1.LHS
: A character vector of R-parsable stringsRHS
: factor (for classification), numeric (for
+regression) or character vector of R-parsable strings (to be
+evaluated)trial_nbr
is a key. C5
model builds
+multiple boosting iterations indexed by trial_nbr
(default
+is set to 1). rule_nbr
’s start from 1 for each
+trial_nbr
. In general, keys
columns along with
+rule_nbr
column should be unique.
Attribute estimation_type
is central to further methods
+where metrics get computed. At this moment, the package supports these:
+classification
, regression
.
The rulelist (obtained from C5
model) ordered by
+confidence
column, by default.
A rulelist can be either created using tidy
on a
+supported model or a from a dataframe using
+as_rulelist
.
++☺☺☺ rulelist is simply a dataframe with some attributes. Manipulate +them with standard dataframe operations (
+dplyr
, +data.table
…).tibble::as_tibble
or +as.data.frame
will convert to a tibble/dataframe (with +attributes).as_rulelist
can be used to convert to a +rulelist.
The mainstay of package is the predict
method of the
+rulelist class. predict
provides the first rule (in the
+order as per the rulelist) that is applicable for a observation/row in
+the test data. If a row is not covered by any rule, then
+rule_nbr
is missing.
+predict(tidy_c5, att_valid)
## # A tibble: 728 × 3
+## row_nbr trial_nbr rule_nbr
+## <int> <int> <int>
+## 1 1 1 8
+## 2 2 1 6
+## 3 3 1 6
+## 4 4 1 6
+## 5 5 1 NA
+## 6 6 1 12
+## 7 7 1 12
+## 8 8 1 5
+## 9 9 1 12
+## 10 10 1 1
+## # ℹ 718 more rows
+++☺☺☺ To know all rules applicable for a row, use argument +
+multiple = TRUE
. Alternately,predict
on a +ruleset always yields all rules applicable per row.
+predict(tidy_c5, att_valid, multiple = TRUE)
## # A tibble: 728 × 3
+## row_nbr trial_nbr rule_nbr
+## <int> <int> <list>
+## 1 1 1 <int [2]>
+## 2 2 1 <int [1]>
+## 3 3 1 <int [1]>
+## 4 4 1 <int [2]>
+## 5 5 1 <int [1]>
+## 6 6 1 <int [1]>
+## 7 7 1 <int [1]>
+## 8 8 1 <int [2]>
+## 9 9 1 <int [1]>
+## 10 10 1 <int [3]>
+## # ℹ 718 more rows
+set_validation_data
: Setting (or removing)
+validation data adds a validation data to a rulelist which gets used for
+augment
, calculate
and other methods.
set_keys
: Sets (or removes) keys.
+tidy_c5 =
+ tidy_c5 %>%
+ set_validation_data(att_valid, y_name = "Attrition", weight = 1) %>%
+ set_keys(NULL)
+
+tidy_c5
## ---- Rulelist --------------------------------
+## ▶ Keys: NULL
+## ▶ Number of rules: 19
+## ▶ Model type: C5
+## ▶ Estimation type: classification
+## ▶ Is validation data set: TRUE
+##
+##
+## rule_nbr trial_nbr LHS RHS support confidence lift
+## <int> <int> <chr> <fct> <int> <dbl> <dbl>
+## 1 1 1 ( Age > 26 ) & ( Environme… No 189 0.963 1.2
+## 2 2 1 ( Age > 26 ) & ( Environme… No 244 0.951 1.1
+## 3 3 1 ( BusinessTravel == 'Non-T… No 74 0.947 1.1
+## 4 4 1 ( Age <= 31 ) & ( Educatio… Yes 12 0.929 5.4
+## 5 5 1 ( JobSatisfaction %in% c('… No 157 0.924 1.1
+## 6 6 1 ( Age > 26 ) & ( Environme… No 351 0.924 1.1
+## 7 7 1 ( EnvironmentSatisfaction … Yes 8 0.9 5.3
+## 8 8 1 ( OverTime == 'Yes' ) & ( … Yes 8 0.9 5.3
+## 9 9 1 ( BusinessTravel %in% c('T… Yes 8 0.9 5.3
+## 10 10 1 ( EnvironmentSatisfaction … Yes 7 0.889 5.2
+## 11 11 1 ( JobInvolvement == 'Low' … Yes 7 0.889 5.2
+## 12 12 1 ( OverTime == 'No' ) No 516 0.888 1.1
+## 13 13 1 ( EnvironmentSatisfaction … Yes 5 0.857 5
+## 14 14 1 ( MaritalStatus %in% c('Ma… Yes 17 0.842 4.9
+## 15 15 1 ( NumCompaniesWorked > 6 )… Yes 10 0.833 4.9
+## 16 16 1 ( EnvironmentSatisfaction … Yes 8 0.8 4.7
+## 17 17 1 ( Age <= 26 ) & ( Environm… Yes 22 0.75 4.4
+## 18 18 1 ( EnvironmentSatisfaction … Yes 9 0.636 3.7
+## 19 19 1 ( EnvironmentSatisfaction … Yes 28 0.633 3.7
+## ----------------------------------------------
+++☺☺☺ Setting weight argument (other than 1 which means equal weigth) +leads to calculating weighted metrics.
+
augment
adds metrics related to validation data in a new
+column ‘augmented_stats’.
+tidy_c5 %>%
+ augment() %>%
+ tibble::as_tibble() %>%
+ tidytable::unnest(names_sep = "__") %>%
+ glimpse()
## Rows: 19
+## Columns: 10
+## $ rule_nbr <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
+## $ trial_nbr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
+## $ LHS <chr> "( Age > 26 ) & ( EnvironmentSatisfaction …
+## $ RHS <fct> No, No, No, Yes, No, No, Yes, Yes, Yes, Ye…
+## $ support <int> 189, 244, 74, 12, 157, 351, 8, 8, 8, 7, 7,…
+## $ confidence <dbl> 0.9634000, 0.9512000, 0.9474000, 0.9285714…
+## $ lift <dbl> 1.2, 1.1, 1.1, 5.4, 1.1, 1.1, 5.3, 5.3, 5.…
+## $ augmented_stats__support <dbl> 212, 236, 76, 13, 153, 306, 10, 17, 10, 6,…
+## $ augmented_stats__confidence <dbl> 0.89622642, 0.91101695, 0.88157895, 0.4615…
+## $ augmented_stats__lift <dbl> 1.0285265, 1.0455004, 1.0117168, 3.5880893…
+++☺☺☺ If augmented metrics differ from train data metrics, then it +could indicate drift in the data!
+
++☺☺☺
+augment
also supports custom metrics in +dplyr::summarise
syntax!
Plotting a rulelist as a heatmap helps in understanding these +things:
+
+plot(tidy_c5)
++☺☺☺ distance metric for rules is
+jaccard
and distance +metric for row clusters iseuclidean
. Former can be changed +to any distance supported byproxy
package or a custom +distance function for custom insight!
++☺☺☺ When you have a rulelist which is a combination of multiple +classifiers, rule clusters quickly reveal ‘correlated’ rules! The ones +which cover almost same rows, but LHS of each reads different!
+
calculate
computes cumulative metrics (as rules are
+applied in the row order) depending on attribute
+estimation_type
.
+calculate(tidy_c5)
## # A tibble: 19 × 4
+## rule_nbr cumulative_coverage cumulative_overlap cumulative_accuracy
+## <int> <dbl> <dbl> <dbl>
+## 1 1 212 0 0.896
+## 2 2 277 171 0.910
+## 3 3 323 187 0.895
+## 4 4 336 187 0.878
+## 5 5 413 217 0.877
+## 6 6 495 301 0.875
+## 7 7 500 306 0.866
+## 8 8 515 308 0.852
+## 9 9 525 308 0.844
+## 10 10 527 312 0.843
+## 11 11 529 314 0.841
+## 12 12 686 456 0.848
+## 13 13 687 460 0.849
+## 14 14 697 462 0.845
+## 15 15 702 466 0.840
+## 16 16 702 470 0.840
+## 17 17 710 480 0.835
+## 18 18 710 484 0.835
+## 19 19 725 495 0.826
+++☺☺☺
+calculate
allows a custom metric of your choice!
reorder
intends to reorder the order of rules. At the
+moment, the greedy implementation adds one rule at a time to a new
+rulelist (from the input rulelist) such that a metric (see
+calculate
) is maximixed/minimized.
Suppose, you wanted to find a smaller ruleset with least overlap that +would still cover 80% of the validation_data. Then,
+
+reorder(tidy_c5,
+ metric = c("cumulative_overlap",
+ "cumulative_coverage",
+ "cumulative_accuracy"
+ ),
+ minimize = TRUE
+ ) %>%
+ mutate(rel_cum_overlap =
+ cumulative_overlap / max(cumulative_overlap),
+ rel_cum_coverage =
+ cumulative_coverage / max(cumulative_coverage)
+ ) %>%
+ select(rule_nbr, LHS, RHS,
+ rel_cum_overlap, rel_cum_coverage,
+ cumulative_accuracy
+ )
## ⠙ Reordering ... 4 done (1.4/s) | 2.8s
+## ⠹ Reordering ... 5 done (1.3/s) | 3.7s
+## ⠸ Reordering ... 8 done (1.1/s) | 7s
+## ⠼ Reordering ... 11 done (1.1/s) | 10.4s
+## ⠴ Reordering ... 14 done (1/s) | 13.4s
+## ⠦ Reordering ... 18 done (1.1/s) | 15.9s
+## ⠦ Reordering ... 19 done (1.2/s) | 16.1s
+##
+## ---- Rulelist --------------------------------
+## ▶ Keys: NULL
+## ▶ Number of rules: 19
+## ▶ Model type: C5
+## ▶ Estimation type: classification
+## ▶ Is validation data set: TRUE
+##
+##
+## rule_nbr LHS RHS rel_cum_overlap rel_cum_coverage cumulative_accuracy
+## <int> <chr> <fct> <dbl> <dbl> <dbl>
+## 1 16 ( Enviro… Yes 0 0.0124 0.556
+## 2 9 ( Busine… Yes 0 0.0262 0.474
+## 3 8 ( OverTi… Yes 0 0.0497 0.444
+## 4 4 ( Age <=… Yes 0 0.0676 0.449
+## 5 3 ( Busine… No 0.0101 0.166 0.708
+## 6 15 ( NumCom… Yes 0.0141 0.175 0.677
+## 7 14 ( Marita… Yes 0.0222 0.189 0.672
+## 8 13 ( Enviro… Yes 0.0242 0.196 0.655
+## 9 12 ( OverTi… No 0.190 0.819 0.854
+## 10 17 ( Age <=… Yes 0.208 0.832 0.846
+## 11 11 ( JobInv… Yes 0.214 0.833 0.844
+## 12 10 ( Enviro… Yes 0.220 0.836 0.843
+## 13 7 ( Enviro… Yes 0.232 0.836 0.843
+## 14 6 ( Age > … No 0.638 0.942 0.837
+## 15 18 ( Enviro… Yes 0.646 0.943 0.836
+## 16 5 ( JobSat… No 0.780 0.950 0.837
+## 17 2 ( Age > … No 0.927 0.972 0.833
+## 18 19 ( Enviro… Yes 0.949 0.993 0.824
+## 19 1 ( Age > … No 1 1 0.822
+## ----------------------------------------------
+we infer that first 9 rules (~ 20% overlap) in the reordered rulelist +would do still ensuring an accuracy of 85% !
+++☺☺☺ In the above code, 2nd metric onwards are used to break ties! +(similar to
+base::order
)
++☺☺☺ Reordering changes the decision bourdaries of your fit! It is a +post-hoc method to overlap the learnt rules to optimize for the +metric you need! But remember, greedy optimization method does guarantee +the global minima (maxima)!
+
++☺☺☺
+reorder
comes with ainit = k
argument +which leaves a predecided top k rules in their order and reorders only +bottom ones. This might be required when policy layer needs to be +incorporated into the rule engine!
prune
suggests k th rule to stop at based on some
+stopping criteria.
Suppose, we seek to find a smaller rulelist with maximum possible +accuracy with a minimum (relative) coverage of 70% and (relative) +overlap not more than half the number of rows. Then,
+
+prune_suggestion =
+ reorder(tidy_c5, "cumulative_accuracy", minimize = FALSE) %>%
+ prune(stop_expr_string = "relative__cumulative_coverage >= 0.7 & cumulative_overlap <= 728/2")
## ⠙ Reordering ... 4 done (1.5/s) | 2.6s
+## ⠹ Reordering ... 7 done (1.2/s) | 5.8s
+## ⠸ Reordering ... 10 done (1.1/s) | 9.2s
+## ⠼ Reordering ... 13 done (1/s) | 12.4s
+## ⠴ Reordering ... 16 done (1.1/s) | 15s
+## ⠴ Reordering ... 19 done (1.2/s) | 16.1s
+##
+
+prune_suggestion
## ── Prune Suggestion ────────────────────────────────────────────────────────────
+## ✔ Keep first 4 out of 19
+##
+## ℹ Metrics after 4 rules:
+##
+## ℹ Run `plot(x)` for details; `x$pruned` to get pruned rulelist
+## ────────────────────────────────────────────────────────────────────────────────
+
+plot(prune_suggestion)
+prune_suggestion$pruned
## ---- Rulelist --------------------------------
+## ▶ Keys: NULL
+## ▶ Number of rules: 4
+## ▶ Model type: C5
+## ▶ Estimation type: classification
+## ▶ Is validation data set: TRUE
+##
+##
+## rule_nbr trial_nbr LHS RHS support confidence lift cumulative_accuracy
+## <int> <int> <chr> <fct> <int> <dbl> <dbl> <dbl>
+## 1 4 1 ( Age <… Yes 12 0.929 5.4 0.462
+## 2 5 1 ( JobSa… No 157 0.924 1.1 0.865
+## 3 6 1 ( Age >… No 351 0.924 1.1 0.886
+## 4 12 1 ( OverT… No 516 0.888 1.1 0.883
+## ----------------------------------------------
+++☺☺☺
+prune
is powerful when combined with +reorder
! Whilereorder
chases a metric, +prune
takes care of constraints! This might lead to small +rulelists, very good for explainability!
Use to_sql_case
to get SQL case when code chunk from a
+rulelist.
+to_sql_case(head(tidy_c5, 5))
CASE
+WHEN (Age > 26)
+ AND (EnvironmentSatisfaction IN ( 'Medium', 'High', 'Very_High' ))
+ AND (PercentSalaryHike <= 17)
+ AND (StockOptionLevel > 0)
+ AND (StockOptionLevel <= 2)
+ AND (TotalWorkingYears > 2) THEN
+ 'No'
+WHEN (Age > 26)
+ AND (EnvironmentSatisfaction IN ( 'Medium', 'High', 'Very_High' ))
+ AND (StockOptionLevel > 0)
+ AND (YearsAtCompany > 3) THEN
+ 'No'
+WHEN (BusinessTravel = 'Non-Travel') THEN
+ 'No'
+WHEN (Age <= 31)
+ AND (EducationField = 'Technical_Degree')
+ AND (StockOptionLevel <= 0) THEN
+ 'Yes'
+WHEN (JobSatisfaction IN ( 'Low', 'Medium', 'High' ))
+ AND (MonthlyIncome > 3210)
+ AND (RelationshipSatisfaction IN ( 'Medium', 'High', 'Very_High' ))
+ AND (TrainingTimesLastYear > 2) THEN
+ 'No'
+ELSE
+ NULL
+END AS output
+tidy
support to more models. Your
+contributions are welcome!✔ `For dev and issues, reach us at http://github.com/talegari/tidyrules
+✔ ‘master’ branch always holds the ‘tested’ dev code!
+✔ Get the latest stable version from CRAN!
+Yours truly,
+Amith (ಅಮಿತ್) and Srikanth (ಶ್ರೀಕಾಂತ)
Developed by Srikanth Komala Sheshachala, Amith Kumar Ullur Raghavendra.
+Site built with pkgdown 2.0.9.
+