diff --git a/.DS_Store b/.DS_Store index c98a046..3aa9b2a 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION index c96b871..8ef24e5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tidyrules Type: Package Title: Obtain Rules from Rule Based Models as Tidy Dataframe -Version: 0.1.4 +Version: 0.1.5 Authors@R: c( person("Srikanth", "Komala Sheshachala", email = "sri.teach@gmail.com", role = c("aut", "cre")), person("Amith Kumar", "Ullur Raghavendra", email = "amith54@gmail.com", role = c("aut")) @@ -22,7 +22,7 @@ Suggests: Cubist (>= 0.2.2), rpart (>= 1.2.2), rpart.plot (>= 3.0.7), - rsample (>= 0.0.2), + modeldata (>= 0.0.1), testthat (>= 2.0.1), MASS (>= 7.3.50), mlbench (>= 2.1.1), @@ -35,5 +35,5 @@ BugReports: https://github.com/talegari/tidyrules/issues License: GPL-3 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index 5872ab5..35b105b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# tidyrules 0.1.5 + +- Maintenance release (replace package rsample with modeldata) + # tidyrules 0.1.4 - Added rules parsable in python and SQL (default: R) diff --git a/R/c5.R b/R/c5.R index 7ffc06e..4888a5e 100644 --- a/R/c5.R +++ b/R/c5.R @@ -28,7 +28,7 @@ #' } #' #' @examples -#' data("attrition", package = "rsample") +#' data("attrition", package = "modeldata") #' attrition <- tibble::as_tibble(attrition) #' c5_model <- C50::C5.0(Attrition ~., data = attrition, rules = TRUE) #' summary(c5_model) diff --git a/R/cubist.R b/R/cubist.R index 101f1c5..ff7fba9 100644 --- a/R/cubist.R +++ b/R/cubist.R @@ -26,7 +26,7 @@ #' } #' #' @examples -#' data("attrition", package = "rsample") +#' data("attrition", package = "modeldata") #' attrition <- tibble::as_tibble(attrition) #' cols_att <- setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) #' diff --git a/R/varSpec.R b/R/varSpec.R index bc4a8ad..7709462 100644 --- a/R/varSpec.R +++ b/R/varSpec.R @@ -12,7 +12,7 @@ #' @return A tibble with three columns: variable(character), type(character) and #' levels(a list-column). For numeric variables, levels are set to NA. #' @examples -#' data("attrition", package = "rsample") +#' data("attrition", package = "modeldata") #' attrition <- tibble::as_tibble(attrition) #' cols_att <- setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) #' diff --git a/README.Rmd b/README.Rmd index d520f45..3583852 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,7 +19,7 @@ knitr::opts_chunk$set( [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/tidyrules)](https://cran.r-project.org/package=tidyrules) -`tidyrules` converts texual rules from models to dataframes with parseable rules. Supported models are: `C5`, `cubist` and `rpart`. +`tidyrules` converts textual rules from models to dataframes with parseable rules. Supported models are: `C5`, `cubist` and `rpart`. ## Example diff --git a/docs/404.html b/docs/404.html index a1c3137..ec0611c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -10,23 +10,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -53,7 +57,7 @@ -
+vignettes/tidyrules_vignette.Rmd
tidyrules_vignette.Rmd
library("tidyrules")
-library("dplyr")
-library("C50")
-library("pander")
-
-# build model
-c5_model <- C5.0(Species ~ ., data = iris, rules = TRUE)
-
-# extract rules in a tidy tibble
-tidy_rules <- tidyRules(c5_model)
-
-# View tidy_rules
-tidy_rules %>%
- select(-c(rule_number,trial_number)) %>%
- pandoc.table()
library("tidyrules") +library("dplyr") +library("C50") +library("pander") + +# build model +c5_model <- C5.0(Species ~ ., data = iris, rules = TRUE) + +# extract rules in a tidy tibble +tidy_rules <- tidyRules(c5_model) + +# View tidy_rules +tidy_rules %>% + select(-c(rule_number,trial_number)) %>% + pandoc.table()
##
## -----------------------------------------------------------------------
## id LHS RHS support confidence lift
@@ -131,28 +131,28 @@
## 4 Petal.Length > 4.9 virginica 46 0.9375 2.8
## -----------------------------------------------------------------------
Filter rules based on RHS
or support
or confidence
or lift
:
# Example 1, filter rules based on support
-tidy_rules %>%
- filter(support >= 48) %>%
- select(LHS, RHS)
# Example 1, filter rules based on support +tidy_rules %>% + filter(support >= 48) %>% + select(LHS, RHS)
## # A tibble: 2 x 2
## LHS RHS
## <chr> <chr>
## 1 Petal.Length <= 1.9 setosa
## 2 Petal.Length > 1.9 & Petal.Length <= 4.9 & Petal.Width <= 1.7 versicolor
-# Example 2, filter rules based on RHS
-tidy_rules %>%
- filter(RHS == "virginica") %>%
- select(LHS, support, confidence, lift)
# Example 2, filter rules based on RHS +tidy_rules %>% + filter(RHS == "virginica") %>% + select(LHS, support, confidence, lift)
## # A tibble: 2 x 4
## LHS support confidence lift
## <chr> <int> <dbl> <dbl>
## 1 Petal.Width > 1.7 46 0.958 2.9
## 2 Petal.Length > 4.9 46 0.938 2.8
Use a tidyrule
in a filter()
function :
iris %>%
- filter(eval(parse(text = tidy_rules[3,"LHS"]))) %>% # filter using a C5 rule
- count(Species)
iris %>% + filter(eval(parse(text = tidy_rules[3,"LHS"]))) %>% # filter using a C5 rule + count(Species)
## # A tibble: 2 x 2
## Species n
## <fct> <int>
@@ -168,18 +168,18 @@
Example: Classification using C5.0
In this example we use attrition
data from rsample
package. This illustration shows how to extract rules from C5.0
model and applying filter()
based on tidyrules.
-# loading packages
-library("tidyrules")
-library("C50")
-library("dplyr")
-
-# attrition data load
-data("attrition", package = "rsample")
-attrition <- as_tibble(attrition)
-
-glimpse(attrition)
-## Observations: 1,470
-## Variables: 31
+# loading packages
+library("tidyrules")
+library("C50")
+library("dplyr")
+
+# attrition data load
+data("attrition", package = "modeldata")
+attrition <- as_tibble(attrition)
+
+glimpse(attrition)
+## Rows: 1,470
+## Columns: 31
## $ Age <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, …
## $ Attrition <fct> Yes, No, Yes, No, No, No, No, No, No, No, No…
## $ BusinessTravel <fct> Travel_Rarely, Travel_Frequently, Travel_Rar…
@@ -212,14 +212,14 @@
## $ YearsSinceLastPromotion <int> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0,…
## $ YearsWithCurrManager <int> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3,…
As you could see, there are 31 variables and 1470 observations are present this data-set. Here our aim is to predict Attrition using rest of the variables. Let us build a C5.0
model first.
-# our C5 model
-c5_att <- C5.0(Attrition ~ ., data = attrition, rules = TRUE)
-
-# sample rules from C5
-c5_att$output %>%
- stringr::str_sub(start = 194L
- , end = 578L) %>%
- writeLines()
+# our C5 model
+c5_att <- C5.0(Attrition ~ ., data = attrition, rules = TRUE)
+
+# sample rules from C5
+c5_att$output %>%
+ stringr::str_sub(start = 194L
+ , end = 578L) %>%
+ writeLines()
##
## Rule 1: (521/30, lift 1.1)
## EnvironmentSatisfaction in [Medium-Very_High]
@@ -239,10 +239,10 @@
## -> class No [0.864]
We get nice and human readable rules. Now problem with C5.0
summary is, you can only read and get a feel of how your predictions made based on rules. But here comes the hard part, imagine if you want to explore further about your data and you want to dig deeper, if you want to know rules which are throwing high lift and confidence, or you may be interested in rules which covers major sub-population. If in case your model is giving too many rules then that is the hardest part to go through each and every rules and identifying best rules out of the summary.
What if we have all the rules in a tidy table format so that we could easily use them on the data. Let’s get it done using tidyRules
.
-
+# Extract rules to a tidy tibble
+tr_att <- tidyRules(c5_att)
+
+tr_att
## # A tibble: 24 x 8
## id LHS RHS support confidence lift rule_number trial_number
## <int> <chr> <chr> <int> <dbl> <dbl> <int> <int>
@@ -271,10 +271,10 @@
lift
: The result of dividing the rule’s estimated accuracy by the relative frequency of the predicted class in the training set.
Let’s have a look at first five rules
-
+tr_att %>%
+ head(5) %>%
+ select(LHS,RHS) %>%
+ pandoc.table(split.cells = 60)
##
## -------------------------------------------------------------------
## LHS RHS
@@ -295,11 +295,11 @@
## OverTime == 'Yes' & TotalWorkingYears > 2
## -------------------------------------------------------------------
Now, all the rules are in tibble
(a tidy form of dataframe
) format. Let us look at rules which favors only Attrition is equal to “No” and arrange by support.
-
+
## # A tibble: 3 x 8
## id LHS RHS support confidence lift rule_number trial_number
## <int> <chr> <chr> <int> <dbl> <dbl> <int> <int>
@@ -310,18 +310,18 @@
Use rules inside filter()
function.
Let’s use a rule within a filter()
. Say, one need to pick a rule which has largest support
for predicted Attrition “Yes”.
-# filter a rule with conditions
-large_support_rule <- tr_att %>%
- filter(RHS == "Yes") %>%
- top_n(1, wt = support) %>%
- pull(LHS)
-
-# parseable rule
-parseable_rule <- parse(text = large_support_rule)
-
-# apply filter on data frame using parseable rule
-attrition %>%
- filter(eval(parseable_rule))
+# filter a rule with conditions
+large_support_rule <- tr_att %>%
+ filter(RHS == "Yes") %>%
+ top_n(1, wt = support) %>%
+ pull(LHS)
+
+# parseable rule
+parseable_rule <- parse(text = large_support_rule)
+
+# apply filter on data frame using parseable rule
+attrition %>%
+ filter(eval(parseable_rule))
## # A tibble: 57 x 31
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome
## <int> <fct> <fct> <int> <fct> <int>
@@ -349,17 +349,17 @@
Rules parsable by python and SQL
-tr_att_python <- tidyRules(c5_att, language = "python")
-tr_att_sql <- tidyRules(c5_att, language = "sql")
-
-head(tr_att_python$LHS)
+tr_att_python <- tidyRules(c5_att, language = "python")
+tr_att_sql <- tidyRules(c5_att, language = "sql")
+
+head(tr_att_python$LHS)
## [1] "EnvironmentSatisfaction in ['Medium', 'High', 'Very_High'] and JobInvolvement in ['Medium', 'High', 'Very_High'] and OverTime == 'No' and TrainingTimesLastYear > 1 and WorkLifeBalance in ['Better', 'Best']"
## [2] "JobRole == 'Research_Scientist' and OverTime == 'No'"
## [3] "TotalWorkingYears > 2"
## [4] "JobLevel <= 1 and MonthlyIncome <= 2468 and OverTime == 'Yes' and TotalWorkingYears > 2 and YearsAtCompany <= 3"
## [5] "DailyRate <= 722 and JobLevel <= 1 and MonthlyIncome <= 2468 and OverTime == 'Yes' and TotalWorkingYears > 2"
## [6] "EnvironmentSatisfaction in ['Low', 'Medium'] and MaritalStatus in ['Divorced', 'Married'] and NumCompaniesWorked > 4 and OverTime == 'Yes' and PerformanceRating == 'Excellent' and RelationshipSatisfaction in ['Low', 'Medium', 'High']"
-
+head(tr_att_sql$LHS)
## [1] "EnvironmentSatisfaction IN ('Medium', 'High', 'Very_High') AND JobInvolvement IN ('Medium', 'High', 'Very_High') AND OverTime = 'No' AND TrainingTimesLastYear > 1 AND WorkLifeBalance IN ('Better', 'Best')"
## [2] "JobRole = 'Research_Scientist' AND OverTime = 'No'"
## [3] "TotalWorkingYears > 2"
@@ -373,27 +373,27 @@
Example: Classification using rpart
In this example we will be using BreastCancer
data from mlbench
package.
-library("tidyrules")
-library("dplyr")
-library("rpart")
-# BreastCancer
-data(BreastCancer, package = "mlbench")
-bc_train <- BreastCancer %>%
- select(-Id) %>%
- mutate_if(is.ordered, function(x) x <- factor(x,ordered = F))
-
-rpart_bc <- rpart(Class ~ ., data = bc_train)
+library("tidyrules")
+library("dplyr")
+library("rpart")
+# BreastCancer
+data(BreastCancer, package = "mlbench")
+bc_train <- BreastCancer %>%
+ select(-Id) %>%
+ mutate_if(is.ordered, function(x) x <- factor(x,ordered = F))
+
+rpart_bc <- rpart(Class ~ ., data = bc_train)
NOTE : Do not forget to convert all ordered
features to factor
type before training the model.
One could visualize rpart decision tree using prp
function from rpart.plot
package.
-
+
The above tree visual is really nice to get a hang of how decision tree is splitting at each node. But, if you want to pick a terminal node it is really boring and hard since one has to enter the respective filter manually (imagine a situation if you have hundreds of features and a huge tree!!). To get-ride of this problem one could use tidyrules to make life easier.
Let’s extract rules from rpart
object and use those rules further more to extract terminal nodes.
-
+# tidyrule extract
+rules_bc <- tidyRules(rpart_bc)
+
+rules_bc
## # A tibble: 7 x 6
## id LHS RHS support confidence lift
## <int> <chr> <chr> <int> <dbl> <dbl>
@@ -404,10 +404,10 @@
## 5 5 Cell.size %in% c('3', '4', '5', '6', '7… benign 15 0.647 0.988
## 6 6 Cell.size %in% c('3', '4', '5', '6', '7… malig… 61 0.841 2.44
## 7 7 Cell.size %in% c('3', '4', '5', '6', '7… malig… 171 0.971 2.82
-# filter the data using a rule
-bc_train %>%
- filter(eval(parse(text = rules_bc[5,"LHS"]))) %>%
- as_tibble()
+# filter the data using a rule
+bc_train %>%
+ filter(eval(parse(text = rules_bc[5,"LHS"]))) %>%
+ as_tibble()
## # A tibble: 15 x 10
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## <fct> <fct> <fct> <fct> <fct> <fct>
@@ -434,20 +434,20 @@
Example: Regression using Cubist
In this example, rules extraction from a regression model (a cubist
model) has been illustrated below. We will be using AmesHousing
dataset for the example.
-library("tidyrules")
-library("dplyr")
-library("Cubist")
-# ames housing data set
-ames <- AmesHousing::make_ames()
-cubist_ames <- cubist(x = ames[, setdiff(colnames(ames), c("Sale_Price"))],
- y = log10(ames[["Sale_Price"]]),
- committees = 3
- )
-
-# rule extract
-rules_ames <- tidyRules(cubist_ames)
-
-rules_ames
+library("tidyrules")
+library("dplyr")
+library("Cubist")
+# ames housing data set
+ames <- AmesHousing::make_ames()
+cubist_ames <- cubist(x = ames[, setdiff(colnames(ames), c("Sale_Price"))],
+ y = log10(ames[["Sale_Price"]]),
+ committees = 3
+ )
+
+# rule extract
+rules_ames <- tidyRules(cubist_ames)
+
+rules_ames
## # A tibble: 43 x 9
## id LHS RHS support mean min max error committee
## <int> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <int>
@@ -479,18 +479,11 @@
model_c5 = C50::C5.0(Species ~ ., data = iris, rules = TRUE)
-summary(model_c5)
-#>
-#> Call:
-#> C5.0.formula(formula = Species ~ ., data = iris, rules = TRUE)
-#>
-#>
-#> C5.0 [Release 2.07 GPL Edition] Tue Dec 10 14:47:18 2019
-#> -------------------------------
-#>
-#> Class specified by attribute `outcome'
-#>
-#> Read 150 cases (5 attributes) from undefined.data
-#>
-#> Rules:
-#>
-#> Rule 1: (50, lift 2.9)
-#> Petal.Length <= 1.9
-#> -> class setosa [0.981]
-#>
-#> Rule 2: (48/1, lift 2.9)
-#> Petal.Length > 1.9
-#> Petal.Length <= 4.9
-#> Petal.Width <= 1.7
-#> -> class versicolor [0.960]
-#>
-#> Rule 3: (46/1, lift 2.9)
-#> Petal.Width > 1.7
-#> -> class virginica [0.958]
-#>
-#> Rule 4: (46/2, lift 2.8)
-#> Petal.Length > 4.9
-#> -> class virginica [0.938]
-#>
-#> Default class: setosa
-#>
-#>
-#> Evaluation on training data (150 cases):
-#>
-#> Rules
-#> ----------------
-#> No Errors
-#>
-#> 4 4( 2.7%) <<
-#>
-#>
-#> (a) (b) (c) <-classified as
-#> ---- ---- ----
-#> 50 (a): class setosa
-#> 47 3 (b): class versicolor
-#> 1 49 (c): class virginica
-#>
-#>
-#> Attribute usage:
-#>
-#> 96.00% Petal.Length
-#> 62.67% Petal.Width
-#>
-#>
-#> Time: 0.0 secs
library(tidyrules)
model_c5 = C50::C5.0(Species ~ ., data = iris, rules = TRUE) +summary(model_c5) +#> +#> Call: +#> C5.0.formula(formula = Species ~ ., data = iris, rules = TRUE) +#> +#> +#> C5.0 [Release 2.07 GPL Edition] Tue Dec 10 14:47:18 2019 +#> ------------------------------- +#> +#> Class specified by attribute `outcome' +#> +#> Read 150 cases (5 attributes) from undefined.data +#> +#> Rules: +#> +#> Rule 1: (50, lift 2.9) +#> Petal.Length <= 1.9 +#> -> class setosa [0.981] +#> +#> Rule 2: (48/1, lift 2.9) +#> Petal.Length > 1.9 +#> Petal.Length <= 4.9 +#> Petal.Width <= 1.7 +#> -> class versicolor [0.960] +#> +#> Rule 3: (46/1, lift 2.9) +#> Petal.Width > 1.7 +#> -> class virginica [0.958] +#> +#> Rule 4: (46/2, lift 2.8) +#> Petal.Length > 4.9 +#> -> class virginica [0.938] +#> +#> Default class: setosa +#> +#> +#> Evaluation on training data (150 cases): +#> +#> Rules +#> ---------------- +#> No Errors +#> +#> 4 4( 2.7%) << +#> +#> +#> (a) (b) (c) <-classified as +#> ---- ---- ---- +#> 50 (a): class setosa +#> 47 3 (b): class versicolor +#> 1 49 (c): class virginica +#> +#> +#> Attribute usage: +#> +#> 96.00% Petal.Length +#> 62.67% Petal.Width +#> +#> +#> Time: 0.0 secs
Tidy the rules:
-pander::pandoc.table(tidyRules(model_c5), split.tables = 120)
-#>
-#> ----------------------------------------------------------------------------------------------------
-#> id LHS RHS support confidence lift rule_number trial_number
-#> ---- ----------------------- ------------ --------- ------------ ------ ------------- --------------
-#> 1 Petal.Length <= 1.9 setosa 50 0.9808 2.9 1 1
-#>
-#> 2 Petal.Length > 1.9 & versicolor 48 0.96 2.9 2 1
-#> Petal.Length <= 4.9 &
-#> Petal.Width <= 1.7
-#>
-#> 3 Petal.Width > 1.7 virginica 46 0.9583 2.9 3 1
-#>
-#> 4 Petal.Length > 4.9 virginica 46 0.9375 2.8 4 1
-#> ----------------------------------------------------------------------------------------------------
pander::pandoc.table(tidyRules(model_c5), split.tables = 120) +#> +#> ---------------------------------------------------------------------------------------------------- +#> id LHS RHS support confidence lift rule_number trial_number +#> ---- ----------------------- ------------ --------- ------------ ------ ------------- -------------- +#> 1 Petal.Length <= 1.9 setosa 50 0.9808 2.9 1 1 +#> +#> 2 Petal.Length > 1.9 & versicolor 48 0.96 2.9 2 1 +#> Petal.Length <= 4.9 & +#> Petal.Width <= 1.7 +#> +#> 3 Petal.Width > 1.7 virginica 46 0.9583 2.9 3 1 +#> +#> 4 Petal.Length > 4.9 virginica 46 0.9375 2.8 4 1 +#> ----------------------------------------------------------------------------------------------------
You can install the released version of tidyrules from CRAN with:
- +install.packages("tidyrules")
And the development version from GitHub with:
- +# install.packages("devtools") +devtools::install_github("talegari/tidyrules")
#> [1] "ab" "`a b`"#> [1] "ab" "`a b`"# }
Maintainer: Srikanth Komala Sheshachala sri.teach@gmail.com
-Authors:
Amith Kumar Ullur Raghavendra amith54@gmail.com
#> [1] "abc" "d"#> [1] "abc" "d"# }
(chr vector) Python parsable rule(s)
(chr vector) SQL parsable rule(s) as a 'WHERE' clause
#> [1] "st" "st"#> [1] "strin" "string"#> [1] "st" "st"#> [1] "strin" "string"# }
#> [1] "cabd"#> [1] "cabd"# }
#> [1] "abc" "d"# } +tidyrules:::strSplitSingle("abc,d", ",")#> [1] "abc" "d"# }
#> [1] "ng" "g2"#> [1] "tring" "tring2"#> [1] "ng" "g2"#> [1] "tring" "tring2"# }
data("attrition", package = "rsample") +- @@ -428,7 +424,7 @@data("attrition", package = "modeldata") attrition <- tibble::as_tibble(attrition) c5_model <- C50::C5.0(Attrition ~., data = attrition, rules = TRUE) summary(c5_model)#> @@ -172,7 +175,7 @@Examp #> C5.0.formula(formula = Attrition ~ ., data = attrition, rules = TRUE) #> #> -#> C5.0 [Release 2.07 GPL Edition] Mon Mar 9 14:11:47 2020 +#> C5.0 [Release 2.07 GPL Edition] Thu Jun 4 17:35:29 2020 #> ------------------------------- #> #> Class specified by attribute `outcome' @@ -407,17 +410,10 @@
Examp #> 10 10 Department == … Yes 13 0.867 5.4 10 1 #> # … with 14 more rows
Author
data("attrition", package = "rsample") +- @@ -204,7 +200,7 @@data("attrition", package = "modeldata") attrition <- tibble::as_tibble(attrition) cols_att <- setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) @@ -183,17 +186,10 @@Examp #> 7 7 JobLevel <= 4… (4166.4) + (3… 87 15824 12061 17924 694. 1 #> 8 8 JobLevel > 4 (13633) + (10… 69 19192. 18041 19999 416 1
Author
tidyRule supports these rule based models: C5, Cubist and rpart.