Skip to content

Commit

Permalink
[R] Fix parsing decision stump. (#7689)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Mar 16, 2022
1 parent e78a38b commit da35162
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 28 deletions.
50 changes: 34 additions & 16 deletions R-package/R/xgb.model.dt.tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
}

if (length(text) < 2 ||
sum(grepl('yes=(\\d+),no=(\\d+)', text)) < 1) {
sum(grepl('leaf=(\\d+)', text)) < 1) {
stop("Non-tree model detected! This function can only be used with tree models.")
}

Expand Down Expand Up @@ -116,16 +116,28 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
td[isLeaf == FALSE,
(branch_cols) := {
matches <- regmatches(t, regexec(branch_rx, t))
# skip some indices with spurious capture groups from anynumber_regex
xtr <- do.call(rbind, matches)[, c(2, 3, 5, 6, 7, 8, 10), drop = FALSE]
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
as.data.table(xtr)
}]
td[
isLeaf == FALSE,
(branch_cols) := {
matches <- regmatches(t, regexec(branch_rx, t))
# skip some indices with spurious capture groups from anynumber_regex
xtr <- do.call(rbind, matches)[, c(2, 3, 5, 6, 7, 8, 10), drop = FALSE]
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
if (length(xtr) == 0) {
as.data.table(
list(Feature = "NA", Split = "NA", Yes = "NA", No = "NA", Missing = "NA", Quality = "NA", Cover = "NA")
)
} else {
as.data.table(xtr)
}
}
]

# assign feature_names when available
if (!is.null(feature_names)) {
is_stump <- function() {
return(length(td$Feature) == 1 && is.na(td$Feature))
}
if (!is.null(feature_names) && !is_stump()) {
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
stop("feature_names has less elements than there are features used in the model")
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
Expand All @@ -134,12 +146,18 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
# parse leaf lines
leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
leaf_cols <- c("Feature", "Quality", "Cover")
td[isLeaf == TRUE,
(leaf_cols) := {
matches <- regmatches(t, regexec(leaf_rx, t))
xtr <- do.call(rbind, matches)[, c(2, 4)]
c("Leaf", as.data.table(xtr))
}]
td[
isLeaf == TRUE,
(leaf_cols) := {
matches <- regmatches(t, regexec(leaf_rx, t))
xtr <- do.call(rbind, matches)[, c(2, 4)]
if (length(xtr) == 2) {
c("Leaf", as.data.table(xtr[1]), as.data.table(xtr[2]))
} else {
c("Leaf", as.data.table(xtr))
}
}
]

# convert some columns to numeric
numeric_cols <- c("Split", "Quality", "Cover")
Expand Down
28 changes: 16 additions & 12 deletions R-package/R/xgb.plot.tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,22 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
data = dt$Feature,
fontcolor = "black")

edges <- DiagrammeR::create_edge_df(
from = match(rep(dt[Feature != "Leaf", c(ID)], 2), dt$ID),
to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID),
label = c(
dt[Feature != "Leaf", paste("<", Split)],
rep("", nrow(dt[Feature != "Leaf"]))
),
style = c(
dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")],
dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]
),
rel = "leading_to")
if (nrow(dt[Feature != "Leaf"]) != 0) {
edges <- DiagrammeR::create_edge_df(
from = match(rep(dt[Feature != "Leaf", c(ID)], 2), dt$ID),
to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID),
label = c(
dt[Feature != "Leaf", paste("<", Split)],
rep("", nrow(dt[Feature != "Leaf"]))
),
style = c(
dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")],
dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]
),
rel = "leading_to")
} else {
edges <- NULL
}

graph <- DiagrammeR::create_graph(
nodes_df = nodes,
Expand Down
10 changes: 10 additions & 0 deletions R-package/tests/testthat/test_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,16 @@ test_that("xgb.importance works with and without feature names", {
imp
}
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)

## decision stump
m <- xgboost::xgboost(
data = as.matrix(data.frame(x = c(0, 1))),
label = c(1, 2),
nrounds = 1
)
df <- xgb.model.dt.tree(model = m)
expect_equal(df$Feature, "Leaf")
expect_equal(df$Cover, 2)
})

test_that("xgb.importance works with GLM model", {
Expand Down

0 comments on commit da35162

Please sign in to comment.