Skip to content

Commit

Permalink
fix combiner n_jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
itlubber committed Sep 12, 2024
1 parent 5be6d49 commit 0374b7b
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
1 change: 0 additions & 1 deletion scorecardpipeline/feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ class LiftSelector(SelectorMixin):
:param scores_ : array-like of shape (n_features,). Lift scores of features.
:param select_columns : array-like
:param dropped : DataFrame
"""
def __init__(self, target="target", threshold=3.0, n_jobs=None, methods=None, **kwargs):
"""
Expand Down
27 changes: 16 additions & 11 deletions scorecardpipeline/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def catboost_selector(self, x, y, cat_features=None):

class Combiner(TransformerMixin, BaseEstimator):

def __init__(self, target="target", method='chi', empty_separate=True, min_n_bins=2, max_n_bins=None, max_n_prebins=20, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", adj_rules={}, n_jobs=1):
def __init__(self, target="target", method='chi', empty_separate=True, min_n_bins=2, max_n_bins=None, max_n_prebins=20, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", adj_rules={}, n_jobs=1, **kwargs):
"""特征分箱封装方法
:param target: 数据集中标签名称,默认 target
Expand Down Expand Up @@ -424,6 +424,7 @@ def __init__(self, target="target", method='chi', empty_separate=True, min_n_bin
self.monotonic_trend = monotonic_trend
self.adj_rules = adj_rules
self.n_jobs = n_jobs
self.kwargs = kwargs

def update(self, rules):
"""更新 Combiner 中特征的分箱规则
Expand All @@ -436,7 +437,8 @@ def update(self, rules):
for feature in rules.keys():
self.check_rules(feature=feature)

def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc"):
@staticmethod
def optbinning_bins(feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", **kwargs):
"""基于 optbinning.OptimalBinning 的特征分箱方法,使用 optbinning.OptimalBinning 分箱失败时,使用 toad.transform.Combiner 的卡方分箱处理
:param feature: 需要进行分箱的特征名称
Expand Down Expand Up @@ -472,18 +474,18 @@ def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max
dtype = "numerical"
x = data[feature].values

_combiner = OptimalBinning(feature, dtype=dtype, min_n_bins=min_n_bins, max_n_bins=max_n_bins, max_n_prebins=max_n_prebins, min_prebin_size=min_prebin_size, min_bin_size=min_bin_size, max_bin_size=max_bin_size, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y)
_combiner = OptimalBinning(feature, dtype=dtype, min_n_bins=min_n_bins, max_n_bins=max_n_bins, max_n_prebins=max_n_prebins, min_prebin_size=min_prebin_size, min_bin_size=min_bin_size, max_bin_size=max_bin_size, monotonic_trend=monotonic_trend, gamma=gamma, **kwargs).fit(x, y)
if _combiner.status == "OPTIMAL":
rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[None] if dtype == "categorical" else np.nan]}
else:
raise Exception("optimalBinning error")

except Exception as e:
_combiner = toad.transform.Combiner()
_combiner.fit(data[[feature, target]].dropna(), target, method="chi", min_samples=self.min_bin_size, n_bins=self.max_n_bins, empty_separate=False)
_combiner.fit(data[[feature, target]].dropna(), target, method="chi", min_samples=min_bin_size, n_bins=max_n_bins, empty_separate=False)
rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.export()[feature]] + [[None] if dtype == "categorical" else np.nan]}

self.combiner.update(rule)
return rule

def fit(self, x: pd.DataFrame, y=None):
"""特征分箱训练
Expand All @@ -499,21 +501,24 @@ def fit(self, x: pd.DataFrame, y=None):
# x[cat_cols] = x[cat_cols].replace(np.nan, None)

if self.method in ["cart", "mdlp", "uniform"]:
feature_optbinning_bins = partial(self.optbinning_bins, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend)
feature_optbinning_bins = partial(self.optbinning_bins, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend, **self.kwargs)
if self.n_jobs > 1:
Parallel(n_jobs=self.n_jobs)(delayed(feature_optbinning_bins)(feature) for feature in x.columns.drop(self.target))
rules = Parallel(n_jobs=self.n_jobs)(delayed(feature_optbinning_bins)(feature) for feature in x.columns.drop(self.target))
[self.combiner.update(r) for r in rules]
# with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
# [executor.submit(feature_optbinning_bins(feature)) for feature in x.columns.drop(self.target)]
else:
for feature in x.drop(columns=[self.target]):
feature_optbinning_bins(feature)
rule = feature_optbinning_bins(feature)
self.combiner.update(rule)
else:
if self.method in ["step", "quantile"]:
self.combiner.fit(x, y=self.target, method=self.method, n_bins=self.max_n_bins, empty_separate=self.empty_separate)
self.combiner.fit(x, y=self.target, method=self.method, n_bins=self.max_n_bins, empty_separate=self.empty_separate, **self.kwargs)
else:
self.combiner.fit(x, y=self.target, method=self.method, min_samples=self.min_bin_size, n_bins=self.max_n_bins, empty_separate=self.empty_separate)
self.combiner.fit(x, y=self.target, method=self.method, min_samples=self.min_bin_size, n_bins=self.max_n_bins, empty_separate=self.empty_separate, **self.kwargs)

self.update(self.adj_rules)
if self.adj_rules is not None and len(self.adj_rules) > 0:
self.update(self.adj_rules)

# 检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值,同时检查分箱顺序并调整为正确顺序
self.check_rules()
Expand Down

0 comments on commit 0374b7b

Please sign in to comment.