From b4dd5b58f34beb2280162f8d2a71007fd8e056be Mon Sep 17 00:00:00 2001 From: sunjunyi01 Date: Mon, 28 Aug 2017 21:10:50 +0800 Subject: [PATCH] bug fix, issue: #511, #512 --- jieba/__init__.py | 4 +++- jieba/finalseg/__init__.py | 14 +++++++++++--- test/test.py | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index b7ad8d53..75092219 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -40,7 +40,7 @@ # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \r\n|\s : whitespace characters. Will not be handled. -re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U) +re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U) re_skip_default = re.compile("(\r\n|\s)", re.U) re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U) re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U) @@ -409,6 +409,8 @@ def add_word(self, word, freq=None, tag=None): wfrag = word[:ch + 1] if wfrag not in self.FREQ: self.FREQ[wfrag] = 0 + if freq == 0: + finalseg.add_force_split(word) def del_word(self, word): """ diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index 5d4a968c..d7600f77 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -19,7 +19,7 @@ 'E': 'BM' } - +Force_Split_Words = set([]) def load_model(): start_p = pickle.load(get_module_res("finalseg", PROB_START_P)) trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P)) @@ -75,8 +75,12 @@ def __cut(sentence): yield sentence[nexti:] re_han = re.compile("([\u4E00-\u9FD5]+)") -re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") +re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)") + +def add_force_split(word): + global Force_Split_Words + Force_Split_Words.add(word) def cut(sentence): sentence = strdecode(sentence) @@ -84,7 +88,11 @@ def cut(sentence): for blk in blocks: if re_han.match(blk): for word in __cut(blk): - yield word + if word not in Force_Split_Words: + yield word + else: + for c in word: + yield c else: tmp = re_skip.split(blk) for x in tmp: diff --git a/test/test.py b/test/test.py index 635b0457..6653136a 100644 --- a/test/test.py +++ b/test/test.py @@ -98,3 +98,5 @@ def cuttest(test_sent): cuttest('张三风同学走上了不归路') cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。') cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。') + jieba.del_word('很赞') + cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')