Skip to content

Commit

Permalink
bug fix, issue: #511, #512
Browse files Browse the repository at this point in the history
  • Loading branch information
sunjunyi01 committed Aug 28, 2017
1 parent 4eef868 commit b4dd5b5
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 4 deletions.
4 changes: 3 additions & 1 deletion jieba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled.
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
re_skip_default = re.compile("(\r\n|\s)", re.U)
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
Expand Down Expand Up @@ -409,6 +409,8 @@ def add_word(self, word, freq=None, tag=None):
wfrag = word[:ch + 1]
if wfrag not in self.FREQ:
self.FREQ[wfrag] = 0
if freq == 0:
finalseg.add_force_split(word)

def del_word(self, word):
"""
Expand Down
14 changes: 11 additions & 3 deletions jieba/finalseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
'E': 'BM'
}


Force_Split_Words = set([])
def load_model():
start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
Expand Down Expand Up @@ -75,16 +75,24 @@ def __cut(sentence):
yield sentence[nexti:]

re_han = re.compile("([\u4E00-\u9FD5]+)")
re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")


def add_force_split(word):
global Force_Split_Words
Force_Split_Words.add(word)

def cut(sentence):
sentence = strdecode(sentence)
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
for word in __cut(blk):
yield word
if word not in Force_Split_Words:
yield word
else:
for c in word:
yield c
else:
tmp = re_skip.split(blk)
for x in tmp:
Expand Down
2 changes: 2 additions & 0 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,5 @@ def cuttest(test_sent):
cuttest('张三风同学走上了不归路')
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
jieba.del_word('很赞')
cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')

0 comments on commit b4dd5b5

Please sign in to comment.