-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
54 lines (42 loc) · 1.34 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from string import punctuation
from pymorphy2 import MorphAnalyzer
from razdel import tokenize as razdel_tokenize
class RSG_MorphAnalyzer():
def __init__(self):
self.morpho = MorphAnalyzer()
self.cashe = {}
def lemmantize_sentences(self, sentences):
"""
receives a list of tokens by sentence
returns list of lemmas by sentence
"""
res = []
for sentence in sentences:
res.append(self.lemmantize(sentence))
return(res)
def lemmantize(self, txt) -> list:
"""
returns only lemmas
"""
words = self.tokenize(txt)
res=[]
for w in words:
if w in self.cashe:
res.append(self.cashe[w])
else:
r=self.morpho.parse(w)[0].normal_form
res.append(r)
self.cashe[w]=r
return(res)
def tokenize(self, txt) -> list:
"""
tokenizes and removes punctuation from a string
"""
punkt = punctuation + '«»—…–“”'
tokens = []
for word in list(razdel_tokenize(txt)):
token = word.text.strip(punkt).lower() # remove punctuation
if token == "": # skip empty elements if any
continue
tokens.append(token)
return(tokens)