-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_output.py
68 lines (48 loc) · 1.72 KB
/
demo_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import itertools
import pickle
from crf_entity_extractor import CrfEntityExtractor
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
# Loading model
crf_loaded = CrfEntityExtractor()
crf_loaded.load_model('CRF_address_ner')
def tokenizer(text):
# Tokenize text into sentences
punkt_param = PunktParameters()
with open("abbrev_list.pkl", "rb") as fp:
abbrev_list = pickle.load(fp)
punkt_param.abbrev_types = set(abbrev_list)
tokenizer = PunktSentenceTokenizer(punkt_param)
tokenizer.train(text)
all_sentences = tokenizer.tokenize(text)
seen = set()
sentences = []
for sentence in all_sentences:
if sentence not in seen:
seen.add(sentence)
sentences.append(sentence)
return sentences
def output(text, model = crf_loaded):
# Tokenize text into sentences
text = tokenizer(text)
# Predict labels
predicted_labels = [model.predict(sentence) for sentence in text]
predicted_labels = list(itertools.chain(*predicted_labels))
# Making output
a = []
for sentence in text:
words = word_tokenize(sentence, language='portuguese')
for word in words:
a.append(word)
output = dict(zip(a, predicted_labels))
return output
def output_2(text, model = crf_loaded):
# Tokenize text into sentences
text = tokenizer(text)
# Predict labels
predicted_labels = [model.predict(sentence) for sentence in text]
# Making output
a = [sentences for sentences in text]
output = dict(zip(a, predicted_labels))
return output