forked from zhuli8805/CBT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_npl_tag.py
121 lines (109 loc) · 4.3 KB
/
read_npl_tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 29 15:35:24 2016
@author: Yixuan Li
"""
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators tokenize,ssplit,pos,ner
import re
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')
text = 'Then the runner awoke , jumped up , and saw that his pitcher was empty and the King \'s XXXXX far ahead . \n'
candidates = ('daughter', 'husband', 'man', 'manners', 'past', 'skull', 'sky', 'spectator', 'stream', 'town')
missing_word = 'XXXXX'
nlp_settings = {'annotators': 'tokenize,ssplit,pos,ner', "outputFormat": "text"}
def get_all_nlp_tag(text, paired_output=False, nlp=nlp):
output = nlp.annotate(text, properties=nlp_settings)
re_annotate_tag = re.compile('\[Text=(\S+).*PartOfSpeech=(\S+)')
pos_list = []
re_symbols = re.compile('^[^a-zA-Z]+$')
for line in output.splitlines():
m = re_annotate_tag.match(line)
if m:
if re_symbols.match(m.group(1)):
continue
if paired_output:
pos_list.append([m.group(1), m.group(2)])
else:
pos_list.append(m.group(2))
return(pos_list)
def get_nlp_annotate(text, candidates=None, nlp=nlp):
output = nlp.annotate(text, properties=nlp_settings)
re_annotate_tag = re.compile('\[Text=(\S+).*PartOfSpeech=(\S+)')
pos_arround_missing_word = identify_annotate_tag(output, re_annotate_tag, missing_word)
if candidates:
pos_of_target = identify_target_tag(text, candidates, re_annotate_tag, nlp)
return(pos_arround_missing_word, pos_of_target)
else:
return(pos_arround_missing_word)
def identify_annotate_tag(nlp_output, regex, target):
pos_list_full = []
pos_list = []
target_index = 0
re_symbols = re.compile('^[^a-zA-Z]+$')
for line in nlp_output.splitlines():
m = regex.match(line)
if m:
if re_symbols.match(m.group(1)):
continue
if m.group(1) == target:
target_index = len(pos_list_full)
pos_list_full.append(m.group(2))
for i in range(target_index - 2, target_index + 3):
if i < 0 or i >= len(pos_list_full):
pos_list.append(None)
elif i != target_index:
pos_list.append(pos_list_full[i])
return pos_list
def identify_target_tag(text, candidates, regex, nlp=nlp):
pos_list = []
missing_word_idx = identify_missing_word_pos(text) + 2
for word in candidates:
output = nlp.annotate(text.replace(missing_word, word), properties=nlp_settings)
line = output.splitlines()[missing_word_idx]
m = regex.match(line)
if m:
if m.group(1) == word:
pos_list.append(m.group(2))
return pos_list
def identify_missing_word_pos(text):
splitted_text = text.split(' ')
for i in range(len(splitted_text)-1):
if splitted_text[i] == missing_word:
return i
def demo():
# Get the POS info arround XXXXX and of candidates.
a, b = get_nlp_annotate(text=text, candidates=candidates, nlp=nlp)
print(list(zip(candidates,b)))
# Get the POS info for a sentence.
a = get_all_nlp_tag(text, nlp=nlp, paired_output=True)
print(a)
def getToken_wordpos(line):
wordpos_pairs = get_all_nlp_tag(line, True)
for wordpos in wordpos_pairs:
word, pos = wordpos
yield word, pos
if __name__ == '__main__':
# demo()
a, b = get_nlp_annotate(text, candidates)
print(dict(zip(candidates,b)))
print('a===', a)
print('b===', b)
c = get_nlp_annotate(text)
print('c===', c)
d = get_all_nlp_tag('this is a fucking apple')
print('d===', d)
e = get_all_nlp_tag('this is a fucking apple', True)
print('e===', e)
e.reverse()
print('f===', e)
pairs = []
for pair in getToken_wordpos('this is a fucking apple'):
pairs = pairs + [pair]
print ('g===', pairs)
pair_START = [['START', 'START']]
pair_END = [['END', 'END']]
pairs = pair_START + pair_START + pairs + pair_END + pair_END
print('123', pairs)
for pair in pairs:
word, pos = pair
print ('h===',word,'-----',pos)