Skip to content

Commit

Permalink
Fix formulae
Browse files Browse the repository at this point in the history
  • Loading branch information
pecorarista committed Nov 7, 2017
1 parent 18c8572 commit 2a9e7b0
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 35 deletions.
62 changes: 28 additions & 34 deletions trf/acceptability.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from janome.tokenizer import Tokenizer

import trf.constant as const
from trf.analyser import Tree
from trf.util import split_text


Expand All @@ -18,7 +17,7 @@ def __init__(self, text: str, delimiter: str, rnnlm_model_path: str):

self.text = text
self.sentences = split_text(text, delimiter)
self.tss = tokenize_by_janome(self.sentences)
self.lengths, self.split_texts = tokenize(self.sentences)

if not os.path.isfile(rnnlm_model_path):
raise FileNotFoundError(errno.ENOENT,
Expand All @@ -28,26 +27,13 @@ def __init__(self, text: str, delimiter: str, rnnlm_model_path: str):

self.word_freq, self.n_total_words = self._load_word_freq(threshold=1)

self.rnnlm_scores = self.get_rnnlm_scores()
self.log_prob_scores = self.calc_log_prob_scores()
self.unigram_scores = self.calc_unigram_scores()
self.mean_lp_scores = self.calc_mean_lp_scores()
self.norm_lp_div_scores = self.calc_log_prob_scores()
self.norm_lp_sub_scores = self.calc_unigram_scores()

self.mean_unigram_scores = self.calc_mean_unigram_scores()

# self.normalized_scores_div = \
# self.calc_normalized_scores('div')

# self.normalized_scores_sub = \
# self.calc_normalized_scores('sub')

# self.normalized_scores_len = \
# self.calc_normalized_scores('len')

self.mean_loglikelihood = \
None \
if None in self.rnnlm_scores \
else numpy.mean(self.rnnlm_scores)

def get_rnnlm_scores(self) -> List[Union[None, float]]:
def calc_log_prob_scores(self) -> List[Union[None, float]]:
"""Get log likelihood scores by calling RNNLM
"""

Expand All @@ -62,7 +48,7 @@ def get_rnnlm_scores(self) -> List[Union[None, float]]:
'-test',
textfile.name]
process = Popen(command, stdout=PIPE, stderr=PIPE)
output , err = process.communicate()
output, err = process.communicate()
lines = [line.strip() for line in output.decode('UTF-8').split('\n')
if line.strip() != '']
scores = []
Expand Down Expand Up @@ -95,6 +81,9 @@ def _load_word_freq(self, threshold: int) -> Tuple[Dict[str, int], int]:

return (word_freq, n_total_words)

def average(xs: List[Union[None, float]]) -> float:
return 0.0

def calc_unigram_scores(self) -> List[float]:

unigram_scores = []
Expand All @@ -110,15 +99,14 @@ def calc_unigram_scores(self) -> List[float]:

return unigram_scores

def calc_mean_unigram_scores(self) -> List[Union[None, float]]:
mean_unigram_scores = []
for score, sentence in zip(self.unigram_scores, self.sentences):
n = len(self.sentences)
def calc_mean_lp_scores(self) -> List[Union[None, float]]:
mean_lp_scores = []
for score, length in zip(self.log_prob_scores, self.lenghts):
x = None \
if score is None or n == 0 \
else float(score) / float(len(self.sentences))
mean_unigram_scores.append(x)
return mean_unigram_scores
if score is None or length == 0 \
else float(score) / float(length)
mean_lp_scores.append(x)
return mean_lp_scores

def calc_normalized_scores(self, method: str) -> List[Union[None, float]]:

Expand Down Expand Up @@ -146,11 +134,17 @@ def _f(score: float, unigram_score: float, length: int, method: str) -> float:
raise ValueError


def tokenize_by_janome(sentences: List[str]) -> List[List[str]]:
def tokenize(sentences: List[str]) -> Tuple[List[int], List[List[str]]]:

tokenizer = Tokenizer()
tss = []
lengths = []
texts = []
for s in sentences:
result = tokenizer.tokenize(s)
ts = ' '.join([t.surface for t in result])
tss.append(ts)
return tss

surfaces = [t.surface for t in result]
lengths.append(len(surfaces))

text = ' '.join(surfaces)
texts.append(text)
return lengths, texts
11 changes: 10 additions & 1 deletion trf/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,13 @@ def translate(en: str):
elif en == 'r_conditional':
return '仮定節'
elif en == 'mean_loglikelihood':
return '言語モデルの尤度'
return '言語モデルの対数尤度'
elif en == 'acceptability_div':
return '容認度 (Norm LP (Div))'
elif en == 'acceptability_sub':
return '容認度 (Norm LP (Sub))'
elif en == 'acceptability_slor (SLOR)':
return '容認度'
else:
return en

Expand Down Expand Up @@ -64,6 +70,7 @@ def show(self, lang: str='ja'):
print('Unsupported language')
sys.exit(1)


def main():

executables = ['juman', 'knp', 'rnnlm']
Expand Down Expand Up @@ -135,6 +142,8 @@ def main():
score = acceptability.mean_loglikelihood
score = 'None' if score is None else '{:.2f}'.format(score)
metrics.append(Metric('mean_loglikelihood', score))
normalized_score = acceptability.normalized_scores_len
metrics.append(Metric('norm_len', normalized_score))
Section('language_model', metrics).show()


Expand Down

0 comments on commit 2a9e7b0

Please sign in to comment.