Skip to content

Commit

Permalink
Merge pull request #23 from aistairc/fix-formulae
Browse files Browse the repository at this point in the history
Fix formulae
  • Loading branch information
pecorarista authored Nov 8, 2017
2 parents 18c8572 + dcda036 commit b2300e8
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 97 deletions.
10 changes: 3 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
__pycache__
build
*.pyc
trf.egg-info
tests/faster-rnnlm
tests/uniq.dat
tests/test.input
tests/rnnlm.output
tests/__init__.pyc
tests/test_acceptability.pyc

build
tools
data
32 changes: 15 additions & 17 deletions tests/test_acceptability.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import unittest
import warnings
import tempfile

from trf.acceptability import Acceptability
from trf.util import check_executable
Expand All @@ -20,34 +18,34 @@ def setUp(self):
self.delimiter,
self.rnnlm_model_path)

def test_rnnlm_scores(self):
scores = self.acceptability.rnnlm_scores
def test_log_prob(self):
scores = self.acceptability._calc_log_prob_scores()
self.assertAlmostEqual(scores[0], -11.571, places=2)

def test_unigram_scores(self):

scores = self.acceptability.unigram_scores
scores = self.acceptability._calc_unigram_scores()
self.assertAlmostEqual(scores[0], -31.457, places=2)

# def test_mean_unigram_scores(self):
def test_mean_lp_scores(self):

# scores = self.acceptability.mean_unigram_scores
# self.assertAlmostEqual(scores[0], -2.12, places=2)
score = self.acceptability.mean_lp
self.assertAlmostEqual(score, -2.892, places=2)

# def test_normalized_scores_div(self):
def test_norm_lp_div(self):

# scores = self.acceptability.normalized_scores_div
# self.assertAlmostEqual(scores[0], -5.446, places=2)
score = self.acceptability.norm_lp_div
self.assertAlmostEqual(score, -0.3678, places=2)

# def test_normalized_scores_sub(self):
def test_norm_lp_sub(self):

# scores = self.acceptability.normalized_scores_sub
# self.assertAlmostEqual(scores[0], -9.447, places=2)
score = self.acceptability.norm_lp_sub
self.assertAlmostEqual(score, 19.885, places=2)

# def test_normalized_scores_len(self):
def test_slor(self):

# scores = self.acceptability.normalized_scores_len
# self.assertAlmostEqual(scores[0], -0.9447, places=2)
score = self.acceptability.slor
self.assertAlmostEqual(score, 4.9713, places=2)

def tearDown(self):
pass
Expand Down
Binary file added trf/__init__.pyc
Binary file not shown.
205 changes: 139 additions & 66 deletions trf/acceptability.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from janome.tokenizer import Tokenizer

import trf.constant as const
from trf.analyser import Tree
from trf.util import split_text


Expand All @@ -17,8 +16,8 @@ class Acceptability:
def __init__(self, text: str, delimiter: str, rnnlm_model_path: str):

self.text = text
self.sentences = split_text(text, delimiter)
self.tss = tokenize_by_janome(self.sentences)
self.sentences = split_text(text, delimiter) # type: List[str]
lengths, self.tss = tokenize(self.sentences)

if not os.path.isfile(rnnlm_model_path):
raise FileNotFoundError(errno.ENOENT,
Expand All @@ -28,26 +27,27 @@ def __init__(self, text: str, delimiter: str, rnnlm_model_path: str):

self.word_freq, self.n_total_words = self._load_word_freq(threshold=1)

self.rnnlm_scores = self.get_rnnlm_scores()
self.unigram_scores = self.calc_unigram_scores()

self.mean_unigram_scores = self.calc_mean_unigram_scores()

# self.normalized_scores_div = \
# self.calc_normalized_scores('div')

# self.normalized_scores_sub = \
# self.calc_normalized_scores('sub')

# self.normalized_scores_len = \
# self.calc_normalized_scores('len')

self.mean_loglikelihood = \
None \
if None in self.rnnlm_scores \
else numpy.mean(self.rnnlm_scores)

def get_rnnlm_scores(self) -> List[Union[None, float]]:
log_prob_scores = \
self._calc_log_prob_scores()
unigram_scores = \
self._calc_unigram_scores()

mean_lp_scores = \
calc_mean_lp_scores(log_prob_scores, lengths)
norm_lp_div_scores = \
calc_norm_lp_div_scores(log_prob_scores, unigram_scores)
norm_lp_sub_scores = \
calc_norm_lp_sub_scores(log_prob_scores, unigram_scores)
slor_scores = \
calc_slor_scores(norm_lp_sub_scores, lengths)

self.log_prob = average(log_prob_scores)
self.mean_lp = average(mean_lp_scores)
self.norm_lp_div = average(norm_lp_div_scores)
self.norm_lp_sub = average(norm_lp_sub_scores)
self.slor = average(slor_scores)

def _calc_log_prob_scores(self) -> List[Union[None, float]]:
"""Get log likelihood scores by calling RNNLM
"""

Expand All @@ -62,7 +62,7 @@ def get_rnnlm_scores(self) -> List[Union[None, float]]:
'-test',
textfile.name]
process = Popen(command, stdout=PIPE, stderr=PIPE)
output , err = process.communicate()
output, err = process.communicate()
lines = [line.strip() for line in output.decode('UTF-8').split('\n')
if line.strip() != '']
scores = []
Expand Down Expand Up @@ -95,7 +95,7 @@ def _load_word_freq(self, threshold: int) -> Tuple[Dict[str, int], int]:

return (word_freq, n_total_words)

def calc_unigram_scores(self) -> List[float]:
def _calc_unigram_scores(self) -> List[float]:

unigram_scores = []
for ts in self.tss:
Expand All @@ -110,47 +110,120 @@ def calc_unigram_scores(self) -> List[float]:

return unigram_scores

def calc_mean_unigram_scores(self) -> List[Union[None, float]]:
mean_unigram_scores = []
for score, sentence in zip(self.unigram_scores, self.sentences):
n = len(self.sentences)
x = None \
if score is None or n == 0 \
else float(score) / float(len(self.sentences))
mean_unigram_scores.append(x)
return mean_unigram_scores

def calc_normalized_scores(self, method: str) -> List[Union[None, float]]:

normalized_scores = []
for score, unigram_score, s in zip(self.rnnlm_scores,
self.unigram_scores,
self.sentences):
x = None \
if score is None or numpy.isclose(unigram_score,
0.0, rtol=1e-05) \
else _f(score, unigram_score, len(s), method)
normalized_scores.append(x)
return normalized_scores


def _f(score: float, unigram_score: float, length: int, method: str) -> float:

if method == 'div':
return (-1) * float(score) / float(unigram_score)
elif method == 'sub':
return float(score) - float(unigram_score)
elif method == 'len':
return (float(score) - float(unigram_score)) / length
else:
raise ValueError


def tokenize_by_janome(sentences: List[str]) -> List[List[str]]:

def average(xs: List[Union[None, float]]) -> float:
"""Calculate the arithmetic mean of the given values (possibly None)
>>> '{:.2f}'.format(average([None, 1.0, 2.0]))
'1.50'
"""
return numpy.mean([x for x in xs if x is not None])


def calc_mean_lp_scores(log_prob_scores: List[float],
lengths: List[int]) -> List[Union[None, float]]:
r"""
.. math:
\frac{%
\log P_\text{model}\left(\xi\right)
}{%
\text{length}\left(\xi\right)
}
>>> '{:.3f}'.format(calc_mean_lp_scores([-14.7579], [4])[0])
'-3.689'
"""
mean_lp_scores = []
for score, length in zip(log_prob_scores, lengths):
x = None \
if score is None or length == 0 \
else float(score) / float(length)
mean_lp_scores.append(x)
return mean_lp_scores


def calc_norm_lp_div_scores(
log_prob_scores: List[float],
unigram_scores: List[float]) -> List[Union[None, float]]:
r"""
.. math:
\frac{%
\log P_\text{model}\left(\xi\right)
}{%
\log P_\text{unigram}\left(\xi\right)
}
>>> '{:.3f}'.format(calc_norm_lp_div_scores([-14.7579], [-35.6325])[0])
'-0.414'
"""
results = []
for log_prob, unigram_score in zip(log_prob_scores, unigram_scores):
if log_prob is None or numpy.isclose(unigram_score, 0.0, rtol=1e-05):
x = None
else:
x = (-1.0) * float(log_prob) / float(unigram_score)
results.append(x)
return results


def calc_norm_lp_sub_scores(
log_prob_scores: List[float],
unigram_scores: List[float]) -> List[Union[None, float]]:
r"""
.. math:
\log P_\text{model}\left(\xi\right)
- \log P_\text{unigram}\left(\xi\right)
>>> '{:.3f}'.format(calc_norm_lp_sub_scores([-14.7579], [-35.6325])[0])
'20.875'
"""

results = []
for log_prob, unigram_score in zip(log_prob_scores, unigram_scores):
if log_prob is None or numpy.isclose(unigram_score, 0.0, rtol=1e-05):
x = None
else:
x = float(log_prob) - float(unigram_score)
results.append(x)
return results


def calc_slor_scores(norm_lp_sub_scores: List[float],
lengths: List[int]) -> List[Union[None, float]]:
r"""Calculate SLOR (Syntactic Log-Odds Ratio)
.. math:
\frac{%
\log P_\text{model}\left(\xi\right)
- \log P_\text{unigram}\left(\xi\right)
}{%
\text{length}\left(\xi\right)
}
>>> '{:.3f}'.format(calc_slor_scores([20.8746], [4])[0])
'5.219'
"""

results = []
for norm_lp_sub_score, length in zip(norm_lp_sub_scores, lengths):
if (norm_lp_sub_score is None) or length == 0:
x = None
else:
x = norm_lp_sub_score / length
results.append(x)
return results


def tokenize(sentences: List[str]) -> Tuple[List[int], List[List[str]]]:

tokenizer = Tokenizer()
tss = []
lengths = []
texts = []
for s in sentences:
result = tokenizer.tokenize(s)
ts = ' '.join([t.surface for t in result])
tss.append(ts)
return tss

surfaces = [t.surface for t in result]
lengths.append(len(surfaces))

text = ' '.join(surfaces)
texts.append(text)
return lengths, texts


if __name__ == '__main__':
import doctest
doctest.testmod(verbose=True)
28 changes: 21 additions & 7 deletions trf/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,16 @@ def translate(en: str):
return '係り受け木の深さ'
elif en == 'r_conditional':
return '仮定節'
elif en == 'mean_loglikelihood':
return '言語モデルの尤度'
elif en == 'log_prob':
return '容認度 (LogProb)'
elif en == 'mean_lp':
return '容認度 (Mean LP)'
elif en == 'norm_lp_div':
return '容認度 (Norm LP (Div))'
elif en == 'norm_lp_sub':
return '容認度 (Norm LP (Sub))'
elif en == 'slor':
return '容認度 (SLOR)'
else:
return en

Expand Down Expand Up @@ -64,6 +72,11 @@ def show(self, lang: str='ja'):
print('Unsupported language')
sys.exit(1)


def _f(score: float) -> str:
return 'None' if score is None else '{:.2f}'.format(score)


def main():

executables = ['juman', 'knp', 'rnnlm']
Expand Down Expand Up @@ -128,13 +141,14 @@ def main():
Section('syntax', metrics).show()

metrics = []
acceptability = \
Acceptability(text,
a = Acceptability(text,
args.delimiter,
args.rnnlm_model_path)
score = acceptability.mean_loglikelihood
score = 'None' if score is None else '{:.2f}'.format(score)
metrics.append(Metric('mean_loglikelihood', score))
metrics.append(Metric('log_prob', _f(a.log_prob)))
metrics.append(Metric('mean_lp', _f(a.mean_lp)))
metrics.append(Metric('norm_lp_div', _f(a.norm_lp_div)))
metrics.append(Metric('norm_lp_sub', _f(a.norm_lp_sub)))
metrics.append(Metric('slor', _f(a.slor)))
Section('language_model', metrics).show()


Expand Down
Binary file added trf/constant.pyc
Binary file not shown.

0 comments on commit b2300e8

Please sign in to comment.