From 2a9e7b0af87881dcaaa93fd3ae2a090524d0e9f5 Mon Sep 17 00:00:00 2001
From: Miyazawa Akira <miyazawa-a@nii.ac.jp>
Date: Tue, 7 Nov 2017 18:21:31 +0900
Subject: [PATCH] Fix formulae

---
 trf/acceptability.py | 62 ++++++++++++++++++++------------------------
 trf/cmdline.py       | 11 +++++++-
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/trf/acceptability.py b/trf/acceptability.py
index 2328616..8da21f6 100644
--- a/trf/acceptability.py
+++ b/trf/acceptability.py
@@ -8,7 +8,6 @@
 from janome.tokenizer import Tokenizer
 
 import trf.constant as const
-from trf.analyser import Tree
 from trf.util import split_text
 
 
@@ -18,7 +17,7 @@ def __init__(self, text: str, delimiter: str, rnnlm_model_path: str):
 
         self.text = text
         self.sentences = split_text(text, delimiter)
-        self.tss = tokenize_by_janome(self.sentences)
+        self.lengths, self.split_texts = tokenize(self.sentences)
 
         if not os.path.isfile(rnnlm_model_path):
             raise FileNotFoundError(errno.ENOENT,
@@ -28,26 +27,13 @@ def __init__(self, text: str, delimiter: str, rnnlm_model_path: str):
 
         self.word_freq, self.n_total_words = self._load_word_freq(threshold=1)
 
-        self.rnnlm_scores = self.get_rnnlm_scores()
+        self.log_prob_scores = self.calc_log_prob_scores()
         self.unigram_scores = self.calc_unigram_scores()
+        self.mean_lp_scores = self.calc_mean_lp_scores()
+        self.norm_lp_div_scores = self.calc_log_prob_scores()
+        self.norm_lp_sub_scores = self.calc_unigram_scores()
 
-        self.mean_unigram_scores = self.calc_mean_unigram_scores()
-
-        # self.normalized_scores_div = \
-        #     self.calc_normalized_scores('div')
-
-        # self.normalized_scores_sub = \
-        #     self.calc_normalized_scores('sub')
-
-        # self.normalized_scores_len = \
-        #     self.calc_normalized_scores('len')
-
-        self.mean_loglikelihood = \
-            None \
-            if None in self.rnnlm_scores \
-            else numpy.mean(self.rnnlm_scores)
-
-    def get_rnnlm_scores(self) -> List[Union[None, float]]:
+    def calc_log_prob_scores(self) -> List[Union[None, float]]:
         """Get log likelihood scores by calling RNNLM
         """
 
@@ -62,7 +48,7 @@ def get_rnnlm_scores(self) -> List[Union[None, float]]:
                    '-test',
                    textfile.name]
         process = Popen(command, stdout=PIPE, stderr=PIPE)
-        output , err = process.communicate()
+        output, err = process.communicate()
         lines = [line.strip() for line in output.decode('UTF-8').split('\n')
                  if line.strip() != '']
         scores = []
@@ -95,6 +81,9 @@ def _load_word_freq(self, threshold: int) -> Tuple[Dict[str, int], int]:
 
         return (word_freq, n_total_words)
 
+    def average(xs: List[Union[None, float]]) -> float:
+        return 0.0
+
     def calc_unigram_scores(self) -> List[float]:
 
         unigram_scores = []
@@ -110,15 +99,14 @@ def calc_unigram_scores(self) -> List[float]:
 
         return unigram_scores
 
-    def calc_mean_unigram_scores(self) -> List[Union[None, float]]:
-        mean_unigram_scores = []
-        for score, sentence in zip(self.unigram_scores, self.sentences):
-            n = len(self.sentences)
+    def calc_mean_lp_scores(self) -> List[Union[None, float]]:
+        mean_lp_scores = []
+        for score, length in zip(self.log_prob_scores, self.lenghts):
             x = None \
-                if score is None or n == 0 \
-                else float(score) / float(len(self.sentences))
-            mean_unigram_scores.append(x)
-        return mean_unigram_scores
+                if score is None or length == 0 \
+                else float(score) / float(length)
+            mean_lp_scores.append(x)
+        return mean_lp_scores
 
     def calc_normalized_scores(self, method: str) -> List[Union[None, float]]:
 
@@ -146,11 +134,17 @@ def _f(score: float, unigram_score: float, length: int, method: str) -> float:
         raise ValueError
 
 
-def tokenize_by_janome(sentences: List[str]) -> List[List[str]]:
+def tokenize(sentences: List[str]) -> Tuple[List[int], List[List[str]]]:
+
     tokenizer = Tokenizer()
-    tss = []
+    lengths = []
+    texts = []
     for s in sentences:
         result = tokenizer.tokenize(s)
-        ts = ' '.join([t.surface for t in result])
-        tss.append(ts)
-    return tss
+
+        surfaces = [t.surface for t in result]
+        lengths.append(len(surfaces))
+
+        text = ' '.join(surfaces)
+        texts.append(text)
+    return lengths, texts
diff --git a/trf/cmdline.py b/trf/cmdline.py
index 7280e14..9a0d5a8 100644
--- a/trf/cmdline.py
+++ b/trf/cmdline.py
@@ -22,7 +22,13 @@ def translate(en: str):
     elif en == 'r_conditional':
         return '仮定節'
     elif en == 'mean_loglikelihood':
-        return '言語モデルの尤度'
+        return '言語モデルの対数尤度'
+    elif en == 'acceptability_div':
+        return '容認度 (Norm LP (Div))'
+    elif en == 'acceptability_sub':
+        return '容認度 (Norm LP (Sub))'
+    elif en == 'acceptability_slor (SLOR)':
+        return '容認度'
     else:
         return en
 
@@ -64,6 +70,7 @@ def show(self, lang: str='ja'):
             print('Unsupported language')
             sys.exit(1)
 
+
 def main():
 
     executables = ['juman', 'knp', 'rnnlm']
@@ -135,6 +142,8 @@ def main():
     score = acceptability.mean_loglikelihood
     score = 'None' if score is None else '{:.2f}'.format(score)
     metrics.append(Metric('mean_loglikelihood', score))
+    normalized_score = acceptability.normalized_scores_len
+    metrics.append(Metric('norm_len', normalized_score))
     Section('language_model', metrics).show()