Merge branch 'main' of https://github.com/recski/tuw-nlp into main

recski · Apr 12, 2022 · 714ef5a · 714ef5a
2 parents 8decbbe + 31e1527
commit 714ef5a
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ python services/text_to_4lang/backend/service.py
 Then run the frontend with this command:
 
 ```
-streamlit run services/text_to_4lang/frontend/extract.py
+streamlit run services/text_to_4lang/frontend/demo.py
 ```
 
 In the demo you can parse english and german sentences and you can also try out multiple algorithms our graphs implement, such as `expand`, `substitute` and `append_zero_paths`.

diff --git a/tuw_nlp/grammar/lexicon.py b/tuw_nlp/grammar/lexicon.py
@@ -508,6 +508,10 @@ def get_mod_edges(self):
             ("VERB", "ADVMOD", "ADV"),
             # nicht staffeln, sample 10
             ("VERB", "ADVMOD", "PART"),
+            # nicht mehr als
+            ("CCONJ", "ADVMOD", "ADV"),
+            # nicht mehr als
+            ("ADV", "ADVMOD", "PART"),
             # sample 112 of sample_10
             ("VERB", "ADVCL", "VERB"),
             # nicht gewaehlt... , weil er gegen die Homo-Ehe... (Germeval '18)

diff --git a/tuw_nlp/grammar/text_to_4lang.py b/tuw_nlp/grammar/text_to_4lang.py
@@ -74,6 +74,8 @@ def expand(self, graph, depth=1, substitute=False, expand_set=set(), strategy="N
                             graph, d_node, definition, substitute, strategy)
                         if expand_set:
                             expand_set |= set(definition_nodes)
+                    else:
+                        print('no definition for ' + node)
 
         self.expand(graph, depth-1, substitute=substitute,
                     expand_set=expand_set, strategy=strategy)
@@ -88,8 +90,8 @@ def parse(self, sen):
         return relabeled_graph, self.graph_lexical.vocab.get_id(
             graph.nodes[root]["name"])
 
-    def __call__(self, text, depth=0, substitute=False, expand_set=set(), strategy="None"):
-        for sen in self.nlp(text).sentences:
+    def __call__(self, text, depth=0, substitute=False, expand_set=set(), strategy="None", ssplit=True):
+        for sen in self.nlp(text, ssplit=ssplit).sentences:
             graph, root = self.parse(sen)
 
             fourlang = FourLang(graph, root, self.graph_lexical)

diff --git a/tuw_nlp/graph/utils.py b/tuw_nlp/graph/utils.py
@@ -1,3 +1,4 @@
+import logging
 import re
 from copy import deepcopy
 from itertools import chain
@@ -156,7 +157,7 @@ def match(self, graph, return_subgraphs=False):
                 for p in patt:
                     matcher = DiGraphMatcher(
                         graph, p, node_match=GraphFormulaMatcher.node_matcher, edge_match=GraphFormulaMatcher.edge_matcher)
-                    
+
                     monomorphic_subgraphs = list(matcher.subgraph_monomorphisms_iter())
                     if not len(monomorphic_subgraphs) == 0:
                         mapping = monomorphic_subgraphs[0]
@@ -170,9 +171,9 @@ def match(self, graph, return_subgraphs=False):
                 if pos_match:
                     if return_subgraphs:
                         yield key, i, subgraphs
-                    else:    
+                    else:
                         yield key, i
-                    
+
 
 def gen_subgraphs(M, no_edges):
     """M must be dict of dicts, see networkx.convert.to_dict_of_dicts.
@@ -263,9 +264,13 @@ def graph_to_pn(graph):
 
     G = pn.Graph(pn_nodes + pn_edges)
 
+    try:
     # two spaces before edge name, because alto does it :)
-    return pn.encode(G, indent=0).replace('\n', '  ')
-
+        return pn.encode(G, indent=0).replace('\n', '  ')
+    except pn.exceptions.LayoutError as e:
+        words = [graph.nodes[node]['name'] for node in graph.nodes()]
+        logging.error(f'pn.encode failed on this graph: {words}')
+        raise e
 
 def read_alto_output(raw_dl):
     id_to_word = {}

diff --git a/tuw_nlp/text/patterns/de.py b/tuw_nlp/text/patterns/de.py
@@ -6,7 +6,9 @@
     "Kat.G.",
     "lit.",
     "ONr.",
-    'bzw.'
+    'bzw.',
+    'Pkt.',
+    "Dipl.-Ing."
 ]
 
 MONTH = [

diff --git a/tuw_nlp/text/pipeline.py b/tuw_nlp/text/pipeline.py
@@ -46,15 +46,15 @@ def __init__(self, stanza_pipeline, cache_path, init=None):
 
         self.changed = False
 
-    def parse(self, text):
+    def parse(self, text, ssplit):
         if self.nlp is None:
             self.nlp = self.init()
 
-        return self.nlp(text)
+        return self.nlp(text) if ssplit else self.nlp.additional(text)
 
-    def __call__(self, text):
+    def __call__(self, text, ssplit=True):
         if text not in self.parsed:
-            self.parsed[text] = self.parse(text)
+            self.parsed[text] = self.parse(text, ssplit=ssplit)
             self.changed = True
 
         return self.parsed[text]
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,9 @@ @@
         "Kat.G.",
         "lit.",
         "ONr.",
-        'bzw.'
+        'bzw.',
+        'Pkt.',
+        "Dipl.-Ing."
     ]
     MONTH = [
@@ Expand Down @@