From 90a729133ccb33dec70a2160fb7f2dd13d8ce53c Mon Sep 17 00:00:00 2001 From: Gabor Recski Date: Mon, 7 Mar 2022 13:06:30 +0100 Subject: [PATCH 1/2] resolved conflict, new rules in text_to_4lang lexicon --- README.md | 2 +- tuw_nlp/grammar/lexicon.py | 4 ++++ tuw_nlp/grammar/text_to_4lang.py | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 24fe0a5..4136576 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ python services/text_to_4lang/backend/service.py Then run the frontend with this command: ``` -streamlit run services/text_to_4lang/frontend/extract.py +streamlit run services/text_to_4lang/frontend/demo.py ``` In the demo you can parse english and german sentences and you can also try out multiple algorithms our graphs implement, such as `expand`, `substitute` and `append_zero_paths`. diff --git a/tuw_nlp/grammar/lexicon.py b/tuw_nlp/grammar/lexicon.py index 9f6114e..e64cd77 100644 --- a/tuw_nlp/grammar/lexicon.py +++ b/tuw_nlp/grammar/lexicon.py @@ -508,6 +508,10 @@ def get_mod_edges(self): ("VERB", "ADVMOD", "ADV"), # nicht staffeln, sample 10 ("VERB", "ADVMOD", "PART"), + # nicht mehr als + ("CCONJ", "ADVMOD", "ADV"), + # nicht mehr als + ("ADV", "ADVMOD", "PART"), # sample 112 of sample_10 ("VERB", "ADVCL", "VERB"), # nicht gewaehlt... , weil er gegen die Homo-Ehe... (Germeval '18) diff --git a/tuw_nlp/grammar/text_to_4lang.py b/tuw_nlp/grammar/text_to_4lang.py index 600a3a9..d730bde 100644 --- a/tuw_nlp/grammar/text_to_4lang.py +++ b/tuw_nlp/grammar/text_to_4lang.py @@ -74,6 +74,8 @@ def expand(self, graph, depth=1, substitute=False, expand_set=set(), strategy="N graph, d_node, definition, substitute, strategy) if expand_set: expand_set |= set(definition_nodes) + else: + print('no definition for ' + node) self.expand(graph, depth-1, substitute=substitute, expand_set=expand_set, strategy=strategy) From 32e9c9d08f6a2383ecb74149f570df98c9f62df6 Mon Sep 17 00:00:00 2001 From: Gabor Recski Date: Fri, 1 Apr 2022 12:08:28 +0200 Subject: [PATCH 2/2] option to disable ssplit in text_to_4lang, raise error if 4lang graph is not connected, some more DE abbrev patterns --- tuw_nlp/grammar/text_to_4lang.py | 4 ++-- tuw_nlp/graph/utils.py | 15 ++++++++++----- tuw_nlp/text/patterns/de.py | 4 +++- tuw_nlp/text/pipeline.py | 8 ++++---- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/tuw_nlp/grammar/text_to_4lang.py b/tuw_nlp/grammar/text_to_4lang.py index d730bde..96ebb6a 100644 --- a/tuw_nlp/grammar/text_to_4lang.py +++ b/tuw_nlp/grammar/text_to_4lang.py @@ -90,8 +90,8 @@ def parse(self, sen): return relabeled_graph, self.graph_lexical.vocab.get_id( graph.nodes[root]["name"]) - def __call__(self, text, depth=0, substitute=False, expand_set=set(), strategy="None"): - for sen in self.nlp(text).sentences: + def __call__(self, text, depth=0, substitute=False, expand_set=set(), strategy="None", ssplit=True): + for sen in self.nlp(text, ssplit=ssplit).sentences: graph, root = self.parse(sen) fourlang = FourLang(graph, root, self.graph_lexical) diff --git a/tuw_nlp/graph/utils.py b/tuw_nlp/graph/utils.py index a02fc51..2007583 100644 --- a/tuw_nlp/graph/utils.py +++ b/tuw_nlp/graph/utils.py @@ -1,3 +1,4 @@ +import logging import re from copy import deepcopy from itertools import chain @@ -156,7 +157,7 @@ def match(self, graph, return_subgraphs=False): for p in patt: matcher = DiGraphMatcher( graph, p, node_match=GraphFormulaMatcher.node_matcher, edge_match=GraphFormulaMatcher.edge_matcher) - + monomorphic_subgraphs = list(matcher.subgraph_monomorphisms_iter()) if not len(monomorphic_subgraphs) == 0: mapping = monomorphic_subgraphs[0] @@ -170,9 +171,9 @@ def match(self, graph, return_subgraphs=False): if pos_match: if return_subgraphs: yield key, i, subgraphs - else: + else: yield key, i - + def gen_subgraphs(M, no_edges): """M must be dict of dicts, see networkx.convert.to_dict_of_dicts. @@ -263,9 +264,13 @@ def graph_to_pn(graph): G = pn.Graph(pn_nodes + pn_edges) + try: # two spaces before edge name, because alto does it :) - return pn.encode(G, indent=0).replace('\n', ' ') - + return pn.encode(G, indent=0).replace('\n', ' ') + except pn.exceptions.LayoutError as e: + words = [graph.nodes[node]['name'] for node in graph.nodes()] + logging.error(f'pn.encode failed on this graph: {words}') + raise e def read_alto_output(raw_dl): id_to_word = {} diff --git a/tuw_nlp/text/patterns/de.py b/tuw_nlp/text/patterns/de.py index b9acad7..1aa22aa 100644 --- a/tuw_nlp/text/patterns/de.py +++ b/tuw_nlp/text/patterns/de.py @@ -6,7 +6,9 @@ "Kat.G.", "lit.", "ONr.", - 'bzw.' + 'bzw.', + 'Pkt.', + "Dipl.-Ing." ] MONTH = [ diff --git a/tuw_nlp/text/pipeline.py b/tuw_nlp/text/pipeline.py index cd1bcae..355d20d 100644 --- a/tuw_nlp/text/pipeline.py +++ b/tuw_nlp/text/pipeline.py @@ -46,15 +46,15 @@ def __init__(self, stanza_pipeline, cache_path, init=None): self.changed = False - def parse(self, text): + def parse(self, text, ssplit): if self.nlp is None: self.nlp = self.init() - return self.nlp(text) + return self.nlp(text) if ssplit else self.nlp.additional(text) - def __call__(self, text): + def __call__(self, text, ssplit=True): if text not in self.parsed: - self.parsed[text] = self.parse(text) + self.parsed[text] = self.parse(text, ssplit=ssplit) self.changed = True return self.parsed[text]