From 1a283240038ff43de50fae7690c523c8de2eaf89 Mon Sep 17 00:00:00 2001 From: Bram Buitendijk Date: Thu, 22 Feb 2018 15:17:38 +0100 Subject: [PATCH] Bugfix for "non-matching root node" bug --- input_xml/w1.xml | 1 + input_xml/w2.xml | 1 + .../AbstractSegmenter.java | 29 +- .../ContentSegmenterUnitTest.java | 278 +++++++++++------- 4 files changed, 186 insertions(+), 123 deletions(-) create mode 100644 input_xml/w1.xml create mode 100644 input_xml/w2.xml diff --git a/input_xml/w1.xml b/input_xml/w1.xml new file mode 100644 index 0000000..5c7a367 --- /dev/null +++ b/input_xml/w1.xml @@ -0,0 +1 @@ +text \ No newline at end of file diff --git a/input_xml/w2.xml b/input_xml/w2.xml new file mode 100644 index 0000000..eaea9cb --- /dev/null +++ b/input_xml/w2.xml @@ -0,0 +1 @@ +text \ No newline at end of file diff --git a/src/main/java/prioritised_xml_collation/AbstractSegmenter.java b/src/main/java/prioritised_xml_collation/AbstractSegmenter.java index 518e1b2..d54a6e7 100644 --- a/src/main/java/prioritised_xml_collation/AbstractSegmenter.java +++ b/src/main/java/prioritised_xml_collation/AbstractSegmenter.java @@ -22,7 +22,7 @@ public List calculateSegmentation(Score[][] editTable, List t Boolean stateChange = lastCell.match != currentCell.match; if (stateChange) { System.out.println(lastCell.match + ", " + currentCell.match); - addCelltoSuperwitness(currentCell, tokensA, tokensB, lastX, lastY, superwitness); + addLastCellToSuperwitness(lastCell, tokensA, tokensB, x, y, superwitness); // System.out.println(String.format("%d %d %d %d", lastX, lastY, x, y)); // change the pointer lastY = y; @@ -31,18 +31,23 @@ public List calculateSegmentation(Score[][] editTable, List t } } // process the final cell in de EditGraphTable (additions/omissions at the beginning of the witnesses - Score currentCell = editTable[0][0]; - addCelltoSuperwitness(currentCell, tokensA, tokensB, lastX, lastY, superwitness); +// Score currentCell = editTable[0][0]; + addLastCellToSuperwitness(lastCell, tokensA, tokensB, 0, 0, superwitness); // System.out.println(String.format("%d %d %d %d", lastX, lastY, 0, 0)); return superwitness; } - private void addCelltoSuperwitness(Score currentCell, List tokensA, List tokensB, int lastX, int lastY, List superwitness) { - int x = currentCell.x; - int y = currentCell.y; - List segmentTokensA = tokensA.subList(x, lastX); - List segmentTokensB = tokensB.subList(y, lastY); - if (currentCell.match == Boolean.TRUE) { + private void addLastCellToSuperwitness(Score lastCell, List tokensA, List tokensB, int currentX, int currentY, List superwitness) { + int lastX = lastCell.x; + int lastY = lastCell.y; + List segmentTokensA = tokensA.subList(currentX, lastX); + List segmentTokensB = tokensB.subList(currentY, lastY); + if (lastCell.match) { + // aligned + Segment segment = new Segment(segmentTokensA, segmentTokensB, Score.Type.aligned); + superwitness.add(0, segment); + } + else { // if currentCell has tokens of type "match", look at lastcell // if lastCell is addition/omission/replacement stateChange occured and a new segment can be made // if cell contains tokens from both witnesses its a replacement @@ -64,10 +69,6 @@ else if (segmentTokensB.isEmpty()) { superwitness.add(0, segment); } } - // aligned - else { - Segment segment = new Segment(segmentTokensA, segmentTokensB, Score.Type.aligned); - superwitness.add(0, segment); - } + } } diff --git a/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java b/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java index bb41405..319f360 100644 --- a/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java +++ b/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java @@ -1,126 +1,186 @@ package prioritised_xml_collation; +import static org.hamcrest.collection.IsIterableContainingInOrder.contains; +import static org.hamcrest.core.Is.is; +import static org.junit.Assert.assertThat; import org.junit.Ignore; import org.junit.Test; +import static prioritised_xml_collation.Score.Type.*; +import static prioritised_xml_collation.SegmentMatcher.sM; +import static prioritised_xml_collation.XMLTokenContentMatcher.t; import java.io.File; import java.util.Arrays; import java.util.List; -import static org.hamcrest.core.Is.is; -import static org.junit.Assert.assertThat; -import static prioritised_xml_collation.SegmentMatcher.sM; -import static prioritised_xml_collation.XMLTokenContentMatcher.t; -import static org.hamcrest.collection.IsIterableContainingInOrder.contains; - /** * Created by ellibleeker on 08/02/2017. */ public class ContentSegmenterUnitTest { - @Test - public void testSegmentMatcher() throws Exception { - List tokensWa = Arrays.asList(new XMLToken("a"), new XMLToken("b")); - List tokensWb = Arrays.asList(new XMLToken("a"), new XMLToken("b")); - Score.Type type = Score.Type.aligned; - - Segment segment = new Segment(tokensWa, tokensWb, type); - assertThat(segment, is(sM(Score.Type.aligned).tokensWa(t("a"), t("b")).tokensWb(t("a"), t("b")))); - } - - @Test - public void testSegmentFactory() throws Exception { - Segment segment = Segment.s(Score.Type.aligned).tokensWa("a").tokensWb("a"); - assertThat(segment, is(sM(Score.Type.aligned).tokensWa(t("a")).tokensWb(t("a")))); - } - - @Test - public void testSegmentAligned() throws Exception { - File input_tokensA = new File("input_xml/witA-simple.xml"); - File input_tokensB = new File("input_xml/witB-simple.xml"); - Tokenizer tokenizer = new Tokenizer(); - List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); - List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); - AbstractScorer contentScorer = new ContentScorer(); - AbstractSegmenter contentSegmenter = new ContentSegmenter(); - EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); - // take that output - List segments = aligner.align(tokensWa, tokensWb); - // actualSegment = one segment object with two lists of token(s) and a type - Segment actualSegment = segments.get(0); - SegmentMatcher expectedSegment = sM(Score.Type.aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s")); - // assert that the segment contains the tokens and the type we want it to have - assertThat(actualSegment, is(expectedSegment)); - } - - @Test - public void testSegmentReplaced() throws Exception { - File input_tokensA = new File("input_xml/witA-simple.xml"); - File input_tokensB = new File("input_xml/witB-simple.xml"); - Tokenizer tokenizer = new Tokenizer(); - List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); - List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); - AbstractScorer contentScorer = new ContentScorer(); - AbstractSegmenter contentSegmenter = new ContentSegmenter(); - EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); - // take that output - List segments = aligner.align(tokensWa, tokensWb); - // actualSegment = one segment object with two lists of token(s) and a type - Segment actualSegment = segments.get(1); - System.out.println(segments); - SegmentMatcher expectedSegment = sM(Score.Type.replacement).tokensWa(t("c")).tokensWb(t("a")); - // assert that the segment contains the tokens and the type we want it to have - assertThat(actualSegment, is(expectedSegment)); - } - - @Test - public void testAllSegments() throws Exception { - File input_tokensA = new File("input_xml/witA-simple.xml"); - File input_tokensB = new File("input_xml/witB-simple.xml"); - Tokenizer tokenizer = new Tokenizer(); - List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); - List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); - AbstractScorer contentScorer = new ContentScorer(); - AbstractSegmenter contentSegmenter = new ContentSegmenter(); - EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); - // take that output - List segments = aligner.align(tokensWa, tokensWb); - assertThat(segments, contains(sM(Score.Type.aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s")), sM(Score.Type.replacement).tokensWa(t("c")).tokensWb(t("a")), sM(Score.Type.aligned).tokensWa(t("/s"), t("/TEI")).tokensWb(t("/s"), t("/TEI")))); - } - - @Test - public void testSegmentS21() throws Exception { - File input_tokensA = new File("input_xml/s21-A.xml"); - File input_tokensB = new File("input_xml/s21-B.xml"); - Tokenizer tokenizer = new Tokenizer(); - List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); - List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); - AbstractScorer contentScorer = new ContentScorer(); - AbstractSegmenter contentSegmenter = new ContentSegmenter(); - EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); - // take that output and align - List segments = aligner.align(tokensWa, tokensWb); - System.out.println(segments); - assertThat(segments, contains(sM(Score.Type.aligned).tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")).tokensWb(t("text"),t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")), sM(Score.Type.omission).tokensWa(t("lb"), t("/lb")).tokensWb(t("")), sM(Score.Type.aligned).tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")).tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("vrouw")).tokensWb(t("vrouw")), sM(Score.Type.replacement).tokensWa(t(","), t("de"), t("ongewisheid")).tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")), sM(Score.Type.aligned).tokensWa(t("voor"), t("de")).tokensWb(t("voor"), t("de")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("liefelijke"), t("toestemming")).tokensWb(t("liefelijke"), t("toestemming")), sM(Score.Type.replacement).tokensWa(t("!")).tokensWb(t(".")), sM(Score.Type.aligned).tokensWa(t("/s"), t("/div"), t("/body"), t("/text")).tokensWb(t("/s"), t("/div"), t("/body"), t("/text")))); - } - - @Ignore("Test fails when root node is not a match") - @Test - public void testLastSegment() throws Exception { - File input_tokensA = new File("input_xml/witA-simple2.xml"); - File input_tokensB = new File("input_xml/witB-simple.xml"); - Tokenizer tokenizer = new Tokenizer(); - List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); - List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); - AbstractScorer contentScorer = new ContentScorer(); - AbstractSegmenter contentSegmenter = new ContentSegmenter(); - EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); - // take that output and align - List segments = aligner.align(tokensWa, tokensWb); - assertThat(segments, contains(sM(Score.Type.aligned).tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")).tokensWb(t("text"),t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")), sM(Score.Type.omission).tokensWa(t("lb"), t("/lb")).tokensWb(t("")), sM(Score.Type.aligned).tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")).tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("vrouw")).tokensWb(t("vrouw")), sM(Score.Type.replacement).tokensWa(t(","), t("de"), t("ongewisheid")).tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")), sM(Score.Type.aligned).tokensWa(t("vóór"), t("de")).tokensWb(t("vóór"), t("de")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("liefelijke"), t("toestemming")).tokensWb(t("liefelijke"), t("toestemming")), sM(Score.Type.replacement).tokensWa(t("!")).tokensWb(t(".")))); - } - // TODO When root node is not a match the test fails: adjust typing - } + @Test + public void testSegmentMatcher() throws Exception { + List tokensWa = Arrays.asList(new XMLToken("a"), new XMLToken("b")); + List tokensWb = Arrays.asList(new XMLToken("a"), new XMLToken("b")); + Score.Type type = aligned; + + Segment segment = new Segment(tokensWa, tokensWb, type); + assertThat(segment, is(sM(aligned).tokensWa(t("a"), t("b")).tokensWb(t("a"), t("b")))); + } + + @Test + public void testSegmentFactory() throws Exception { + Segment segment = Segment.s(aligned).tokensWa("a").tokensWb("a"); + assertThat(segment, is(sM(aligned).tokensWa(t("a")).tokensWb(t("a")))); + } + + @Test + public void testSegmentAligned() throws Exception { + File input_tokensA = new File("input_xml/witA-simple.xml"); + File input_tokensB = new File("input_xml/witB-simple.xml"); + Tokenizer tokenizer = new Tokenizer(); + List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); + List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); + AbstractScorer contentScorer = new ContentScorer(); + AbstractSegmenter contentSegmenter = new ContentSegmenter(); + EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); + // take that output + List segments = aligner.align(tokensWa, tokensWb); + // actualSegment = one segment object with two lists of token(s) and a type + Segment actualSegment = segments.get(0); + SegmentMatcher expectedSegment = sM(aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s")); + // assert that the segment contains the tokens and the type we want it to have + assertThat(actualSegment, is(expectedSegment)); + } + + @Test + public void testSegmentReplaced() throws Exception { + File input_tokensA = new File("input_xml/witA-simple.xml"); + File input_tokensB = new File("input_xml/witB-simple.xml"); + Tokenizer tokenizer = new Tokenizer(); + List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); + List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); + AbstractScorer contentScorer = new ContentScorer(); + AbstractSegmenter contentSegmenter = new ContentSegmenter(); + EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); + // take that output + List segments = aligner.align(tokensWa, tokensWb); + // actualSegment = one segment object with two lists of token(s) and a type + Segment actualSegment = segments.get(1); + System.out.println(segments); + SegmentMatcher expectedSegment = sM(replacement).tokensWa(t("c")).tokensWb(t("a")); + // assert that the segment contains the tokens and the type we want it to have + assertThat(actualSegment, is(expectedSegment)); + } + + @Test + public void testAllSegments() throws Exception { + File input_tokensA = new File("input_xml/witA-simple.xml"); + File input_tokensB = new File("input_xml/witB-simple.xml"); + Tokenizer tokenizer = new Tokenizer(); + List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); + List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); + AbstractScorer contentScorer = new ContentScorer(); + AbstractSegmenter contentSegmenter = new ContentSegmenter(); + EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); + // take that output + List segments = aligner.align(tokensWa, tokensWb); + assertThat(segments, contains(sM(aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s")), sM(replacement).tokensWa(t("c")).tokensWb(t("a")), sM(aligned).tokensWa(t("/s"), t("/TEI")).tokensWb(t("/s"), t("/TEI")))); + } + + @Test + public void testSegmentS21() throws Exception { + File input_tokensA = new File("input_xml/s21-A.xml"); + File input_tokensB = new File("input_xml/s21-B.xml"); + Tokenizer tokenizer = new Tokenizer(); + List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); + List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); + AbstractScorer contentScorer = new ContentScorer(); + AbstractSegmenter contentSegmenter = new ContentSegmenter(); + EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); + // take that output and align + List segments = aligner.align(tokensWa, tokensWb); + System.out.println(segments); + assertThat(segments, contains(sM(aligned).tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")).tokensWb(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")), sM(omission).tokensWa(t("lb"), t("/lb")).tokensWb(t("")), sM(aligned).tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")).tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")), sM(addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(aligned).tokensWa(t("vrouw")).tokensWb(t("vrouw")), sM(replacement).tokensWa(t(","), t("de"), t("ongewisheid")).tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")), sM(aligned).tokensWa(t("voor"), t("de")).tokensWb(t("voor"), t("de")), sM(addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(aligned).tokensWa(t("liefelijke"), t("toestemming")).tokensWb(t("liefelijke"), t("toestemming")), sM(replacement).tokensWa(t("!")).tokensWb(t(".")), sM(aligned).tokensWa(t("/s"), t("/div"), t("/body"), t("/text")).tokensWb(t("/s"), t("/div"), t("/body"), t("/text")))); + } + + @Ignore("Test fails when root node is not a match") + @Test + public void testLastSegment() throws Exception { + File input_tokensA = new File("input_xml/witA-simple2.xml"); + File input_tokensB = new File("input_xml/witB-simple.xml"); + Tokenizer tokenizer = new Tokenizer(); + List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); + List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); + AbstractScorer contentScorer = new ContentScorer(); + AbstractSegmenter contentSegmenter = new ContentSegmenter(); + EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); + // take that output and align + List segments = aligner.align(tokensWa, tokensWb); + assertThat(segments, contains( + sM(aligned)// + .tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit"))// + .tokensWb(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")),// + sM(omission)// + .tokensWa(t("lb"), t("/lb"))// + .tokensWb(t("")),// + sM(aligned)// + .tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een"))// + .tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")),// + sM(addition)// + .tokensWa(t(""))// + .tokensWb(t("lb"), t("/lb")),// + sM(aligned)// + .tokensWa(t("vrouw"))// + .tokensWb(t("vrouw")),// + sM(replacement)// + .tokensWa(t(","), t("de"), t("ongewisheid"))// + .tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")),// + sM(aligned)// + .tokensWa(t("vóór"), t("de"))// + .tokensWb(t("vóór"), t("de")),// + sM(addition)// + .tokensWa(t(""))// + .tokensWb(t("lb"), t("/lb")),// + sM(aligned)// + .tokensWa(t("liefelijke"), t("toestemming"))// + .tokensWb(t("liefelijke"), t("toestemming")),// + sM(replacement)// + .tokensWa(t("!"))// + .tokensWb(t("."))// + )// + ); + } + // TODO When root node is not a match the test fails: adjust typing + + @Test + public void testStartTagReplacement() throws Exception { + File input_tokensA = new File("input_xml/w1.xml"); + File input_tokensB = new File("input_xml/w2.xml"); + Tokenizer tokenizer = new Tokenizer(); + List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA); + List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB); + AbstractScorer contentScorer = new ContentScorer(); + AbstractSegmenter contentSegmenter = new ContentSegmenter(); + EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter); + // take that output and align + List segments = aligner.align(tokensWa, tokensWb); + System.out.println(segments); + assertThat(segments, contains( + sM(replacement)// + .tokensWa(t("a"))// + .tokensWb(t("b")),// + sM(aligned)// + .tokensWa(t("text"))// + .tokensWb(t("text")),// + sM(replacement)// + .tokensWa(t("/a"))// + .tokensWb(t("/b"))// + ) + ); + } + +}