diff --git a/input_xml/w1.xml b/input_xml/w1.xml
new file mode 100644
index 0000000..5c7a367
--- /dev/null
+++ b/input_xml/w1.xml
@@ -0,0 +1 @@
+text
\ No newline at end of file
diff --git a/input_xml/w2.xml b/input_xml/w2.xml
new file mode 100644
index 0000000..eaea9cb
--- /dev/null
+++ b/input_xml/w2.xml
@@ -0,0 +1 @@
+text
\ No newline at end of file
diff --git a/src/main/java/prioritised_xml_collation/AbstractSegmenter.java b/src/main/java/prioritised_xml_collation/AbstractSegmenter.java
index 518e1b2..d54a6e7 100644
--- a/src/main/java/prioritised_xml_collation/AbstractSegmenter.java
+++ b/src/main/java/prioritised_xml_collation/AbstractSegmenter.java
@@ -22,7 +22,7 @@ public List calculateSegmentation(Score[][] editTable, List t
Boolean stateChange = lastCell.match != currentCell.match;
if (stateChange) {
System.out.println(lastCell.match + ", " + currentCell.match);
- addCelltoSuperwitness(currentCell, tokensA, tokensB, lastX, lastY, superwitness);
+ addLastCellToSuperwitness(lastCell, tokensA, tokensB, x, y, superwitness);
// System.out.println(String.format("%d %d %d %d", lastX, lastY, x, y));
// change the pointer
lastY = y;
@@ -31,18 +31,23 @@ public List calculateSegmentation(Score[][] editTable, List t
}
}
// process the final cell in de EditGraphTable (additions/omissions at the beginning of the witnesses
- Score currentCell = editTable[0][0];
- addCelltoSuperwitness(currentCell, tokensA, tokensB, lastX, lastY, superwitness);
+// Score currentCell = editTable[0][0];
+ addLastCellToSuperwitness(lastCell, tokensA, tokensB, 0, 0, superwitness);
// System.out.println(String.format("%d %d %d %d", lastX, lastY, 0, 0));
return superwitness;
}
- private void addCelltoSuperwitness(Score currentCell, List tokensA, List tokensB, int lastX, int lastY, List superwitness) {
- int x = currentCell.x;
- int y = currentCell.y;
- List segmentTokensA = tokensA.subList(x, lastX);
- List segmentTokensB = tokensB.subList(y, lastY);
- if (currentCell.match == Boolean.TRUE) {
+ private void addLastCellToSuperwitness(Score lastCell, List tokensA, List tokensB, int currentX, int currentY, List superwitness) {
+ int lastX = lastCell.x;
+ int lastY = lastCell.y;
+ List segmentTokensA = tokensA.subList(currentX, lastX);
+ List segmentTokensB = tokensB.subList(currentY, lastY);
+ if (lastCell.match) {
+ // aligned
+ Segment segment = new Segment(segmentTokensA, segmentTokensB, Score.Type.aligned);
+ superwitness.add(0, segment);
+ }
+ else {
// if currentCell has tokens of type "match", look at lastcell
// if lastCell is addition/omission/replacement stateChange occured and a new segment can be made
// if cell contains tokens from both witnesses its a replacement
@@ -64,10 +69,6 @@ else if (segmentTokensB.isEmpty()) {
superwitness.add(0, segment);
}
}
- // aligned
- else {
- Segment segment = new Segment(segmentTokensA, segmentTokensB, Score.Type.aligned);
- superwitness.add(0, segment);
- }
+
}
}
diff --git a/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java b/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java
index bb41405..319f360 100644
--- a/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java
+++ b/src/test/java/prioritised_xml_collation/ContentSegmenterUnitTest.java
@@ -1,126 +1,186 @@
package prioritised_xml_collation;
+import static org.hamcrest.collection.IsIterableContainingInOrder.contains;
+import static org.hamcrest.core.Is.is;
+import static org.junit.Assert.assertThat;
import org.junit.Ignore;
import org.junit.Test;
+import static prioritised_xml_collation.Score.Type.*;
+import static prioritised_xml_collation.SegmentMatcher.sM;
+import static prioritised_xml_collation.XMLTokenContentMatcher.t;
import java.io.File;
import java.util.Arrays;
import java.util.List;
-import static org.hamcrest.core.Is.is;
-import static org.junit.Assert.assertThat;
-import static prioritised_xml_collation.SegmentMatcher.sM;
-import static prioritised_xml_collation.XMLTokenContentMatcher.t;
-import static org.hamcrest.collection.IsIterableContainingInOrder.contains;
-
/**
* Created by ellibleeker on 08/02/2017.
*/
public class ContentSegmenterUnitTest {
- @Test
- public void testSegmentMatcher() throws Exception {
- List tokensWa = Arrays.asList(new XMLToken("a"), new XMLToken("b"));
- List tokensWb = Arrays.asList(new XMLToken("a"), new XMLToken("b"));
- Score.Type type = Score.Type.aligned;
-
- Segment segment = new Segment(tokensWa, tokensWb, type);
- assertThat(segment, is(sM(Score.Type.aligned).tokensWa(t("a"), t("b")).tokensWb(t("a"), t("b"))));
- }
-
- @Test
- public void testSegmentFactory() throws Exception {
- Segment segment = Segment.s(Score.Type.aligned).tokensWa("a").tokensWb("a");
- assertThat(segment, is(sM(Score.Type.aligned).tokensWa(t("a")).tokensWb(t("a"))));
- }
-
- @Test
- public void testSegmentAligned() throws Exception {
- File input_tokensA = new File("input_xml/witA-simple.xml");
- File input_tokensB = new File("input_xml/witB-simple.xml");
- Tokenizer tokenizer = new Tokenizer();
- List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
- List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
- AbstractScorer contentScorer = new ContentScorer();
- AbstractSegmenter contentSegmenter = new ContentSegmenter();
- EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
- // take that output
- List segments = aligner.align(tokensWa, tokensWb);
- // actualSegment = one segment object with two lists of token(s) and a type
- Segment actualSegment = segments.get(0);
- SegmentMatcher expectedSegment = sM(Score.Type.aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s"));
- // assert that the segment contains the tokens and the type we want it to have
- assertThat(actualSegment, is(expectedSegment));
- }
-
- @Test
- public void testSegmentReplaced() throws Exception {
- File input_tokensA = new File("input_xml/witA-simple.xml");
- File input_tokensB = new File("input_xml/witB-simple.xml");
- Tokenizer tokenizer = new Tokenizer();
- List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
- List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
- AbstractScorer contentScorer = new ContentScorer();
- AbstractSegmenter contentSegmenter = new ContentSegmenter();
- EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
- // take that output
- List segments = aligner.align(tokensWa, tokensWb);
- // actualSegment = one segment object with two lists of token(s) and a type
- Segment actualSegment = segments.get(1);
- System.out.println(segments);
- SegmentMatcher expectedSegment = sM(Score.Type.replacement).tokensWa(t("c")).tokensWb(t("a"));
- // assert that the segment contains the tokens and the type we want it to have
- assertThat(actualSegment, is(expectedSegment));
- }
-
- @Test
- public void testAllSegments() throws Exception {
- File input_tokensA = new File("input_xml/witA-simple.xml");
- File input_tokensB = new File("input_xml/witB-simple.xml");
- Tokenizer tokenizer = new Tokenizer();
- List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
- List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
- AbstractScorer contentScorer = new ContentScorer();
- AbstractSegmenter contentSegmenter = new ContentSegmenter();
- EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
- // take that output
- List segments = aligner.align(tokensWa, tokensWb);
- assertThat(segments, contains(sM(Score.Type.aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s")), sM(Score.Type.replacement).tokensWa(t("c")).tokensWb(t("a")), sM(Score.Type.aligned).tokensWa(t("/s"), t("/TEI")).tokensWb(t("/s"), t("/TEI"))));
- }
-
- @Test
- public void testSegmentS21() throws Exception {
- File input_tokensA = new File("input_xml/s21-A.xml");
- File input_tokensB = new File("input_xml/s21-B.xml");
- Tokenizer tokenizer = new Tokenizer();
- List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
- List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
- AbstractScorer contentScorer = new ContentScorer();
- AbstractSegmenter contentSegmenter = new ContentSegmenter();
- EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
- // take that output and align
- List segments = aligner.align(tokensWa, tokensWb);
- System.out.println(segments);
- assertThat(segments, contains(sM(Score.Type.aligned).tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")).tokensWb(t("text"),t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")), sM(Score.Type.omission).tokensWa(t("lb"), t("/lb")).tokensWb(t("")), sM(Score.Type.aligned).tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")).tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("vrouw")).tokensWb(t("vrouw")), sM(Score.Type.replacement).tokensWa(t(","), t("de"), t("ongewisheid")).tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")), sM(Score.Type.aligned).tokensWa(t("voor"), t("de")).tokensWb(t("voor"), t("de")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("liefelijke"), t("toestemming")).tokensWb(t("liefelijke"), t("toestemming")), sM(Score.Type.replacement).tokensWa(t("!")).tokensWb(t(".")), sM(Score.Type.aligned).tokensWa(t("/s"), t("/div"), t("/body"), t("/text")).tokensWb(t("/s"), t("/div"), t("/body"), t("/text"))));
- }
-
- @Ignore("Test fails when root node is not a match")
- @Test
- public void testLastSegment() throws Exception {
- File input_tokensA = new File("input_xml/witA-simple2.xml");
- File input_tokensB = new File("input_xml/witB-simple.xml");
- Tokenizer tokenizer = new Tokenizer();
- List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
- List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
- AbstractScorer contentScorer = new ContentScorer();
- AbstractSegmenter contentSegmenter = new ContentSegmenter();
- EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
- // take that output and align
- List segments = aligner.align(tokensWa, tokensWb);
- assertThat(segments, contains(sM(Score.Type.aligned).tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")).tokensWb(t("text"),t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")), sM(Score.Type.omission).tokensWa(t("lb"), t("/lb")).tokensWb(t("")), sM(Score.Type.aligned).tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")).tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("vrouw")).tokensWb(t("vrouw")), sM(Score.Type.replacement).tokensWa(t(","), t("de"), t("ongewisheid")).tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")), sM(Score.Type.aligned).tokensWa(t("vóór"), t("de")).tokensWb(t("vóór"), t("de")), sM(Score.Type.addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(Score.Type.aligned).tokensWa(t("liefelijke"), t("toestemming")).tokensWb(t("liefelijke"), t("toestemming")), sM(Score.Type.replacement).tokensWa(t("!")).tokensWb(t("."))));
- }
- // TODO When root node is not a match the test fails: adjust typing
- }
+ @Test
+ public void testSegmentMatcher() throws Exception {
+ List tokensWa = Arrays.asList(new XMLToken("a"), new XMLToken("b"));
+ List tokensWb = Arrays.asList(new XMLToken("a"), new XMLToken("b"));
+ Score.Type type = aligned;
+
+ Segment segment = new Segment(tokensWa, tokensWb, type);
+ assertThat(segment, is(sM(aligned).tokensWa(t("a"), t("b")).tokensWb(t("a"), t("b"))));
+ }
+
+ @Test
+ public void testSegmentFactory() throws Exception {
+ Segment segment = Segment.s(aligned).tokensWa("a").tokensWb("a");
+ assertThat(segment, is(sM(aligned).tokensWa(t("a")).tokensWb(t("a"))));
+ }
+
+ @Test
+ public void testSegmentAligned() throws Exception {
+ File input_tokensA = new File("input_xml/witA-simple.xml");
+ File input_tokensB = new File("input_xml/witB-simple.xml");
+ Tokenizer tokenizer = new Tokenizer();
+ List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
+ List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
+ AbstractScorer contentScorer = new ContentScorer();
+ AbstractSegmenter contentSegmenter = new ContentSegmenter();
+ EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
+ // take that output
+ List segments = aligner.align(tokensWa, tokensWb);
+ // actualSegment = one segment object with two lists of token(s) and a type
+ Segment actualSegment = segments.get(0);
+ SegmentMatcher expectedSegment = sM(aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s"));
+ // assert that the segment contains the tokens and the type we want it to have
+ assertThat(actualSegment, is(expectedSegment));
+ }
+
+ @Test
+ public void testSegmentReplaced() throws Exception {
+ File input_tokensA = new File("input_xml/witA-simple.xml");
+ File input_tokensB = new File("input_xml/witB-simple.xml");
+ Tokenizer tokenizer = new Tokenizer();
+ List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
+ List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
+ AbstractScorer contentScorer = new ContentScorer();
+ AbstractSegmenter contentSegmenter = new ContentSegmenter();
+ EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
+ // take that output
+ List segments = aligner.align(tokensWa, tokensWb);
+ // actualSegment = one segment object with two lists of token(s) and a type
+ Segment actualSegment = segments.get(1);
+ System.out.println(segments);
+ SegmentMatcher expectedSegment = sM(replacement).tokensWa(t("c")).tokensWb(t("a"));
+ // assert that the segment contains the tokens and the type we want it to have
+ assertThat(actualSegment, is(expectedSegment));
+ }
+
+ @Test
+ public void testAllSegments() throws Exception {
+ File input_tokensA = new File("input_xml/witA-simple.xml");
+ File input_tokensB = new File("input_xml/witB-simple.xml");
+ Tokenizer tokenizer = new Tokenizer();
+ List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
+ List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
+ AbstractScorer contentScorer = new ContentScorer();
+ AbstractSegmenter contentSegmenter = new ContentSegmenter();
+ EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
+ // take that output
+ List segments = aligner.align(tokensWa, tokensWb);
+ assertThat(segments, contains(sM(aligned).tokensWa(t("TEI"), t("s")).tokensWb(t("TEI"), t("s")), sM(replacement).tokensWa(t("c")).tokensWb(t("a")), sM(aligned).tokensWa(t("/s"), t("/TEI")).tokensWb(t("/s"), t("/TEI"))));
+ }
+
+ @Test
+ public void testSegmentS21() throws Exception {
+ File input_tokensA = new File("input_xml/s21-A.xml");
+ File input_tokensB = new File("input_xml/s21-B.xml");
+ Tokenizer tokenizer = new Tokenizer();
+ List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
+ List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
+ AbstractScorer contentScorer = new ContentScorer();
+ AbstractSegmenter contentSegmenter = new ContentSegmenter();
+ EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
+ // take that output and align
+ List segments = aligner.align(tokensWa, tokensWb);
+ System.out.println(segments);
+ assertThat(segments, contains(sM(aligned).tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")).tokensWb(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")), sM(omission).tokensWa(t("lb"), t("/lb")).tokensWb(t("")), sM(aligned).tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")).tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")), sM(addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(aligned).tokensWa(t("vrouw")).tokensWb(t("vrouw")), sM(replacement).tokensWa(t(","), t("de"), t("ongewisheid")).tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")), sM(aligned).tokensWa(t("voor"), t("de")).tokensWb(t("voor"), t("de")), sM(addition).tokensWa(t("")).tokensWb(t("lb"), t("/lb")), sM(aligned).tokensWa(t("liefelijke"), t("toestemming")).tokensWb(t("liefelijke"), t("toestemming")), sM(replacement).tokensWa(t("!")).tokensWb(t(".")), sM(aligned).tokensWa(t("/s"), t("/div"), t("/body"), t("/text")).tokensWb(t("/s"), t("/div"), t("/body"), t("/text"))));
+ }
+
+ @Ignore("Test fails when root node is not a match")
+ @Test
+ public void testLastSegment() throws Exception {
+ File input_tokensA = new File("input_xml/witA-simple2.xml");
+ File input_tokensB = new File("input_xml/witB-simple.xml");
+ Tokenizer tokenizer = new Tokenizer();
+ List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
+ List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
+ AbstractScorer contentScorer = new ContentScorer();
+ AbstractSegmenter contentSegmenter = new ContentSegmenter();
+ EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
+ // take that output and align
+ List segments = aligner.align(tokensWa, tokensWb);
+ assertThat(segments, contains(
+ sM(aligned)//
+ .tokensWa(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit"))//
+ .tokensWb(t("text"), t("body"), t("div"), t("s"), t("Hoe"), t("zoet"), t("moet"), t("nochtans"), t("zijn"), t("dit")),//
+ sM(omission)//
+ .tokensWa(t("lb"), t("/lb"))//
+ .tokensWb(t("")),//
+ sM(aligned)//
+ .tokensWa(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een"))//
+ .tokensWb(t("del"), t("werven"), t("om"), t("/del"), t("add"), t("trachten"), t("naar"), t("/add"), t("een")),//
+ sM(addition)//
+ .tokensWa(t(""))//
+ .tokensWb(t("lb"), t("/lb")),//
+ sM(aligned)//
+ .tokensWa(t("vrouw"))//
+ .tokensWb(t("vrouw")),//
+ sM(replacement)//
+ .tokensWa(t(","), t("de"), t("ongewisheid"))//
+ .tokensWb(t("!"), t("/s"), t("s"), t("Die"), t("dagen"), t("van"), t("nerveuze"), t("verwachting")),//
+ sM(aligned)//
+ .tokensWa(t("vóór"), t("de"))//
+ .tokensWb(t("vóór"), t("de")),//
+ sM(addition)//
+ .tokensWa(t(""))//
+ .tokensWb(t("lb"), t("/lb")),//
+ sM(aligned)//
+ .tokensWa(t("liefelijke"), t("toestemming"))//
+ .tokensWb(t("liefelijke"), t("toestemming")),//
+ sM(replacement)//
+ .tokensWa(t("!"))//
+ .tokensWb(t("."))//
+ )//
+ );
+ }
+ // TODO When root node is not a match the test fails: adjust typing
+
+ @Test
+ public void testStartTagReplacement() throws Exception {
+ File input_tokensA = new File("input_xml/w1.xml");
+ File input_tokensB = new File("input_xml/w2.xml");
+ Tokenizer tokenizer = new Tokenizer();
+ List tokensWa = tokenizer.convertXMLFileIntoTokens(input_tokensA);
+ List tokensWb = tokenizer.convertXMLFileIntoTokens(input_tokensB);
+ AbstractScorer contentScorer = new ContentScorer();
+ AbstractSegmenter contentSegmenter = new ContentSegmenter();
+ EditGraphAligner aligner = new EditGraphAligner(contentScorer, contentSegmenter);
+ // take that output and align
+ List segments = aligner.align(tokensWa, tokensWb);
+ System.out.println(segments);
+ assertThat(segments, contains(
+ sM(replacement)//
+ .tokensWa(t("a"))//
+ .tokensWb(t("b")),//
+ sM(aligned)//
+ .tokensWa(t("text"))//
+ .tokensWb(t("text")),//
+ sM(replacement)//
+ .tokensWa(t("/a"))//
+ .tokensWb(t("/b"))//
+ )
+ );
+ }
+
+}