diff --git a/README.md b/README.md
index e43c3a4..e3cea20 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-[![Build Status](https://staging.travis-ci.com/HumanBehaviourChangeProject/Info-extract.svg?branch=master)](https://staging.travis-ci.com/HumanBehaviourChangeProject/Info-extract)
-
# Human Behaviour Change Project (HBCP)
The Human Behaviour-Change Project (HBCP) is a collaboration between behavioral scientists, computer scientists and
diff --git a/core/pom.xml b/core/pom.xml
index aaffd3d..f83d032 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -3,9 +3,9 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
- ibm.research.drl
+ com.ibm.drl.hbcp
hbcp-core
- 0.0.1-SNAPSHOT
+ 0.0.1
war
hbcp-core
@@ -108,8 +108,8 @@
slf4j-jdk14
- org.slf4j
- jcl-over-slf4j
+ org.slf4j
+ jcl-over-slf4j
@@ -144,10 +144,11 @@
lucene-classification
8.1.1
+
com.google.guava
guava
- 27.1-jre
+ 30.1.1-jre
org.apache.opennlp
@@ -174,64 +175,64 @@
1.18.16
provided
-
+
-
-
- org.apache.tika
- tika-parsers
+
+ org.apache.tika
+ tika-parsers
1.24
-
-
- org.apache.tika
- tika-langdetect
+
+
+ org.apache.tika
+ tika-langdetect
1.24
-
-
- org.apache.tika
- tika-serialization
+
+
+ org.apache.tika
+ tika-serialization
1.24
-
-
- org.apache.tika
- tika-xmp
+
+
+ org.apache.tika
+ tika-xmp
1.24
-
-
- org.apache.tika
- tika-batch
+
+
+ org.apache.tika
+ tika-batch
1.24
-
- org.slf4j
- slf4j-log4j12
-
+
+ org.slf4j
+ slf4j-log4j12
+
-
+
@@ -240,20 +241,16 @@
lucene-highlighter
8.1.1
-
- net.minidev
- json-smart
- 2.2.1
-
com.fasterxml.jackson.core
jackson-core
+
com.thoughtworks.xstream
xstream
- 1.4.11.1
+ 1.4.17
org.glassfish
@@ -312,17 +309,6 @@
springfox-swagger-ui
2.7.0
-
- org.grobid
- grobid-core
- 0.5.3
-
-
- org.slf4j
- slf4j-jdk14
-
-
-
uk.com.robust-it
cloning
@@ -358,16 +344,16 @@
glove
0.3
-
-
-
- ch.qos.logback
- logback-classic
-
+
+
+
+ ch.qos.logback
+ logback-classic
+
@@ -577,7 +563,7 @@
false
-
+
diff --git a/core/src/main/java/com/ibm/drl/hbcp/api/ExtractorController.java b/core/src/main/java/com/ibm/drl/hbcp/api/ExtractorController.java
index 47b9318..f2f4c41 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/api/ExtractorController.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/api/ExtractorController.java
@@ -200,11 +200,11 @@ protected String extractAll(
protected String extractAllMulti(
@ApiParam("Number of top passages to retrieve for aggregating the confidences of BCT presence")
@RequestParam(value="ntoppassages", required= false, defaultValue = "5") Integer numTopPassagesToRetrieve,
- @ApiParam("A comma separated list of window sizes to use, e.g. '10,20'")
+ @ApiParam("A comma separated list of window sizes to use, e.g. '10,20', only useful if using the unsupervised baseline")
@RequestParam(value="wsizes", required= false, defaultValue = "10,20") String wsizes,
- @ApiParam("A threshold value within the range of [0, 1] (e.g. 0.25)")
+ @ApiParam("A threshold value within the range of [0, 1] (e.g. 0.25), only useful if using the unsupervised baseline")
@RequestParam(value="threshold", required= false, defaultValue = "0.2") Float threshold,
- @ApiParam("Whether to use an ABBYY XML file as input instead of a PDF (true to use ABBYY, false to use raw PDF")
+ @ApiParam("Whether to use an ABBYY XML file as input instead of a PDF (true to use ABBYY .pdf.xml, false to use raw PDF")
@RequestParam(value = "useAbbyy", required = false, defaultValue = "false") boolean useAbbyy,
@ApiParam("Whether to use the faster, less accurate unsupervised extraction algorithm")
@RequestParam(value = "useUnsupervisedBaseline", required = false, defaultValue = "false") boolean useUnsupervisedBaseline,
diff --git a/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/Evaluation_NameAsCategory_NewFlairVersion.java b/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/Evaluation_NameAsCategory_NewFlairVersion.java
index 611a178..b7f6666 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/Evaluation_NameAsCategory_NewFlairVersion.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/Evaluation_NameAsCategory_NewFlairVersion.java
@@ -5,6 +5,7 @@
*/
package com.ibm.drl.hbcp.experiments.flair;
+import com.beust.jcommander.internal.Lists;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
@@ -47,11 +48,14 @@
import com.ibm.drl.hbcp.extraction.indexing.IndexedDocument;
import com.ibm.drl.hbcp.extraction.indexing.SentenceBasedIndexManager;
import com.ibm.drl.hbcp.extraction.indexing.SlidingWindowIndexManager;
+import com.ibm.drl.hbcp.inforetrieval.indexer.BaseDirInfo;
import com.ibm.drl.hbcp.parser.AnnotatedAttributeNameNumberTriple;
import com.ibm.drl.hbcp.parser.AnnotatedAttributeValuePair;
import com.ibm.drl.hbcp.parser.JSONRefParser;
import com.ibm.drl.hbcp.parser.cleaning.Cleaners;
import com.ibm.drl.hbcp.util.Props;
+import java.io.FileReader;
+import java.util.Arrays;
/**
*
@@ -68,7 +72,7 @@ public class Evaluation_NameAsCategory_NewFlairVersion {
public Evaluation_NameAsCategory_NewFlairVersion() {
gson = new Gson();
}
-
+
public Map> extractPrediction_old(String jsonfile) throws IOException, Exception {
Map> entitiesPerDoc = new HashMap<>();
Type type = new TypeToken>() {
@@ -92,8 +96,6 @@ public Map> extractPrediction_old(String jsonfile) throws I
}
return entitiesPerDoc;
}
-
-
public Map> extractPrediction(String jsonfile) throws IOException, Exception {
Map> entitiesPerDoc = new HashMap<>();
@@ -119,36 +121,139 @@ public Map> extractPrediction(String jsonfile) throws IOExc
return entitiesPerDoc;
}
+ public Map>> extractPrediction_MentionTag() throws IOException, Exception{
+ Map>> results = new HashMap();
+ BufferedReader textFileReader = new BufferedReader(new FileReader("/Users/yhou/git/hbcp-tableqa/mentionExp/mentionTagPredictions"));
+ String line = "";
+ while ((line = textFileReader.readLine()) != null) {
+ String filename = line.split("\t")[0];
+ String mention = line.split("\t")[2];
+ String category = line.split("\t")[3];
+ if(results.containsKey(filename)){
+ if(results.get(filename).containsKey(category)){
+ results.get(filename).get(category).add(mention);
+ }else{
+ List values = new ArrayList();
+ values.add(mention);
+ results.get(filename).put(category, values);
+ }
+ }else{
+ Map> mention4catetory = new HashMap();
+ List values = new ArrayList();
+ values.add(mention);
+ mention4catetory.put(category, values);
+ results.put(filename, mention4catetory);
+ }
+ }
+
+ return results;
+ }
+
+ public void extractPrediction_table(String filename, Map> prediction) throws IOException, Exception {
+ File dir_table = new File(BaseDirInfo.getBaseDir() + "../flairExp_tableqa/test_prediction");
+ Type type = new TypeToken>() {
+ }.getType();
+ for (File file : dir_table.listFiles()) {
+ if (file.getName().contains(filename)) {
+ InputStream inputStream = new FileInputStream(file);
+ Reader reader = new BufferedReader(new InputStreamReader(inputStream));
+ List result = gson.fromJson(reader, type);
+ for (SentenceEntityNew predict : result) {
+ if (!predict.entities.isEmpty()) {
+ for (SentenceEntityNew.Entity entity : predict.entities) {
+ if (prediction.containsKey(entity.labels.get(0)._value.replace("_", " "))) {
+ prediction.get(entity.labels.get(0)._value.replace("_", " ")).add(entity.text);
+ } else {
+ List entities = new ArrayList<>();
+ entities.add(entity.text);
+ prediction.put(entity.labels.get(0)._value.replace("_", " "), entities);
+ }
+ }
+ }
+ }
+
+ }
+ }
+ }
+
+ public Map> extractPrediction_NameValue_old(String jsonfile) throws IOException, Exception {
+ List namevalueAttri = Lists.newArrayList(
+ "Proportion identifying as belonging to a specific ethnic group",
+ "Proportion belonging to specified family or household income category",
+ "Proportion belonging to specified individual income category",
+ "Aggregate relationship status",
+ "Nicotine dependence",
+ "Individual reasons for attrition"
+ );
+ Map> entitiesPerDoc = new HashMap<>();
+ for (String attr : namevalueAttri) {
+ entitiesPerDoc.put(attr, new HashMap<>());
+ }
+ Type type = new TypeToken>() {
+ }.getType();
+ InputStream inputStream = new FileInputStream(new File(jsonfile));
+ Reader reader = new BufferedReader(new InputStreamReader(inputStream));
+ List result = gson.fromJson(reader, type);
+ for (SentenceEntity predict : result) {
+ if (!predict.entities.isEmpty()) {
+ List nameEntitiesPerSent = new ArrayList<>();
+ Map valueEntitiesPerSent = new LinkedHashMap<>();
+ for (SentenceEntity.Entity entity : predict.entities) {
+ if (namevalueAttri.contains(entity.type.replace("_", " ").replace("-name", ""))) {
+ nameEntitiesPerSent.add(entity);
+ }
+ if (namevalueAttri.contains(entity.type.replace("_", " ").replace("-value", ""))) {
+ valueEntitiesPerSent.put(entity, "not-paired");
+ }
+ }
+ //find paired name-value entities
+ for (SentenceEntity.Entity nameentity : nameEntitiesPerSent) {
+ for (SentenceEntity.Entity valueentity : valueEntitiesPerSent.keySet()) {
+ if (valueEntitiesPerSent.get(valueentity).equalsIgnoreCase("not-paired")
+ && nameentity.type.replace("-name", "").equalsIgnoreCase(valueentity.type.replace("-value", ""))) {
+ entitiesPerDoc.get(nameentity.type.replace("-name", "").replace("_", " ")).put(nameentity.text, valueentity.text);
+ valueEntitiesPerSent.put(valueentity, "paired");
+ }
+
+ }
+ }
+ }
+ }
+ return entitiesPerDoc;
+ }
+
public Map> extractPrediction_NameValue(String jsonfile) throws IOException, Exception {
Map> entitiesPerDoc = new HashMap<>();
- for(String attr: ValueType.COMPLEX_TYPE){
+ for (String attr : ValueType.COMPLEX_TYPE) {
entitiesPerDoc.put(attr, new HashMap<>());
}
- Type type = new TypeToken>() {}.getType();
+ Type type = new TypeToken>() {
+ }.getType();
InputStream inputStream = new FileInputStream(new File(jsonfile));
Reader reader = new BufferedReader(new InputStreamReader(inputStream));
List result = gson.fromJson(reader, type);
for (SentenceEntityNew predict : result) {
+// if(predict.text.contains("has a value of")) continue;
if (!predict.entities.isEmpty()) {
List nameEntitiesPerSent = new ArrayList<>();
Map valueEntitiesPerSent = new LinkedHashMap<>();
for (SentenceEntityNew.Entity entity : predict.entities) {
- if(ValueType.COMPLEX_TYPE.contains(entity.labels.get(0)._value.replace("_", " ").replace("-name", ""))){
+ if (ValueType.COMPLEX_TYPE.contains(entity.labels.get(0)._value.replace("_", " ").replace("-name", ""))) {
nameEntitiesPerSent.add(entity);
}
- if(ValueType.COMPLEX_TYPE.contains(entity.labels.get(0)._value.replace("_", " ").replace("-value", ""))){
+ if (ValueType.COMPLEX_TYPE.contains(entity.labels.get(0)._value.replace("_", " ").replace("-value", ""))) {
valueEntitiesPerSent.put(entity, "not-paired");
}
}
//find paired name-value entities
- for(SentenceEntityNew.Entity nameentity: nameEntitiesPerSent){
- for(SentenceEntityNew.Entity valueentity: valueEntitiesPerSent.keySet()){
- if(valueEntitiesPerSent.get(valueentity).equalsIgnoreCase("not-paired")
- && nameentity.labels.get(0)._value.replace("-name", "").equalsIgnoreCase(valueentity.labels.get(0)._value.replace("-value", ""))){
+ for (SentenceEntityNew.Entity nameentity : nameEntitiesPerSent) {
+ for (SentenceEntityNew.Entity valueentity : valueEntitiesPerSent.keySet()) {
+ if (valueEntitiesPerSent.get(valueentity).equalsIgnoreCase("not-paired")
+ && nameentity.labels.get(0)._value.replace("-name", "").equalsIgnoreCase(valueentity.labels.get(0)._value.replace("-value", ""))) {
entitiesPerDoc.get(nameentity.labels.get(0)._value.replace("-name", "").replace("_", " ")).put(nameentity.text, valueentity.text);
valueEntitiesPerSent.put(valueentity, "paired");
}
-
+
}
}
}
@@ -156,10 +261,6 @@ public Map> extractPrediction_NameValue(String jsonf
return entitiesPerDoc;
}
-
-
-
-
/**
* Get an index manager as defined by the properties. Can fail if properties
* are missing or wrongly defined
@@ -210,9 +311,7 @@ public void extractPrediction() throws IOException, Exception {
}
}
}
-
-
public void evaluate() throws Exception {
Map evalcount_value = new LinkedHashMap<>();
Map evalcount_present = new LinkedHashMap<>();
@@ -220,16 +319,17 @@ public void evaluate() throws Exception {
for (String s : ValueType.VALUE_TYPE) {
evalcount_value.put(s, new int[3]);
}
- for(String s: ValueType.PRESENCE_TYPE)
+ for (String s : ValueType.PRESENCE_TYPE) {
evalcount_present.put(s, new int[3]);
- for(String s: ValueType.COMPLEX_TYPE)
+ }
+ for (String s : ValueType.COMPLEX_TYPE) {
evalcount_namevalue.put(s, new int[3]);
+ }
//gold
Properties props = Props.loadProperties();
JSONRefParser refParser = new JSONRefParser(props);
// IndexManager index = getDefaultIndexManager(props);
Map> groundTruthPerDoc = new HashMap<>();
-
Cleaners cleaners = new Cleaners(Props.loadProperties());
// AttributeValueCollection annotations = refParser.getAttributeValuePairs();
@@ -250,17 +350,16 @@ public void evaluate() throws Exception {
// AnnotationOutcomesMiner outcomeMiner = new AnnotationOutcomesMiner(props);
// AttributeValueCollection annotations1 = outcomeMiner.withOtherOutcomeAndFollowupSeparate(new AttributeValueCollection<>(annotations));
// AttributeValueCollection cleaned1 = cleaners.clean(new AttributeValueCollection<>(annotations1));
-
-// groundTruthPerDoc.put(docname, cleaned1);
-
+// groundTruthPerDoc.put(docname, cleaned1);
}
-
//prediction_baseline
// Map>> prediction_bl = extract_baseline();
File dir_armname = new File("../flairExp/rank123Exp/testfile_rank123_armname_entityPrediction/");
- File dir = new File("../flairExp/rank123Exp/testfile_rank123_dataAugment1_pubmed_entityPrediction/");
+// File dir_armname = new File("../flairExp/rank123Exp/testfile_rank123_augment1_wotablesent_entityPrediction/");
+ File dir = new File("../flairExp/rank123Exp/testfile_rank123_augment1_wotablesent_entityPrediction/");
+// File dir = new File("../flairExp/rank123Exp/testfile_rank123_dataAugment1_pubmed_entityPrediction/");
// File dir = new File("../flairExp/rank123Exp/testfile_rank123_dataAugment1_lm2_entityPrediction/");
// File dir = new File("../flairExp/rank123Exp/oneentity/testfile_rank123_feedbackonbehaviour_augment1_lm2_entityPrediction/");
// File dir = new File("../flairExp/rank123Exp/testfile_rank123_dataAugment1_newtablesent_entityPrediction/");
@@ -276,8 +375,6 @@ public void evaluate() throws Exception {
// File dir_armname = new File("../flairExp/rank123Exp/testfile_rank123_entityPrediction/");
// File dir = new File("../flairExp/rank123Exp/testfile_rank123_entityPrediction/");
-
-
// File dir = new File("./flairExp/testfile_lrec_entityPrediction/");
// File dir = new File("./flairExp/testfile_entityPrediction_wotable/");
for (File jsonfile : dir.listFiles()) {
@@ -286,43 +383,44 @@ public void evaluate() throws Exception {
Map> gold = new HashMap<>();
Map> gold_namevalue = new HashMap<>();
for (ArmifiedAttributeValuePair cap : groundTruthPerDoc.get(docname)) {
- if(ValueType.COMPLEX_TYPE.contains(cap.getAttribute().getName().trim())){
- AnnotatedAttributeNameNumberTriple nameNumber = (AnnotatedAttributeNameNumberTriple)cap;
- if(gold_namevalue.containsKey(cap.getAttribute().getName().trim())){
+ if (ValueType.COMPLEX_TYPE.contains(cap.getAttribute().getName().trim())) {
+ AnnotatedAttributeNameNumberTriple nameNumber = (AnnotatedAttributeNameNumberTriple) cap;
+ if (gold_namevalue.containsKey(cap.getAttribute().getName().trim())) {
gold_namevalue.get(cap.getAttribute().getName().trim()).put(nameNumber.getValueName(), nameNumber.getValueNumber());
- }else{
+ } else {
Map map = new HashMap<>();
map.put(nameNumber.getValueName(), nameNumber.getValueNumber());
gold_namevalue.put(cap.getAttribute().getName().trim(), map);
}
- }else{
- if (gold.containsKey(cap.getAttribute().getName().trim())) {
- if(cap.getAttribute().getId().equalsIgnoreCase("5730447")){
- gold.get(cap.getAttribute().getName().trim()).addAll(cap.getArm().getAllNames());
- }else{
- gold.get(cap.getAttribute().getName().trim()).add(cap.getValue());
- }
- } else {
- List values = new ArrayList<>();
- if(cap.getAttribute().getName().trim().equalsIgnoreCase("5730447")){
- values.addAll(cap.getArm().getAllNames());
- }else{
- values.add(cap.getValue());
- }
- gold.put(cap.getAttribute().getName().trim(), values);
- }
+ } else {
+ if (gold.containsKey(cap.getAttribute().getName().trim())) {
+ if (cap.getAttribute().getId().equalsIgnoreCase("5730447")) {
+ gold.get(cap.getAttribute().getName().trim()).addAll(cap.getArm().getAllNames());
+ } else {
+ gold.get(cap.getAttribute().getName().trim()).add(cap.getValue());
+ }
+ } else {
+ List values = new ArrayList<>();
+ if (cap.getAttribute().getName().trim().equalsIgnoreCase("5730447")) {
+ values.addAll(cap.getArm().getAllNames());
+ } else {
+ values.add(cap.getValue());
+ }
+ gold.put(cap.getAttribute().getName().trim(), values);
+ }
}
}
-
-
+
Map> prediction = extractPrediction(jsonfile.getAbsolutePath());
+// Map> prediction_armname = extractPrediction(dir_armname.getAbsolutePath() + "/" + jsonfile.getName());
Map> prediction_armname = extractPrediction_old(dir_armname.getAbsolutePath() + "/" + jsonfile.getName());
Map> prediction_nventity = extractPrediction_NameValue(jsonfile.getAbsolutePath());
// Map> prediction = prediction_bl.get(docname);
// for (String att : result.keySet()) {
// System.err.println(att + "--" + result.get(att));
// }
- for(String armname: prediction_armname.keySet()){
+ extractPrediction_table(docname, prediction);
+ for (String armname : prediction_armname.keySet()) {
prediction.put(armname, prediction_armname.get(armname));
}
for (String att : evalcount_present.keySet()) {
@@ -344,14 +442,14 @@ public void evaluate() throws Exception {
} else {
predictStr = "0";
}
- if(!goldStr_source.isEmpty() || !predictStr_source.isEmpty()){
+ if (!goldStr_source.isEmpty() || !predictStr_source.isEmpty()) {
System.err.println(att);
System.err.println(goldStr_source);
System.err.println(predictStr_source);
}
// if (goldStr.equalsIgnoreCase(predictStr)) {
- if(goldStr.equalsIgnoreCase(predictStr)&&goldStr.equalsIgnoreCase("1")){
+ if (goldStr.equalsIgnoreCase(predictStr) && goldStr.equalsIgnoreCase("1")) {
evalcount_present.get(att)[TP]++;
} else if (goldStr.equalsIgnoreCase("1") && predictStr.equalsIgnoreCase("0")) {
evalcount_present.get(att)[FN]++;
@@ -361,8 +459,219 @@ public void evaluate() throws Exception {
}
}
+
+ for (String att : evalcount_value.keySet()) {
+ //value attribute evaluation
+ Set goldValues = new HashSet<>();
+ Set predictValues = new HashSet<>();
+ if (ValueType.VALUE_TYPE.contains(att)) {
+ if (gold.containsKey(att)) {
+ goldValues = Sets.newHashSet(gold.get(att));
+ }
+ if (prediction.containsKey(att)) {
+ predictValues = Sets.newHashSet(prediction.get(att));
+ }
+ System.err.println("gold:" + goldValues);
+ System.err.println("predict:" + predictValues);
+ //
+ if (!goldValues.isEmpty() && predictValues.isEmpty()) {
+ evalcount_value.get(att)[TP] = evalcount_value.get(att)[TP] + 0;
+ evalcount_value.get(att)[FP] = evalcount_value.get(att)[FP] + 0;
+ evalcount_value.get(att)[FN] = evalcount_value.get(att)[FN] + goldValues.size();
+ } else if (goldValues.isEmpty() && !predictValues.isEmpty()) {
+ evalcount_value.get(att)[TP] = evalcount_value.get(att)[TP] + 0;
+ evalcount_value.get(att)[FP] = evalcount_value.get(att)[FP] + predictValues.size();
+ evalcount_value.get(att)[FN] = evalcount_value.get(att)[FN] + 0;
+ } else if (!goldValues.isEmpty() && !predictValues.isEmpty()) {
+ //strict evaluation
+// Set intersection = new HashSet(goldValues); // use the copy constructor
+// intersection.retainAll(predictValues);
+// //relaxed evaluation
+ Set intersection = calculateRelaxedTP(goldValues, predictValues);
+ System.err.println("intersect:" + intersection);
+ System.err.println("tp:" + intersection.size());
+ evalcount_value.get(att)[TP] = evalcount_value.get(att)[TP] + intersection.size();
+ evalcount_value.get(att)[FP] = evalcount_value.get(att)[FP] + predictValues.size() - intersection.size();
+ evalcount_value.get(att)[FN] = evalcount_value.get(att)[FN] + goldValues.size() - intersection.size();
+ int fnsize = goldValues.size() - intersection.size();
+ System.err.println("fn:" + fnsize);
+ }
+
+ }
+
+ }
+ //name-value pairs valuation
+ for (String att : evalcount_namevalue.keySet()) {
+ //value attribute evaluation
+ Map goldValues = new HashMap<>();
+ Map predictValues = new HashMap<>();
+ if (ValueType.COMPLEX_TYPE.contains(att)) {
+ if (gold_namevalue.containsKey(att)) {
+ goldValues = Maps.newHashMap(gold_namevalue.get(att));
+ }
+ if (prediction_nventity.containsKey(att)) {
+ predictValues = Maps.newHashMap(prediction_nventity.get(att));
+ }
+ System.err.println(att);
+ System.err.println("gold:" + goldValues);
+ System.err.println("predict:" + predictValues);
+ //
+ if (!goldValues.isEmpty() && predictValues.isEmpty()) {
+ evalcount_namevalue.get(att)[TP] = evalcount_namevalue.get(att)[TP] + 0;
+ evalcount_namevalue.get(att)[FP] = evalcount_namevalue.get(att)[FP] + 0;
+ evalcount_namevalue.get(att)[FN] = evalcount_namevalue.get(att)[FN] + goldValues.size();
+ } else if (goldValues.isEmpty() && !predictValues.isEmpty()) {
+ evalcount_namevalue.get(att)[TP] = evalcount_namevalue.get(att)[TP] + 0;
+ evalcount_namevalue.get(att)[FP] = evalcount_namevalue.get(att)[FP] + predictValues.size();
+ evalcount_namevalue.get(att)[FN] = evalcount_namevalue.get(att)[FN] + 0;
+ } else if (!goldValues.isEmpty() && !predictValues.isEmpty()) {
+// //relaxed evaluation
+ Map intersection = calculateRelaxedTP_namevalue(goldValues, predictValues);
+ System.err.println("intersect:" + intersection);
+ System.err.println("tp:" + intersection.size());
+ evalcount_namevalue.get(att)[TP] = evalcount_namevalue.get(att)[TP] + intersection.size();
+ evalcount_namevalue.get(att)[FP] = evalcount_namevalue.get(att)[FP] + predictValues.size() - intersection.size();
+ evalcount_namevalue.get(att)[FN] = evalcount_namevalue.get(att)[FN] + goldValues.size() - intersection.size();
+ int fnsize = goldValues.size() - intersection.size();
+ System.err.println("fn:" + fnsize);
+
+ }
+
+ }
+
+ } //name-value pairs evaluation
+ }
+ //
+ System.err.println("attribute present evaluation:");
+ calEvalMetric(evalcount_present);
+ System.err.println("attribute value evaluation:");
+ calEvalMetric(evalcount_value);
+ System.err.println("attribute name-value evaluation:");
+ calEvalMetric(evalcount_namevalue);
+ }
+
+ public void evaluate_mentionTagExp() throws Exception {
+ Map evalcount_value = new LinkedHashMap<>();
+ Map evalcount_present = new LinkedHashMap<>();
+ Map evalcount_namevalue = new LinkedHashMap<>();
+ for (String s : ValueType.VALUE_TYPE) {
+ evalcount_value.put(s, new int[3]);
+ }
+ for (String s : ValueType.PRESENCE_TYPE) {
+ evalcount_present.put(s, new int[3]);
+ }
+ for (String s : ValueType.COMPLEX_TYPE) {
+ evalcount_namevalue.put(s, new int[3]);
+ }
+ //gold
+ Properties props = Props.loadProperties();
+ JSONRefParser refParser = new JSONRefParser(props);
+ Map> groundTruthPerDoc = new HashMap<>();
+
+ Cleaners cleaners = new Cleaners(Props.loadProperties());
+ for (String docname : refParser.getAttributeValuePairs().byDoc().keySet()) {
+ Collection extends AnnotatedAttributeValuePair> annotations = refParser.getAttributeValuePairs().byDoc().get(docname);
+ // copy the annotations to a new list to make them true ArmifiedAttributeValuePair (remove the "Annotated" part)
+ if (annotations == null) {
+ System.err.println(docname + " NOT FOUND in the JSON " + refParser.getFile().getName());
+ continue;
+ }
+ AttributeValueCollection cleaned = cleaners.clean(new AttributeValueCollection<>(annotations));
+ groundTruthPerDoc.put(docname, cleaned);
+ }
+
+ Map>> predictedResults_allDoc = extractPrediction_MentionTag();
+
+ File dir_armname = new File("../flairExp/rank123Exp/testfile_rank123_armname_entityPrediction/");
+ File dir = new File("../flairExp/rank123Exp/testfile_rank123_augment1_wotablesent_entityPrediction/");
+ for (File jsonfile : dir.listFiles()) {
+ String docname = jsonfile.getName().split(".txt")[0];
+ System.err.println(docname);
+ Map> gold = new HashMap<>();
+ Map> gold_namevalue = new HashMap<>();
+ for (ArmifiedAttributeValuePair cap : groundTruthPerDoc.get(docname)) {
+ if (ValueType.COMPLEX_TYPE.contains(cap.getAttribute().getName().trim())) {
+ AnnotatedAttributeNameNumberTriple nameNumber = (AnnotatedAttributeNameNumberTriple) cap;
+ if (gold_namevalue.containsKey(cap.getAttribute().getName().trim())) {
+ gold_namevalue.get(cap.getAttribute().getName().trim()).put(nameNumber.getValueName(), nameNumber.getValueNumber());
+ } else {
+ Map map = new HashMap<>();
+ map.put(nameNumber.getValueName(), nameNumber.getValueNumber());
+ gold_namevalue.put(cap.getAttribute().getName().trim(), map);
+ }
+ } else {
+ if (gold.containsKey(cap.getAttribute().getName().trim())) {
+ if (cap.getAttribute().getId().equalsIgnoreCase("5730447")) {
+ gold.get(cap.getAttribute().getName().trim()).addAll(cap.getArm().getAllNames());
+ } else {
+ gold.get(cap.getAttribute().getName().trim()).add(cap.getValue());
+ }
+ } else {
+ List values = new ArrayList<>();
+ if (cap.getAttribute().getName().trim().equalsIgnoreCase("5730447")) {
+ values.addAll(cap.getArm().getAllNames());
+ } else {
+ values.add(cap.getValue());
+ }
+ gold.put(cap.getAttribute().getName().trim(), values);
+ }
+ }
+ }
- for(String att: evalcount_value.keySet()){
+ Map> prediction = new HashMap();
+ if(predictedResults_allDoc.containsKey(docname.replace(" ", "_"))){
+ prediction = predictedResults_allDoc.get(docname.replace(" ", "_"));
+ }
+// Map> prediction = extractPrediction(jsonfile.getAbsolutePath());
+// Map> prediction_armname = extractPrediction(dir_armname.getAbsolutePath() + "/" + jsonfile.getName());
+ Map> prediction_armname = extractPrediction_old(dir_armname.getAbsolutePath() + "/" + jsonfile.getName());
+ Map> prediction_nventity = extractPrediction_NameValue(jsonfile.getAbsolutePath());
+// Map> prediction = prediction_bl.get(docname);
+// for (String att : result.keySet()) {
+// System.err.println(att + "--" + result.get(att));
+// }
+// extractPrediction_table(docname, prediction);
+// for (String armname : prediction_armname.keySet()) {
+// prediction.put(armname, prediction_armname.get(armname));
+// }
+ for (String att : evalcount_present.keySet()) {
+ String goldStr = "";
+ String predictStr = "";
+ List goldStr_source = new ArrayList<>();
+ List predictStr_source = new ArrayList<>();
+ //present attribute evaluation
+ if (ValueType.PRESENCE_TYPE.contains(att)) {
+ if (gold.containsKey(att)) {
+ goldStr = "1";
+ goldStr_source = gold.get(att);
+ } else {
+ goldStr = "0";
+ }
+ if (prediction.containsKey(att)) {
+ predictStr = "1";
+ predictStr_source = prediction.get(att);
+ } else {
+ predictStr = "0";
+ }
+ if (!goldStr_source.isEmpty() || !predictStr_source.isEmpty()) {
+ System.err.println(att);
+ System.err.println(goldStr_source);
+ System.err.println(predictStr_source);
+ }
+
+// if (goldStr.equalsIgnoreCase(predictStr)) {
+ if (goldStr.equalsIgnoreCase(predictStr) && goldStr.equalsIgnoreCase("1")) {
+ evalcount_present.get(att)[TP]++;
+ } else if (goldStr.equalsIgnoreCase("1") && predictStr.equalsIgnoreCase("0")) {
+ evalcount_present.get(att)[FN]++;
+ } else if (goldStr.equalsIgnoreCase("0") && predictStr.equalsIgnoreCase("1")) {
+ evalcount_present.get(att)[FP]++;
+ }
+
+ }
+ }
+
+ for (String att : evalcount_value.keySet()) {
//value attribute evaluation
Set goldValues = new HashSet<>();
Set predictValues = new HashSet<>();
@@ -403,7 +712,7 @@ public void evaluate() throws Exception {
}
//name-value pairs valuation
- for(String att: evalcount_namevalue.keySet()){
+ for (String att : evalcount_namevalue.keySet()) {
//value attribute evaluation
Map goldValues = new HashMap<>();
Map predictValues = new HashMap<>();
@@ -451,8 +760,10 @@ public void evaluate() throws Exception {
System.err.println("attribute name-value evaluation:");
calEvalMetric(evalcount_namevalue);
}
+
- private void calEvalMetric(Map evalcount){
+
+ private void calEvalMetric(Map evalcount) {
System.err.println("att \t precision \t recall \t fscore");
double macro_precision = 0.0;
double macro_recall = 0.0;
@@ -465,18 +776,18 @@ private void calEvalMetric(Map evalcount){
macro_precision = macro_precision + precision;
macro_recall = macro_recall + recall;
macro_fscore = macro_fscore + fscore;
- System.err.println(att + "\t" + precision + "\t" + recall + "\t" + fscore + "\t (" + "tp:" + evalcount.get(att)[TP] + "-fp:" + evalcount.get(att)[FP] + "-fn:" + evalcount.get(att)[FN] + ")");
+ System.err.println(att + "\t" + precision + "\t" + recall + "\t" + fscore + "\t (" + "tp:" + evalcount.get(att)[TP] + "-fp:" + evalcount.get(att)[FP] + "-fn:" + evalcount.get(att)[FN] + ")");
}
- System.err.println("Macro:\t" + macro_precision/evalcount.size() + "\t" + macro_recall/evalcount.size() + "\t" + macro_fscore/evalcount.size());
+ System.err.println("Macro:\t" + macro_precision / evalcount.size() + "\t" + macro_recall / evalcount.size() + "\t" + macro_fscore / evalcount.size());
System.err.println("\n");
}
-
- private Map calculateRelaxedTP_namevalue(Map goldValues, Map predictedValues){
+
+ private Map calculateRelaxedTP_namevalue(Map goldValues, Map predictedValues) {
Map intersection = new HashMap<>();
- for(Entry predict: predictedValues.entrySet()){
- for(Entry gold: goldValues.entrySet()){
- if((gold.getKey().contains(predict.getKey())&&gold.getValue().contains(predict.getValue()))
- ||(predict.getKey().contains(gold.getKey())&&predict.getValue().contains(gold.getValue()))){
+ for (Entry predict : predictedValues.entrySet()) {
+ for (Entry gold : goldValues.entrySet()) {
+ if ((gold.getKey().contains(predict.getKey()) && gold.getValue().contains(predict.getValue()))
+ || (predict.getKey().contains(gold.getKey()) && predict.getValue().contains(gold.getValue()))) {
intersection.put(predict.getKey(), predict.getValue());
break;
}
@@ -484,33 +795,33 @@ private Map calculateRelaxedTP_namevalue(Map gol
}
return intersection;
}
-
- private Set calculateRelaxedTP(Set goldValues, Set predictedValues){
- //enforce one gold value only mapped to one predicted value for the case like:
- //gold: =0.055, prediction[0.055, 0.05]
- //we only map 0.055 which has the longest overlap with the gold annotation
- Map gold2PredictionMap = new HashMap<>();
- Set intersection = new HashSet<>();
- for(String predict: predictedValues){
- for(String gold: goldValues){
- if(gold.contains(predict)){
- if(gold2PredictionMap.containsKey(gold)){
- String oldPrediction = gold2PredictionMap.get(gold);
- if(predict.contains(oldPrediction)){
- gold2PredictionMap.put(gold, predict);
- }
-
- }else{
- gold2PredictionMap.put(gold, predict);
- }
- break;
- }
- }
- }
- for(String gold: gold2PredictionMap.keySet()){
- intersection.add(gold2PredictionMap.get(gold));
- }
- return intersection;
+
+ private Set calculateRelaxedTP(Set goldValues, Set predictedValues) {
+ //enforce one gold value only mapped to one predicted value for the case like:
+ //gold: =0.055, prediction[0.055, 0.05]
+ //we only map 0.055 which has the longest overlap with the gold annotation
+ Map gold2PredictionMap = new HashMap<>();
+ Set intersection = new HashSet<>();
+ for (String predict : predictedValues) {
+ for (String gold : goldValues) {
+ if (gold.contains(predict)) {
+ if (gold2PredictionMap.containsKey(gold)) {
+ String oldPrediction = gold2PredictionMap.get(gold);
+ if (predict.contains(oldPrediction)) {
+ gold2PredictionMap.put(gold, predict);
+ }
+
+ } else {
+ gold2PredictionMap.put(gold, predict);
+ }
+ break;
+ }
+ }
+ }
+ for (String gold : gold2PredictionMap.keySet()) {
+ intersection.add(gold2PredictionMap.get(gold));
+ }
+ return intersection;
}
private double recall(int tp, int fn) {
@@ -532,7 +843,7 @@ private double f1Score(double prec, double recall) {
return 2 * prec * recall / (prec + recall);
}
}
-
+
public Map>> extract_baseline() throws IOException, ParseException {
Set testfilename = new HashSet<>();
File dir = new File("./flairExp/testfile_new/");
@@ -540,30 +851,32 @@ public Map>> extract_baseline() throws IOExcept
String docname = jsonfile.getName().split(".txt")[0];
testfilename.add(docname);
}
-
+
Map>> resultsAllDoc = new HashMap<>();
Properties props = Props.loadProperties();
try (InformationExtractor extractor = new InformationExtractor(props)) {
IndexManager index = extractor.getDefaultIndexManager(props);
for (IndexedDocument doc : index.getAllDocuments()) {
- if(!testfilename.contains(doc.getDocName())) continue;
-
+ if (!testfilename.contains(doc.getDocName())) {
+ continue;
+ }
+
HashMap> res = new HashMap>();
-
+
Collection> candidateArms = extractor.getArmExtractor().extract(doc);
Collection arms = candidateArms.stream().map(x -> x.getAnswer()).collect(Collectors.toSet());
Collection> results = extractor.extract(doc);
List armNames = new ArrayList<>();
- for(Arm arm: arms){
+ for (Arm arm : arms) {
armNames.addAll(arm.getAllNames());
}
res.put("5730447", armNames);
- for(CandidateInPassage candi: results){
+ for (CandidateInPassage candi : results) {
String value = candi.getAnswer().getValue();
String attribute = candi.getAnswer().getAttribute().getId();
- if(res.containsKey(attribute)){
+ if (res.containsKey(attribute)) {
res.get(attribute).add(value);
- }else{
+ } else {
List values = new ArrayList<>();
values.add(value);
res.put(attribute, values);
@@ -574,15 +887,13 @@ public Map>> extract_baseline() throws IOExcept
}
return resultsAllDoc;
}
-
-
-
public static void main(String[] args) throws Exception {
Evaluation_NameAsCategory_NewFlairVersion extractor = new Evaluation_NameAsCategory_NewFlairVersion();
// extractor.extractPrediction();
- extractor.evaluate();
+ extractor.evaluate_mentionTagExp();
+// extractor.evaluate();
// extractor.extract_baseline();
}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/GenerateTrainingData_NameAsCategory.java b/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/GenerateTrainingData_NameAsCategory.java
index 726937b..1d73206 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/GenerateTrainingData_NameAsCategory.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/experiments/flair/GenerateTrainingData_NameAsCategory.java
@@ -48,6 +48,9 @@
import edu.stanford.nlp.simple.Sentence;
import edu.stanford.nlp.simple.Token;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileReader;
/**
@@ -94,9 +97,8 @@ public List generateTestingSentence(File jsonPdfOutput){
}
return sentences;
}
-
-
- public List generateTestingSentence(String docName, String dir) {
+
+ public List generateTestingSentence(String docName, String dir) {
// System.err.println("generate testing sentence:" + docName);
List sentences = new ArrayList<>();
File jsonPdfOutput = new File(dir + docName + ".json");
@@ -131,6 +133,52 @@ public List generateTestingSentence(String docName, String dir) {
}
return sentences;
}
+
+
+ public List generateTestingSentence_mention(String docName, String dir) {
+// System.err.println("generate testing sentence:" + docName);
+ List sentences = new ArrayList<>();
+ File jsonPdfOutput = new File(dir + docName + ".json");
+ try {
+ Reparser parser = new Reparser(jsonPdfOutput);
+ for (String str : parser.toText().split("\n")) {
+// if(str.contains("has a value of")&&str.split(" ").length>30) continue;
+// if (str.equalsIgnoreCase("acknowledgements") || str.equalsIgnoreCase("references")) {
+// break;
+// }
+ if (str.equalsIgnoreCase("references")) { //part of rank3 entities are about funding infor
+ break;
+ }
+ if (str.matches(".*?http:.*?")) {
+ continue;
+ }
+ if (str.split(" ").length < 6) {
+ continue;
+ }
+ if(str.matches(".*?(\\d+|\\d+\\%|\\d+\\.\\d+|\\d+\\.\\d+\\%)-(\\d+|\\d+\\%|\\d+\\.\\d+|\\d+\\.\\d+\\%|[a-zA-Z].*?).*?"))
+ str = splitDashBetweenNumbers(str);
+ if(str.matches(".*?(\\d+|\\d+\\%|\\d+\\.\\d+|\\d+\\.\\d+\\%)/(\\d+|\\d+\\%|\\d+\\.\\d+|\\d+\\.\\d+\\%|[a-zA-Z].*?).*?"))
+ str = splitSlashBetweenNumbers(str);
+ if(str.matches(".*? (\\d+th) .*?")||str.contains("1st")||str.contains("2nd")||str.contains("3rd"))
+ str = splitNumberth(str);
+ if(str.matches(".*?(one|two|three|four|five|six|seven|eight|nine|twice)-(month|months|monthly|week|weeks|weekly|day|days|session|sessions|time).*?"))
+ str = splitDashBetweenTokens(str);
+ for(String str1: str.split("(\\. |\\.89 |\\.\\'°01 |\\, \\(|; \\(|\\n|\n|\\?)")){
+// if(str1.split("\\, ").length>10){
+ if(str1.contains("M.Sc.")){
+ System.err.println("filter:" + str1);
+ continue;
+ }
+ if(isContextFromTable(str1))
+ continue;
+ sentences.add(str1);
+ }
+ }
+ } catch (Exception e) {
+ System.err.println(e.toString());
+ }
+ return sentences;
+ }
public void generateTrainTestData_ArmAssociation() throws IOException, Exception{
int traintestsplit = 200;
@@ -520,22 +568,604 @@ public Map generateAnnotationPairsForNameValueEntity(String high
return pairs;
}
+
+ public void generateTestingData_mention_classification() throws IOException, Exception{
+ Map> traintest = generateTrainingTestingFiles();
+ String testdir = "../data/All_512Papers_04March20_extracted_fixname23/";
+ BufferedWriter writer3 = new BufferedWriter(new FileWriter(new File(BaseDirInfo.getBaseDir() + "../mentionExp/menExt/test.tsv")));
+ int filterSent = 0;
+ int senttokencount = 0;
+ String longestSent = "";
+ int sentcount = 0;
+ List debug = new ArrayList();
+ List addedDoc = new ArrayList();
+ for (String doc : traintest.get("test")) {
+ List sents = generateTestingSentence_mention(doc, testdir);
+ if(sents.isEmpty()){
+ System.err.println(doc + ": no corresponding xml file");
+ continue;
+ }
+ addedDoc.add(doc);
+ int sentid = 0;
+ for (String block : sents) {
+ if(block.contains("has a value of")) continue;
+ List tokens = new ArrayList();
+ for(String s: block.split(" ")){
+ tokens.add(s);
+ }
+ if(tokens.size()>150) {
+ filterSent ++;
+ System.err.println("filter:"+ tokens.size() + "--" + block);
+ continue;
+ }
+ if(tokens.size()>senttokencount){
+ senttokencount = tokens.size();
+ longestSent = block;
+ }
+ for(int i=0; itokens.size()) break;
+ String spanContent = "";
+ String newsent = "";
+ int originalStart = i;
+ int originalEnd = i + len-1;
+ for(int k = originalStart; k<=originalEnd; k++) {
+ spanContent = spanContent + " " + tokens.get(k);
+ }
+ spanContent = spanContent.trim();
+ for(int j = 0; j=1775008){
+// debug.add(newsent);
+// }
+ }
+ }
+ }
+ sentid = sentid + 1;
+ }
+ writer3.close();
+ System.err.println("filter sent:" + filterSent + ":" + senttokencount);
+ System.err.println("longest sent:" + longestSent);
+ for(String s: debug)
+ System.err.println(s);
+// for(String s: addedDoc)
+// System.err.print(s);
+
+ }
+
+ public class ExtractedMention{
+ public String mentionid;
+ public String docid;
+ public int sentid;
+ public int start;
+ public int end;
+ public String mentioncontent;
+ public String sentconent;
+ public String fullpremention;
+ public String partialfullpremention;
+
+ public ExtractedMention(String mentionid, String docid, int sentid, int start, int end, String mentioncontent, String sentconent) {
+ this.mentionid = mentionid;
+ this.docid = docid;
+ this.sentid = sentid;
+ this.start = start;
+ this.end = end;
+ this.mentioncontent = mentioncontent;
+ this.sentconent = sentconent;
+ this.fullpremention = "FALSE";
+ this.partialfullpremention = "UNKNOWN";
+ }
+ }
+
+
+ public Map>> getExtractedMentionPerDoc() throws IOException, Exception{
+ BufferedReader textFileReader = new BufferedReader(new FileReader("/Users/yhou/git/hbcp-tableqa/mentionExp/menExt/mentionPredictions"));
+ String line = "";
+ Map>> mentions_sent_doc = new HashMap();
+ Map> mentionsdoc = new HashMap();
+ while ((line = textFileReader.readLine()) != null) {
+ String mentionid = line.split("\t")[0];
+ String docid = mentionid.split("_sent")[0].replace("_", " ");
+ int sentid = Integer.valueOf(mentionid.split("_sent")[1].split("_")[0]);
+ int start = Integer.valueOf(mentionid.split("_sent")[1].split("_")[1]);
+ int end = Integer.valueOf(mentionid.split("_sent")[1].split("_")[2]);
+ String sent = line.split("\t")[2];
+ if(!sent.contains("[unused2]")) {
+ System.err.println(mentionid + ":" + sent);
+ }
+ String mention_content = line.split("\t")[3];
+ ExtractedMention mention = new ExtractedMention(mentionid, docid, sentid, start, end, mention_content, sent);
+ if(mentions_sent_doc.containsKey(docid)) {
+ Map> sent_mentions = mentions_sent_doc.get(docid);
+ if(sent_mentions.containsKey(sentid)) {
+ sent_mentions.get(sentid).add(mention);
+ }else {
+ List sentMentions= new ArrayList();
+ sentMentions.add(mention);
+ sent_mentions.put(sentid, sentMentions);
+ }
+ }else {
+ List sentMentions= new ArrayList();
+ sentMentions.add(mention);
+ Map> sent_mentions = new HashMap();
+ sent_mentions.put(sentid, sentMentions);
+ mentions_sent_doc.put(docid, sent_mentions);
+ }
+ if(mentionsdoc.containsKey(docid)) {
+ mentionsdoc.get(docid).add(mention);
+ }else {
+ List mentions = new ArrayList();
+ mentions.add(mention);
+ mentionsdoc.put(docid, mentions);
+ }
+ }
+ return mentions_sent_doc;
+ }
+
+
+
+ public void generateTrainingData_mention_classification() throws IOException, Exception{
+ Map> traintest = generateTrainingTestingFiles();
+ Map>> extractedMentions = getExtractedMentionPerDoc();
+ BufferedWriter writer1 = new BufferedWriter(new FileWriter(new File(BaseDirInfo.getBaseDir() + "../mentionExp/train.tsv")));
+
+ BufferedWriter writer2 = new BufferedWriter(new FileWriter(new File(BaseDirInfo.getBaseDir() + "../mentionExp/test.tsv")));
+ String testdir = "../data/All_512Papers_04March20_extracted_fixname23/";
+ Properties props = Props.loadProperties();
+ JSONRefParser refParser = new JSONRefParser(props);
+ Set targetedAttri = Sets.newHashSet(
+ "1.1.Goal setting (behavior)",
+ "1.2 Problem solving",
+ "1.4 Action planning",
+ "2.2 Feedback on behaviour",
+ "2.3 Self-monitoring of behavior",
+ "3.1 Social support (unspecified)",
+ "5.1 Information about health consequences",
+ "5.3 Information about social and environmental consequences",
+ "11.1 Pharmacological support",
+ "11.2 Reduce negative emotions",
+
+ "Arm name",
+ "Outcome value",
+ "Mean age",
+ "Proportion identifying as female gender",
+ "Mean number of times tobacco used",
+ "Proportion identifying as male gender",
+ "Proportion identifying as belonging to a specific ethnic group",
+ "Lower-level geographical region",
+ "Smoking",
+ "4.1 Instruction on how to perform the behavior",
+
+ "Longest follow up",
+ "Effect size p value",
+ "Effect size estimate",
+ "Biochemical verification",
+ "Proportion employed",
+ "4.5. Advise to change behavior",
+ "Proportion achieved university or college education",
+ "Country of intervention",
+ "Self report",
+ "Odds Ratio",
+
+ "Aggregate patient role",
+ "Aggregate health status type",
+ "Aggregate relationship status",
+ "Proportion in a legal marriage or union",
+ "Mean number of years in education completed",
+ "Proportion belonging to specified family or household income category",
+ "Proportion belonging to specified individual income category",
+ "Healthcare facility",
+ "Doctor-led primary care facility",
+ "Hospital facility",
+
+ "Site",
+ "Individual-level allocated",
+ "Individual-level analysed",
+ "Face to face",
+ "Distance",
+ "Printed material",
+ "Digital content type",
+ "Website / Computer Program / App",
+ "Somatic",
+ "Patch",
+
+ "Pill",
+ "Individual",
+ "Group-based",
+ "Health Professional",
+ "Psychologist",
+ "Researcher not otherwise specified",
+ "Interventionist not otherwise specified",
+ "Expertise of Source",
+
+ //rank3
+ "Dose",
+ "Overall duration",
+ "Number of contacts",
+ "Contact frequency",
+ "Contact duration",
+ "Format",
+ "Nicotine dependence",
+ "Cognitive Behavioural Therapy",
+ "Mindfulness",
+ "Motivational Interviewing",
+ "Brief advice",
+ "Physical activity",
+ "Individual reasons for attrition",
+ "Encountered intervention",
+ "Completed intervention",
+ "Sessions delivered",
+ "Pharmaceutical company funding",
+ "Tobacco company funding",
+ "Research grant funding",
+ "No funding",
+ "Pharmaceutical company competing interest",
+ "Tobacco company competing interest",
+ "Research grant competing interest",
+ "No competing interest"
+ );
+ Set nameValueAttri = Sets.newHashSet(
+ "Proportion identifying as belonging to a specific ethnic group",
+ "Proportion belonging to specified family or household income category",
+ "Proportion belonging to specified individual income category",
+ "Aggregate relationship status",
+ "Nicotine dependence",
+ "Individual reasons for attrition"
+ );
+ List>> groundTruth = getGroundTruthForEvaluation_fromJson(refParser);
+ for (Pair> pairsPerDoc : groundTruth) {
+ String doc = pairsPerDoc.getKey();
+ Set matchedTableSent = new HashSet();
+ Map> annotationPerContext = new HashMap();
+ for (ArmifiedAttributeValuePair cap : pairsPerDoc.getValue()) {
+ String annotation = cap.getValue();
+ String highlightedText = ((AnnotatedAttributeValuePair)cap).getHighlightedText();
+ String context = cap.getContext();
+ String attrName = cap.getAttribute().getName();
+ if (!targetedAttri.contains(cap.getAttribute().getName().trim())) {
+ continue;
+ }
+ //annotated context
+ if (isContextFromTable(context)) {
+ //context is from table
+ continue;
+ }
+ if (context.isEmpty() || annotation.isEmpty()) {
+ //context is empty
+ continue;
+ }
+ if (cap.getAttribute().getId().equalsIgnoreCase("5730447")) {//arm name
+ List armNames = cap.getArm().getAllNames();
+ annotation = "";
+ for (String s : armNames) {
+ annotation = s + "##" + cap.getAttribute().getName().trim();
+ if (annotationPerContext.containsKey(context)) {
+ annotationPerContext.get(context).add(annotation + "##" + cap.getAttribute().getName().trim());
+ } else {
+ Set anno = Sets.newHashSet(annotation + "##" + cap.getAttribute().getName().trim());
+ annotationPerContext.put(context, anno);
+ }
+ }
+ }else if(nameValueAttri.contains(cap.getAttribute().getName().trim())){
+ AnnotatedAttributeNameNumberTriple nameNumber = (AnnotatedAttributeNameNumberTriple)cap;
+ String name = nameNumber.getValueName();
+ String value = nameNumber.getValueNumber();
+ if(annotationPerContext.containsKey(context)){
+ annotationPerContext.get(context).add(name + "##" + cap.getAttribute().getName().trim() + "-name");
+ annotationPerContext.get(context).add(value + "##" + cap.getAttribute().getName().trim() + "-value");
+ }else{
+ Set anno = Sets.newHashSet(name + "##" + cap.getAttribute().getName().trim() + "-name");
+ anno.add(value + "##" + cap.getAttribute().getName().trim() + "-value");
+ annotationPerContext.put(context, anno);
+ }
+ }else {
+ if (annotationPerContext.containsKey(context)) {
+ annotationPerContext.get(context).add(annotation + "##" + cap.getAttribute().getName().trim());
+ } else {
+ Set anno = Sets.newHashSet(annotation + "##" + cap.getAttribute().getName().trim());
+ annotationPerContext.put(context, anno);
+ }
+ }
+ }
+ //write to file per doc
+
+ for (String context : annotationPerContext.keySet()) {
+ boolean problematicAnnotation = true;
+ List splitsent = new ArrayList<>();
+ for(String str: context.split("( ; |\\.; |\\. ;|\\.;|\\.;|;;;)")){
+ if(str.split(" ").length>100||(str.contains("\n")&&str.replaceAll("\n", " ").split(" ").length>=100)){
+ for(String str1: str.split("(\\. |\\, \\(|; \\(|\\n|\n|\\?)")){
+ splitsent.add(str1);
+ }
+ }else{
+ splitsent.add(str.replaceAll("\n", " "));
+ }
+ }
+ for(String str: splitsent){
+ if(str.contains("has a value of")) continue;
+ if (str.split(" ").length >= 2) {
+ str = splitDashBetweenNumbers(str);
+ str = splitSlashBetweenNumbers(str);
+ str = splitNumberth(str);
+ str = splitDashBetweenTokens(str);
+ Sentence sent = new Sentence(str);
+ List annotation = getAnnotationOnSent(sent, annotationPerContext.get(context));
+ if(annotation==null){
+ //try to fix the partical annotation problem here, using the fixed annotation to match the context again
+ String currentcontext = str;
+ Set fixedAnnotation = new HashSet<>();
+ for(String annoplustype:annotationPerContext.get(context)){
+ String highlightedText = annoplustype.split("##")[0];
+ String type = annoplustype.split("##")[1];
+ String newhightlightedText = fixParticalAnnotation(currentcontext, highlightedText);
+ fixedAnnotation.add(newhightlightedText + "##" + type);
+ }
+ annotation = getAnnotationOnSent(sent, fixedAnnotation);
+ }
+ if (annotation != null) {
+ if(context.contains("has a value of")){
+ continue;
+ }
+
+
+ List results = generateMentionClassifierTrainingData(doc.replace(" ", "_"), annotation);
+ if (traintest.get("train").contains(doc)) {
+ for(String s: results){
+ writer1.append(s).append("\n");
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ //write testing data
+ for(String doc: traintest.get("test")){
+ Map> mentions = extractedMentions.get(doc);
+ if(mentions==null) continue;
+ System.err.println(doc + ":" + mentions.get(0).size());
+ for(Integer sentid: mentions.keySet()){
+ for(ExtractedMention men: mentions.get(sentid)){
+ for(String attri: targetedAttri_mentionExp)
+ writer2.append(doc.replace(" ", "_") + "\t" + men.sentconent + "\t" + men.mentioncontent + "\t" + attri + "\t" + "0").append("\n");
+ }
+ }
+ }
+ writer1.close();
+ writer2.close();
+ }
+
+ Set targetedAttri_mentionExp = Sets.newHashSet(
+ "1.1.Goal setting (behavior)",
+ "1.2 Problem solving",
+ "1.4 Action planning",
+ "2.2 Feedback on behaviour",
+ "2.3 Self-monitoring of behavior",
+ "3.1 Social support (unspecified)",
+ "5.1 Information about health consequences",
+ "5.3 Information about social and environmental consequences",
+ "11.1 Pharmacological support",
+ "11.2 Reduce negative emotions",
+
+ "Arm name",
+ "Outcome value",
+ "Mean age",
+ "Proportion identifying as female gender",
+ "Mean number of times tobacco used",
+ "Proportion identifying as male gender",
+ "Proportion identifying as belonging to a specific ethnic group",
+ "Lower-level geographical region",
+ "Smoking",
+ "4.1 Instruction on how to perform the behavior",
+
+ "Longest follow up",
+ "Effect size p value",
+ "Effect size estimate",
+ "Biochemical verification",
+ "Proportion employed",
+ "4.5. Advise to change behavior",
+ "Proportion achieved university or college education",
+ "Country of intervention",
+ "Self report",
+ "Odds Ratio",
+
+ "Aggregate patient role",
+ "Aggregate health status type",
+ "Aggregate relationship status",
+ "Proportion in a legal marriage or union",
+ "Mean number of years in education completed",
+ "Proportion belonging to specified family or household income category",
+ "Proportion belonging to specified individual income category",
+ "Healthcare facility",
+ "Doctor-led primary care facility",
+ "Hospital facility",
+
+ "Site",
+ "Individual-level allocated",
+ "Individual-level analysed",
+ "Face to face",
+ "Distance",
+ "Printed material",
+ "Digital content type",
+ "Website / Computer Program / App",
+ "Somatic",
+ "Patch",
+
+ "Pill",
+ "Individual",
+ "Group-based",
+ "Health Professional",
+ "Psychologist",
+ "Researcher not otherwise specified",
+ "Interventionist not otherwise specified",
+ "Expertise of Source",
+
+ //rank3
+ "Dose",
+ "Overall duration",
+ "Number of contacts",
+ "Contact frequency",
+ "Contact duration",
+ "Format",
+ "Nicotine dependence",
+ "Cognitive Behavioural Therapy",
+ "Mindfulness",
+ "Motivational Interviewing",
+ "Brief advice",
+ "Physical activity",
+ "Individual reasons for attrition",
+ "Encountered intervention",
+ "Completed intervention",
+ "Sessions delivered",
+ "Pharmaceutical company funding",
+ "Tobacco company funding",
+ "Research grant funding",
+ "No funding",
+ "Pharmaceutical company competing interest",
+ "Tobacco company competing interest",
+ "Research grant competing interest",
+ "No competing interest",
+
+ "a specific ethnic group",
+ "Proportion identifying as belonging to a specific ethnic group",
+ "specified family or household income category",
+ "Proportion belonging to specified family or household income category",
+ "specified individual income category",
+ "Proportion belonging to specified individual income category",
+ "Aggregate relationship status name",
+ "Aggregate relationship status value",
+ "Nicotine dependence name",
+ "Nicotine dependence value",
+ "Individual reasons for attrition name",
+ "Individual reasons for attrition value"
+ );
+
+
+ public List generateMentionClassifierTrainingData (String doc, List annotation){
+ List results = new ArrayList();
+ List mentionIndexWithType = new ArrayList();
+ Boolean mentionStart = false;
+ int index = 0;
+ int mentionStartIndex = 0;
+ int mentionEndIndex = 0;
+ String mentionType = "";
+ for(NERToken token: annotation){
+ if(!mentionStart&&token.nertag.contains("B-")){
+ mentionStart = true;
+ mentionStartIndex = index;
+ mentionType = token.nertag;
+ }else if(mentionStart && token.nertag.contains("B")) {
+ mentionEndIndex = index-1;
+ mentionIndexWithType.add(mentionStartIndex + "#" + mentionEndIndex + "#" + mentionType);
+ mentionStart = true;
+ mentionType = token.nertag;
+ mentionStartIndex = index;
+ mentionEndIndex = 0;
+ }else if(mentionStart && token.nertag.contains("O")) {
+ mentionEndIndex = index -1;
+ mentionIndexWithType.add(mentionStartIndex + "#" + mentionEndIndex + "#" + mentionType);
+ mentionStart = false;
+ mentionType = "";
+ mentionStartIndex = 0;
+ mentionEndIndex = 0;
+ }
+ index = index + 1;
+ }
+ for(String s: mentionIndexWithType){
+ int start = Integer.valueOf(s.split("#")[0]);
+ int end = Integer.valueOf(s.split("#")[1]);
+ String type = s.split("#")[2];
+ String sent = "";
+ String spanContent = "";
+ for(int k = start; k<=end; k++) {
+ spanContent = spanContent + " " + annotation.get(k).word;
+ }
+ spanContent = spanContent.trim();
+
+ for(int i=0; i< annotation.size(); i++){
+ NERToken token = annotation.get(i);
+ if(start==i && end==i){
+ sent = sent + " [unused1] " + token.word + " [unused2]";
+ }else if(start ==i){
+ sent = sent + " [unused1] " + token.word;
+ }else if(end ==i){
+ sent = sent + " " + token.word + " [unused2]";
+ }else{
+ sent = sent + " " + token.word;
+ }
+ }
+ type = type.replace("B-", "").replaceAll("_", "").replace("-", "").trim();
+ if(type.equalsIgnoreCase("Proportion identifying as belonging to a specific ethnic group name")){
+ type = "a specific ethnic group";
+ }
+ if(type.equalsIgnoreCase("Proportion identifying as belonging to a specific ethnic group value")){
+ type = "Proportion identifying as belonging to a specific ethnic group";
+ }
+ if(type.equalsIgnoreCase("Proportion belonging to specified family or household income category name")){
+ type = "specified family or household income category";
+ }
+ if(type.equalsIgnoreCase("Proportion belonging to specified family or household income category value")){
+ type = "Proportion belonging to specified family or household income category";
+ }
+ if(type.equalsIgnoreCase("Proportion belonging to specified individual income category name")){
+ type = "specified individual income category";
+ }
+ if(type.equalsIgnoreCase("Proportion belonging to specified individual income category value")){
+ type = "Proportion belonging to specified individual income category";
+ }
+ results.add(doc + "\t" + sent.trim() + "\t" +spanContent + "\t"+ type + "\t" + "1");
+ for(String attri: targetedAttri_mentionExp){
+ if(!type.equalsIgnoreCase(attri)){
+ results.add(doc + "\t" + sent.trim() + "\t" + spanContent + "\t"+ attri + "\t" + "0");
+ }
+ }
+ }
+ return results;
+ }
public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
Map> traintest = generateTrainingTestingFiles();
- FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_augment1_newtablesent.csv"));
+ FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_oddsratio_wotablesent.csv"));
+// FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_augment1_wotablesent.csv"));
+// FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_augment1_newtablesent.csv"));
// FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_augment2.csv"));
// FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_augment1.csv"));
// FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/train_rank123_oldtablesent_moreoutcome.csv"));
// FileWriter writer1 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "flairExp/train_rank12_wotable.csv"));
StringBuffer sb1 = new StringBuffer();
- FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_augment1_newtablesent.csv"));
+// FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_augment1_newtablesent.csv"));
+ FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_oddsratio_wotablesent.csv"));
+// FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_augment1_wotablesent.csv"));
// FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_augment2.csv"));
// FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_augment1.csv"));
// FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/test_rank123_oldtablesent_moreoutcome.csv"));
// FileWriter writer2 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "flairExp/test_rank12_wotable.csv"));
StringBuffer sb2 = new StringBuffer();
- FileWriter writer4 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/unmatch_020420.txt"));
+ FileWriter writer4 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/unmatch_050220.txt"));
StringBuffer sb4 = new StringBuffer();
String testdir = "../data/All_512Papers_04March20_extracted_fixname23/";
@@ -547,7 +1177,7 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
// IndexManager index = getDefaultIndexManager(props);
//priority rank1 + rank2 + rank3
- Set augmentAttri = Sets.newHashSet(
+ Set augmentAttri1 = Sets.newHashSet(
"2.2 Feedback on behaviour",
"Doctor-led primary care facility",
"Patch",
@@ -571,13 +1201,18 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
"Encountered intervention",
"Sessions delivered"
);
- Set augmentAttri_namevalue = Sets.newHashSet(
+ Set augmentAttri_namevalue1 = Sets.newHashSet(
"Proportion belonging to specified family or household income category-name",
"Proportion belonging to specified individual income category-name",
"Aggregate relationship status-name",
"Proportion identifying as belonging to a specific ethnic group-name",
"Nicotine dependence-name"
);
+ Set augmentAttri_namevalue = Sets.newHashSet(
+ );
+ Set augmentAttri = Sets.newHashSet(
+ );
+
Set targetedAttri1 = Sets.newHashSet("Arm name");
Set targetedAttri2 = Sets.newHashSet(
// "Proportion identifying as belonging to a specific ethnic group",
@@ -591,10 +1226,10 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
// "Proportion identifying as male gender"
// "2.2 Feedback on behaviour"
);
- Set nameValueAttri1 = Sets.newHashSet(
+ Set nameValueAttri = Sets.newHashSet(
// "Proportion identifying as belonging to a specific ethnic group"
);
- Set nameValueAttri = Sets.newHashSet(
+ Set nameValueAttri1 = Sets.newHashSet(
"Proportion identifying as belonging to a specific ethnic group",
"Proportion belonging to specified family or household income category",
"Proportion belonging to specified individual income category",
@@ -607,6 +1242,10 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
// "Proportion identifying as belonging to a specific ethnic group"
);
Set targetedAttri = Sets.newHashSet(
+ "Odds Ratio"
+// "Proportion identifying as belonging to a specific ethnic group"
+ );
+ Set targetedAttri4 = Sets.newHashSet(
// "Minimum age",
// "Maximum age",
// "All male",
@@ -623,7 +1262,7 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
"11.1 Pharmacological support",
"11.2 Reduce negative emotions",
-// "Arm name",
+ "Arm name",
"Outcome value",
"Mean age",
"Proportion identifying as female gender",
@@ -840,7 +1479,7 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
// for (String str : context.split("( ; |.; |. ;|.;|.;)")) {
// for (String str : context.split("(;|,)")) {
for(String str: splitsent){
-// if(str.contains("has a value of")) continue;
+ if(str.contains("has a value of")) continue;
if (str.split(" ").length >= 2) {
str = splitDashBetweenNumbers(str);
str = splitSlashBetweenNumbers(str);
@@ -955,28 +1594,28 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
}
// System.err.print(matchedTableSent);
//for table sentences, bring in all 'O' sentences
- List sents = generateTestingSentence(doc, testdir);
- int tableSentCount = 0;
- for(String sent: sents){
- if(sent.contains("has a value of")&&!matchedTableSent.contains(sent)){
- Sentence tableSent = new Sentence(sent);
- List tokens_original = tableSent.tokens();
- tableSentCount++;
- if(tableSentCount>10) break;
- if(traintest.get("train").contains(doc)){ //training
- for(Token t: tokens_original){
- sb1.append(doc.replace(" ", "_") + "\t" + t.originalText() + "\t" + t.posTag() + "\t" + "O" + "\n");
- }
- sb1.append("\n");
- }else{ //testing
- for(Token t: tokens_original){
- sb2.append(doc.replace(" ", "_") + "\t" + t.originalText() + "\t" + t.posTag() + "\t" + "O" + "\n");
- }
- sb2.append("\n");
- }
-
- }
- }// add table sentences which do not have any annotations
+// List sents = generateTestingSentence(doc, testdir);
+// int tableSentCount = 0;
+// for(String sent: sents){
+// if(sent.contains("has a value of")&&!matchedTableSent.contains(sent)){
+// Sentence tableSent = new Sentence(sent);
+// List tokens_original = tableSent.tokens();
+// tableSentCount++;
+// if(tableSentCount>10) break;
+// if(traintest.get("train").contains(doc)){ //training
+// for(Token t: tokens_original){
+// sb1.append(doc.replace(" ", "_") + "\t" + t.originalText() + "\t" + t.posTag() + "\t" + "O" + "\n");
+// }
+// sb1.append("\n");
+// }else{ //testing
+// for(Token t: tokens_original){
+// sb2.append(doc.replace(" ", "_") + "\t" + t.originalText() + "\t" + t.posTag() + "\t" + "O" + "\n");
+// }
+// sb2.append("\n");
+// }
+//
+// }
+// }// add table sentences which do not have any annotations
}
//add augenentated annotations in the training file
// for(String attribute: augmentAttri){
@@ -1115,21 +1754,21 @@ public void generateTrainTestData_BIO_Tagging() throws IOException, Exception {
System.err.println("num of HBCP annotated instances:" + instanceNum_annotate);
System.err.println("num of problematic annotated instances:" + problematicAnnotationCount);
//generate real testing data
-// for (String doc : traintest.get("test")) {
-// List sents = generateTestingSentence(doc, testdir);
-// if(sents.isEmpty()){
-// System.err.println(doc + ": no corresponding xml file");
-// continue;
-// }
-//// FileWriter writer3 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "flairExp/testfile/" + doc + ".txt"));
-// FileWriter writer3 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/testfile_rank123/" + doc + ".txt"));
-// StringBuffer sb3 = new StringBuffer();
-// for (String sent : sents) {
-// sb3.append(sent).append("\n");
-// }
-// writer3.write(sb3.toString());
-// writer3.close();
-// }
+ for (String doc : traintest.get("test")) {
+ List sents = generateTestingSentence(doc, testdir);
+ if(sents.isEmpty()){
+ System.err.println(doc + ": no corresponding xml file");
+ continue;
+ }
+// FileWriter writer3 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "flairExp/testfile/" + doc + ".txt"));
+ FileWriter writer3 = new FileWriter(new File(BaseDirInfo.getBaseDir() + "../flairExp/rank123Exp/testfile_rank123/" + doc + ".txt"));
+ StringBuffer sb3 = new StringBuffer();
+ for (String sent : sents) {
+ sb3.append(sent).append("\n");
+ }
+ writer3.write(sb3.toString());
+ writer3.close();
+ }
for(String attri: stat.keySet()){
System.err.println(attri + "\t" + stat.get(attri).get(0) + "\t" + stat.get(attri).get(1) + "\t" + stat.get(attri).get(2) + "\t" + stat.get(attri).get(3));
}
@@ -1506,7 +2145,15 @@ public static void main(String[] args) throws IOException, Exception {
// System.err.println(generatorTrainTest.isContextFromTable(s1));
// System.err.println(generatorTrainTest.isContextFromTable(s2));
- generatorTrainTest.generateTrainTestData_BIO_Tagging();
+ generatorTrainTest.generateTrainTestData_BIO_Tagging();
+
+//mention classification exp
+// generate candidate mentions
+// generatorTrainTest.generateTestingData_mention_classification();
+ //generate mention classification data
+// generatorTrainTest.generateTrainingData_mention_classification();
+
+
// generatorTrainTest.generateTestingFileForPhysicalActivity("/Users/yhou/git/hbcp/data/Physical_Activity_extracted");
// generatorTrainTest.count();
diff --git a/core/src/main/java/com/ibm/drl/hbcp/experiments/ie/parsing/AbbyyVsGrobidComparer.java b/core/src/main/java/com/ibm/drl/hbcp/experiments/ie/parsing/AbbyyVsGrobidComparer.java
index 5d100d9..b9e6fd3 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/experiments/ie/parsing/AbbyyVsGrobidComparer.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/experiments/ie/parsing/AbbyyVsGrobidComparer.java
@@ -9,7 +9,6 @@
import com.ibm.drl.hbcp.parser.pdf.Document;
import com.ibm.drl.hbcp.parser.pdf.manager.PdfToAbbyyParse;
import com.ibm.drl.hbcp.parser.pdf.manager.PdfToDocumentFunction;
-import com.ibm.drl.hbcp.parser.pdf.manager.PdfToGrobidParse;
import com.ibm.drl.hbcp.parser.pdf.reparsing.ReparsePdfToDocument;
import com.ibm.drl.hbcp.util.Props;
import com.opencsv.CSVWriter;
@@ -42,8 +41,7 @@ public class AbbyyVsGrobidComparer {
public AbbyyVsGrobidComparer() throws IOException {
props = Props.loadProperties();
parsers = Lists.newArrayList(
- new PdfToAbbyyParse(props),
- new PdfToGrobidParse()
+ new PdfToAbbyyParse(props)
);
}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/experiments/lrec20/OpenAccessIndexer.java b/core/src/main/java/com/ibm/drl/hbcp/experiments/lrec20/OpenAccessIndexer.java
deleted file mode 100644
index c8500fa..0000000
--- a/core/src/main/java/com/ibm/drl/hbcp/experiments/lrec20/OpenAccessIndexer.java
+++ /dev/null
@@ -1,79 +0,0 @@
-package com.ibm.drl.hbcp.experiments.lrec20;
-
-import com.ibm.drl.hbcp.core.attributes.ExtractedAttributeValuePair;
-import com.ibm.drl.hbcp.core.attributes.collection.AttributeValueCollection;
-import com.ibm.drl.hbcp.inforetrieval.indexer.PaperIndexer;
-import com.ibm.drl.hbcp.parser.AnnotatedAttributeValuePair;
-import com.ibm.drl.hbcp.parser.JSONRefParser;
-import com.ibm.drl.hbcp.parser.pdf.grobid.OpenAccessDatasetGeneratorForLREC2020;
-import org.apache.tika.exception.TikaException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.stream.Collectors;
-
-public class OpenAccessIndexer extends PaperIndexer {
-
- private static Logger logger = LoggerFactory.getLogger(OpenAccessIndexer.class);
- private final Collection oaFilenames;
-
- /**
- * Constructs the object by taking as an argument the relative path of a
- * properties file.
- *
- * @param propFile Relative path (from the project base) to the a properties file.
- * @param oaFilenames Open access filenames
- */
- public OpenAccessIndexer(String propFile, Collection oaFilenames) throws IOException {
- super(propFile);
- this.oaFilenames = oaFilenames;
- }
-
- @Override
- protected void indexDirectory(File dir) throws SAXException, IOException {
- logger.info("Indexing directory " + dir);
- File[] files = dir.listFiles();
- if (files == null) {
- return;
- }
- Arrays.sort(files);
-
- for (File f : files) {
- if (f.isDirectory()) {
- logger.info("Indexing directory " + f.getName());
- indexDirectory(f); // recurse
- }
- else
- if (oaFilenames.contains(f.getName())) {
- try {
- indexFile(f);
- } catch (TikaException e) {
-// e.printStackTrace();
- System.err.println(e.getMessage());
- System.err.println("Skipping " + f.getName() + " (couldn't parse and index PDF)...");
- }
- }
- }
- }
-
-
- public static void main(String[] args) {
- try {
- String openAccessJsonsPath = "data/lrec2020/openaccesspapers_extracted_humanreadable";
- JSONRefParser annotations = new JSONRefParser(new File("data/jsons/SmokingPapers407_19Nov19.json"));
- final AttributeValueCollection annotatedAttributeValuePairs = OpenAccessDatasetGeneratorForLREC2020.openAccessDataset(openAccessJsonsPath, annotations);
- final Collection oaFilenames = annotatedAttributeValuePairs.stream().map(ExtractedAttributeValuePair::getDocName).collect(Collectors.toSet());
- final OpenAccessIndexer openAccessIndexer = new OpenAccessIndexer("src/main/java/com/ibm/drl/hbcp/experiments/lrec20/lrec.properties", oaFilenames);
- openAccessIndexer.processAll();
-
- } catch (IOException | TikaException | SAXException e) {
- e.printStackTrace();
- }
- }
-
-}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/AnnotatedAttributeValuePair.java b/core/src/main/java/com/ibm/drl/hbcp/parser/AnnotatedAttributeValuePair.java
index c528d90..d4c4597 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/AnnotatedAttributeValuePair.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/AnnotatedAttributeValuePair.java
@@ -83,8 +83,10 @@ public boolean isNameNumberPair() {
@Override
public String toString() {
// value and hightlighted text should be the same
- return "{attributeId:" + attribute.getId() + ", attributeName:" + attribute.getName() + ", value:" + getSingleLineValue() + ", context:" + context
- + ", docName:" + getDocName() + ", annotationPage:" + annotationPage + ", sprintNo:" + sprintNo + ", arm:" + arm + "}";
+ return "{attributeId:" + attribute.getId() + ", attributeName:" + attribute.getName() + ", value:" + getSingleLineValue()
+ + ", context:" + normalizeWhitespace(context)
+ + ", highlightedText: " + normalizeWhitespace(highlightedText)
+ + ", docName:" + getDocName() + ", arm:" + arm + "}";
}
protected String normalizeWhitespace(String s) { return s.replaceAll("\\s", " "); }
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/JSONRefParser.java b/core/src/main/java/com/ibm/drl/hbcp/parser/JSONRefParser.java
index 4c86d49..5b584be 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/JSONRefParser.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/JSONRefParser.java
@@ -12,10 +12,7 @@
import com.ibm.drl.hbcp.core.attributes.collection.AttributeValueCollection;
import com.ibm.drl.hbcp.inforetrieval.indexer.BaseDirInfo;
import com.ibm.drl.hbcp.parser.cleaning.typing.LineConsistencyChecker;
-import com.ibm.drl.hbcp.parser.jsonstructure.JsonAnnotationFile;
-import com.ibm.drl.hbcp.parser.jsonstructure.JsonCode;
-import com.ibm.drl.hbcp.parser.jsonstructure.JsonItemAttributeFullTextDetail;
-import com.ibm.drl.hbcp.parser.jsonstructure.JsonReference;
+import com.ibm.drl.hbcp.parser.jsonstructure.*;
import com.ibm.drl.hbcp.util.FileUtils;
import com.ibm.drl.hbcp.util.ParsingUtils;
import com.ibm.drl.hbcp.util.Props;
@@ -173,6 +170,12 @@ private List getInstances(JsonAnnotationFile json)
res.addAll(getAttributeValuePairs(code, reference.getItemId()));
}
}
+ // separate handling of outcomes (only used for physical activity)
+ if (reference.getOutcomes() != null) {
+ for (JsonOutcome outcome : reference.getOutcomes()) {
+ res.addAll(getAttributeValuePairs(outcome, reference));
+ }
+ }
}
return res;
}
@@ -246,18 +249,56 @@ private List extends AnnotatedAttributeValuePair> getAttributeValuePairs(JsonI
new AnnotatedAttributeValuePair(attribute, highlightedText, docName, finalArm, context, highlightedText, finalSprintNo, finalAnnotationPage)
);
}
+ }
+ private List getAttributeValuePairs(JsonOutcome outcome, JsonReference reference) {
+ Optional docNameTrue = getFirstDocname(reference);
+ String docName = docNameTrue.orElse(outcome.getShortTitle());
+ // we use the highlighted text to put the table caption
+ String highlightedText = outcome.getOutcomeDescription();
+ Arm arm1 = getAssignedArm(outcome.getItemArmIdGrp1(), outcome.getGrp1ArmName());
+ Arm arm2 = getAssignedArm(outcome.getItemArmIdGrp2(), outcome.getGrp2ArmName());
+ // outcome values
+ Attribute ovAttribute = attributes.getFromName("Outcome value");
+ AnnotatedAttributeValuePair ov1 = new AnnotatedAttributeValuePair(ovAttribute, outcome.getData3(),
+ docName, arm1, "", highlightedText, "", 0);
+ AnnotatedAttributeValuePair ov2 = new AnnotatedAttributeValuePair(ovAttribute, outcome.getData4(),
+ docName, arm2, "", highlightedText, "", 0);
+ // timepoints
+ Attribute timepointAttribute = attributes.getFromName("Longest follow up");
+ AnnotatedAttributeValuePair tp1 = new AnnotatedAttributeValuePair(timepointAttribute, outcome.getTimepointString(),
+ docName, arm1, "", highlightedText, "", 0);
+ AnnotatedAttributeValuePair tp2 = new AnnotatedAttributeValuePair(timepointAttribute, outcome.getTimepointString(),
+ docName, arm2, "", highlightedText, "", 0);
+ // timepoint units
+ Attribute timepointUnitAttribute = attributes.getFromName("Longest follow up (metric)");
+ AnnotatedAttributeValuePair tpUnit1 = new AnnotatedAttributeValuePair(timepointUnitAttribute, outcome.getItemTimepointMetric(),
+ docName, arm1, "", highlightedText, "", 0);
+ AnnotatedAttributeValuePair tpUnit2 = new AnnotatedAttributeValuePair(timepointUnitAttribute, outcome.getItemTimepointMetric(),
+ docName, arm2, "", highlightedText, "", 0);
+ // sample size
+ Attribute samplesizeAttribute = attributes.getFromName("Individual-level analysed");
+ AnnotatedAttributeValuePair ss1 = new AnnotatedAttributeValuePair(samplesizeAttribute, outcome.getData1(),
+ docName, arm1, "", highlightedText, "", 0);
+ AnnotatedAttributeValuePair ss2 = new AnnotatedAttributeValuePair(samplesizeAttribute, outcome.getData2(),
+ docName, arm2, "", highlightedText, "", 0);
+ // TODO: anything else?
+ return Lists.newArrayList(ov1, ov2, tp1, tp2, tpUnit1, tpUnit2, ss1, ss2);
}
protected Arm getAssignedArm(JsonCode code, int itemId) {
int armId = isArmifiedBasedOnItemId ? itemId : (isArmified ? code.getArmId() : 0);
+ return getAssignedArm(armId, code.getArmTitle());
+ }
+
+ protected Arm getAssignedArm(int armId, String armTitle) {
Arm arm = arms.get(armId);
if (arm == null) {
// this means the arm is implicit in the document (not annotated/declared), we add it on-the-fly
// this is an expected behavior
- arm = new Arm(String.valueOf(code.getArmId()), code.getArmTitle());
- logger.debug("Arm {} was implicit (not declared/annotated).", code.getArmId());
- arms.put(code.getArmId(), arm);
+ arm = new Arm(String.valueOf(armId), armTitle);
+ logger.debug("Arm {} was implicit (not declared/annotated).", armId);
+ arms.put(armId, arm);
}
return arm;
}
@@ -497,14 +538,44 @@ public static void displayReachAttributes() throws IOException {
}
}
+ public static void outcomeValuesStats() throws IOException {
+ JSONRefParser parser = new JSONRefParser(new File("../data/jsons/All_annotations_512papers_05March20.json"));
+ double total = 0;
+ List values = new ArrayList<>();
+ // compute mean
+ double mean = 0.0;
+ for (AnnotatedAttributeValuePair outcome : parser.getAttributeValuePairs().byId().get(Attributes.get().getFromName("Outcome value").getId())) {
+ try {
+ double value = ParsingUtils.parseFirstDouble(outcome.getValue());
+ mean += value;
+ values.add(value);
+ total++;
+ } catch (NumberFormatException e) {
+
+ }
+ }
+ mean /= total;
+ // compute SD
+ double sd = 0.0;
+ for (double value : values) {
+ sd += (value - mean) * (value - mean);
+ }
+ sd /= total;
+ sd = Math.sqrt(sd);
+ System.out.println("Mean = " + mean);
+ System.out.println("SD = " + sd);
+ }
+
public static void countAttributes() throws IOException {
JSONRefParser parser = new JSONRefParser(new File("../data/jsons/All_annotations_512papers_05March20.json"));
System.out.println("Attribute count: " + parser.getAttributeValuePairs().getAllAttributeIds().size());
}
public static void countAttributesPA() throws IOException {
- JSONRefParser parser = new JSONRefParser(new File("../data/jsons/PhysicalActivity Sprint1ArmsAnd Prioritised47Papers.json"));
- System.out.println("Attribute count for PA: " + parser.getAttributeValuePairs().getAllAttributeIds().size());
+ JSONRefParser parser = new JSONRefParser(new File("data/PhysicalActivity Sprint1ArmsAnd Prioritised47Papers.json"));
+ System.out.println("Docs: " + parser.getAttributeValuePairs().getDocNames().size());
+ parser = new JSONRefParser(new File("data/Batch2PhysicalActivityPrioritisedCodeset.json"));
+ System.out.println("Docs: " + parser.getAttributeValuePairs().getDocNames().size());
}
public static void mainTableGrammar() throws IOException {
@@ -537,7 +608,7 @@ public static void mainTableGrammar() throws IOException {
public static void main(String[] args) throws IOException {
//mainTableGrammar();
//countAttributes();
- //countAttributesPA();
- displayReachAttributes();
+ countAttributesPA();
+ //outcomeValuesStats();
}
}
\ No newline at end of file
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/Cleaners.java b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/Cleaners.java
index 749b74f..54bfde1 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/Cleaners.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/Cleaners.java
@@ -27,9 +27,11 @@ public Cleaners(Properties properties) {
List predefinedNumericAttributeIds = Arrays.asList(properties.getProperty("prediction.attribtype.numerical").split(","));
cleaners = Lists.newArrayList(
new NEqualsArtifactCleaner(),
+ new ExoticCharacterCleaner(),
new ValueCompletionCleaner(),
new NumericValueCleaner(predefinedNumericAttributeIds),
new ContextCompletionCleaner(properties, predefinedNumericAttributeIds),
+ new EmptyContextStandardizingCleaner(),
new ContextInTableCleaner(properties)
);
}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/EmptyContextStandardizingCleaner.java b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/EmptyContextStandardizingCleaner.java
new file mode 100644
index 0000000..22d7a1c
--- /dev/null
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/EmptyContextStandardizingCleaner.java
@@ -0,0 +1,92 @@
+package com.ibm.drl.hbcp.parser.cleaning;
+
+import com.google.common.collect.Multiset;
+import com.ibm.drl.hbcp.core.attributes.AttributeType;
+import com.ibm.drl.hbcp.core.attributes.collection.AttributeValueCollection;
+import com.ibm.drl.hbcp.parser.AnnotatedAttributeValuePair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * An empty context is a new convention appearing in Physical Activity annotations and indicates a value
+ * found in a table. Most of the time, only one of the in-table population characteristics or outcome has a non-empty
+ * context, and this will be the name/title/caption of the table in the PDF.
+ *
+ * This cleaner empties all contexts supposed to be associated with table values in this new convention, and puts
+ * systematically the name/title/caption of the table in the "highlighted text" field instead, so that further cleaners
+ * can consistently identify and locate table values.
+ *
+ * @author mgleize
+ */
+public class EmptyContextStandardizingCleaner implements Cleaner {
+
+ public static final Logger log = LoggerFactory.getLogger(EmptyContextStandardizingCleaner.class);
+
+ @Override
+ public List clean(Collection original) {
+ List res = new ArrayList<>();
+ // split across docnames
+ AttributeValueCollection collection = new AttributeValueCollection<>(original);
+ for (String docName : collection.byDoc().keySet()) {
+ Multiset avps = collection.byDoc().get(docName);
+ // only values to touch are of the population type, since the outcomes already respect the convention
+ // described in the class description
+ List toFix = avps.stream()
+ .filter(this::isContextInTable)
+ .collect(Collectors.toList());
+ List notToFix = avps.stream()
+ .filter(aavp -> !isContextInTable(aavp))
+ .collect(Collectors.toList());
+ // there should be a (hopefully unique) non-empty context in this collection to mark the table caption
+ List nonEmptyContexts = toFix.stream()
+ .filter(aavp -> !aavp.getContext().isEmpty())
+ .map(AnnotatedAttributeValuePair::getContext)
+ .distinct()
+ .collect(Collectors.toList());
+ // if there are none or several, simply signal it and take the first (or don't touch anything)
+ if (nonEmptyContexts.isEmpty()) {
+ // haven't found any table name, just return the collection as is
+ res.addAll(new ArrayList<>(avps));
+ } else {
+ String tableCaption = nonEmptyContexts.get(0);
+ if (nonEmptyContexts.size() > 1) {
+ log.info("In {}: several potential table captions have been detected, taking the first", docName);
+ }
+ // fix all the toFix values (leave context empty, put the table caption as "highlighted text")
+ List fixed = toFix.stream()
+ .map(aavp -> getStandardizedTableValue(aavp, tableCaption))
+ .collect(Collectors.toList());
+ // add to the result the fixed values AND the ones to leave untouched
+ res.addAll(fixed);
+ res.addAll(notToFix);
+ }
+ }
+ return res;
+ }
+
+ private AnnotatedAttributeValuePair getStandardizedTableValue(AnnotatedAttributeValuePair tableOriginalValue,
+ String tableCaption) {
+ return new AnnotatedAttributeValuePair(tableOriginalValue.getAttribute(), tableOriginalValue.getValue(),
+ tableOriginalValue.getDocName(), tableOriginalValue.getArm(),
+ "",
+ tableCaption,
+ tableOriginalValue.getSprintNo(), tableOriginalValue.getAnnotationPage());
+ }
+
+ private boolean isContextInTable(AnnotatedAttributeValuePair original) {
+ // for a population attribute
+ // either an empty context, or a context starting with "Table"
+ return original.getAttribute().getType() == AttributeType.POPULATION
+ && (original.getContext().isEmpty()
+ || contextStartsWithTable(original));
+ }
+
+ private boolean contextStartsWithTable(AnnotatedAttributeValuePair original) {
+ return original.getContext().trim().toLowerCase().startsWith("table ");
+ }
+}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/ExoticCharacterCleaner.java b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/ExoticCharacterCleaner.java
new file mode 100644
index 0000000..340e002
--- /dev/null
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/ExoticCharacterCleaner.java
@@ -0,0 +1,58 @@
+package com.ibm.drl.hbcp.parser.cleaning;
+
+import com.google.common.collect.Lists;
+import com.ibm.drl.hbcp.parser.AnnotatedAttributeValuePair;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Attempts to get rid of weird exotic characters introduced during annotation,
+ * mostly ligatures like "fi" instead of the 2 characters "fi"
+ *
+ * @author mgleize
+ */
+public class ExoticCharacterCleaner implements Cleaner {
+
+ private static final List> CHAR_REPLACEMENTS = Lists.newArrayList(
+ Pair.of('fi', "fi"),
+ Pair.of('fl', "fl"),
+ Pair.of('ffl', "ffl"),
+ Pair.of('ffi', "ffi"),
+ Pair.of('ff', "ff")
+ );
+
+ private static final Set EXOTIC_CHARS = CHAR_REPLACEMENTS.stream()
+ .map(Pair::getKey).collect(Collectors.toSet());
+
+ @Override
+ public List clean(Collection original) {
+ return original.stream()
+ .map(this::clean)
+ .collect(Collectors.toList());
+ }
+
+ private AnnotatedAttributeValuePair clean(AnnotatedAttributeValuePair original) {
+ String cleanedValue = clean(original.getValue());
+ String cleanedContext = clean(original.getContext());
+ String cleanedHighlightedText = clean(original.getHighlightedText());
+ AnnotatedAttributeValuePair newAvp = new AnnotatedAttributeValuePair(original.getAttribute(),
+ cleanedValue, original.getDocName(), original.getArm(),
+ cleanedContext,
+ cleanedHighlightedText, original.getSprintNo(), original.getAnnotationPage()
+ );
+ // return it only if it has changed compared to the original
+ return newAvp.equals(original) ? original : newAvp;
+ }
+
+ private String clean(String s) {
+ for (Pair charAndReplacement : CHAR_REPLACEMENTS) {
+ s = StringUtils.replace(s, charAndReplacement.getKey().toString(), charAndReplacement.getValue());
+ }
+ return s;
+ }
+}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/TableValueFinder.java b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/TableValueFinder.java
index d8ec922..71c2c80 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/TableValueFinder.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/cleaning/TableValueFinder.java
@@ -8,6 +8,7 @@
import com.ibm.drl.hbcp.util.ParsingUtils;
import lombok.Getter;
import org.apache.commons.lang3.tuple.Pair;
+import org.jetbrains.annotations.NotNull;
import java.util.*;
import java.util.regex.Pattern;
@@ -33,21 +34,33 @@ public TableValueFinder(Document document, boolean useStrictCellEquality) {
public Optional findTableValue(AnnotatedAttributeValuePair avp) {
// detect if this AVP was found in a table
- Optional tableAnalysis = tableAnnotationAnalyzer.analyze(avp);
+ Optional tableAnalysis = getTableAnnotationAnalysis(avp);
if (!tableAnalysis.isPresent())
return Optional.empty();
- if (tablePreprocessings.isEmpty()) // it means that the AVP actually wasn't in a table (false positive of the TableAnnotationAnalyzer)
+ if (tablePreprocessings.isEmpty()) // no tables detected in the document
return Optional.empty();
// find the table most likely to contain the value (by computing some kind of recall of the cells in the context)
List numbersInTableContext = tableAnalysis.get().getNumericCellSequences().stream()
.flatMap(List::stream)
.map(ParsingUtils::parseFirstDoubleString)
.collect(Collectors.toList());
- List closestTables = findClosestTables(numbersInTableContext, tablePreprocessings);
+ List closestTables = findClosestTables(avp, numbersInTableContext, tablePreprocessings);
// find the value in the ABBYY table
return findCell(avp.getValue(), closestTables);
}
+ private Optional getTableAnnotationAnalysis(AnnotatedAttributeValuePair avp) {
+ Optional tableAnalysis = tableAnnotationAnalyzer.analyze(avp);
+ if (!tableAnalysis.isPresent() && isOtherTableValue(avp)) {
+ TableAnnotationAnalyzer.TableAnnotationAnalysis emptyAnalysis = new TableAnnotationAnalyzer.TableAnnotationAnalysis(
+ avp,
+ new ArrayList<>()
+ );
+ tableAnalysis = Optional.of(emptyAnalysis);
+ }
+ return tableAnalysis;
+ }
+
public Pair, List> getSameRowAndSameColumnValues(TableValue value) {
List fullTable = value.getTableBlock().getTable();
List sameRow = fullTable.stream()
@@ -61,7 +74,35 @@ public Pair, List> getSameRowAndSameColumnValues(Ta
return Pair.of(sameRow, sameColumn);
}
- private List findClosestTables(List numbersInTableContext, Map tables) {
+ private List findClosestTables(AnnotatedAttributeValuePair avp,
+ List numbersInTableContext,
+ Map tables) {
+ List closestTables;
+ if (numbersInTableContext.isEmpty() && isOtherTableValue(avp)) {
+ // first use the Physical Activity convention (the more recent one):
+ // a table value has empty context (and the table caption has been added in "highlighted text")
+ String tableCaption = avp.getHighlightedText();
+ closestTables = findClosestTablesWithTableCaption(tableCaption, tables);
+ } else {
+ // then use the Smoking Cessation convention (the older one):
+ // the table value has a context with 3 rows of the table to help the locating of the table
+ closestTables = findClosestTablesWithContextNumbers(numbersInTableContext, tables);
+ }
+ return closestTables;
+ }
+
+ private List findClosestTablesWithTableCaption(String tableCaption, Map tables) {
+ List res = new ArrayList<>();
+ for (Block table : tables.keySet()) {
+ TablePreprocessing preprocessedTable = tables.get(table);
+ if (tableHasHeader(preprocessedTable, tableCaption)) {
+ res.add(preprocessedTable);
+ }
+ }
+ return res;
+ }
+
+ private List findClosestTablesWithContextNumbers(List numbersInTableContext, Map tables) {
List res = new ArrayList<>();
long max = Long.MIN_VALUE;
for (Block table : tables.keySet()) {
@@ -97,6 +138,14 @@ private Optional findCell(String value, List tab
}
}
+ /** Detects other types of table values that the TableAnnotationAnalyzer couldn't detect such as:
+ * table values in Physical Activity for which the convention is that the context stays empty
+ * and the table caption is found somewhere else */
+ private boolean isOtherTableValue(AnnotatedAttributeValuePair avp) {
+ // highlighted text will carry the table caption
+ return avp.getContext().isEmpty() && !avp.getHighlightedText().isEmpty();
+ }
+
private Map preprocessAbbyyTable(Document document) {
Map res = new LinkedHashMap<>();
for (Block table : document.getTables()) {
@@ -133,6 +182,34 @@ private long countSharedCells(List contextNumbers, Set tableNumb
.count();
}
+ private boolean tableHasHeader(TablePreprocessing table, String searchString) {
+ Set headers = getHeaders(table);
+ for (String header : headers) {
+ if (headerMatchesSearchString(header, searchString)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private Set getHeaders(TablePreprocessing table) {
+ Set res = new HashSet<>();
+ for (TableValue value : table.getTable()) {
+ res.addAll(value.getRowHeaders());
+ res.addAll(value.getColumnHeaders());
+ }
+ return res;
+ }
+
+ private boolean headerMatchesSearchString(@NotNull String header, String searchString) {
+ // soft match on the start of the string
+ if (header.length() < "Table X".length()) {
+ return header.equals(searchString);
+ } else {
+ return searchString.startsWith(header);
+ }
+ }
+
public static String escapeRegex(String pattern) {
return pattern.replaceAll("[-\\[\\]{}()*+?.,\\\\\\^$|#\\s]", "\\\\$0");
}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonOutcome.java b/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonOutcome.java
new file mode 100644
index 0000000..4c4e2c4
--- /dev/null
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonOutcome.java
@@ -0,0 +1,58 @@
+package com.ibm.drl.hbcp.parser.jsonstructure;
+
+import lombok.Getter;
+import lombok.Setter;
+
+/**
+ * The new Outcome JSON object produced by EPPI reviewer, and first used in Physical Activity annotations
+ * @author mgleize
+ */
+@Getter @Setter
+public class JsonOutcome {
+ private String outcomeId; // probably useless
+ private String outcomeText; // probably useless
+ private int outcomeTypeId; // probably useless
+ private String outcomeTypeName; // probably useless
+ private String title;
+ private String shortTitle; // docname (minus the trailing ".pdf")
+ // information on timepoint
+ private String itemTimepointDisplayValue; // value + unit (can be null :( )
+ private String itemTimepointMetric; // unit
+ private String itemTimepointValue; // value only
+ // information on arms
+ private int itemArmIdGrp1;
+ private int itemArmIdGrp2;
+ private String grp1ArmName;
+ private String grp2ArmName;
+ // context information: the table caption
+ private String outcomeDescription;
+ // data: N's
+ private String data1; // group 1 N
+ private String data1Desc;
+ private String data2; // group 2 N
+ private String data2Desc;
+ // data: means
+ private String data3; // group 1 mean
+ private String data3Desc;
+ private String data4; // group 2 mean
+ private String data4Desc;
+ // data: confidence intervals
+ private String data5; // group 1 CI lower
+ private String data5Desc;
+ private String data6; // group 2 CI lower
+ private String data6Desc;
+ private String data7; // group 1 CI upper
+ private String data7Desc;
+ private String data8; // group 2 CI upper
+ private String data8Desc;
+
+ public boolean hasAnyOfImportantFieldsNull() {
+ return data1 == null || data2 == null || data3 == null || data4 == null
+ || itemTimepointDisplayValue == null;
+ }
+
+ public String getTimepointString() {
+ return itemTimepointDisplayValue != null ? itemTimepointDisplayValue : itemTimepointValue;
+ }
+
+}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonReference.java b/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonReference.java
index 2d1499e..8e4ce3d 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonReference.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/jsonstructure/JsonReference.java
@@ -10,5 +10,8 @@ public class JsonReference {
private String title;
private String shortTitle;
private String Abstract;
- private JsonCode[] codes = new JsonCode[0]; // empty codes can happen when the document has no annotation
+ // empty codes can happen when the document has no annotation
+ private JsonCode[] codes = new JsonCode[0];
+ // exclusively used in physical activity, not smoking cessation
+ private JsonOutcome[] outcomes = new JsonOutcome[0];
}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/TableValue.java b/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/TableValue.java
index 187b9f9..00db3b1 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/TableValue.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/TableValue.java
@@ -36,9 +36,9 @@ public TableValue(String value, List rowHeaders, List columnHead
public String toText() {
return TEMPLATE
.replace("[row]",
- StringUtils.join(rowHeaders, ": "))
+ StringUtils.join(getFilteredHeadersForTableSentence(rowHeaders), ": "))
.replace("[column]",
- StringUtils.join(columnHeaders, ": "))
+ StringUtils.join(getFilteredHeadersForTableSentence(columnHeaders), ": "))
.replace("[value]", value.trim())
.replaceAll("[\\r\\n]+", "");
}
@@ -51,9 +51,18 @@ private List getFilteredHeaders(List headers) {
.collect(Collectors.toList());
}
+ private List getFilteredHeadersForTableSentence(List headers) {
+ return headers.stream()
+ .filter(this::isValidSingleHeaderForTableSentence)
+ .collect(Collectors.toList());
+ }
+
private boolean isValidSingleHeader(String header) {
- return CONTAINS_LETTER_REGEX.matcher(header).find() &&
- !header.toLowerCase().matches(".*table [0-9]+.*");
+ return CONTAINS_LETTER_REGEX.matcher(header).find();
+ }
+
+ private boolean isValidSingleHeaderForTableSentence(String header) {
+ return !header.toLowerCase().matches(".*table [0-9]+.*");
}
@Override
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/experiments/GrobidRecall.java b/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/experiments/GrobidRecall.java
index d9f5eea..6781d60 100644
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/experiments/GrobidRecall.java
+++ b/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/experiments/GrobidRecall.java
@@ -14,7 +14,6 @@
import com.ibm.drl.hbcp.parser.pdf.TableValue;
import com.ibm.drl.hbcp.parser.pdf.manager.PdfToAbbyyParse;
import com.ibm.drl.hbcp.parser.pdf.manager.PdfToDocumentFunction;
-import com.ibm.drl.hbcp.parser.pdf.manager.PdfToGrobidParse;
import com.ibm.drl.hbcp.parser.pdf.reparsing.ReparsePdfToDocument;
import com.ibm.drl.hbcp.util.ParsingUtils;
import com.ibm.drl.hbcp.util.Props;
@@ -41,7 +40,7 @@ public class GrobidRecall {
public GrobidRecall(Properties props) {
this.props = props;
abbyyParser = new PdfToAbbyyParse(props);
- grobidParser = new PdfToGrobidParse();
+ grobidParser = new PdfToAbbyyParse(props);
reparseParser = new ReparsePdfToDocument(ABBYY_PDF_BY_GROBID_FOLDER);
}
diff --git a/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/grobid/OpenAccessDatasetGeneratorForLREC2020.java b/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/grobid/OpenAccessDatasetGeneratorForLREC2020.java
deleted file mode 100644
index 53f4d99..0000000
--- a/core/src/main/java/com/ibm/drl/hbcp/parser/pdf/grobid/OpenAccessDatasetGeneratorForLREC2020.java
+++ /dev/null
@@ -1,109 +0,0 @@
-package com.ibm.drl.hbcp.parser.pdf.grobid;
-
-import com.ibm.drl.hbcp.core.attributes.collection.AttributeValueCollection;
-import com.ibm.drl.hbcp.parser.AnnotatedAttributeValuePair;
-import com.ibm.drl.hbcp.parser.JSONRefParser;
-import com.ibm.drl.hbcp.parser.cleaning.Cleaners;
-import com.ibm.drl.hbcp.parser.pdf.Document;
-import com.ibm.drl.hbcp.parser.pdf.manager.PdfToDocumentFunction;
-import com.ibm.drl.hbcp.parser.pdf.manager.PdfToGrobidParse;
-import com.ibm.drl.hbcp.parser.pdf.reparsing.Reparser;
-import com.ibm.drl.hbcp.util.FileUtils;
-import com.ibm.drl.hbcp.util.Props;
-
-import java.io.*;
-import java.util.*;
-import java.util.stream.Collectors;
-
-/**
- * Script class to process and generate the dataset for our submission at LREC2020
- *
- * @author mgleize
- */
-public class OpenAccessDatasetGeneratorForLREC2020 {
-
- final static PdfToDocumentFunction grobidParser = new PdfToGrobidParse();
-
- public static void writeAllOpenAccessPapersJson(File tabSeparatedOpenAccessDataset, File folder, JSONRefParser annotations) throws Exception {
- try (BufferedReader br = new BufferedReader(new FileReader(tabSeparatedOpenAccessDataset))) {
- String line = br.readLine();
- while ((line = br.readLine()) != null) {
- String[] splits = line.split("\t");
- String shortTitle = splits[1];
- Optional pdfInfo = annotations.getDocInfoFromShortTitle(shortTitle);
- File pdf = new File("data/All_330_PDFs_renamed/" + pdfInfo.get().getFilename());
- try {
- grobidParser.getDocument(pdf).writeToFile(new File(folder, shortTitle + ".json"));
- } catch (IOException e) {
- System.err.println("Failed on " + shortTitle);
- }
- System.out.println("Done: " + shortTitle);
- }
- }
- }
-
- public static void writeAllHumanReadableJsons(File tabSeparatedOpenAccessDataset, File inputJsonFolder, JSONRefParser annotations, File outputJsonFolder) throws IOException {
- try (BufferedReader br = new BufferedReader(new FileReader(tabSeparatedOpenAccessDataset))) {
- String line = br.readLine();
- while ((line = br.readLine()) != null) {
- String[] splits = line.split("\t");
- String shortTitle = splits[1];
- JSONRefParser.PdfInfo pdfInfo = annotations.getDocInfoFromShortTitle(shortTitle).get();
- File extractionJson = new File(inputJsonFolder, shortTitle + ".json");
- Reparser reparsed = new Reparser(extractionJson);
- Document doc = reparsed.getDocument();
- FileUtils.writeJsonToFile(
- doc.toHumanReadableJson(pdfInfo.getTitle(), pdfInfo.getShortTitle(), pdfInfo.getFilename(), pdfInfo.getIntroduction()),
- new File(outputJsonFolder, shortTitle + ".json")
- );
- System.out.println("Done: " + shortTitle);
- }
- }
- }
-
- public static AttributeValueCollection openAccessDataset(String inputJsonFolder, JSONRefParser annotations) {
- AttributeValueCollection all = annotations.getAttributeValuePairs();
- System.out.println("All: " + all.size());
- Set shortTitles = Arrays.stream(new File(inputJsonFolder).listFiles())
- .map(File::getName)
- .map(filename -> filename.replaceAll("\\.json", ""))
- .collect(Collectors.toSet());
- List