Skip to content

Commit

Permalink
Limit gathered phrase indexes to excerpt fields (#1641)
Browse files Browse the repository at this point in the history
Reduce the phrase indexes gathered to only the fields that we need to
gather excerpts for.

Resolves #1607
  • Loading branch information
lbschanno authored Jul 27, 2023
1 parent bd3fd6b commit 94ebd6f
Show file tree
Hide file tree
Showing 7 changed files with 375 additions and 28 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package datawave.query.function;

import java.util.Set;

import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.security.ColumnVisibility;
import org.apache.commons.jexl2.DatawaveJexlScript;
Expand Down Expand Up @@ -38,6 +40,9 @@ public class JexlEvaluation implements Predicate<Tuple3<Key,Document,DatawaveJex
// do we need to gather phrase offsets
private boolean gatherPhraseOffsets = false;

// The set of fields for which we should gather phrase offsets for.
private Set<String> phraseOffsetFields;

/**
* Compiled and flattened jexl script
*/
Expand Down Expand Up @@ -81,6 +86,7 @@ public boolean apply(Tuple3<Key,Document,DatawaveJexlContext> input) {
TermOffsetMap termOffsetMap = (TermOffsetMap) input.third().get(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME);
if (termOffsetMap != null && isGatherPhraseOffsets() && arithmetic instanceof HitListArithmetic) {
termOffsetMap.setGatherPhraseOffsets(true);
termOffsetMap.setExcerptFields(phraseOffsetFields);
}

// now evaluate
Expand Down Expand Up @@ -163,4 +169,11 @@ public void setGatherPhraseOffsets(boolean gatherPhraseOffsets) {
this.gatherPhraseOffsets = gatherPhraseOffsets;
}

public Set<String> getPhraseOffsetFields() {
return phraseOffsetFields;
}

public void setPhraseOffsetFields(Set<String> phraseOffsetFields) {
this.phraseOffsetFields = phraseOffsetFields;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
import datawave.query.DocumentSerialization.ReturnType;
import datawave.query.attributes.AttributeKeepFilter;
import datawave.query.attributes.Document;
import datawave.query.attributes.ExcerptFields;
import datawave.query.attributes.ValueTuple;
import datawave.query.composite.CompositeMetadata;
import datawave.query.function.Aggregation;
Expand Down Expand Up @@ -1058,8 +1059,10 @@ protected JexlEvaluation getJexlEvaluation(String query, NestedQueryIterator<Key
}

// update the jexl evaluation to gather phrase offsets if required for excerpts
if (getExcerptFields() != null && !getExcerptFields().isEmpty()) {
ExcerptFields excerptFields = getExcerptFields();
if (excerptFields != null && !excerptFields.isEmpty()) {
jexlEvaluationFunction.setGatherPhraseOffsets(true);
jexlEvaluationFunction.setPhraseOffsetFields(excerptFields.getFields());
}

return jexlEvaluationFunction;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,14 +255,18 @@ private boolean isConverged(String field, String eventId, List<NavigableSet<Eval
} else {
// nope

// Establish the end offset of the phrase.
endOffset = second.first().termWeightPosition.getOffset();
// Record the phrase offsets to fetch excerpts later if desired.
termOffsetMap.addPhraseIndexTriplet(field, eventId, startOffset, endOffset);
if (log.isTraceEnabled()) {
log.trace("Adding phrase indexes [" + startOffset + "," + endOffset + "] for field " + field + " for event " + eventId
+ " to jexl context");
// Only record the phrase index if this is a targeted excerpt field.
if (termOffsetMap.shouldRecordPhraseIndex(field)) {
// Establish the end offset of the phrase.
endOffset = second.first().termWeightPosition.getOffset();
// Record the phrase offsets to fetch excerpts later if desired.
termOffsetMap.addPhraseIndexTriplet(field, eventId, startOffset, endOffset);
if (log.isTraceEnabled()) {
log.trace("Adding phrase indexes [" + startOffset + "," + endOffset + "] for field " + field + " for event " + eventId
+ " to jexl context");
}
}

return true;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,15 @@ public boolean findMatch() {
OffsetList o = offsetQueue.remove();

if (maxOffset.get().getLowOffset() - o.getMinOffset().getOffset() <= distance) {
// Track the start and end offset for the phrase.
int startOffset = o.getMinOffset().getOffset();
int endOffset = maxOffset.get().getLowOffset();
termOffsetMap.addPhraseIndexTriplet(field, eventId, startOffset, endOffset);
if (log.isTraceEnabled()) {
log.trace("Adding phrase indexes [" + startOffset + "," + endOffset + "] for field " + field + " to jexl context");
// Only record the phrase index if this is a targeted excerpt field.
if (termOffsetMap.shouldRecordPhraseIndex(field)) {
// Track the start and end offset for the phrase.
int startOffset = o.getMinOffset().getOffset();
int endOffset = maxOffset.get().getLowOffset();
termOffsetMap.addPhraseIndexTriplet(field, eventId, startOffset, endOffset);
if (log.isTraceEnabled()) {
log.trace("Adding phrase indexes [" + startOffset + "," + endOffset + "] for field " + field + " to jexl context");
}
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.StringJoiner;

import org.javatuples.Triplet;
Expand All @@ -19,6 +20,8 @@ public class TermOffsetMap {
// should we gather phrase offsets
boolean gatherPhraseOffsets = false;

// The set of excerpt fields to gather phrase offsets for.
private Set<String> excerptFields;
/**
* The term frequencies, with their corresponding fields.
*/
Expand Down Expand Up @@ -60,6 +63,17 @@ public TermFrequencyList getTermFrequencyList(String field) {
return termFrequencies.get(field);
}

/**
* Return whether phrases indexes should be recorded and the given field is am excerpt field.
*
* @param field
* the field
* @return true if phrase indexes should be recorded for the field, or false otherwise.
*/
public boolean shouldRecordPhraseIndex(String field) {
return gatherPhraseOffsets() && isExcerptField(field);
}

/**
* Add a new phrase index pair found for a hit for the specified field
*
Expand Down Expand Up @@ -101,7 +115,7 @@ public PhraseIndexes getPhraseIndexes() {
return phraseIndexes;
}

public boolean isGatherPhraseOffsets() {
public boolean gatherPhraseOffsets() {
return gatherPhraseOffsets;
}

Expand All @@ -113,9 +127,31 @@ public void setGatherPhraseOffsets(boolean gatherPhraseOffsets) {
}
} else {
this.phraseIndexes = null;
this.excerptFields = null;
}
}

/**
* Set the excerpt fields.
*
* @param excerptFields
* the fields
*/
public void setExcerptFields(Set<String> excerptFields) {
this.excerptFields = excerptFields;
}

/**
* Return whether the given field is an excerpt field
*
* @param field
* the field
* @return true if the field is an excerpt field, or false otherwise
*/
public boolean isExcerptField(String field) {
return excerptFields != null && excerptFields.contains(field);
}

@Override
public boolean equals(Object o) {
if (this == o) {
Expand All @@ -125,18 +161,18 @@ public boolean equals(Object o) {
return false;
}
TermOffsetMap that = (TermOffsetMap) o;
return Objects.equals(termFrequencies, that.termFrequencies) && Objects.equals(phraseIndexes, that.phraseIndexes);
return gatherPhraseOffsets == that.gatherPhraseOffsets && Objects.equals(excerptFields, that.excerptFields)
&& Objects.equals(termFrequencies, that.termFrequencies) && Objects.equals(phraseIndexes, that.phraseIndexes);
}

@Override
public int hashCode() {
return Objects.hash(termFrequencies, phraseIndexes);
return Objects.hash(gatherPhraseOffsets, excerptFields, termFrequencies, phraseIndexes);
}

@Override
public String toString() {
return new StringJoiner(", ", TermOffsetMap.class.getSimpleName() + "[", "]").add("termFrequencies=" + termFrequencies)
.add("phraseIndexes=" + phraseIndexes).toString();
return new StringJoiner(", ", TermOffsetMap.class.getSimpleName() + "[", "]").add("gatherPhraseOffsets=" + gatherPhraseOffsets)
.add("excerptFields=" + excerptFields).add("termFrequencies=" + termFrequencies).add("phraseIndexes=" + phraseIndexes).toString();
}

}
Loading

0 comments on commit 94ebd6f

Please sign in to comment.