From cdfbcc2a1aed5f3833b4c7430682f0d48fc00080 Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Fri, 15 Mar 2024 09:27:21 -0400 Subject: [PATCH] Implements the chained SSDeep Discovery Query (#2242) --- warehouse/assemble/datawave/pom.xml | 6 + .../src/main/resources/bin/ingest/findJars.sh | 2 + .../main/resources/bin/ingest/ingest-libs.sh | 2 + .../query/tables/ssdeep/DiscoveredSSDeep.java | 26 + .../FullSSDeepDiscoveryChainStrategy.java | 113 ++++ .../SSDeepChainedDiscoveryQueryLogic.java | 79 +++ .../ssdeep/SSDeepDiscoveryQueryLogic.java | 486 ++++++++++++++++++ .../tables/ssdeep/SSDeepScoringFunction.java | 3 +- .../SSDeepSimilarityQueryTransformer.java | 5 +- ...eryTest.java => SSDeepIndexQueryTest.java} | 7 +- .../tables/ssdeep/SSDeepIngestQueryTest.java | 80 ++- .../ssdeep/SSDeepSimilarityQueryTest.java | 3 +- .../query/SSDeepQueryLogicFactory.xml | 25 + 13 files changed, 827 insertions(+), 10 deletions(-) create mode 100644 warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/DiscoveredSSDeep.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/FullSSDeepDiscoveryChainStrategy.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepChainedDiscoveryQueryLogic.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepDiscoveryQueryLogic.java rename warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/{SSDeepQueryTest.java => SSDeepIndexQueryTest.java} (97%) diff --git a/warehouse/assemble/datawave/pom.xml b/warehouse/assemble/datawave/pom.xml index 6571d67fe5e..2ab36776881 100644 --- a/warehouse/assemble/datawave/pom.xml +++ b/warehouse/assemble/datawave/pom.xml @@ -82,6 +82,11 @@ gov.nsa.datawave datawave-ingest-scripts + + gov.nsa.datawave + datawave-ingest-ssdeep + ${project.version} + gov.nsa.datawave datawave-ingest-wikipedia @@ -672,6 +677,7 @@ ${project.groupId}:datawave-edge-model-configuration-core ${project.groupId}:datawave-ingest-wikipedia ${project.groupId}:datawave-ingest-nyctlc + ${project.groupId}:datawave-ingest-ssdeep gov.nsa.datawave.microservice:accumulo-utils gov.nsa.datawave.microservice:metadata-utils gov.nsa.datawave.microservice:type-utils diff --git a/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh b/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh index 21040a2f42e..562a3fb83bb 100644 --- a/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh +++ b/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh @@ -31,9 +31,11 @@ DATAWAVE_INGEST_CSV_JAR=$(findJar datawave-ingest-csv) DATAWAVE_INGEST_JSON_JAR=$(findJar datawave-ingest-json) DATAWAVE_INGEST_WIKIPEDIA_JAR=$(findJar datawave-ingest-wikipedia) DATAWAVE_INGEST_NYCTLC_JAR=$(findJar datawave-ingest-nyctlc) +DATAWAVE_INGEST_SSDEEP_JAR=$(findJar datawave-ingest-ssdeep) DATAWAVE_INGEST_CORE_JAR=$(findJar datawave-ingest-core) DATAWAVE_INGEST_CONFIG_JAR=$(findJar datawave-ingest-configuration) DATAWAVE_COMMON_JAR=$(findJar datawave-common) +DATAWAVE_COMMON_SSDEEP_JAR=$(findJar datawave-ssdeep-common) DATAWAVE_ACCUMULO_EXTENSIONS_JAR=$(findJar datawave-accumulo-extensions) DATAWAVE_METRICS_CORE_JAR=$(findJar datawave-metrics-core) DATAWAVE_METADATA_UTILS_JAR=$(findJar metadata-utils) diff --git a/warehouse/ingest-scripts/src/main/resources/bin/ingest/ingest-libs.sh b/warehouse/ingest-scripts/src/main/resources/bin/ingest/ingest-libs.sh index d4cb7b30fe9..3f4386b54fc 100755 --- a/warehouse/ingest-scripts/src/main/resources/bin/ingest/ingest-libs.sh +++ b/warehouse/ingest-scripts/src/main/resources/bin/ingest/ingest-libs.sh @@ -18,6 +18,7 @@ CLASSPATH=${CLASSPATH}:${DATAWAVE_ACCUMULO_UTILS_JAR} CLASSPATH=${CLASSPATH}:${INMEMORY_ACCUMULO_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_BASE_REST_RESPONSES_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_COMMON_UTILS_JAR} +CLASSPATH=${CLASSPATH}:${DATAWAVE_COMMON_SSDEEP_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_INDEX_STATS_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_CORE_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_CONFIG_JAR} @@ -25,6 +26,7 @@ CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_CSV_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_JSON_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_WIKIPEDIA_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_NYCTLC_JAR} +CLASSPATH=${CLASSPATH}:${DATAWAVE_INGEST_SSDEEP_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_METADATA_UTILS_JAR} CLASSPATH=${CLASSPATH}:${DATAWAVE_TYPE_UTILS_JAR} CLASSPATH=${CLASSPATH}:${CURATOR_FRAMEWORK_JAR} diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/DiscoveredSSDeep.java b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/DiscoveredSSDeep.java new file mode 100644 index 00000000000..5f4cb926042 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/DiscoveredSSDeep.java @@ -0,0 +1,26 @@ +package datawave.query.tables.ssdeep; + +import datawave.query.discovery.DiscoveredThing; + +/** + * Captures a ssdeep query, matching ssdeep and the discovery data about that match. This class immutable once created + */ +public class DiscoveredSSDeep { + /** A scored match between two ssdeep hashes, output by the SSDeep similarity query logic */ + public final ScoredSSDeepPair scoredSSDeepPair; + /** The discovered information about the matching SSDeep hash */ + public final DiscoveredThing discoveredThing; + + public DiscoveredSSDeep(ScoredSSDeepPair scoredSSDeepPair, DiscoveredThing discoveredThing) { + this.scoredSSDeepPair = scoredSSDeepPair; + this.discoveredThing = discoveredThing; + } + + public ScoredSSDeepPair getScoredSSDeepPair() { + return scoredSSDeepPair; + } + + public DiscoveredThing getDiscoveredThing() { + return discoveredThing; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/FullSSDeepDiscoveryChainStrategy.java b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/FullSSDeepDiscoveryChainStrategy.java new file mode 100644 index 00000000000..8a14d0acf04 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/FullSSDeepDiscoveryChainStrategy.java @@ -0,0 +1,113 @@ +package datawave.query.tables.ssdeep; + +import java.util.Iterator; +import java.util.Set; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.log4j.Logger; + +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; + +import datawave.query.discovery.DiscoveredThing; +import datawave.query.tables.chained.strategy.FullChainStrategy; +import datawave.webservice.query.Query; +import datawave.webservice.query.QueryImpl; +import datawave.webservice.query.logic.QueryLogic; + +/** + * A chain strategy that is designed to first run a ssdeep similarity query and then run a subsequent discovery query for each matching ssdeep hash found by + * that similarity query. Effectively allows the user to discover information related to hashes that are similar to one or more query hashes + */ +public class FullSSDeepDiscoveryChainStrategy extends FullChainStrategy { + private static final Logger log = Logger.getLogger(FullSSDeepDiscoveryChainStrategy.class); + + private Multimap scoredMatches; + + @Override + protected Query buildLatterQuery(Query initialQuery, Iterator initialQueryResults, String latterLogicName) { + log.debug("buildLatterQuery() called..."); + + // track the scored matches we've seen while traversing the initial query results. + // this has to be case-insensitive because the CHECKSUM_SSDEEP index entries are most likely downcased. + scoredMatches = TreeMultimap.create(String.CASE_INSENSITIVE_ORDER, ScoredSSDeepPair.NATURAL_ORDER); + + String queryString = captureScoredMatchesAndBuildQuery(initialQueryResults, scoredMatches); + + Query q = new QueryImpl(); // TODO, need to use a factory? don't hardcode this. + q.setQuery(queryString); + q.setId(UUID.randomUUID()); + q.setPagesize(Integer.MAX_VALUE); // TODO: choose something reasonable. + q.setQueryAuthorizations(initialQuery.getQueryAuthorizations()); + q.setUserDN(initialQuery.getUserDN()); + return q; + } + + @Override + public Iterator runChainedQuery(AccumuloClient client, Query initialQuery, Set auths, + Iterator initialQueryResults, QueryLogic latterQueryLogic) throws Exception { + final Iterator it = super.runChainedQuery(client, initialQuery, auths, initialQueryResults, latterQueryLogic); + + // Create a defensive copy of the score map because stream evaluation may be delayed. + final Multimap localScoredMatches = TreeMultimap.create(String.CASE_INSENSITIVE_ORDER, ScoredSSDeepPair.NATURAL_ORDER); + localScoredMatches.putAll(scoredMatches); + + return getEnrichedDiscoveredSSDeepIterator(it, localScoredMatches); + } + + /** + * + * @param initialQueryResults + * an iterator of scored ssdeep pairs that represent the results of the initial ssdeep similarity query. + * @param scoredMatches + * used to capture the scored matches contained within the initialQueryResults + * @return the query string for the next stage of the query. + */ + public static String captureScoredMatchesAndBuildQuery(Iterator initialQueryResults, + final Multimap scoredMatches) { + // extract the matched ssdeeps from the query results and generate the discovery query. + return StreamSupport.stream(Spliterators.spliteratorUnknownSize(initialQueryResults, Spliterator.ORDERED), false) + .filter(queryResult -> scoredMatches.put(queryResult.getMatchingHash().toString(), queryResult)) + .map(queryResult -> queryResult.getMatchingHash().toString()).distinct().peek(ssdeep -> log.debug("Added new ssdeep " + ssdeep)) + .map(ssdeep -> "CHECKSUM_SSDEEP:\"" + ssdeep + "\"").collect(Collectors.joining(" OR ", "", "")); + } + + /** + * Given an iterator of DiscoveredSSDeep objects that have no matching query or weighted score, lookup the potential queries that returned them and the + * weighted score associated with that query and use them to produce enriched results. + * + * @param resultsIterator + * an iterator of unenrched DiscoveredSSDeep's that don't have query or score info. + * @param scoredMatches + * the colletion of matchin hashes and the original queries that lead them to be returned. + * @return an iterator of DiscoveredSSDeep's enriched with the queries that returned them. + */ + public static Iterator getEnrichedDiscoveredSSDeepIterator(Iterator resultsIterator, + final Multimap scoredMatches) { + return StreamSupport.stream(Spliterators.spliteratorUnknownSize(resultsIterator, Spliterator.ORDERED), false) + .flatMap(discoveredSSdeep -> enrichDiscoveredSSDeep(discoveredSSdeep, scoredMatches)).iterator(); + } + + /** + * Given a single discovered ssdeep, use the scoredMatches map to determine which queries it is related to. This will return zero to many new + * DiscoveredSSDeep entries for each query that the matching ssdeep hash appeared in. + * + * @param discoveredSSDeep + * the ssdeep discovery information a single matched hash + * @param scoredMatches + * the set of scored matches from the ssdeep similarity logic, used to look up score and query info for the matched hash. + * @return a stream of DiscoveredSSDeep objects that align discovery information with the original query hashes. + */ + public static Stream enrichDiscoveredSSDeep(DiscoveredSSDeep discoveredSSDeep, final Multimap scoredMatches) { + final DiscoveredThing discoveredThing = discoveredSSDeep.getDiscoveredThing(); + final String term = discoveredThing.getTerm(); + return scoredMatches.get(term).stream().map(scoredPair -> new DiscoveredSSDeep(scoredPair, discoveredThing)); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepChainedDiscoveryQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepChainedDiscoveryQueryLogic.java new file mode 100644 index 00000000000..6635e6d3846 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepChainedDiscoveryQueryLogic.java @@ -0,0 +1,79 @@ +package datawave.query.tables.ssdeep; + +import java.util.Collections; +import java.util.Iterator; +import java.util.Set; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.log4j.Logger; + +import datawave.query.tables.chained.ChainedQueryTable; +import datawave.webservice.query.Query; +import datawave.webservice.query.configuration.GenericQueryConfiguration; +import datawave.webservice.query.logic.QueryLogicTransformer; + +/** + * Implements a ChainedQueryTable that will first use the SSDeepSimilarityQueryLogic to find similar hashes for a set of query hashes and then run the + * SSDeepDiscoveryQueryLogic to retrieve discovery info for those matched hashes. + */ +public class SSDeepChainedDiscoveryQueryLogic extends ChainedQueryTable { + + private static final Logger log = Logger.getLogger(SSDeepChainedDiscoveryQueryLogic.class); + + private Query discoveryQuery = null; + + public SSDeepChainedDiscoveryQueryLogic() { + super(); + } + + @SuppressWarnings("CopyConstructorMissesField") + public SSDeepChainedDiscoveryQueryLogic(SSDeepChainedDiscoveryQueryLogic other) { + super(other); + } + + @Override + public void close() { + super.close(); + } + + public GenericQueryConfiguration initialize(AccumuloClient client, Query settings, Set auths) throws Exception { + super.initialize(client, settings, auths); + this.discoveryQuery = settings.duplicate(settings.getQueryName() + "_discovery_query"); + + log.debug("Initial settings parameters: " + settings.getParameters().toString()); + GenericQueryConfiguration config = this.logic1.initialize(client, settings, auths); + return config; + } + + public void setupQuery(GenericQueryConfiguration config) throws Exception { + if (null == this.getChainStrategy()) { + final String error = "No transformed ChainStrategy provided for SSDeepChainedDiscoveryQueryLogic!"; + log.error(error); + throw new RuntimeException(error); + } + + log.info("Setting up ssdeep query using config"); + this.logic1.setupQuery(config); + + final Iterator iter1 = this.logic1.iterator(); + + log.info("Running chained discovery query"); + this.iterator = this.getChainStrategy().runChainedQuery(config.getClient(), this.discoveryQuery, config.getAuthorizations(), iter1, this.logic2); + } + + @Override + public QueryLogicTransformer getTransformer(Query settings) { + return this.logic2.getTransformer(settings); + } + + @Override + public SSDeepChainedDiscoveryQueryLogic clone() throws CloneNotSupportedException { + return new SSDeepChainedDiscoveryQueryLogic(this); + } + + public Set getExampleQueries() { + return Collections.emptySet(); + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepDiscoveryQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepDiscoveryQueryLogic.java new file mode 100644 index 00000000000..5567dbe9505 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepDiscoveryQueryLogic.java @@ -0,0 +1,486 @@ +package datawave.query.tables.ssdeep; + +import java.security.Principal; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.commons.collections4.Transformer; +import org.apache.commons.collections4.iterators.TransformIterator; + +import datawave.audit.SelectorExtractor; +import datawave.marking.MarkingFunctions; +import datawave.query.discovery.DiscoveredThing; +import datawave.query.discovery.DiscoveryLogic; +import datawave.query.discovery.DiscoveryTransformer; +import datawave.query.model.QueryModel; +import datawave.query.util.MetadataHelperFactory; +import datawave.security.authorization.UserOperations; +import datawave.webservice.common.audit.Auditor; +import datawave.webservice.common.connection.AccumuloConnectionFactory; +import datawave.webservice.query.Query; +import datawave.webservice.query.configuration.GenericQueryConfiguration; +import datawave.webservice.query.exception.QueryException; +import datawave.webservice.query.iterator.DatawaveTransformIterator; +import datawave.webservice.query.logic.AbstractQueryLogicTransformer; +import datawave.webservice.query.logic.BaseQueryLogic; +import datawave.webservice.query.logic.QueryLogicTransformer; +import datawave.webservice.query.logic.ResponseEnricherBuilder; +import datawave.webservice.query.logic.RoleManager; +import datawave.webservice.query.result.event.EventBase; +import datawave.webservice.query.result.event.FieldBase; +import datawave.webservice.query.result.event.ResponseObjectFactory; +import datawave.webservice.result.BaseQueryResponse; + +/** + * Implements the SSDeepDiscoveryLogic which will retrieve discovery info for an SSDeep hash. Expected to be used as a part of the SSDeepChainedDiscoveryLogic, + * this class largely delegates to an instance of the DiscoveryLogic itself, but is required so that we can return DiscoveredSSDeep objects, which encapsulate + * the DiscoveredThing from the DiscoveryLogic and adds additional information about the query hash that resulted in the retrieval of this hash, including a + * similarity score. This is performed by the inline transformer class implemented in the getTransformer method. + *

+ * Note: as additional functionality is added to the DiscoveryLogic, delegate methods may need to be added here as well. + */ +public class SSDeepDiscoveryQueryLogic extends BaseQueryLogic { + public DiscoveryLogic discoveryDelegate; + + @SuppressWarnings("ConstantConditions") + public SSDeepDiscoveryQueryLogic() { + super(); + if (this.discoveryDelegate == null) { // may be set by super constructor + this.discoveryDelegate = new DiscoveryLogic(); + } + } + + public SSDeepDiscoveryQueryLogic(SSDeepDiscoveryQueryLogic other) { + super(other); + this.discoveryDelegate = (DiscoveryLogic) other.discoveryDelegate.clone(); + } + + @Override + public QueryLogicTransformer getTransformer(final Query settings) { + final DiscoveryTransformer discoveryTransformer = (DiscoveryTransformer) discoveryDelegate.getTransformer(settings); + QueryLogicTransformer ssdeepTransformer = new AbstractQueryLogicTransformer<>() { + @Override + public BaseQueryResponse createResponse(List resultList) { + return discoveryTransformer.createResponse(resultList); + } + + @Override + public EventBase transform(DiscoveredSSDeep discoveredSSDeep) { + EventBase eventBase = discoveryTransformer.transform(discoveredSSDeep.getDiscoveredThing()); + ResponseObjectFactory responseObjectFactory = discoveryDelegate.getResponseObjectFactory(); + ScoredSSDeepPair scoredSSDeepPair = discoveredSSDeep.getScoredSSDeepPair(); + if (scoredSSDeepPair != null) { + List> fields = eventBase.getFields(); + Optional> valueFieldOptional = fields.stream().filter(field -> "VALUE".equals(field.getName())).findFirst(); + + if (valueFieldOptional.isEmpty()) { + throw new IllegalStateException("Could not find value field in event"); + } + + FieldBase valueField = valueFieldOptional.get(); + + { + FieldBase field = responseObjectFactory.getField(); + field.setName("QUERY"); + field.setMarkings(valueField.getMarkings()); + field.setColumnVisibility(valueField.getColumnVisibility()); + field.setTimestamp(valueField.getTimestamp()); + field.setValue(scoredSSDeepPair.getQueryHash().toString()); + fields.add(field); + } + + { + FieldBase field = responseObjectFactory.getField(); + field.setName("WEIGHTED_SCORE"); + field.setMarkings(valueField.getMarkings()); + field.setColumnVisibility(valueField.getColumnVisibility()); + field.setTimestamp(valueField.getTimestamp()); + field.setValue(scoredSSDeepPair.getWeightedScore()); + fields.add(field); + } + + } + + return eventBase; + } + }; + return ssdeepTransformer; + + } + + @Override + public TransformIterator getTransformIterator(Query settings) { + return new DatawaveTransformIterator(this.iterator(), this.getTransformer(settings)); + } + + /** + * Return an iterator over the logic's results. This method must return an Iterator with the generic type compatible with generic type on the superclass + * BaseQueryLogic<DiscoveredSSDeep>, but the actual DiscoveredSSDeep with a complete scoredSSDeepPair is generated by the enrichDiscoveredSSDeep() + * method in FullSSDeepDiscoveryChainStrategy. + */ + @Override + public Iterator iterator() { + return new TransformIterator<>(discoveryDelegate.iterator(), discoveredThing -> new DiscoveredSSDeep(null, discoveredThing)); + } + + // All delegate methods past this point // + + public void setTableName(String tableName) { + discoveryDelegate.setTableName(tableName); + } + + public void setIndexTableName(String tableName) { + discoveryDelegate.setIndexTableName(tableName); + } + + public void setReverseIndexTableName(String tableName) { + discoveryDelegate.setReverseIndexTableName(tableName); + } + + public void setModelTableName(String tableName) { + discoveryDelegate.setModelTableName(tableName); + } + + public void setModelName(String modelName) { + discoveryDelegate.setModelName(modelName); + } + + public void setQueryModel(QueryModel model) { + discoveryDelegate.setQueryModel(model); + } + + public String getModelName() { + return discoveryDelegate.getModelName(); + } + + public void setMetadataHelperFactory(MetadataHelperFactory metadataHelperFactory) { + discoveryDelegate.setMetadataHelperFactory(metadataHelperFactory); + } + + public void setResponseObjectFactory(ResponseObjectFactory responseObjectFactory) { + discoveryDelegate.setResponseObjectFactory(responseObjectFactory); + } + + public void setMarkingFunctions(MarkingFunctions markingFunctions) { + discoveryDelegate.setMarkingFunctions(markingFunctions); + } + + @Override + public GenericQueryConfiguration initialize(AccumuloClient client, Query settings, Set runtimeQueryAuthorizations) throws Exception { + return discoveryDelegate.initialize(client, settings, runtimeQueryAuthorizations); + } + + @Override + public void setupQuery(GenericQueryConfiguration configuration) throws Exception { + discoveryDelegate.setupQuery(configuration); + } + + @Override + public Object clone() throws CloneNotSupportedException { + return new SSDeepDiscoveryQueryLogic(this); + } + + @Override + public AccumuloConnectionFactory.Priority getConnectionPriority() { + return discoveryDelegate.getConnectionPriority(); + } + + @Override + public Set getOptionalQueryParameters() { + return discoveryDelegate.getOptionalQueryParameters(); + } + + public void setFullTableScanEnabled(boolean fullTableScanEnabled) { + discoveryDelegate.setFullTableScanEnabled(fullTableScanEnabled); + } + + public void setAllowLeadingWildcard(boolean allowLeadingWildcard) { + discoveryDelegate.setAllowLeadingWildcard(allowLeadingWildcard); + } + + @Override + public Set getRequiredQueryParameters() { + return discoveryDelegate.getRequiredQueryParameters(); + } + + @Override + public Set getExampleQueries() { + return discoveryDelegate.getExampleQueries(); + } + + @Override + public GenericQueryConfiguration getConfig() { + if (discoveryDelegate == null) { + discoveryDelegate = new DiscoveryLogic(); + } + return discoveryDelegate.getConfig(); + } + + @Override + public String getPlan(AccumuloClient client, Query settings, Set runtimeQueryAuthorizations, boolean expandFields, boolean expandValues) + throws Exception { + return discoveryDelegate.getPlan(client, settings, runtimeQueryAuthorizations, expandFields, expandValues); + } + + @Override + public MarkingFunctions getMarkingFunctions() { + return discoveryDelegate.getMarkingFunctions(); + } + + @Override + public ResponseObjectFactory getResponseObjectFactory() { + return discoveryDelegate.getResponseObjectFactory(); + } + + @Override + public Principal getPrincipal() { + return discoveryDelegate.getPrincipal(); + } + + @Override + public void setPrincipal(Principal principal) { + discoveryDelegate.setPrincipal(principal); + } + + @Override + public String getTableName() { + return discoveryDelegate.getTableName(); + } + + @Override + public long getMaxResults() { + return discoveryDelegate.getMaxResults(); + } + + @Override + public long getMaxWork() { + return discoveryDelegate.getMaxWork(); + } + + @Override + public void setMaxResults(long maxResults) { + discoveryDelegate.setMaxResults(maxResults); + } + + @Override + public void setMaxWork(long maxWork) { + discoveryDelegate.setMaxWork(maxWork); + } + + @Override + public int getMaxPageSize() { + return discoveryDelegate.getMaxPageSize(); + } + + @Override + public void setMaxPageSize(int maxPageSize) { + discoveryDelegate.setMaxPageSize(maxPageSize); + } + + @Override + public long getPageByteTrigger() { + return discoveryDelegate.getPageByteTrigger(); + } + + @Override + public void setPageByteTrigger(long pageByteTrigger) { + discoveryDelegate.setPageByteTrigger(pageByteTrigger); + } + + @Override + public int getBaseIteratorPriority() { + return discoveryDelegate.getBaseIteratorPriority(); + } + + @Override + public void setBaseIteratorPriority(int baseIteratorPriority) { + discoveryDelegate.setBaseIteratorPriority(baseIteratorPriority); + } + + @Override + public String getLogicName() { + return discoveryDelegate.getLogicName(); + } + + @Override + public void setLogicName(String logicName) { + discoveryDelegate.setLogicName(logicName); + } + + @Override + public boolean getBypassAccumulo() { + return discoveryDelegate.getBypassAccumulo(); + } + + @Override + public void setBypassAccumulo(boolean bypassAccumulo) { + discoveryDelegate.setBypassAccumulo(bypassAccumulo); + } + + @Override + public String getAccumuloPassword() { + return discoveryDelegate.getAccumuloPassword(); + } + + @Override + public void setAccumuloPassword(String accumuloPassword) { + discoveryDelegate.setAccumuloPassword(accumuloPassword); + } + + @Override + public Auditor.AuditType getAuditType(Query query) { + return discoveryDelegate.getAuditType(query); + } + + @Override + public Auditor.AuditType getAuditType() { + return discoveryDelegate.getAuditType(); + } + + @Override + public void setAuditType(Auditor.AuditType auditType) { + discoveryDelegate.setAuditType(auditType); + } + + @Override + public void setLogicDescription(String logicDescription) { + discoveryDelegate.setLogicDescription(logicDescription); + } + + @Override + public String getLogicDescription() { + return discoveryDelegate.getLogicDescription(); + } + + @Override + public boolean getCollectQueryMetrics() { + return discoveryDelegate.getCollectQueryMetrics(); + } + + @Override + public void setCollectQueryMetrics(boolean collectQueryMetrics) { + discoveryDelegate.setCollectQueryMetrics(collectQueryMetrics); + } + + @Override + public RoleManager getRoleManager() { + return discoveryDelegate.getRoleManager(); + } + + @Override + public void setRoleManager(RoleManager roleManager) { + discoveryDelegate.setRoleManager(roleManager); + } + + @Override + public String getConnPoolName() { + return discoveryDelegate.getConnPoolName(); + } + + @Override + public void setConnPoolName(String connPoolName) { + discoveryDelegate.setConnPoolName(connPoolName); + } + + @Override + public boolean canRunQuery() { + return discoveryDelegate.canRunQuery(); + } + + @Override + public boolean canRunQuery(Principal principal) { + return discoveryDelegate.canRunQuery(principal); + } + + @Override + public List getSelectors(Query settings) throws IllegalArgumentException { + return discoveryDelegate.getSelectors(settings); + } + + @Override + public void setSelectorExtractor(SelectorExtractor selectorExtractor) { + discoveryDelegate.setSelectorExtractor(selectorExtractor); + } + + @Override + public SelectorExtractor getSelectorExtractor() { + return discoveryDelegate.getSelectorExtractor(); + } + + @Override + public Set getAuthorizedDNs() { + return discoveryDelegate.getAuthorizedDNs(); + } + + @Override + public void setAuthorizedDNs(Set authorizedDNs) { + discoveryDelegate.setAuthorizedDNs(authorizedDNs); + } + + @Override + public void setDnResultLimits(Map dnResultLimits) { + discoveryDelegate.setDnResultLimits(dnResultLimits); + } + + @Override + public Map getDnResultLimits() { + return discoveryDelegate.getDnResultLimits(); + } + + @Override + public void setSystemFromResultLimits(Map systemFromLimits) { + discoveryDelegate.setSystemFromResultLimits(systemFromLimits); + } + + @Override + public Map getSystemFromResultLimits() { + return discoveryDelegate.getSystemFromResultLimits(); + } + + @Override + public void setPageProcessingStartTime(long pageProcessingStartTime) { + discoveryDelegate.setPageProcessingStartTime(pageProcessingStartTime); + } + + @Override + public boolean isLongRunningQuery() { + return discoveryDelegate.isLongRunningQuery(); + } + + @Override + public ResponseEnricherBuilder getResponseEnricherBuilder() { + return discoveryDelegate.getResponseEnricherBuilder(); + } + + @Override + public void setResponseEnricherBuilder(ResponseEnricherBuilder responseEnricherBuilder) { + discoveryDelegate.setResponseEnricherBuilder(responseEnricherBuilder); + } + + @Override + public UserOperations getUserOperations() { + return discoveryDelegate.getUserOperations(); + } + + @Override + public String getResponseClass(Query query) throws QueryException { + return discoveryDelegate.getResponseClass(query); + } + + @Override + public boolean containsDNWithAccess(Collection dns) { + return discoveryDelegate.containsDNWithAccess(dns); + } + + @Override + public long getResultLimit(Query settings) { + return discoveryDelegate.getResultLimit(settings); + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepScoringFunction.java b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepScoringFunction.java index 3b02e982a0c..70102fd3b9b 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepScoringFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepScoringFunction.java @@ -1,7 +1,5 @@ package datawave.query.tables.ssdeep; -import static datawave.query.tables.ssdeep.SSDeepSimilarityQueryTransformer.MIN_SSDEEP_SCORE_PARAMETER; - import java.util.Collection; import java.util.Map; import java.util.function.Function; @@ -27,6 +25,7 @@ /** A function that transforms entries retrieved from Accumulo into Scored SSDeep hash matches */ public class SSDeepScoringFunction implements Function,Stream> { + public static final String MIN_SSDEEP_SCORE_PARAMETER = "minScore"; private static final Logger log = Logger.getLogger(SSDeepScoringFunction.class); /** Used to encode the chunk size as a character which is included in the ranges used to retrieve ngram tuples */ diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTransformer.java b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTransformer.java index f0fe24f1963..e8e9302cd29 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTransformer.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTransformer.java @@ -16,10 +16,11 @@ import datawave.webservice.result.BaseQueryResponse; import datawave.webservice.result.EventQueryResponseBase; +/** + * Transforms the results from an SSDeepSimilarityQuery into an EventBase suitable for a datawave web service api response + */ public class SSDeepSimilarityQueryTransformer extends BaseQueryLogicTransformer { - public static final String MIN_SSDEEP_SCORE_PARAMETER = "minScore"; - protected final Authorizations auths; protected final ResponseObjectFactory responseObjectFactory; diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIndexQueryTest.java similarity index 97% rename from warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepQueryTest.java rename to warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIndexQueryTest.java index 5e3834b4b3c..745719f0c87 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIndexQueryTest.java @@ -52,7 +52,8 @@ import datawave.webservice.query.runner.RunningQuery; import datawave.webservice.result.EventQueryResponseBase; -public class SSDeepQueryTest { +/** Simple unit test against the SSDeepIndex / SSDeepSimilarityLogic code */ +public class SSDeepIndexQueryTest { public static String[] TEST_SSDEEPS = {"12288:002r/VG4GjeZHkwuPikQ7lKH5p5H9x1beZHkwulizQ1lK55pGxlXTd8zbW:002LVG4GjeZEXi37l6Br1beZEdic1lmu", "6144:02C3nq73v1kHGhs6y7ppFj93NRW6/ftZTgC6e8o4toHZmk6ZxoXb0ns:02C4cGCLjj9Swfj9koHEk6/Fns", @@ -62,7 +63,7 @@ public class SSDeepQueryTest { "48:1aBhsiUw69/UXX0x0qzNkVkydf2klA8a7Z35:155w69MXAlNkmkWTF5", "196608:wEEE+EEEEE0LEEEEEEEEEEREEEEhEEETEEEEEWUEEEJEEEEcEEEEEEEE3EEEEEEN:", "1536:0YgNvw/OmgPgiQeI+25Nh6+RS5Qa8LmbyfAiIRgizy1cBx76UKYbD+iD/RYgNvw6:", "12288:222222222222222222222222222222222:"}; - private static final Logger log = Logger.getLogger(SSDeepQueryTest.class); + private static final Logger log = Logger.getLogger(SSDeepIndexQueryTest.class); private static final Authorizations auths = AbstractDataTypeConfig.getTestAuths(); @@ -209,7 +210,7 @@ public EventQueryResponseBase runSSDeepQuery(String query, int minScoreThreshold q.setQueryAuthorizations(auths.toString()); if (minScoreThreshold > 0) { - q.addParameter(SSDeepSimilarityQueryTransformer.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold)); + q.addParameter(SSDeepScoringFunction.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold)); } RunningQuery runner = new RunningQuery(accumuloClient, AccumuloConnectionFactory.Priority.NORMAL, this.logic, q, "", principal, diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIngestQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIngestQueryTest.java index 2ee8a55824e..090fd6d3b3c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIngestQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepIngestQueryTest.java @@ -19,7 +19,6 @@ import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Test; import com.google.common.collect.Sets; @@ -39,6 +38,7 @@ import datawave.query.testframework.FieldConfig; import datawave.query.testframework.FileType; import datawave.query.testframework.QueryLogicTestHarness; +import datawave.query.util.MetadataHelperFactory; import datawave.security.authorization.DatawavePrincipal; import datawave.security.authorization.DatawaveUser; import datawave.security.authorization.SubjectIssuerDNPair; @@ -53,6 +53,9 @@ import datawave.webservice.query.runner.RunningQuery; import datawave.webservice.result.EventQueryResponseBase; +/** + * Ingests some test data into the ssdeepIndex and shard tables and then tests that various SSDeep query logics against that data produce the expected results + */ public class SSDeepIngestQueryTest extends AbstractFunctionalQuery { @ClassRule @@ -62,6 +65,10 @@ public class SSDeepIngestQueryTest extends AbstractFunctionalQuery { SSDeepSimilarityQueryLogic similarityQueryLogic; + SSDeepDiscoveryQueryLogic discoveryQueryLogic; + + SSDeepChainedDiscoveryQueryLogic similarityDiscoveryQueryLogic; + @BeforeClass public static void filterSetup() throws Exception { log.setLevel(Level.DEBUG); @@ -82,6 +89,7 @@ public static void filterSetup() throws Exception { public void setupQuery() { MarkingFunctions markingFunctions = new MarkingFunctions.Default(); ResponseObjectFactory responseFactory = new DefaultResponseObjectFactory(); + MetadataHelperFactory metadataHelperFactory = new MetadataHelperFactory(); similarityQueryLogic = new SSDeepSimilarityQueryLogic(); similarityQueryLogic.setTableName(SSDeepIndexHandler.DEFAULT_SSDEEP_INDEX_TABLE_NAME); @@ -91,6 +99,24 @@ public void setupQuery() { similarityQueryLogic.setBucketEncodingLength(BUCKET_ENCODING_LENGTH); similarityQueryLogic.setIndexBuckets(BUCKET_COUNT); + discoveryQueryLogic = new SSDeepDiscoveryQueryLogic(); + discoveryQueryLogic.setTableName("shardIndex"); + discoveryQueryLogic.setIndexTableName("shardIndex"); + discoveryQueryLogic.setReverseIndexTableName("shardReverseIndex"); + discoveryQueryLogic.setModelTableName("metadata"); + discoveryQueryLogic.setMarkingFunctions(markingFunctions); + discoveryQueryLogic.setMetadataHelperFactory(metadataHelperFactory); + discoveryQueryLogic.setResponseObjectFactory(responseFactory); + + // FUTURE: Implement a streaming chain strategy for the SSDeepChainedDiscoveryQueryLogic + FullSSDeepDiscoveryChainStrategy ssdeepDiscoveryChainStrategy = new FullSSDeepDiscoveryChainStrategy(); + + similarityDiscoveryQueryLogic = new SSDeepChainedDiscoveryQueryLogic(); + similarityDiscoveryQueryLogic.setTableName("ssdeepIndex"); + similarityDiscoveryQueryLogic.setLogic1(similarityQueryLogic); + similarityDiscoveryQueryLogic.setLogic2(discoveryQueryLogic); + similarityDiscoveryQueryLogic.setChainStrategy(ssdeepDiscoveryChainStrategy); + // init must set auths testInit(); @@ -126,6 +152,56 @@ public void testSSDeepSimilarity() throws Exception { SSDeepTestUtil.assertSSDeepSimilarityMatch(testSSDeep, testSSDeep, "38", "100", observedEvents); } + @Test + public void testSSDeepDiscovery() throws Exception { + log.info("------ testDiscovery ------"); + String testSSDeep = "384:nv/fP9FmWVMdRFj2aTgSO+u5QT4ZE1PIVS:nDmWOdRFNTTs504cQS"; + String query = "CHECKSUM_SSDEEP:\"" + testSSDeep + "\""; + EventQueryResponseBase response = runSSDeepQuery(query, discoveryQueryLogic, 0); + + List events = response.getEvents(); + Assert.assertEquals(1, events.size()); + Map> observedEvents = extractObservedEvents(events); + + Map.Entry> result = observedEvents.entrySet().iterator().next(); + Map resultFields = result.getValue(); + Assert.assertEquals(testSSDeep, resultFields.get("VALUE")); + Assert.assertEquals("CHECKSUM_SSDEEP", resultFields.get("FIELD")); + Assert.assertEquals("20201031", resultFields.get("DATE")); + Assert.assertEquals("ssdeep", resultFields.get("DATA TYPE")); + Assert.assertEquals("4", resultFields.get("RECORD COUNT")); + + // At this point, the results have not been enriched with these fields, so they should not exist. + Assert.assertNull(null, resultFields.get("QUERY")); + Assert.assertNull(null, resultFields.get("WEIGHTED_SCORE")); + } + + @Test + public void testChainedSSDeepDiscovery() throws Exception { + log.info("------ testSSDeepDiscovery ------"); + String testSSDeep = "384:nv/fP9FmWVMdRFj2aTgSO+u5QT4ZE1PIVS:nDmWOdRFNTTs504---"; + String targetSSDeep = "384:nv/fP9FmWVMdRFj2aTgSO+u5QT4ZE1PIVS:nDmWOdRFNTTs504cQS"; + String query = "CHECKSUM_SSDEEP:" + testSSDeep; + EventQueryResponseBase response = runSSDeepQuery(query, similarityDiscoveryQueryLogic, 0); + + List events = response.getEvents(); + Assert.assertEquals(1, events.size()); + Map> observedEvents = extractObservedEvents(events); + + Map.Entry> result = observedEvents.entrySet().iterator().next(); + Map resultFields = result.getValue(); + Assert.assertEquals(targetSSDeep, resultFields.get("VALUE")); + + Assert.assertEquals("CHECKSUM_SSDEEP", resultFields.get("FIELD")); + Assert.assertEquals("20201031", resultFields.get("DATE")); + Assert.assertEquals("ssdeep", resultFields.get("DATA TYPE")); + Assert.assertEquals("4", resultFields.get("RECORD COUNT")); + + // The results have been enriched with these fields at this point. + Assert.assertEquals(testSSDeep, resultFields.get("QUERY")); + Assert.assertEquals("100", resultFields.get("WEIGHTED_SCORE")); + } + @SuppressWarnings("rawtypes") public EventQueryResponseBase runSSDeepQuery(String query, QueryLogic queryLogic, int minScoreThreshold) throws Exception { QueryImpl q = new QueryImpl(); @@ -135,7 +211,7 @@ public EventQueryResponseBase runSSDeepQuery(String query, QueryLogic queryLo q.setQueryAuthorizations(auths.toString()); if (minScoreThreshold > 0) { - q.addParameter(SSDeepSimilarityQueryTransformer.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold)); + q.addParameter(SSDeepScoringFunction.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold)); } RunningQuery runner = new RunningQuery(client, AccumuloConnectionFactory.Priority.NORMAL, queryLogic, q, "", principal, new QueryMetricFactoryImpl()); diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTest.java index 59c7b75f675..1a267afef9a 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ssdeep/SSDeepSimilarityQueryTest.java @@ -44,6 +44,7 @@ import datawave.webservice.query.runner.RunningQuery; import datawave.webservice.result.EventQueryResponseBase; +/** Additional unit test against the SSDeepIndex / SSDeepSimilarityLogic code */ public class SSDeepSimilarityQueryTest { private static final Logger log = Logger.getLogger(SSDeepSimilarityQueryTest.class); @@ -149,7 +150,7 @@ public EventQueryResponseBase runSSDeepQuery(String query, int minScoreThreshold q.setQueryAuthorizations(auths.toString()); if (minScoreThreshold > 0) { - q.addParameter(SSDeepSimilarityQueryTransformer.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold)); + q.addParameter(SSDeepScoringFunction.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold)); } RunningQuery runner = new RunningQuery(accumuloClient, AccumuloConnectionFactory.Priority.NORMAL, this.logic, q, "", principal, diff --git a/web-services/deploy/configuration/src/main/resources/datawave/query/SSDeepQueryLogicFactory.xml b/web-services/deploy/configuration/src/main/resources/datawave/query/SSDeepQueryLogicFactory.xml index 29c840eee61..e6a272e1aea 100644 --- a/web-services/deploy/configuration/src/main/resources/datawave/query/SSDeepQueryLogicFactory.xml +++ b/web-services/deploy/configuration/src/main/resources/datawave/query/SSDeepQueryLogicFactory.xml @@ -18,4 +18,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + +