Add ExactMatch scorer (#79)

By popular request -- this scorer simply compares two values and tells you whether they're equal or not. Of course, things get a little tricky if one thing is an object and the other is a string (not an uncommon scenario when generating JSON).
braintrustdata · Jul 19, 2024 · 67d0bf8 · 67d0bf8
1 parent a7c7135
commit 67d0bf8
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -188,6 +188,7 @@ npx braintrust run example.eval.js
 ### Heuristic
 
 - Levenshtein distance
+- Exact match
 - Numeric difference
 - JSON diff
 - Jaccard distance

diff --git a/js/manifest.ts b/js/manifest.ts
@@ -24,6 +24,7 @@ import {
 import { ListContains } from "./list";
 import { ScorerWithPartial } from "./partial";
 import { Moderation } from "./moderation";
+import { ExactMatch } from "./value";
 
 interface AutoevalMethod {
   method: ScorerWithPartial<any, any>;
@@ -164,6 +165,11 @@ export const Evaluators: {
         method: Levenshtein,
         description: "Uses the Levenshtein distance to compare two strings.",
       },
+      {
+        method: ExactMatch,
+        description:
+          "Compares two values for exact equality. If the values are objects, they are converted to JSON strings before comparison.",
+      },
       {
         method: NumericDiff,
         description: "Compares numbers by normalizing their difference.",

diff --git a/js/value.test.ts b/js/value.test.ts
@@ -1,6 +1,7 @@
 import { ListContains } from "./list";
 import { NumericDiff } from "./number";
 import { LevenshteinScorer } from "./string";
+import { ExactMatch } from "./value";
 
 test("Levenshtein Test", async () => {
   const cases = [
@@ -89,3 +90,32 @@ test("ListContains Test", async () => {
     ).score,
   ).toBe(1);
 });
+
+test("ExactMatch", async () => {
+  const cases = [
+    { output: "hello", expected: "hello", expectedScore: 1 },
+    { output: "hello", expected: "world", expectedScore: 0 },
+    { output: 123, expected: 123, expectedScore: 1 },
+    { output: 123, expected: "123", expectedScore: 1 },
+    { output: { a: 1, b: 2 }, expected: { a: 1, b: 2 }, expectedScore: 1 },
+    { output: { a: 1, b: 2 }, expected: { a: 1, b: 3 }, expectedScore: 0 },
+    { output: [1, 2, 3], expected: [1, 2, 3], expectedScore: 1 },
+    { output: [1, 2, 3], expected: [3, 2, 1], expectedScore: 0 },
+    { output: { a: 1, b: 2 }, expected: { b: 2, a: 1 }, expectedScore: 0 }, // Order matters
+    { output: { a: 1, b: 2 }, expected: '{"a": 1, "b": 2}', expectedScore: 1 }, // String representation matches dict
+    { output: { a: 1, b: 2 }, expected: '{"a":1, "b":2}', expectedScore: 1 }, // String representation matches dict
+    { output: { a: 1, b: 2 }, expected: '{"b":2, "a":1}', expectedScore: 0 },
+    {
+      output: { a: 1, b: 2 },
+      expected: { b: 2, a: 1, c: 3 },
+      expectedScore: 0,
+    }, // Extra key, not equal
+    { output: null, expected: null, expectedScore: 1 },
+    { output: null, expected: undefined, expectedScore: 1 },
+  ];
+
+  for (const { output, expected, expectedScore } of cases) {
+    const score = (await ExactMatch({ output, expected })).score;
+    expect(score).toBeCloseTo(expectedScore, 4);
+  }
+});
diff --git a/js/value.ts b/js/value.ts
@@ -0,0 +1,41 @@
+import { makePartial, ScorerWithPartial } from "./partial";
+
+/**
+ * A simple scorer that tests whether two values are equal. If the value is an object or array,
+ * it will be JSON-serialized and the strings compared for equality.
+ */
+export const ExactMatch: ScorerWithPartial<unknown, {}> = makePartial(
+  (args) => {
+    const maybeObject = needsJSON(args.output) || needsJSON(args.expected);
+    const [output, expected] = [
+      normalizeValue(args.output ?? null, maybeObject),
+      normalizeValue(args.expected ?? null, maybeObject),
+    ];
+
+    const score = output === expected ? 1 : 0;
+
+    return {
+      name: "ExactMatch",
+      score,
+    };
+  },
+  "ExactMatch",
+);
+
+function needsJSON(value: unknown): boolean {
+  return typeof value === "object" || Array.isArray(value);
+}
+
+export function normalizeValue(value: unknown, maybeObject: boolean): string {
+  if (needsJSON(value)) {
+    return JSON.stringify(value);
+  }
+  try {
+    if (typeof value === "string" && maybeObject) {
+      return JSON.stringify(JSON.parse(value));
+    }
+  } catch (e) {
+    // That's ok, just return the string representation
+  }
+  return `${value}`;
+}
diff --git a/py/autoevals/test_values.py b/py/autoevals/test_values.py
@@ -1,8 +1,10 @@
+import pytest
 from pytest import approx
 
 from autoevals.list import ListContains
 from autoevals.number import NumericDiff
 from autoevals.string import LevenshteinScorer
+from autoevals.value import ExactMatch
 
 
 def test_levenshtein():
@@ -77,3 +79,30 @@ def test_list_contains():
     assert (
         ListContains(pairwise_evaluator=LevenshteinScorer(), allow_extra_entities=True)(["a", "b"], ["a"]).score == 1
     )
+
+
+def test_exact_match():
+    cases = [
+        ["hello", "hello", 1],
+        ["hello", "world", 0],
+        [123, 123, 1],
+        [123, "123", 1],
+        [{"a": 1, "b": 2}, {"a": 1, "b": 2}, 1],
+        [{"a": 1, "b": 2}, {"a": 1, "b": 3}, 0],
+        [[1, 2, 3], [1, 2, 3], 1],
+        [[1, 2, 3], [3, 2, 1], 0],
+        [{"a": 1, "b": 2}, {"b": 2, "a": 1}, 0],  # Order matters
+        [{"a": 1, "b": 2}, '{"a": 1, "b": 2}', 1],  # String representation matches dict
+        [{"a": 1, "b": 2}, '{"a":1, "b":2}', 1],  # String representation matches dict
+        [{"a": 1, "b": 2}, '{"b": 2, "a": 1}', 0],
+        [{"a": 1, "b": 2}, {"b": 2, "a": 1, "c": 3}, 0],  # Extra key, not equal
+        [None, None, 1],
+        [None, "None", 1],
+    ]
+
+    for output, expected, expected_score in cases:
+        assert ExactMatch()(output, expected).score == approx(expected_score, abs=1e-4), (
+            output,
+            expected,
+            expected_score,
+        )
diff --git a/py/autoevals/value.py b/py/autoevals/value.py
@@ -0,0 +1,37 @@
+import json
+from typing import Any
+
+from braintrust_core.score import Score
+
+from autoevals.partial import ScorerWithPartial
+
+
+class ExactMatch(ScorerWithPartial):
+    """
+    A simple scorer that tests whether two values are equal. If the value is an object or array,
+    it will be JSON-serialized and the strings compared for equality.
+    """
+
+    def _run_eval_sync(self, output, expected=None, **kwargs):
+        maybe_object = needs_json(output) or needs_json(expected)
+        output, expected = normalize_value(output, maybe_object), normalize_value(expected, maybe_object)
+        score = 1 if output == expected else 0
+
+        return Score(name=self._name(), score=score)
+
+
+def needs_json(value: Any) -> bool:
+    return isinstance(value, (dict, list))
+
+
+def normalize_value(value: Any, maybe_object: bool) -> str:
+    if needs_json(value):
+        return json.dumps(value)
+
+    try:
+        if maybe_object:
+            return json.dumps(json.loads(value))
+    except json.JSONDecodeError:
+        pass
+
+    return str(value)