diff --git a/README.md b/README.md index 2b24a50..34fa420 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,7 @@ npx braintrust run example.eval.js ### Heuristic - Levenshtein distance +- Exact match - Numeric difference - JSON diff - Jaccard distance diff --git a/js/manifest.ts b/js/manifest.ts index a937a7b..6e75df8 100644 --- a/js/manifest.ts +++ b/js/manifest.ts @@ -24,6 +24,7 @@ import { import { ListContains } from "./list"; import { ScorerWithPartial } from "./partial"; import { Moderation } from "./moderation"; +import { ExactMatch } from "./value"; interface AutoevalMethod { method: ScorerWithPartial; @@ -164,6 +165,11 @@ export const Evaluators: { method: Levenshtein, description: "Uses the Levenshtein distance to compare two strings.", }, + { + method: ExactMatch, + description: + "Compares two values for exact equality. If the values are objects, they are converted to JSON strings before comparison.", + }, { method: NumericDiff, description: "Compares numbers by normalizing their difference.", diff --git a/js/value.test.ts b/js/value.test.ts index ae00fdc..13bc1be 100644 --- a/js/value.test.ts +++ b/js/value.test.ts @@ -1,6 +1,7 @@ import { ListContains } from "./list"; import { NumericDiff } from "./number"; import { LevenshteinScorer } from "./string"; +import { ExactMatch } from "./value"; test("Levenshtein Test", async () => { const cases = [ @@ -89,3 +90,32 @@ test("ListContains Test", async () => { ).score, ).toBe(1); }); + +test("ExactMatch", async () => { + const cases = [ + { output: "hello", expected: "hello", expectedScore: 1 }, + { output: "hello", expected: "world", expectedScore: 0 }, + { output: 123, expected: 123, expectedScore: 1 }, + { output: 123, expected: "123", expectedScore: 1 }, + { output: { a: 1, b: 2 }, expected: { a: 1, b: 2 }, expectedScore: 1 }, + { output: { a: 1, b: 2 }, expected: { a: 1, b: 3 }, expectedScore: 0 }, + { output: [1, 2, 3], expected: [1, 2, 3], expectedScore: 1 }, + { output: [1, 2, 3], expected: [3, 2, 1], expectedScore: 0 }, + { output: { a: 1, b: 2 }, expected: { b: 2, a: 1 }, expectedScore: 0 }, // Order matters + { output: { a: 1, b: 2 }, expected: '{"a": 1, "b": 2}', expectedScore: 1 }, // String representation matches dict + { output: { a: 1, b: 2 }, expected: '{"a":1, "b":2}', expectedScore: 1 }, // String representation matches dict + { output: { a: 1, b: 2 }, expected: '{"b":2, "a":1}', expectedScore: 0 }, + { + output: { a: 1, b: 2 }, + expected: { b: 2, a: 1, c: 3 }, + expectedScore: 0, + }, // Extra key, not equal + { output: null, expected: null, expectedScore: 1 }, + { output: null, expected: undefined, expectedScore: 1 }, + ]; + + for (const { output, expected, expectedScore } of cases) { + const score = (await ExactMatch({ output, expected })).score; + expect(score).toBeCloseTo(expectedScore, 4); + } +}); diff --git a/js/value.ts b/js/value.ts new file mode 100644 index 0000000..40254cf --- /dev/null +++ b/js/value.ts @@ -0,0 +1,41 @@ +import { makePartial, ScorerWithPartial } from "./partial"; + +/** + * A simple scorer that tests whether two values are equal. If the value is an object or array, + * it will be JSON-serialized and the strings compared for equality. + */ +export const ExactMatch: ScorerWithPartial = makePartial( + (args) => { + const maybeObject = needsJSON(args.output) || needsJSON(args.expected); + const [output, expected] = [ + normalizeValue(args.output ?? null, maybeObject), + normalizeValue(args.expected ?? null, maybeObject), + ]; + + const score = output === expected ? 1 : 0; + + return { + name: "ExactMatch", + score, + }; + }, + "ExactMatch", +); + +function needsJSON(value: unknown): boolean { + return typeof value === "object" || Array.isArray(value); +} + +export function normalizeValue(value: unknown, maybeObject: boolean): string { + if (needsJSON(value)) { + return JSON.stringify(value); + } + try { + if (typeof value === "string" && maybeObject) { + return JSON.stringify(JSON.parse(value)); + } + } catch (e) { + // That's ok, just return the string representation + } + return `${value}`; +} diff --git a/py/autoevals/test_values.py b/py/autoevals/test_values.py index 83da32b..f808b2f 100644 --- a/py/autoevals/test_values.py +++ b/py/autoevals/test_values.py @@ -1,8 +1,10 @@ +import pytest from pytest import approx from autoevals.list import ListContains from autoevals.number import NumericDiff from autoevals.string import LevenshteinScorer +from autoevals.value import ExactMatch def test_levenshtein(): @@ -77,3 +79,30 @@ def test_list_contains(): assert ( ListContains(pairwise_evaluator=LevenshteinScorer(), allow_extra_entities=True)(["a", "b"], ["a"]).score == 1 ) + + +def test_exact_match(): + cases = [ + ["hello", "hello", 1], + ["hello", "world", 0], + [123, 123, 1], + [123, "123", 1], + [{"a": 1, "b": 2}, {"a": 1, "b": 2}, 1], + [{"a": 1, "b": 2}, {"a": 1, "b": 3}, 0], + [[1, 2, 3], [1, 2, 3], 1], + [[1, 2, 3], [3, 2, 1], 0], + [{"a": 1, "b": 2}, {"b": 2, "a": 1}, 0], # Order matters + [{"a": 1, "b": 2}, '{"a": 1, "b": 2}', 1], # String representation matches dict + [{"a": 1, "b": 2}, '{"a":1, "b":2}', 1], # String representation matches dict + [{"a": 1, "b": 2}, '{"b": 2, "a": 1}', 0], + [{"a": 1, "b": 2}, {"b": 2, "a": 1, "c": 3}, 0], # Extra key, not equal + [None, None, 1], + [None, "None", 1], + ] + + for output, expected, expected_score in cases: + assert ExactMatch()(output, expected).score == approx(expected_score, abs=1e-4), ( + output, + expected, + expected_score, + ) diff --git a/py/autoevals/value.py b/py/autoevals/value.py new file mode 100644 index 0000000..605cc4b --- /dev/null +++ b/py/autoevals/value.py @@ -0,0 +1,37 @@ +import json +from typing import Any + +from braintrust_core.score import Score + +from autoevals.partial import ScorerWithPartial + + +class ExactMatch(ScorerWithPartial): + """ + A simple scorer that tests whether two values are equal. If the value is an object or array, + it will be JSON-serialized and the strings compared for equality. + """ + + def _run_eval_sync(self, output, expected=None, **kwargs): + maybe_object = needs_json(output) or needs_json(expected) + output, expected = normalize_value(output, maybe_object), normalize_value(expected, maybe_object) + score = 1 if output == expected else 0 + + return Score(name=self._name(), score=score) + + +def needs_json(value: Any) -> bool: + return isinstance(value, (dict, list)) + + +def normalize_value(value: Any, maybe_object: bool) -> str: + if needs_json(value): + return json.dumps(value) + + try: + if maybe_object: + return json.dumps(json.loads(value)) + except json.JSONDecodeError: + pass + + return str(value)