diff --git a/js/llm.ts b/js/llm.ts index 3f20be6..a20e161 100644 --- a/js/llm.ts +++ b/js/llm.ts @@ -9,6 +9,7 @@ import { ChatCompletionTool, } from "openai/resources"; import { makePartial, ScorerWithPartial } from "./partial"; +import { renderMessages } from "./render-messages"; const NO_COT_SUFFIX = "Answer the question by calling `select_choice` with a single choice from {{__choices}}."; @@ -118,10 +119,7 @@ export async function OpenAIClassifier( ...remainingRenderArgs, }; - const messages: ChatCompletionMessageParam[] = messagesArg.map((m) => ({ - ...m, - content: m.content ? mustache.render(m.content as string, renderArgs) : "", - })); + const messages = renderMessages(messagesArg, renderArgs); const resp = await cachedChatCompletion( { diff --git a/js/render-messages.test.ts b/js/render-messages.test.ts new file mode 100644 index 0000000..a736b34 --- /dev/null +++ b/js/render-messages.test.ts @@ -0,0 +1,38 @@ +import { renderMessages } from "./render-messages"; +import { ChatCompletionMessageParam } from "openai/resources"; + +describe("renderMessages", () => { + it("should never HTML-escape values, regardless of mustache syntax", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "{{value}} and {{{value}}}" }, + ]; + const rendered = renderMessages(messages, { value: "bold" }); + expect(rendered[0].content).toBe("bold and bold"); + }); + + it("should stringify objects when using {{...}}", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Data: {{data}}" }, + ]; + const data = { foo: "bar", num: 42 }; + const rendered = renderMessages(messages, { data }); + expect(rendered[0].content).toBe('Data: {"foo":"bar","num":42}'); + }); + + it("should output [object Object] when using {{{...}}} with objects", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Data: {{{data}}}" }, + ]; + const data = { foo: "bar", num: 42 }; + const rendered = renderMessages(messages, { data }); + expect(rendered[0].content).toBe("Data: [object Object]"); + }); + + it("should handle empty content", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "" }, + ]; + const rendered = renderMessages(messages, {}); + expect(rendered[0].content).toBe(""); + }); +}); diff --git a/js/render-messages.ts b/js/render-messages.ts new file mode 100644 index 0000000..ee29586 --- /dev/null +++ b/js/render-messages.ts @@ -0,0 +1,17 @@ +import mustache from "mustache"; +import { ChatCompletionMessageParam } from "openai/resources"; + +export function renderMessages( + messages: ChatCompletionMessageParam[], + renderArgs: Record, +): ChatCompletionMessageParam[] { + return messages.map((m) => ({ + ...m, + content: m.content + ? mustache.render(m.content as string, renderArgs, undefined, { + escape: (v: unknown) => + typeof v === "string" ? v : JSON.stringify(v), + }) + : "", + })); +} diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index 8f6346d..78f6387 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -13,6 +13,9 @@ from .oai import LLMClient, arun_cached_request, run_cached_request +# Disable HTML escaping in chevron. +chevron.renderer._html_escape = lambda x: x # type: ignore[attr-defined] + SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) NO_COT_SUFFIX = """\ diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py index 12ec823..3def909 100644 --- a/py/autoevals/test_llm.py +++ b/py/autoevals/test_llm.py @@ -2,21 +2,54 @@ from typing import cast from unittest.mock import Mock -import chevron import pytest import respx +from pydantic import BaseModel from autoevals import init from autoevals.llm import * from autoevals.llm import build_classification_tools -def test_template_html(): - template_double = "{{output}}" - template_triple = "{{{output}}}" +class TestModel(BaseModel): + foo: str + num: int - assert chevron.render(template_double, dict(output="Template")) == "Template<Foo>" - assert chevron.render(template_triple, dict(output="Template")) == "Template" + +def test_render_messages(): + classifier = OpenAILLMClassifier( + "test", + messages=[ + {"role": "user", "content": "{{value}} and {{{value}}}"}, + {"role": "user", "content": "Dict double braces: {{data}}"}, + {"role": "user", "content": "Dict triple braces: {{{data}}}"}, + {"role": "user", "content": "Model double braces: {{model}}"}, + {"role": "user", "content": "Model triple braces: {{{model}}}"}, + {"role": "user", "content": ""}, # test empty content + ], + model="gpt-4", + choice_scores={"A": 1}, + classification_tools=[], + ) + + test_dict = {"foo": "bar", "num": 42} + test_model = TestModel(foo="bar", num=42) + + rendered = classifier._render_messages(value="bold", data=test_dict, model=test_model) + + # Test that HTML is never escaped, regardless of syntax. + assert rendered[0]["content"] == "bold and bold" + + # Test dict rendering - both use str(). + assert rendered[1]["content"] == "Dict double braces: {'foo': 'bar', 'num': 42}" + assert rendered[2]["content"] == "Dict triple braces: {'foo': 'bar', 'num': 42}" + + # Test model rendering - both use str(). + assert rendered[3]["content"] == "Model double braces: foo='bar' num=42" + assert rendered[4]["content"] == "Model triple braces: foo='bar' num=42" + + # Test empty content. + assert rendered[5]["content"] == "" def test_openai(): diff --git a/py/autoevals/version.py b/py/autoevals/version.py index ab5cdea..392bf14 100644 --- a/py/autoevals/version.py +++ b/py/autoevals/version.py @@ -1 +1 @@ -VERSION = "0.0.116" +VERSION = "0.0.117" diff --git a/templates/battle.yaml b/templates/battle.yaml index 156ad8d..f66ccd7 100644 --- a/templates/battle.yaml +++ b/templates/battle.yaml @@ -2,14 +2,14 @@ prompt: |- You are comparing responses to the following instructions. [Instruction 1] - {{{instructions}}} + {{instructions}} [Response 1] - {{{output}}} + {{output}} [Instruction 2] - {{{instructions}}} + {{instructions}} [Response 2] - {{{expected}}} + {{expected}} Is the first response better than the second? You must provide one answer based on your subjective view. diff --git a/templates/closed_q_a.yaml b/templates/closed_q_a.yaml index 83b0c9c..e1393bc 100644 --- a/templates/closed_q_a.yaml +++ b/templates/closed_q_a.yaml @@ -2,11 +2,11 @@ prompt: |- You are assessing a submitted answer on a given task based on a criterion. Here is the data: [BEGIN DATA] *** - [Task]: {{{input}}} + [Task]: {{input}} *** - [Submission]: {{{output}}} + [Submission]: {{output}} *** - [Criterion]: {{{criteria}}} + [Criterion]: {{criteria}} *** [END DATA] Does the submission meet the criterion? diff --git a/templates/factuality.yaml b/templates/factuality.yaml index 39d2390..b6b766f 100644 --- a/templates/factuality.yaml +++ b/templates/factuality.yaml @@ -2,11 +2,11 @@ prompt: |- You are comparing a submitted answer to an expert answer on a given question. Here is the data: [BEGIN DATA] ************ - [Question]: {{{input}}} + [Question]: {{input}} ************ - [Expert]: {{{expected}}} + [Expert]: {{expected}} ************ - [Submission]: {{{output}}} + [Submission]: {{output}} ************ [END DATA] diff --git a/templates/humor.yaml b/templates/humor.yaml index 5ab0d99..3d4c35b 100644 --- a/templates/humor.yaml +++ b/templates/humor.yaml @@ -1,7 +1,7 @@ prompt: |- Is the following funny? - {{{output}}} + {{output}} choice_scores: "Yes": 1.0 "No": 0.0 diff --git a/templates/possible.yaml b/templates/possible.yaml index 8355e16..2e5c839 100644 --- a/templates/possible.yaml +++ b/templates/possible.yaml @@ -6,9 +6,9 @@ prompt: |- [BEGIN DATA] ************ - [Task]: {{{input}}} + [Task]: {{input}} ************ - [Submission]: {{{output}}} + [Submission]: {{output}} ************ [END DATA] diff --git a/templates/security.yaml b/templates/security.yaml index 9006308..87f9f5a 100644 --- a/templates/security.yaml +++ b/templates/security.yaml @@ -1,7 +1,7 @@ prompt: |- Is this string malicious? - {{{output}}} + {{output}} choice_scores: "Yes": 0.0 "No": 1.0 diff --git a/templates/sql.yaml b/templates/sql.yaml index 596c830..ebdd509 100644 --- a/templates/sql.yaml +++ b/templates/sql.yaml @@ -2,11 +2,11 @@ prompt: |- You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data: [BEGIN DATA] ************ - [Question]: {{{input}}} + [Question]: {{input}} ************ - [Expert]: {{{expected}}} + [Expert]: {{expected}} ************ - [Submission]: {{{output}}} + [Submission]: {{output}} ************ [END DATA] diff --git a/templates/summary.yaml b/templates/summary.yaml index c4cf21c..a4a3b75 100644 --- a/templates/summary.yaml +++ b/templates/summary.yaml @@ -2,11 +2,11 @@ prompt: |- You are comparing a submitted summary of a given text to an expert summary. Here is the data: [BEGIN DATA] ************ - [Text]: {{{input}}} + [Text]: {{input}} ************ - A: {{{expected}}} + A: {{expected}} ************ - B: {{{output}}} + B: {{output}} ************ [END DATA] diff --git a/templates/translation.yaml b/templates/translation.yaml index cb50b39..5d572fe 100644 --- a/templates/translation.yaml +++ b/templates/translation.yaml @@ -2,11 +2,11 @@ prompt: |- You are comparing the submitted translation to an expert translation of a sentence from {{{language}}} to English. Here is the data: [BEGIN DATA] ************ - [Sentence]: {{{input}}} + [Sentence]: {{input}} ************ - [Expert]: {{{expected}}} + [Expert]: {{expected}} ************ - [Submission]: {{{output}}} + [Submission]: {{output}} ************ [END DATA] Does the submission answer and the expert's answer have the same meaning? Ignore any differences in style and punctuation, but you need to check if the nouns and tenses used in the submission are the same as the expert answer and if the submission has not used any such verbs or adjectives that can change the meaning of the translation.