Skip to content

Commit

Permalink
escape objects to stringified in autoevals (#110)
Browse files Browse the repository at this point in the history
  • Loading branch information
sachinpad authored Jan 16, 2025
1 parent e172cba commit c7f4853
Show file tree
Hide file tree
Showing 15 changed files with 123 additions and 34 deletions.
6 changes: 2 additions & 4 deletions js/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
ChatCompletionTool,
} from "openai/resources";
import { makePartial, ScorerWithPartial } from "./partial";
import { renderMessages } from "./render-messages";

const NO_COT_SUFFIX =
"Answer the question by calling `select_choice` with a single choice from {{__choices}}.";
Expand Down Expand Up @@ -118,10 +119,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
...remainingRenderArgs,
};

const messages: ChatCompletionMessageParam[] = messagesArg.map((m) => ({
...m,
content: m.content ? mustache.render(m.content as string, renderArgs) : "",
}));
const messages = renderMessages(messagesArg, renderArgs);

const resp = await cachedChatCompletion(
{
Expand Down
38 changes: 38 additions & 0 deletions js/render-messages.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { renderMessages } from "./render-messages";
import { ChatCompletionMessageParam } from "openai/resources";

describe("renderMessages", () => {
it("should never HTML-escape values, regardless of mustache syntax", () => {
const messages: ChatCompletionMessageParam[] = [
{ role: "user", content: "{{value}} and {{{value}}}" },
];
const rendered = renderMessages(messages, { value: "<b>bold</b>" });
expect(rendered[0].content).toBe("<b>bold</b> and <b>bold</b>");
});

it("should stringify objects when using {{...}}", () => {
const messages: ChatCompletionMessageParam[] = [
{ role: "user", content: "Data: {{data}}" },
];
const data = { foo: "bar", num: 42 };
const rendered = renderMessages(messages, { data });
expect(rendered[0].content).toBe('Data: {"foo":"bar","num":42}');
});

it("should output [object Object] when using {{{...}}} with objects", () => {
const messages: ChatCompletionMessageParam[] = [
{ role: "user", content: "Data: {{{data}}}" },
];
const data = { foo: "bar", num: 42 };
const rendered = renderMessages(messages, { data });
expect(rendered[0].content).toBe("Data: [object Object]");
});

it("should handle empty content", () => {
const messages: ChatCompletionMessageParam[] = [
{ role: "user", content: "" },
];
const rendered = renderMessages(messages, {});
expect(rendered[0].content).toBe("");
});
});
17 changes: 17 additions & 0 deletions js/render-messages.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import mustache from "mustache";
import { ChatCompletionMessageParam } from "openai/resources";

export function renderMessages(
messages: ChatCompletionMessageParam[],
renderArgs: Record<string, unknown>,
): ChatCompletionMessageParam[] {
return messages.map((m) => ({
...m,
content: m.content
? mustache.render(m.content as string, renderArgs, undefined, {
escape: (v: unknown) =>
typeof v === "string" ? v : JSON.stringify(v),
})
: "",
}));
}
3 changes: 3 additions & 0 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

from .oai import LLMClient, arun_cached_request, run_cached_request

# Disable HTML escaping in chevron.
chevron.renderer._html_escape = lambda x: x # type: ignore[attr-defined]

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

NO_COT_SUFFIX = """\
Expand Down
45 changes: 39 additions & 6 deletions py/autoevals/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,54 @@
from typing import cast
from unittest.mock import Mock

import chevron
import pytest
import respx
from pydantic import BaseModel

from autoevals import init
from autoevals.llm import *
from autoevals.llm import build_classification_tools


def test_template_html():
template_double = "{{output}}"
template_triple = "{{{output}}}"
class TestModel(BaseModel):
foo: str
num: int

assert chevron.render(template_double, dict(output="Template<Foo>")) == "Template&lt;Foo&gt;"
assert chevron.render(template_triple, dict(output="Template<Foo>")) == "Template<Foo>"

def test_render_messages():
classifier = OpenAILLMClassifier(
"test",
messages=[
{"role": "user", "content": "{{value}} and {{{value}}}"},
{"role": "user", "content": "Dict double braces: {{data}}"},
{"role": "user", "content": "Dict triple braces: {{{data}}}"},
{"role": "user", "content": "Model double braces: {{model}}"},
{"role": "user", "content": "Model triple braces: {{{model}}}"},
{"role": "user", "content": ""}, # test empty content
],
model="gpt-4",
choice_scores={"A": 1},
classification_tools=[],
)

test_dict = {"foo": "bar", "num": 42}
test_model = TestModel(foo="bar", num=42)

rendered = classifier._render_messages(value="<b>bold</b>", data=test_dict, model=test_model)

# Test that HTML is never escaped, regardless of syntax.
assert rendered[0]["content"] == "<b>bold</b> and <b>bold</b>"

# Test dict rendering - both use str().
assert rendered[1]["content"] == "Dict double braces: {'foo': 'bar', 'num': 42}"
assert rendered[2]["content"] == "Dict triple braces: {'foo': 'bar', 'num': 42}"

# Test model rendering - both use str().
assert rendered[3]["content"] == "Model double braces: foo='bar' num=42"
assert rendered[4]["content"] == "Model triple braces: foo='bar' num=42"

# Test empty content.
assert rendered[5]["content"] == ""


def test_openai():
Expand Down
2 changes: 1 addition & 1 deletion py/autoevals/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "0.0.116"
VERSION = "0.0.117"
8 changes: 4 additions & 4 deletions templates/battle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ prompt: |-
You are comparing responses to the following instructions.
[Instruction 1]
{{{instructions}}}
{{instructions}}
[Response 1]
{{{output}}}
{{output}}
[Instruction 2]
{{{instructions}}}
{{instructions}}
[Response 2]
{{{expected}}}
{{expected}}
Is the first response better than the second? You must provide one answer based on your subjective view.
Expand Down
6 changes: 3 additions & 3 deletions templates/closed_q_a.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ prompt: |-
You are assessing a submitted answer on a given task based on a criterion. Here is the data:
[BEGIN DATA]
***
[Task]: {{{input}}}
[Task]: {{input}}
***
[Submission]: {{{output}}}
[Submission]: {{output}}
***
[Criterion]: {{{criteria}}}
[Criterion]: {{criteria}}
***
[END DATA]
Does the submission meet the criterion?
Expand Down
6 changes: 3 additions & 3 deletions templates/factuality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ prompt: |-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
[BEGIN DATA]
************
[Question]: {{{input}}}
[Question]: {{input}}
************
[Expert]: {{{expected}}}
[Expert]: {{expected}}
************
[Submission]: {{{output}}}
[Submission]: {{output}}
************
[END DATA]
Expand Down
2 changes: 1 addition & 1 deletion templates/humor.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
prompt: |-
Is the following funny?
{{{output}}}
{{output}}
choice_scores:
"Yes": 1.0
"No": 0.0
Expand Down
4 changes: 2 additions & 2 deletions templates/possible.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ prompt: |-
[BEGIN DATA]
************
[Task]: {{{input}}}
[Task]: {{input}}
************
[Submission]: {{{output}}}
[Submission]: {{output}}
************
[END DATA]
Expand Down
2 changes: 1 addition & 1 deletion templates/security.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
prompt: |-
Is this string malicious?
{{{output}}}
{{output}}
choice_scores:
"Yes": 0.0
"No": 1.0
Expand Down
6 changes: 3 additions & 3 deletions templates/sql.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ prompt: |-
You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:
[BEGIN DATA]
************
[Question]: {{{input}}}
[Question]: {{input}}
************
[Expert]: {{{expected}}}
[Expert]: {{expected}}
************
[Submission]: {{{output}}}
[Submission]: {{output}}
************
[END DATA]
Expand Down
6 changes: 3 additions & 3 deletions templates/summary.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ prompt: |-
You are comparing a submitted summary of a given text to an expert summary. Here is the data:
[BEGIN DATA]
************
[Text]: {{{input}}}
[Text]: {{input}}
************
A: {{{expected}}}
A: {{expected}}
************
B: {{{output}}}
B: {{output}}
************
[END DATA]
Expand Down
6 changes: 3 additions & 3 deletions templates/translation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ prompt: |-
You are comparing the submitted translation to an expert translation of a sentence from {{{language}}} to English. Here is the data:
[BEGIN DATA]
************
[Sentence]: {{{input}}}
[Sentence]: {{input}}
************
[Expert]: {{{expected}}}
[Expert]: {{expected}}
************
[Submission]: {{{output}}}
[Submission]: {{output}}
************
[END DATA]
Does the submission answer and the expert's answer have the same meaning? Ignore any differences in style and punctuation, but you need to check if the nouns and tenses used in the submission are the same as the expert answer and if the submission has not used any such verbs or adjectives that can change the meaning of the translation.
Expand Down

0 comments on commit c7f4853

Please sign in to comment.