Skip to content

Commit

Permalink
fix gap in our client= coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
ibolmo committed Dec 12, 2024
1 parent 6cf7d73 commit 24522e6
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 6 deletions.
20 changes: 14 additions & 6 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,21 +269,29 @@ def __init__(
)

@classmethod
def from_spec(cls, name: str, spec: ModelGradedSpec, **kwargs):
return cls(name, spec.prompt, spec.choice_scores, **kwargs)
def from_spec(cls, name: str, spec: ModelGradedSpec, client: Optional[AutoEvalClient] = None, **kwargs):
return cls(name, spec.prompt, spec.choice_scores, client=client, **kwargs)

@classmethod
def from_spec_file(cls, name: str, path: str, **kwargs):
def from_spec_file(cls, name: str, path: str, client: Optional[AutoEvalClient] = None, **kwargs):
if cls._SPEC_FILE_CONTENTS is None:
with open(path) as f:
cls._SPEC_FILE_CONTENTS = f.read()
spec = yaml.safe_load(cls._SPEC_FILE_CONTENTS)
return cls.from_spec(name, ModelGradedSpec(**spec), **kwargs)
return cls.from_spec(name, ModelGradedSpec(**spec), client=client, **kwargs)


class SpecFileClassifier(LLMClassifier):
def __new__(
cls, model=None, engine=None, use_cot=None, max_tokens=None, temperature=None, api_key=None, base_url=None
cls,
model=None,
engine=None,
use_cot=None,
max_tokens=None,
temperature=None,
api_key=None,
base_url=None,
client: Optional[AutoEvalClient] = None,
):
kwargs = {}
if model is not None:
Expand Down Expand Up @@ -311,7 +319,7 @@ def __new__(

extra_render_args = cls._partial_args() if hasattr(cls, "_partial_args") else {}

return LLMClassifier.from_spec_file(cls_name, template_path, **kwargs, **extra_render_args)
return LLMClassifier.from_spec_file(cls_name, template_path, client=client, **kwargs, **extra_render_args)


class Battle(SpecFileClassifier):
Expand Down
61 changes: 61 additions & 0 deletions py/autoevals/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,67 @@ def test_factuality():
assert result.score == 1


def test_factuality_client():
client = Mock()
client.RateLimitError = Exception

completion = Mock()
completion.to_dict.return_value = {
"id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": None,
"refusal": None,
"role": "assistant",
"tool_calls": [
{
"id": "call_JKoeGAX2zGPJAmF2muDgjpHp",
"function": {
"arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}',
"name": "select_choice",
},
"type": "function",
}
],
},
}
],
"created": 1734029028,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion",
"system_fingerprint": "fp_cc5cf1c6e3",
"usage": {
"completion_tokens": 149,
"prompt_tokens": 404,
"total_tokens": 553,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0,
},
"prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
},
}

client.complete.return_value = completion

llm = Factuality(client=cast(AutoEvalClient, client))
result = llm.eval(
output="6",
expected="6",
input="Add the following numbers: 1, 2, 3",
)

assert client.complete.call_count == 1

assert result.score == 1


# make sure we deny any leaked calls to OpenAI
@respx.mock(base_url="https://api.openai.com/v1/")
def test_init_client():
Expand Down

0 comments on commit 24522e6

Please sign in to comment.