Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@
"builtin.tool_call_success",
}

_TOOL_DEFINITION_EVALUATORS: set[str] = _TOOL_EVALUATORS | {
"builtin.intent_resolution",
"builtin.task_adherence",
"builtin.task_completion",
"builtin.task_navigation_efficiency",
}

# Evaluators that require a ground_truth / expected_output field.
_GROUND_TRUTH_EVALUATORS: set[str] = {
"builtin.similarity",
Expand Down Expand Up @@ -161,6 +168,7 @@ def _build_testing_criteria(
model: str,
*,
include_data_mapping: bool = False,
include_tool_definitions: bool = False,
) -> list[dict[str, Any]]:
"""Build ``testing_criteria`` for ``evals.create()``.

Expand All @@ -169,6 +177,7 @@ def _build_testing_criteria(
model: Model deployment for the LLM judge.
include_data_mapping: Whether to include field-level data mapping
(required for the JSONL data source, not needed for response-based).
include_tool_definitions: Whether JSONL items include tool definitions.
"""
criteria: list[dict[str, Any]] = []
for name in evaluators:
Expand Down Expand Up @@ -203,7 +212,7 @@ def _build_testing_criteria(
mapping["context"] = "{{item.context}}"
if qualified in _GROUND_TRUTH_EVALUATORS:
mapping["ground_truth"] = "{{item.ground_truth}}"
if qualified in _TOOL_EVALUATORS:
if include_tool_definitions and qualified in _TOOL_DEFINITION_EVALUATORS:
mapping["tool_definitions"] = "{{item.tool_definitions}}"
entry["data_mapping"] = mapping

Expand Down Expand Up @@ -713,6 +722,7 @@ async def _evaluate_via_dataset(
evaluators,
self._model,
include_data_mapping=True,
include_tool_definitions=has_tools,
),
)

Expand Down
33 changes: 30 additions & 3 deletions python/packages/foundry/tests/test_foundry_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,12 @@ def test_with_data_mapping(self) -> None:
assert "conversation" not in criteria[1]["data_mapping"]

def test_tool_evaluator_includes_tool_definitions(self) -> None:
criteria = _build_testing_criteria(["relevance", "tool_call_accuracy"], "gpt-4o", include_data_mapping=True)
criteria = _build_testing_criteria(
["relevance", "tool_call_accuracy"],
"gpt-4o",
include_data_mapping=True,
include_tool_definitions=True,
)
# relevance: string query/response
assert criteria[0]["data_mapping"]["query"] == "{{item.query}}"
assert criteria[0]["data_mapping"]["response"] == "{{item.response}}"
Expand All @@ -762,6 +767,17 @@ def test_agent_evaluators_use_message_arrays(self) -> None:
assert c["data_mapping"]["query"] == "{{item.query_messages}}", f"{c['name']}"
assert c["data_mapping"]["response"] == "{{item.response_messages}}", f"{c['name']}"

def test_agent_evaluators_include_tool_definitions_for_tool_items(self) -> None:
agent_evals = ["task_adherence", "intent_resolution", "task_completion", "task_navigation_efficiency"]
criteria = _build_testing_criteria(
agent_evals,
"gpt-4o",
include_data_mapping=True,
include_tool_definitions=True,
)
for c in criteria:
assert c["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}", f"{c['name']}"

def test_quality_evaluators_use_strings(self) -> None:
quality_evals = ["coherence", "relevance", "fluency"]
criteria = _build_testing_criteria(quality_evals, "gpt-4o", include_data_mapping=True)
Expand All @@ -781,7 +797,12 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None:
"tool_output_utilization",
"tool_call_success",
]
criteria = _build_testing_criteria(tool_evals, "gpt-4o", include_data_mapping=True)
criteria = _build_testing_criteria(
tool_evals,
"gpt-4o",
include_data_mapping=True,
include_tool_definitions=True,
)
for c in criteria:
assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"

Expand Down Expand Up @@ -2646,7 +2667,7 @@ def test_sync_client_raises(self):


class TestEvaluatorSetConsistency:
"""Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS."""
"""Verify that evaluator helper sets are subsets of _BUILTIN_EVALUATORS."""

def test_agent_evaluators_subset(self):
from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS
Expand All @@ -2660,6 +2681,12 @@ def test_tool_evaluators_subset(self):
diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values())
assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}"

def test_tool_definition_evaluators_subset(self):
from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_DEFINITION_EVALUATORS

diff = _TOOL_DEFINITION_EVALUATORS - set(_BUILTIN_EVALUATORS.values())
assert not diff, f"_TOOL_DEFINITION_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}"


# ---------------------------------------------------------------------------
# r5 review: evaluate_traces with agent_id only
Expand Down