diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 2f68816591..098a8ba34e 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -75,6 +75,13 @@ "builtin.tool_call_success", } +_TOOL_DEFINITION_EVALUATORS: set[str] = _TOOL_EVALUATORS | { + "builtin.intent_resolution", + "builtin.task_adherence", + "builtin.task_completion", + "builtin.task_navigation_efficiency", +} + # Evaluators that require a ground_truth / expected_output field. _GROUND_TRUTH_EVALUATORS: set[str] = { "builtin.similarity", @@ -161,6 +168,7 @@ def _build_testing_criteria( model: str, *, include_data_mapping: bool = False, + include_tool_definitions: bool = False, ) -> list[dict[str, Any]]: """Build ``testing_criteria`` for ``evals.create()``. @@ -169,6 +177,7 @@ def _build_testing_criteria( model: Model deployment for the LLM judge. include_data_mapping: Whether to include field-level data mapping (required for the JSONL data source, not needed for response-based). + include_tool_definitions: Whether JSONL items include tool definitions. """ criteria: list[dict[str, Any]] = [] for name in evaluators: @@ -203,7 +212,7 @@ def _build_testing_criteria( mapping["context"] = "{{item.context}}" if qualified in _GROUND_TRUTH_EVALUATORS: mapping["ground_truth"] = "{{item.ground_truth}}" - if qualified in _TOOL_EVALUATORS: + if include_tool_definitions and qualified in _TOOL_DEFINITION_EVALUATORS: mapping["tool_definitions"] = "{{item.tool_definitions}}" entry["data_mapping"] = mapping @@ -713,6 +722,7 @@ async def _evaluate_via_dataset( evaluators, self._model, include_data_mapping=True, + include_tool_definitions=has_tools, ), ) diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index 937a3cf524..65efb9eb94 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -745,7 +745,12 @@ def test_with_data_mapping(self) -> None: assert "conversation" not in criteria[1]["data_mapping"] def test_tool_evaluator_includes_tool_definitions(self) -> None: - criteria = _build_testing_criteria(["relevance", "tool_call_accuracy"], "gpt-4o", include_data_mapping=True) + criteria = _build_testing_criteria( + ["relevance", "tool_call_accuracy"], + "gpt-4o", + include_data_mapping=True, + include_tool_definitions=True, + ) # relevance: string query/response assert criteria[0]["data_mapping"]["query"] == "{{item.query}}" assert criteria[0]["data_mapping"]["response"] == "{{item.response}}" @@ -762,6 +767,17 @@ def test_agent_evaluators_use_message_arrays(self) -> None: assert c["data_mapping"]["query"] == "{{item.query_messages}}", f"{c['name']}" assert c["data_mapping"]["response"] == "{{item.response_messages}}", f"{c['name']}" + def test_agent_evaluators_include_tool_definitions_for_tool_items(self) -> None: + agent_evals = ["task_adherence", "intent_resolution", "task_completion", "task_navigation_efficiency"] + criteria = _build_testing_criteria( + agent_evals, + "gpt-4o", + include_data_mapping=True, + include_tool_definitions=True, + ) + for c in criteria: + assert c["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}", f"{c['name']}" + def test_quality_evaluators_use_strings(self) -> None: quality_evals = ["coherence", "relevance", "fluency"] criteria = _build_testing_criteria(quality_evals, "gpt-4o", include_data_mapping=True) @@ -781,7 +797,12 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None: "tool_output_utilization", "tool_call_success", ] - criteria = _build_testing_criteria(tool_evals, "gpt-4o", include_data_mapping=True) + criteria = _build_testing_criteria( + tool_evals, + "gpt-4o", + include_data_mapping=True, + include_tool_definitions=True, + ) for c in criteria: assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" @@ -2646,7 +2667,7 @@ def test_sync_client_raises(self): class TestEvaluatorSetConsistency: - """Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS.""" + """Verify that evaluator helper sets are subsets of _BUILTIN_EVALUATORS.""" def test_agent_evaluators_subset(self): from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS @@ -2660,6 +2681,12 @@ def test_tool_evaluators_subset(self): diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" + def test_tool_definition_evaluators_subset(self): + from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_DEFINITION_EVALUATORS + + diff = _TOOL_DEFINITION_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) + assert not diff, f"_TOOL_DEFINITION_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" + # --------------------------------------------------------------------------- # r5 review: evaluate_traces with agent_id only