Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 50 additions & 50 deletions data/airline_dataset.jsonl

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions src/eva/assistant/tools/airline_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,14 @@ def get_reservation(params: dict, db: dict, call_index: int) -> dict:
"message": f"Last name does not match reservation {confirmation_number}",
}

# Write session data to mark successful authentication
db.setdefault("session", {}).update(
{
"confirmation_number": confirmation_number,
"last_name": last_name.lower() if last_name else "",
}
)

# Return success — sort journeys by first segment's date, then journey_id for readability
result_reservation = copy.deepcopy(reservation)
result_reservation["bookings"].sort(
Expand Down
70 changes: 36 additions & 34 deletions src/eva/metrics/diagnostic/authentication_success.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Authentication success metric - checks if get_reservation was called successfully.
"""Authentication success metric - checks if the session was authenticated correctly.

Debug metric for diagnosing model performance issues, not directly used in
final evaluation scores.
Expand All @@ -11,56 +11,58 @@

@register_metric
class AuthenticationSuccessMetric(CodeMetric):
"""Checks whether the agent successfully authenticated the user via get_reservation.
"""Checks whether the agent successfully authenticated the user.

Looks at tool_responses for entries where tool_name == "get_reservation"
and checks if any had tool_response.status == "success".
Compares the 'session' key in the final scenario database against the
expected session in the ground truth. Authentication is successful if the
final session is a superset of the expected session — i.e., every key-value
pair in expected_session is present in the actual final session.

Score: 1.0 if at least one get_reservation call succeeded, 0.0 otherwise.
Score: 1.0 if final session is a superset of expected session, 0.0 otherwise.

This is a diagnostic metric used for diagnosing model performance issues.
It is not directly used in final evaluation scores.
"""

name = "authentication_success"
description = "Debug metric: checks if get_reservation was called with a successful result"
description = "Checks if session state in final DB is a superset of expected session"
category = "diagnostic"
exclude_from_pass_at_k = True

async def compute(self, context: MetricContext) -> MetricScore:
"""Compute authentication success from tool responses."""
"""Compute authentication success from final scenario database session state."""
try:
tool_responses = context.tool_responses or []

get_reservation_calls = [resp for resp in tool_responses if resp.get("tool_name") == "get_reservation"]

found = len(get_reservation_calls) > 0

success_count = 0
for resp in get_reservation_calls:
tool_response = resp.get("tool_response", {})
if isinstance(tool_response, dict) and tool_response.get("status") == "success":
success_count += 1

score = 1.0 if success_count > 0 else 0.0

# Determine reason for failure
if not found:
reason = "get_reservation tool was never called"
elif success_count == 0:
reason = "get_reservation was called but never returned status success"
else:
reason = "get_reservation called successfully"
expected_session = context.expected_scenario_db.get("session", {})
actual_session = context.final_scenario_db.get("session", {})

if not expected_session:
return MetricScore(
name=self.name,
score=1.0,
normalized_score=1.0,
details={"reason": "No expected session to verify — skipping auth check"},
)

# Check superset: every key-value in expected_session must be in actual_session
mismatches = {
k: {"expected": v, "actual": actual_session.get(k)}
for k, v in expected_session.items()
if actual_session.get(k) != v
}

success = len(mismatches) == 0

return MetricScore(
name=self.name,
score=score,
normalized_score=score,
score=1.0 if success else 0.0,
normalized_score=1.0 if success else 0.0,
details={
"get_reservation_found": found,
"get_reservation_call_count": len(get_reservation_calls),
"get_reservation_success_count": success_count,
"reason": reason,
"expected_session": expected_session,
"actual_session": actual_session,
"mismatches": mismatches,
"reason": "Authentication successful"
if success
else f"Session mismatch on keys: {list(mismatches)}",
},
)

Expand Down
6 changes: 5 additions & 1 deletion src/eva/utils/hash_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,17 @@ def get_dict_hash(obj: dict) -> str:
- Use default=str for non-JSON-serializable types
- Compute SHA-256 hash of the serialized string
The 'session' key is always excluded from hashing — auth success is
tracked separately via the authentication_success metric.
Args:
obj: Dictionary to hash
Returns:
Hexadecimal SHA-256 hash string
"""
normalized = normalize_for_comparison(obj)
obj_for_hash = {k: v for k, v in obj.items() if k != "session"} if isinstance(obj, dict) else obj
normalized = normalize_for_comparison(obj_for_hash)
serialized = json.dumps(normalized, sort_keys=True, default=str, separators=(",", ":"))
return hashlib.sha256(serialized.encode()).hexdigest()

Expand Down
31 changes: 29 additions & 2 deletions tests/unit/airline/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,14 +1068,41 @@ def test_get_reservation_success(sample_db):
assert result["reservation"]["bookings"][1]["status"] == "confirmed"


def test_get_reservation_not_found(sample_db):
"""Test reservation not found."""
def test_get_reservation_writes_session(sample_db):
"""Successful get_reservation should write confirmation_number and last_name to db session."""
params = {"confirmation_number": "abc123", "last_name": "Doe"}
get_reservation(params, sample_db, call_index=1)

assert sample_db["session"]["confirmation_number"] == "ABC123"
assert sample_db["session"]["last_name"] == "doe"


def test_get_reservation_session_last_name_lowercased(sample_db):
"""Session last_name should be stored lowercase regardless of input case."""
params = {"confirmation_number": "ABC123", "last_name": "DOE"}
get_reservation(params, sample_db, call_index=1)

assert sample_db["session"]["last_name"] == "doe"


def test_get_reservation_failed_auth_does_not_write_session(sample_db):
"""Failed authentication (bad last name) should not write to db session."""
params = {"confirmation_number": "ABC123", "last_name": "Wrong"}
result = get_reservation(params, sample_db, call_index=1)

assert result["status"] == "error"
assert "session" not in sample_db


def test_get_reservation_not_found_does_not_write_session(sample_db):
"""Reservation not found should not write to db session."""
params = {"confirmation_number": "XXXXXX", "last_name": "Smith"}
result = get_reservation(params, sample_db, call_index=1)

assert result["status"] == "error"
assert result["error_type"] == "not_found"
assert "XXXXXX" in result["message"]
assert "session" not in sample_db


def test_get_flight_status_success(sample_db):
Expand Down
125 changes: 54 additions & 71 deletions tests/unit/metrics/test_authentication_success.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,98 +13,81 @@ def metric():


@pytest.mark.asyncio
async def test_no_tool_calls(metric):
"""No tool calls at all should return 0.0 with tool not found."""
context = make_metric_context(tool_params=[], tool_responses=[])
result = await metric.compute(context)
async def test_session_matches_expected(metric):
"""Final session matching expected session exactly should score 1.0."""
ctx = make_metric_context(
expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
final_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
)
result = await metric.compute(ctx)

assert result.score == 0.0
assert result.normalized_score == 0.0
assert result.details["get_reservation_found"] is False
assert result.details["get_reservation_call_count"] == 0
assert result.details["get_reservation_success_count"] == 0
assert result.score == 1.0
assert result.normalized_score == 1.0
assert result.details["mismatches"] == {}


@pytest.mark.asyncio
async def test_successful_get_reservation(metric):
"""A successful get_reservation call should return 1.0."""
context = make_metric_context(
tool_params=[],
tool_responses=[
{
"tool_name": "get_reservation",
"tool_response": {"status": "success", "reservation": {"confirmation_number": "ABC123"}},
},
],
async def test_session_is_superset(metric):
"""Final session with extra keys beyond expected should still score 1.0."""
ctx = make_metric_context(
expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
final_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe", "extra_key": "value"}},
)
result = await metric.compute(context)
result = await metric.compute(ctx)

assert result.score == 1.0
assert result.normalized_score == 1.0
assert result.details["get_reservation_found"] is True
assert result.details["get_reservation_call_count"] == 1
assert result.details["get_reservation_success_count"] == 1
assert result.details["mismatches"] == {}


@pytest.mark.asyncio
async def test_failed_get_reservation(metric):
"""A get_reservation call with error status should return 0.0."""
context = make_metric_context(
tool_params=[],
tool_responses=[
{
"tool_name": "get_reservation",
"tool_response": {"status": "error", "error_type": "not_found", "message": "No reservation found"},
},
],
async def test_wrong_confirmation_number(metric):
"""Final session with wrong confirmation number should score 0.0."""
ctx = make_metric_context(
expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
final_scenario_db={"session": {"confirmation_number": "WRONG1", "last_name": "doe"}},
)
result = await metric.compute(context)
result = await metric.compute(ctx)

assert result.score == 0.0
assert result.details["get_reservation_found"] is True
assert result.details["get_reservation_call_count"] == 1
assert result.details["get_reservation_success_count"] == 0
assert result.normalized_score == 0.0
assert "confirmation_number" in result.details["mismatches"]


@pytest.mark.asyncio
async def test_mixed_calls(metric):
"""One failed and one successful get_reservation plus other tools should return 1.0."""
context = make_metric_context(
tool_params=[],
tool_responses=[
{
"tool_name": "get_reservation",
"tool_response": {"status": "error", "error_type": "verification_failed", "message": "Bad last name"},
},
{
"tool_name": "search_rebooking_options",
"tool_response": {"status": "success", "options": []},
},
{
"tool_name": "get_reservation",
"tool_response": {"status": "success", "reservation": {}},
},
],
async def test_wrong_last_name(metric):
"""Final session with wrong last name should score 0.0."""
ctx = make_metric_context(
expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
final_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "smith"}},
)
result = await metric.compute(context)
result = await metric.compute(ctx)

assert result.score == 1.0
assert result.details["get_reservation_call_count"] == 2
assert result.details["get_reservation_success_count"] == 1
assert result.score == 0.0
assert "last_name" in result.details["mismatches"]


@pytest.mark.asyncio
async def test_other_tools_only(metric):
"""Only non-get_reservation tools should return 0.0 with tool not found."""
context = make_metric_context(
tool_params=[],
tool_responses=[
{"tool_name": "search_rebooking_options", "tool_response": {"status": "success", "options": []}},
{"tool_name": "rebook_flight", "tool_response": {"status": "success"}},
],
async def test_empty_final_session(metric):
"""No session written (agent never authenticated) should score 0.0."""
ctx = make_metric_context(
expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
final_scenario_db={},
)
result = await metric.compute(context)
result = await metric.compute(ctx)

assert result.score == 0.0
assert result.details["get_reservation_found"] is False
assert result.details["get_reservation_call_count"] == 0
assert result.details["actual_session"] == {}
assert len(result.details["mismatches"]) == 2


@pytest.mark.asyncio
async def test_no_expected_session(metric):
"""Missing expected session key should skip auth check and score 1.0."""
ctx = make_metric_context(
expected_scenario_db={},
final_scenario_db={},
)
result = await metric.compute(ctx)

assert result.score == 1.0
assert "skipping" in result.details["reason"]
12 changes: 12 additions & 0 deletions tests/unit/utils/test_hash_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,18 @@ def test_hash_differs_for_key_order_irrelevant(self):
d2 = {"a": 2, "b": 1}
assert get_dict_hash(d1) == get_dict_hash(d2)

def test_session_key_excluded_from_hash(self):
"""Session key should not affect the hash."""
db_without_session = {"reservations": {"ABC": {"status": "confirmed"}}}
db_with_session = {**db_without_session, "session": {"confirmation_number": "ABC", "last_name": "doe"}}
assert get_dict_hash(db_without_session) == get_dict_hash(db_with_session)

def test_different_sessions_produce_same_hash(self):
"""Two DBs identical except for session content should hash the same."""
db_session_a = {"reservations": {}, "session": {"confirmation_number": "AAA", "last_name": "smith"}}
db_session_b = {"reservations": {}, "session": {"confirmation_number": "BBB", "last_name": "jones"}}
assert get_dict_hash(db_session_a) == get_dict_hash(db_session_b)


class TestComputeDbDiff:
def test_no_diff_after_normalization(self):
Expand Down