ServiceNow · raghavm243512 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/data/airline_dataset.jsonl b/data/airline_dataset.jsonl
diff --git a/src/eva/assistant/tools/airline_tools.py b/src/eva/assistant/tools/airline_tools.py
@@ -173,6 +173,14 @@ def get_reservation(params: dict, db: dict, call_index: int) -> dict:
                 "message": f"Last name does not match reservation {confirmation_number}",
             }
 
+    # Write session data to mark successful authentication
+    db.setdefault("session", {}).update(
+        {
+            "confirmation_number": confirmation_number,
+            "last_name": last_name.lower() if last_name else "",
+        }
+    )
+
     # Return success — sort journeys by first segment's date, then journey_id for readability
     result_reservation = copy.deepcopy(reservation)
     result_reservation["bookings"].sort(

diff --git a/src/eva/metrics/diagnostic/authentication_success.py b/src/eva/metrics/diagnostic/authentication_success.py
@@ -1,4 +1,4 @@
-"""Authentication success metric - checks if get_reservation was called successfully.
+"""Authentication success metric - checks if the session was authenticated correctly.
 
 Debug metric for diagnosing model performance issues, not directly used in
 final evaluation scores.
@@ -11,56 +11,58 @@
 
 @register_metric
 class AuthenticationSuccessMetric(CodeMetric):
-    """Checks whether the agent successfully authenticated the user via get_reservation.
+    """Checks whether the agent successfully authenticated the user.
 
-    Looks at tool_responses for entries where tool_name == "get_reservation"
-    and checks if any had tool_response.status == "success".
+    Compares the 'session' key in the final scenario database against the
+    expected session in the ground truth. Authentication is successful if the
+    final session is a superset of the expected session — i.e., every key-value
+    pair in expected_session is present in the actual final session.
 
-    Score: 1.0 if at least one get_reservation call succeeded, 0.0 otherwise.
+    Score: 1.0 if final session is a superset of expected session, 0.0 otherwise.
 
     This is a diagnostic metric used for diagnosing model performance issues.
     It is not directly used in final evaluation scores.
     """
 
     name = "authentication_success"
-    description = "Debug metric: checks if get_reservation was called with a successful result"
+    description = "Checks if session state in final DB is a superset of expected session"
     category = "diagnostic"
     exclude_from_pass_at_k = True
 
     async def compute(self, context: MetricContext) -> MetricScore:
-        """Compute authentication success from tool responses."""
+        """Compute authentication success from final scenario database session state."""
         try:
-            tool_responses = context.tool_responses or []
-
-            get_reservation_calls = [resp for resp in tool_responses if resp.get("tool_name") == "get_reservation"]
-
-            found = len(get_reservation_calls) > 0
-
-            success_count = 0
-            for resp in get_reservation_calls:
-                tool_response = resp.get("tool_response", {})
-                if isinstance(tool_response, dict) and tool_response.get("status") == "success":
-                    success_count += 1
-
-            score = 1.0 if success_count > 0 else 0.0
-
-            # Determine reason for failure
-            if not found:
-                reason = "get_reservation tool was never called"
-            elif success_count == 0:
-                reason = "get_reservation was called but never returned status success"
-            else:
-                reason = "get_reservation called successfully"
+            expected_session = context.expected_scenario_db.get("session", {})
+            actual_session = context.final_scenario_db.get("session", {})
+
+            if not expected_session:
+                return MetricScore(
+                    name=self.name,
+                    score=1.0,
+                    normalized_score=1.0,
+                    details={"reason": "No expected session to verify — skipping auth check"},
+                )
+
+            # Check superset: every key-value in expected_session must be in actual_session
+            mismatches = {
+                k: {"expected": v, "actual": actual_session.get(k)}
+                for k, v in expected_session.items()
+                if actual_session.get(k) != v
+            }
+
+            success = len(mismatches) == 0
 
             return MetricScore(
                 name=self.name,
-                score=score,
-                normalized_score=score,
+                score=1.0 if success else 0.0,
+                normalized_score=1.0 if success else 0.0,
                 details={
-                    "get_reservation_found": found,
-                    "get_reservation_call_count": len(get_reservation_calls),
-                    "get_reservation_success_count": success_count,
-                    "reason": reason,
+                    "expected_session": expected_session,
+                    "actual_session": actual_session,
+                    "mismatches": mismatches,
+                    "reason": "Authentication successful"
+                    if success
+                    else f"Session mismatch on keys: {list(mismatches)}",
                 },
             )
 

diff --git a/src/eva/utils/hash_utils.py b/src/eva/utils/hash_utils.py
@@ -87,13 +87,17 @@ def get_dict_hash(obj: dict) -> str:
     - Use default=str for non-JSON-serializable types
     - Compute SHA-256 hash of the serialized string
 
+    The 'session' key is always excluded from hashing — auth success is
+    tracked separately via the authentication_success metric.
+
     Args:
         obj: Dictionary to hash
 
     Returns:
         Hexadecimal SHA-256 hash string
     """
-    normalized = normalize_for_comparison(obj)
+    obj_for_hash = {k: v for k, v in obj.items() if k != "session"} if isinstance(obj, dict) else obj
+    normalized = normalize_for_comparison(obj_for_hash)
     serialized = json.dumps(normalized, sort_keys=True, default=str, separators=(",", ":"))
     return hashlib.sha256(serialized.encode()).hexdigest()
 

diff --git a/tests/unit/airline/test_tools.py b/tests/unit/airline/test_tools.py
@@ -1068,14 +1068,41 @@ def test_get_reservation_success(sample_db):
     assert result["reservation"]["bookings"][1]["status"] == "confirmed"
 
 
-def test_get_reservation_not_found(sample_db):
-    """Test reservation not found."""
+def test_get_reservation_writes_session(sample_db):
+    """Successful get_reservation should write confirmation_number and last_name to db session."""
+    params = {"confirmation_number": "abc123", "last_name": "Doe"}
+    get_reservation(params, sample_db, call_index=1)
+
+    assert sample_db["session"]["confirmation_number"] == "ABC123"
+    assert sample_db["session"]["last_name"] == "doe"
+
+
+def test_get_reservation_session_last_name_lowercased(sample_db):
+    """Session last_name should be stored lowercase regardless of input case."""
+    params = {"confirmation_number": "ABC123", "last_name": "DOE"}
+    get_reservation(params, sample_db, call_index=1)
+
+    assert sample_db["session"]["last_name"] == "doe"
+
+
+def test_get_reservation_failed_auth_does_not_write_session(sample_db):
+    """Failed authentication (bad last name) should not write to db session."""
+    params = {"confirmation_number": "ABC123", "last_name": "Wrong"}
+    result = get_reservation(params, sample_db, call_index=1)
+
+    assert result["status"] == "error"
+    assert "session" not in sample_db
+
+
+def test_get_reservation_not_found_does_not_write_session(sample_db):
+    """Reservation not found should not write to db session."""
     params = {"confirmation_number": "XXXXXX", "last_name": "Smith"}
     result = get_reservation(params, sample_db, call_index=1)
 
     assert result["status"] == "error"
     assert result["error_type"] == "not_found"
     assert "XXXXXX" in result["message"]
+    assert "session" not in sample_db
 
 
 def test_get_flight_status_success(sample_db):

diff --git a/tests/unit/metrics/test_authentication_success.py b/tests/unit/metrics/test_authentication_success.py
@@ -13,98 +13,81 @@ def metric():
 
 
 @pytest.mark.asyncio
-async def test_no_tool_calls(metric):
-    """No tool calls at all should return 0.0 with tool not found."""
-    context = make_metric_context(tool_params=[], tool_responses=[])
-    result = await metric.compute(context)
+async def test_session_matches_expected(metric):
+    """Final session matching expected session exactly should score 1.0."""
+    ctx = make_metric_context(
+        expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
+        final_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
+    )
+    result = await metric.compute(ctx)
 
-    assert result.score == 0.0
-    assert result.normalized_score == 0.0
-    assert result.details["get_reservation_found"] is False
-    assert result.details["get_reservation_call_count"] == 0
-    assert result.details["get_reservation_success_count"] == 0
+    assert result.score == 1.0
+    assert result.normalized_score == 1.0
+    assert result.details["mismatches"] == {}
 
 
 @pytest.mark.asyncio
-async def test_successful_get_reservation(metric):
-    """A successful get_reservation call should return 1.0."""
-    context = make_metric_context(
-        tool_params=[],
-        tool_responses=[
-            {
-                "tool_name": "get_reservation",
-                "tool_response": {"status": "success", "reservation": {"confirmation_number": "ABC123"}},
-            },
-        ],
+async def test_session_is_superset(metric):
+    """Final session with extra keys beyond expected should still score 1.0."""
+    ctx = make_metric_context(
+        expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
+        final_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe", "extra_key": "value"}},
     )
-    result = await metric.compute(context)
+    result = await metric.compute(ctx)
 
     assert result.score == 1.0
-    assert result.normalized_score == 1.0
-    assert result.details["get_reservation_found"] is True
-    assert result.details["get_reservation_call_count"] == 1
-    assert result.details["get_reservation_success_count"] == 1
+    assert result.details["mismatches"] == {}
 
 
 @pytest.mark.asyncio
-async def test_failed_get_reservation(metric):
-    """A get_reservation call with error status should return 0.0."""
-    context = make_metric_context(
-        tool_params=[],
-        tool_responses=[
-            {
-                "tool_name": "get_reservation",
-                "tool_response": {"status": "error", "error_type": "not_found", "message": "No reservation found"},
-            },
-        ],
+async def test_wrong_confirmation_number(metric):
+    """Final session with wrong confirmation number should score 0.0."""
+    ctx = make_metric_context(
+        expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
+        final_scenario_db={"session": {"confirmation_number": "WRONG1", "last_name": "doe"}},
     )
-    result = await metric.compute(context)
+    result = await metric.compute(ctx)
 
     assert result.score == 0.0
-    assert result.details["get_reservation_found"] is True
-    assert result.details["get_reservation_call_count"] == 1
-    assert result.details["get_reservation_success_count"] == 0
+    assert result.normalized_score == 0.0
+    assert "confirmation_number" in result.details["mismatches"]
 
 
 @pytest.mark.asyncio
-async def test_mixed_calls(metric):
-    """One failed and one successful get_reservation plus other tools should return 1.0."""
-    context = make_metric_context(
-        tool_params=[],
-        tool_responses=[
-            {
-                "tool_name": "get_reservation",
-                "tool_response": {"status": "error", "error_type": "verification_failed", "message": "Bad last name"},
-            },
-            {
-                "tool_name": "search_rebooking_options",
-                "tool_response": {"status": "success", "options": []},
-            },
-            {
-                "tool_name": "get_reservation",
-                "tool_response": {"status": "success", "reservation": {}},
-            },
-        ],
+async def test_wrong_last_name(metric):
+    """Final session with wrong last name should score 0.0."""
+    ctx = make_metric_context(
+        expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
+        final_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "smith"}},
     )
-    result = await metric.compute(context)
+    result = await metric.compute(ctx)
 
-    assert result.score == 1.0
-    assert result.details["get_reservation_call_count"] == 2
-    assert result.details["get_reservation_success_count"] == 1
+    assert result.score == 0.0
+    assert "last_name" in result.details["mismatches"]
 
 
 @pytest.mark.asyncio
-async def test_other_tools_only(metric):
-    """Only non-get_reservation tools should return 0.0 with tool not found."""
-    context = make_metric_context(
-        tool_params=[],
-        tool_responses=[
-            {"tool_name": "search_rebooking_options", "tool_response": {"status": "success", "options": []}},
-            {"tool_name": "rebook_flight", "tool_response": {"status": "success"}},
-        ],
+async def test_empty_final_session(metric):
+    """No session written (agent never authenticated) should score 0.0."""
+    ctx = make_metric_context(
+        expected_scenario_db={"session": {"confirmation_number": "ABC123", "last_name": "doe"}},
+        final_scenario_db={},
     )
-    result = await metric.compute(context)
+    result = await metric.compute(ctx)
 
     assert result.score == 0.0
-    assert result.details["get_reservation_found"] is False
-    assert result.details["get_reservation_call_count"] == 0
+    assert result.details["actual_session"] == {}
+    assert len(result.details["mismatches"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_no_expected_session(metric):
+    """Missing expected session key should skip auth check and score 1.0."""
+    ctx = make_metric_context(
+        expected_scenario_db={},
+        final_scenario_db={},
+    )
+    result = await metric.compute(ctx)
+
+    assert result.score == 1.0
+    assert "skipping" in result.details["reason"]
diff --git a/tests/unit/utils/test_hash_normalization.py b/tests/unit/utils/test_hash_normalization.py
@@ -162,6 +162,18 @@ def test_hash_differs_for_key_order_irrelevant(self):
         d2 = {"a": 2, "b": 1}
         assert get_dict_hash(d1) == get_dict_hash(d2)
 
+    def test_session_key_excluded_from_hash(self):
+        """Session key should not affect the hash."""
+        db_without_session = {"reservations": {"ABC": {"status": "confirmed"}}}
+        db_with_session = {**db_without_session, "session": {"confirmation_number": "ABC", "last_name": "doe"}}
+        assert get_dict_hash(db_without_session) == get_dict_hash(db_with_session)
+
+    def test_different_sessions_produce_same_hash(self):
+        """Two DBs identical except for session content should hash the same."""
+        db_session_a = {"reservations": {}, "session": {"confirmation_number": "AAA", "last_name": "smith"}}
+        db_session_b = {"reservations": {}, "session": {"confirmation_number": "BBB", "last_name": "jones"}}
+        assert get_dict_hash(db_session_a) == get_dict_hash(db_session_b)
+
 
 class TestComputeDbDiff:
     def test_no_diff_after_normalization(self):