Skip to content

Commit bc36e7c

Browse files
slister1001Copilot
andcommitted
fix: track errored/undetermined objectives in red team result counts
Previously, objectives that failed during attack execution or risk categories with zero prepared objectives were silently dropped from the pipeline. The result_counts.errored field always showed 0 because _compute_result_count only counted existing output items. Changes: - _execution_manager.py: Record 0-objective categories as failed in red_team_info instead of silently skipping. Add expected_count to all red_team_info entries to track expected vs actual objectives. - _result_processor.py: Add _extract_expected_total() to compute total expected objectives from red_team_info (de-duplicated by risk category). Pass expected_total to _compute_result_count() which now computes errored as the delta between expected and actual items. Add partial_failure to _determine_run_status failure detection. - test_result_processor_errored.py: 31 new unit tests covering _compute_result_count with expected_total, _extract_expected_total de-duplication logic, and _determine_run_status failure detection. - test_foundry.py: 3 new tests for 0-objective recording and expected_count propagation in FoundryExecutionManager. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 24212d1 commit bc36e7c

File tree

4 files changed

+464
-10
lines changed

4 files changed

+464
-10
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_execution_manager.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,22 @@ async def execute_attacks(
122122
objectives = objectives_by_risk.get(risk_value, [])
123123

124124
if not objectives:
125-
self.logger.info(f"No objectives for {risk_value}, skipping")
125+
self.logger.info(f"No objectives for {risk_value}, recording as failed")
126+
# Record zero-objective categories so _determine_run_status
127+
# detects the failure and errored counts reflect the gap.
128+
from .._utils.formatting_utils import get_strategy_name
129+
130+
if include_baseline:
131+
strategy_key = get_strategy_name(AttackStrategy.Baseline)
132+
if strategy_key not in red_team_info:
133+
red_team_info[strategy_key] = {}
134+
red_team_info[strategy_key][risk_value] = {
135+
"data_file": "",
136+
"status": "failed",
137+
"error": "No attack objectives could be prepared for this risk category",
138+
"asr": 0.0,
139+
"expected_count": 0,
140+
}
126141
continue
127142

128143
self.logger.info(f"Processing {len(objectives)} objectives for {risk_value}")
@@ -186,6 +201,7 @@ async def execute_attacks(
186201
"error": str(e),
187202
"partial_failure": True,
188203
"asr": 0.0,
204+
"expected_count": len(objectives),
189205
}
190206
else:
191207
self.logger.error(f"Error executing attacks for {risk_value}: {e}")
@@ -197,6 +213,7 @@ async def execute_attacks(
197213
"status": "failed",
198214
"error": str(e),
199215
"asr": 0.0,
216+
"expected_count": len(objectives),
200217
}
201218
continue
202219

@@ -223,6 +240,7 @@ async def execute_attacks(
223240
output_path=output_path,
224241
attack_strategies=attack_strategies,
225242
include_baseline=include_baseline,
243+
num_objectives=len(objectives),
226244
)
227245

228246
for strategy_name, strategy_data in strategy_results.items():
@@ -357,6 +375,7 @@ def _group_results_by_strategy(
357375
output_path: str,
358376
attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]],
359377
include_baseline: bool,
378+
num_objectives: int = 0,
360379
) -> Dict[str, Dict[str, Any]]:
361380
"""Group attack results by strategy for red_team_info format.
362381
@@ -375,6 +394,8 @@ def _group_results_by_strategy(
375394
:type attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
376395
:param include_baseline: Whether baseline was included in execution
377396
:type include_baseline: bool
397+
:param num_objectives: Number of objectives sent for this risk category
398+
:type num_objectives: int
378399
:return: Dictionary mapping strategy name to result data
379400
:rtype: Dict[str, Dict[str, Any]]
380401
"""
@@ -395,6 +416,7 @@ def _group_results_by_strategy(
395416
"data_file": output_path,
396417
"status": "completed",
397418
"asr": overall_asr,
419+
"expected_count": num_objectives,
398420
}
399421

400422
# Add entries for special strategies that were executed (e.g., IndirectJailbreak via XPIA)
@@ -407,6 +429,7 @@ def _group_results_by_strategy(
407429
"data_file": output_path,
408430
"status": "completed",
409431
"asr": overall_asr,
432+
"expected_count": num_objectives,
410433
}
411434

412435
# Add baseline entry if it was included
@@ -415,6 +438,7 @@ def _group_results_by_strategy(
415438
"data_file": output_path,
416439
"status": "completed",
417440
"asr": overall_asr,
441+
"expected_count": num_objectives,
418442
}
419443

420444
# Fallback if no strategies produced results
@@ -423,6 +447,7 @@ def _group_results_by_strategy(
423447
"data_file": output_path,
424448
"status": "completed",
425449
"asr": overall_asr,
450+
"expected_count": num_objectives,
426451
}
427452

428453
return results

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,16 +1424,52 @@ def _format_thresholds_for_output(self) -> Dict[str, Any]:
14241424
return formatted_thresholds
14251425

14261426
@staticmethod
1427-
def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
1427+
def _extract_expected_total(red_team_info: Optional[Dict]) -> Optional[int]:
1428+
"""Extract the total expected objective count from red_team_info.
1429+
1430+
Each strategy/risk-category entry may carry an ``expected_count`` field
1431+
set by the execution layer. Since the same risk category can appear
1432+
under multiple strategies (e.g. baseline + crescendo), we de-duplicate
1433+
by risk category to avoid double-counting.
1434+
1435+
:param red_team_info: The red_team_info tracking dictionary
1436+
:return: Total expected objectives, or None if information is unavailable
1437+
"""
1438+
if not isinstance(red_team_info, dict):
1439+
return None
1440+
1441+
# Collect the max expected_count per risk category across strategies
1442+
per_risk: Dict[str, int] = {}
1443+
for risk_data in red_team_info.values():
1444+
if not isinstance(risk_data, dict):
1445+
continue
1446+
for risk_category, details in risk_data.items():
1447+
if not isinstance(details, dict):
1448+
continue
1449+
count = details.get("expected_count")
1450+
if count is not None:
1451+
per_risk[risk_category] = max(per_risk.get(risk_category, 0), int(count))
1452+
1453+
if not per_risk:
1454+
return None
1455+
1456+
return sum(per_risk.values())
1457+
1458+
@staticmethod
1459+
def _compute_result_count(
1460+
output_items: List[Dict[str, Any]],
1461+
expected_total: Optional[int] = None,
1462+
) -> Dict[str, int]:
14281463
"""Aggregate run-level pass/fail counts from individual output items.
14291464
14301465
Counts reflect attack success rate (ASR) semantics:
14311466
- passed: attacks that were unsuccessful (system defended successfully)
14321467
- failed: attacks that were successful (system was compromised)
1433-
- errored: rows that failed to process due to errors
1468+
- errored: rows that failed to process due to errors, plus any
1469+
objectives that never produced output items (expected vs actual delta)
14341470
"""
14351471

1436-
total = len(output_items)
1472+
actual_total = len(output_items)
14371473
passed = failed = errored = 0
14381474

14391475
for item in output_items:
@@ -1474,6 +1510,15 @@ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
14741510
else:
14751511
errored += 1
14761512

1513+
# Account for objectives that never produced output items (e.g., attack
1514+
# execution errors that were caught and swallowed, or risk categories
1515+
# with zero objectives prepared).
1516+
if expected_total is not None and expected_total > actual_total:
1517+
missing = expected_total - actual_total
1518+
errored += missing
1519+
1520+
total = expected_total if expected_total is not None and expected_total > actual_total else actual_total
1521+
14771522
return {
14781523
"total": total,
14791524
"passed": passed,
@@ -1675,7 +1720,7 @@ def _determine_run_status(
16751720
if not isinstance(details, dict):
16761721
continue
16771722
status = details.get("status", "").lower()
1678-
if status in ("incomplete", "failed", "timeout", "pending", "running"):
1723+
if status in ("incomplete", "failed", "timeout", "pending", "running", "partial_failure"):
16791724
return "failed"
16801725

16811726
return "completed"
@@ -1769,7 +1814,10 @@ def _build_results_payload(
17691814
if run_name is None:
17701815
run_name = scan_name or f"redteam-run-{run_id[:8]}"
17711816

1772-
result_count = self._compute_result_count(output_items)
1817+
result_count = self._compute_result_count(
1818+
output_items,
1819+
expected_total=self._extract_expected_total(red_team_info),
1820+
)
17731821
per_testing_results = self._compute_per_testing_criteria(output_items)
17741822
data_source = self._build_data_source_section(parameters, red_team_info)
17751823
status = self._determine_run_status(scan_result, red_team_info, output_items)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1749,7 +1749,7 @@ def test_group_results_by_strategy_with_indirect_jailbreak(
17491749

17501750
@pytest.mark.asyncio
17511751
async def test_execute_attacks_empty_objectives(self, mock_credential, mock_azure_ai_project, mock_logger):
1752-
"""Test execute_attacks with no objectives."""
1752+
"""Test execute_attacks with no objectives for any risk category."""
17531753
manager = FoundryExecutionManager(
17541754
credential=mock_credential,
17551755
azure_ai_project=mock_azure_ai_project,
@@ -1762,12 +1762,75 @@ async def test_execute_attacks_empty_objectives(self, mock_credential, mock_azur
17621762
result = await manager.execute_attacks(
17631763
objective_target=mock_target,
17641764
risk_categories=[RiskCategory.Violence],
1765-
attack_strategies=[AttackStrategy.Base64],
1765+
attack_strategies=[AttackStrategy.Baseline],
17661766
objectives_by_risk={}, # No objectives
17671767
)
17681768

1769-
# Should return empty dict when no objectives
1770-
assert result == {}
1769+
# When no objectives are available at all the category should be
1770+
# recorded as failed in red_team_info so _determine_run_status can
1771+
# detect the gap.
1772+
assert "baseline" in result
1773+
assert "violence" in result["baseline"]
1774+
entry = result["baseline"]["violence"]
1775+
assert entry["status"] == "failed"
1776+
assert entry["expected_count"] == 0
1777+
assert "error" in entry
1778+
1779+
@pytest.mark.asyncio
1780+
async def test_execute_attacks_zero_objectives_records_failed(
1781+
self, mock_credential, mock_azure_ai_project, mock_logger
1782+
):
1783+
"""When a risk category has zero objectives, it should be recorded as failed
1784+
in red_team_info so that _determine_run_status marks the run as failed."""
1785+
manager = FoundryExecutionManager(
1786+
credential=mock_credential,
1787+
azure_ai_project=mock_azure_ai_project,
1788+
logger=mock_logger,
1789+
output_dir="/test/output",
1790+
)
1791+
1792+
mock_target = MagicMock()
1793+
1794+
result = await manager.execute_attacks(
1795+
objective_target=mock_target,
1796+
risk_categories=[RiskCategory.Violence, RiskCategory.SelfHarm],
1797+
attack_strategies=[AttackStrategy.Baseline],
1798+
objectives_by_risk={
1799+
"violence": [], # explicitly empty
1800+
# self_harm not present at all
1801+
},
1802+
)
1803+
1804+
# Both risk categories should be recorded as failed
1805+
assert "baseline" in result
1806+
assert result["baseline"]["violence"]["status"] == "failed"
1807+
assert result["baseline"]["self_harm"]["status"] == "failed"
1808+
1809+
def test_group_results_by_strategy_includes_expected_count(
1810+
self, mock_credential, mock_azure_ai_project, mock_logger
1811+
):
1812+
"""Verify _group_results_by_strategy includes expected_count in entries."""
1813+
manager = FoundryExecutionManager(
1814+
credential=mock_credential,
1815+
azure_ai_project=mock_azure_ai_project,
1816+
logger=mock_logger,
1817+
output_dir="/test/output",
1818+
)
1819+
1820+
mock_orchestrator = MagicMock()
1821+
mock_orchestrator.calculate_asr.return_value = 0.5
1822+
1823+
results = manager._group_results_by_strategy(
1824+
orchestrator=mock_orchestrator,
1825+
risk_value="violence",
1826+
output_path="/test/output.jsonl",
1827+
attack_strategies=[AttackStrategy.Baseline],
1828+
include_baseline=True,
1829+
num_objectives=32,
1830+
)
1831+
1832+
assert "baseline" in results
1833+
assert results["baseline"]["expected_count"] == 32
17711834

17721835
@pytest.mark.asyncio
17731836
async def test_execute_attacks_filters_multi_turn_without_adversarial(

0 commit comments

Comments
 (0)