Skip to content

Commit 992ddd2

Browse files
slister1001Copilot
andcommitted
fix: track errored/undetermined objectives in red team result counts
Previously, objectives that failed during attack execution or risk categories with zero prepared objectives were silently dropped from the pipeline. The result_counts.errored field always showed 0 because _compute_result_count only counted existing output items. Changes: - _execution_manager.py: Record 0-objective categories as failed in red_team_info instead of silently skipping. Add expected_count to all red_team_info entries to track expected vs actual objectives. - _result_processor.py: Add _extract_expected_total() to compute total expected objectives from red_team_info (de-duplicated by risk category). Pass expected_total to _compute_result_count() which now computes errored as the delta between expected and actual items. Add partial_failure to _determine_run_status failure detection. - test_result_processor_errored.py: 31 new unit tests covering _compute_result_count with expected_total, _extract_expected_total de-duplication logic, and _determine_run_status failure detection. - test_foundry.py: 3 new tests for 0-objective recording and expected_count propagation in FoundryExecutionManager. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 24212d1 commit 992ddd2

File tree

4 files changed

+520
-10
lines changed

4 files changed

+520
-10
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_execution_manager.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,37 @@ async def execute_attacks(
122122
objectives = objectives_by_risk.get(risk_value, [])
123123

124124
if not objectives:
125-
self.logger.info(f"No objectives for {risk_value}, skipping")
125+
self.logger.info(f"No objectives for {risk_value}, recording as failed")
126+
# Record zero-objective categories for every requested strategy
127+
# so _determine_run_status detects the failure and errored
128+
# counts reflect the gap.
129+
from .._utils.formatting_utils import get_strategy_name
130+
131+
failed_entry = {
132+
"data_file": "",
133+
"status": "failed",
134+
"error": "No attack objectives could be prepared for this risk category",
135+
"asr": 0.0,
136+
"expected_count": 0,
137+
}
138+
foundry_strats, special_strats = StrategyMapper.filter_for_foundry(attack_strategies)
139+
for strategy in foundry_strats:
140+
strategy_key = get_strategy_name(strategy)
141+
if strategy_key not in red_team_info:
142+
red_team_info[strategy_key] = {}
143+
red_team_info[strategy_key][risk_value] = {**failed_entry}
144+
for strategy in special_strats:
145+
flat = strategy if not isinstance(strategy, list) else strategy[0]
146+
if flat != AttackStrategy.Baseline:
147+
strategy_key = get_strategy_name(strategy)
148+
if strategy_key not in red_team_info:
149+
red_team_info[strategy_key] = {}
150+
red_team_info[strategy_key][risk_value] = {**failed_entry}
151+
if include_baseline:
152+
strategy_key = get_strategy_name(AttackStrategy.Baseline)
153+
if strategy_key not in red_team_info:
154+
red_team_info[strategy_key] = {}
155+
red_team_info[strategy_key][risk_value] = {**failed_entry}
126156
continue
127157

128158
self.logger.info(f"Processing {len(objectives)} objectives for {risk_value}")
@@ -186,6 +216,7 @@ async def execute_attacks(
186216
"error": str(e),
187217
"partial_failure": True,
188218
"asr": 0.0,
219+
"expected_count": len(objectives),
189220
}
190221
else:
191222
self.logger.error(f"Error executing attacks for {risk_value}: {e}")
@@ -197,6 +228,7 @@ async def execute_attacks(
197228
"status": "failed",
198229
"error": str(e),
199230
"asr": 0.0,
231+
"expected_count": len(objectives),
200232
}
201233
continue
202234

@@ -223,6 +255,7 @@ async def execute_attacks(
223255
output_path=output_path,
224256
attack_strategies=attack_strategies,
225257
include_baseline=include_baseline,
258+
num_objectives=len(objectives),
226259
)
227260

228261
for strategy_name, strategy_data in strategy_results.items():
@@ -357,6 +390,7 @@ def _group_results_by_strategy(
357390
output_path: str,
358391
attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]],
359392
include_baseline: bool,
393+
num_objectives: int = 0,
360394
) -> Dict[str, Dict[str, Any]]:
361395
"""Group attack results by strategy for red_team_info format.
362396
@@ -375,6 +409,8 @@ def _group_results_by_strategy(
375409
:type attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
376410
:param include_baseline: Whether baseline was included in execution
377411
:type include_baseline: bool
412+
:param num_objectives: Number of objectives sent for this risk category
413+
:type num_objectives: int
378414
:return: Dictionary mapping strategy name to result data
379415
:rtype: Dict[str, Dict[str, Any]]
380416
"""
@@ -395,6 +431,7 @@ def _group_results_by_strategy(
395431
"data_file": output_path,
396432
"status": "completed",
397433
"asr": overall_asr,
434+
"expected_count": num_objectives,
398435
}
399436

400437
# Add entries for special strategies that were executed (e.g., IndirectJailbreak via XPIA)
@@ -407,6 +444,7 @@ def _group_results_by_strategy(
407444
"data_file": output_path,
408445
"status": "completed",
409446
"asr": overall_asr,
447+
"expected_count": num_objectives,
410448
}
411449

412450
# Add baseline entry if it was included
@@ -415,6 +453,7 @@ def _group_results_by_strategy(
415453
"data_file": output_path,
416454
"status": "completed",
417455
"asr": overall_asr,
456+
"expected_count": num_objectives,
418457
}
419458

420459
# Fallback if no strategies produced results
@@ -423,6 +462,7 @@ def _group_results_by_strategy(
423462
"data_file": output_path,
424463
"status": "completed",
425464
"asr": overall_asr,
465+
"expected_count": num_objectives,
426466
}
427467

428468
return results

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,16 +1424,55 @@ def _format_thresholds_for_output(self) -> Dict[str, Any]:
14241424
return formatted_thresholds
14251425

14261426
@staticmethod
1427-
def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
1427+
def _extract_expected_total(red_team_info: Optional[Dict]) -> Optional[int]:
1428+
"""Extract the total expected objective count from red_team_info.
1429+
1430+
Each strategy/risk-category entry may carry an ``expected_count`` field
1431+
set by the execution layer. Since the same risk category can appear
1432+
under multiple strategies (e.g. baseline + crescendo), we de-duplicate
1433+
by risk category to avoid double-counting.
1434+
1435+
:param red_team_info: The red_team_info tracking dictionary
1436+
:return: Total expected objectives, or None if information is unavailable
1437+
"""
1438+
if not isinstance(red_team_info, dict):
1439+
return None
1440+
1441+
# Collect the max expected_count per risk category across strategies
1442+
per_risk: Dict[str, int] = {}
1443+
for risk_data in red_team_info.values():
1444+
if not isinstance(risk_data, dict):
1445+
continue
1446+
for risk_category, details in risk_data.items():
1447+
if not isinstance(details, dict):
1448+
continue
1449+
count = details.get("expected_count")
1450+
if count is not None:
1451+
try:
1452+
per_risk[risk_category] = max(per_risk.get(risk_category, 0), int(count))
1453+
except (ValueError, TypeError):
1454+
continue
1455+
1456+
if not per_risk:
1457+
return None
1458+
1459+
return sum(per_risk.values())
1460+
1461+
@staticmethod
1462+
def _compute_result_count(
1463+
output_items: List[Dict[str, Any]],
1464+
expected_total: Optional[int] = None,
1465+
) -> Dict[str, int]:
14281466
"""Aggregate run-level pass/fail counts from individual output items.
14291467
14301468
Counts reflect attack success rate (ASR) semantics:
14311469
- passed: attacks that were unsuccessful (system defended successfully)
14321470
- failed: attacks that were successful (system was compromised)
1433-
- errored: rows that failed to process due to errors
1471+
- errored: rows that failed to process due to errors, plus any
1472+
objectives that never produced output items (expected vs actual delta)
14341473
"""
14351474

1436-
total = len(output_items)
1475+
actual_total = len(output_items)
14371476
passed = failed = errored = 0
14381477

14391478
for item in output_items:
@@ -1474,6 +1513,15 @@ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
14741513
else:
14751514
errored += 1
14761515

1516+
# Account for objectives that never produced output items (e.g., attack
1517+
# execution errors that were caught and swallowed, or risk categories
1518+
# with zero objectives prepared).
1519+
if expected_total is not None and expected_total > actual_total:
1520+
missing = expected_total - actual_total
1521+
errored += missing
1522+
1523+
total = expected_total if expected_total is not None and expected_total > actual_total else actual_total
1524+
14771525
return {
14781526
"total": total,
14791527
"passed": passed,
@@ -1675,7 +1723,7 @@ def _determine_run_status(
16751723
if not isinstance(details, dict):
16761724
continue
16771725
status = details.get("status", "").lower()
1678-
if status in ("incomplete", "failed", "timeout", "pending", "running"):
1726+
if status in ("incomplete", "failed", "timeout", "pending", "running", "partial_failure"):
16791727
return "failed"
16801728

16811729
return "completed"
@@ -1769,7 +1817,10 @@ def _build_results_payload(
17691817
if run_name is None:
17701818
run_name = scan_name or f"redteam-run-{run_id[:8]}"
17711819

1772-
result_count = self._compute_result_count(output_items)
1820+
result_count = self._compute_result_count(
1821+
output_items,
1822+
expected_total=self._extract_expected_total(red_team_info),
1823+
)
17731824
per_testing_results = self._compute_per_testing_criteria(output_items)
17741825
data_source = self._build_data_source_section(parameters, red_team_info)
17751826
status = self._determine_run_status(scan_result, red_team_info, output_items)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1749,7 +1749,7 @@ def test_group_results_by_strategy_with_indirect_jailbreak(
17491749

17501750
@pytest.mark.asyncio
17511751
async def test_execute_attacks_empty_objectives(self, mock_credential, mock_azure_ai_project, mock_logger):
1752-
"""Test execute_attacks with no objectives."""
1752+
"""Test execute_attacks with no objectives for any risk category."""
17531753
manager = FoundryExecutionManager(
17541754
credential=mock_credential,
17551755
azure_ai_project=mock_azure_ai_project,
@@ -1762,12 +1762,103 @@ async def test_execute_attacks_empty_objectives(self, mock_credential, mock_azur
17621762
result = await manager.execute_attacks(
17631763
objective_target=mock_target,
17641764
risk_categories=[RiskCategory.Violence],
1765-
attack_strategies=[AttackStrategy.Base64],
1765+
attack_strategies=[AttackStrategy.Baseline],
17661766
objectives_by_risk={}, # No objectives
17671767
)
17681768

1769-
# Should return empty dict when no objectives
1770-
assert result == {}
1769+
# When no objectives are available at all the category should be
1770+
# recorded as failed in red_team_info so _determine_run_status can
1771+
# detect the gap.
1772+
assert "baseline" in result
1773+
assert "violence" in result["baseline"]
1774+
entry = result["baseline"]["violence"]
1775+
assert entry["status"] == "failed"
1776+
assert entry["expected_count"] == 0
1777+
assert "error" in entry
1778+
1779+
@pytest.mark.asyncio
1780+
async def test_execute_attacks_zero_objectives_records_failed(
1781+
self, mock_credential, mock_azure_ai_project, mock_logger
1782+
):
1783+
"""When a risk category has zero objectives, it should be recorded as failed
1784+
in red_team_info so that _determine_run_status marks the run as failed."""
1785+
manager = FoundryExecutionManager(
1786+
credential=mock_credential,
1787+
azure_ai_project=mock_azure_ai_project,
1788+
logger=mock_logger,
1789+
output_dir="/test/output",
1790+
)
1791+
1792+
mock_target = MagicMock()
1793+
1794+
result = await manager.execute_attacks(
1795+
objective_target=mock_target,
1796+
risk_categories=[RiskCategory.Violence, RiskCategory.SelfHarm],
1797+
attack_strategies=[AttackStrategy.Baseline],
1798+
objectives_by_risk={
1799+
"violence": [], # explicitly empty
1800+
# self_harm not present at all
1801+
},
1802+
)
1803+
1804+
# Both risk categories should be recorded as failed
1805+
assert "baseline" in result
1806+
assert result["baseline"]["violence"]["status"] == "failed"
1807+
assert result["baseline"]["self_harm"]["status"] == "failed"
1808+
1809+
@pytest.mark.asyncio
1810+
async def test_execute_attacks_zero_objectives_records_all_strategies(
1811+
self, mock_credential, mock_azure_ai_project, mock_logger
1812+
):
1813+
"""When a risk category has zero objectives with multiple strategies,
1814+
failed entries should be created for every strategy, not just baseline."""
1815+
manager = FoundryExecutionManager(
1816+
credential=mock_credential,
1817+
azure_ai_project=mock_azure_ai_project,
1818+
logger=mock_logger,
1819+
output_dir="/test/output",
1820+
)
1821+
1822+
mock_target = MagicMock()
1823+
1824+
result = await manager.execute_attacks(
1825+
objective_target=mock_target,
1826+
risk_categories=[RiskCategory.Violence],
1827+
attack_strategies=[AttackStrategy.Base64, AttackStrategy.Baseline],
1828+
objectives_by_risk={}, # No objectives
1829+
)
1830+
1831+
# Both base64 and baseline strategies should have a failed entry
1832+
assert "base64" in result
1833+
assert result["base64"]["violence"]["status"] == "failed"
1834+
assert "baseline" in result
1835+
assert result["baseline"]["violence"]["status"] == "failed"
1836+
1837+
def test_group_results_by_strategy_includes_expected_count(
1838+
self, mock_credential, mock_azure_ai_project, mock_logger
1839+
):
1840+
"""Verify _group_results_by_strategy includes expected_count in entries."""
1841+
manager = FoundryExecutionManager(
1842+
credential=mock_credential,
1843+
azure_ai_project=mock_azure_ai_project,
1844+
logger=mock_logger,
1845+
output_dir="/test/output",
1846+
)
1847+
1848+
mock_orchestrator = MagicMock()
1849+
mock_orchestrator.calculate_asr.return_value = 0.5
1850+
1851+
results = manager._group_results_by_strategy(
1852+
orchestrator=mock_orchestrator,
1853+
risk_value="violence",
1854+
output_path="/test/output.jsonl",
1855+
attack_strategies=[AttackStrategy.Baseline],
1856+
include_baseline=True,
1857+
num_objectives=32,
1858+
)
1859+
1860+
assert "baseline" in results
1861+
assert results["baseline"]["expected_count"] == 32
17711862

17721863
@pytest.mark.asyncio
17731864
async def test_execute_attacks_filters_multi_turn_without_adversarial(

0 commit comments

Comments
 (0)