Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyrit/scenario/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,18 @@
# This allows: from pyrit.scenario.airt import ContentHarms
# without needing separate pyrit/scenario/airt/ directories
from pyrit.scenario.scenarios import airt as _airt_module
from pyrit.scenario.scenarios import benchmark as _benchmark_module
from pyrit.scenario.scenarios import foundry as _foundry_module
from pyrit.scenario.scenarios import garak as _garak_module

sys.modules["pyrit.scenario.airt"] = _airt_module
sys.modules["pyrit.scenario.benchmark"] = _benchmark_module
sys.modules["pyrit.scenario.garak"] = _garak_module
sys.modules["pyrit.scenario.foundry"] = _foundry_module

# Also expose as attributes for IDE support
airt = _airt_module
benchmark = _benchmark_module
garak = _garak_module
foundry = _foundry_module

Expand All @@ -53,6 +56,7 @@
"ScenarioIdentifier",
"ScenarioResult",
"airt",
"benchmark",
"garak",
"foundry",
]
29 changes: 29 additions & 0 deletions pyrit/scenario/scenarios/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Benchmark scenario classes."""

from typing import Any

from pyrit.scenario.scenarios.benchmark.benchmark import Benchmark


def __getattr__(name: str) -> Any:
"""
Lazily resolve the dynamic BenchmarkStrategy class.

Returns:
Any: The resolved strategy class.

Raises:
AttributeError: If the attribute name is not recognized.
"""
if name == "BenchmarkStrategy":
return Benchmark.get_strategy_class()
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


__all__ = [
"Benchmark",
"BenchmarkStrategy",
]
232 changes: 232 additions & 0 deletions pyrit/scenario/scenarios/benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""
Benchmark scenario — compare adversarial-model ASR across attack techniques.

Strategies are built dynamically by filtering ``SCENARIO_TECHNIQUES`` to those
that accept an adversarial chat model but don't have one baked in. The
constructor takes a ``dict[str, PromptChatTarget]`` mapping user-chosen labels
to adversarial targets. At attack-creation time each model is injected via
``attack_adversarial_config_override``, producing a technique × model × dataset
cross-product for side-by-side comparison.

New adversarial techniques added to ``SCENARIO_TECHNIQUES`` are automatically
discovered — no changes to this module needed.
"""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, ClassVar, cast

from pyrit.common import apply_defaults
from pyrit.registry.object_registries.attack_technique_registry import AttackTechniqueRegistry, AttackTechniqueSpec
from pyrit.registry.tag_query import TagQuery
from pyrit.scenario.core.atomic_attack import AtomicAttack
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.scenario.core.scenario_techniques import SCENARIO_TECHNIQUES

if TYPE_CHECKING:
from pyrit.prompt_target import PromptChatTarget
from pyrit.scenario.core.scenario_strategy import ScenarioStrategy
from pyrit.score import TrueFalseScorer

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Dynamic technique filter — auto-discover adversarial-capable techniques
# ---------------------------------------------------------------------------


def _get_benchmarkable_specs() -> list[AttackTechniqueSpec]:
"""
Return techniques from ``SCENARIO_TECHNIQUES`` that accept an adversarial
model but don't have one already baked in.

This is the dual guard: ``_accepts_adversarial`` ensures the technique
CAN use an adversarial model, and ``adversarial_chat is None`` ensures
it doesn't already have one set — we inject our own at create-time.

Returns:
list[AttackTechniqueSpec]: Filtered, adversarial-ready specs.
"""
return [
spec
for spec in SCENARIO_TECHNIQUES
if AttackTechniqueRegistry._accepts_adversarial(spec.attack_class) and spec.adversarial_chat is None
]


def _build_benchmark_strategy() -> type[ScenarioStrategy]:
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So much of the strategy is shared with rapid response, these two functions could likely use a helper

build_strategy_from_techniques

"""
Build the BenchmarkStrategy enum from adversarial-capable ``SCENARIO_TECHNIQUES``.

Returns a strategy class whose concrete members are adversarial-capable
techniques (no baked-in adversarial chat) and whose aggregates allow
selecting by turn style.

Returns:
type[ScenarioStrategy]: The dynamically generated strategy enum class.
"""
specs = _get_benchmarkable_specs()
return AttackTechniqueRegistry.build_strategy_class_from_specs(
class_name="BenchmarkStrategy",
specs=TagQuery.all("core").filter(specs),
aggregate_tags={
"all": TagQuery.any_of("core"),
"single_turn": TagQuery.any_of("single_turn"),
"multi_turn": TagQuery.any_of("multi_turn"),
},
)


class Benchmark(Scenario):
"""
Benchmarking scenario that compares the ASR of several adversarial models.

Each selected technique is executed once per adversarial model per dataset,
producing a cross-product of atomic attacks. Results are grouped by model
label so that ASR can be compared side-by-side.
"""

VERSION: int = 1
_cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None

@classmethod
def get_strategy_class(cls) -> type[ScenarioStrategy]:
"""
Return the BenchmarkStrategy enum, building on first access.

Returns:
type[ScenarioStrategy]: The BenchmarkStrategy enum class.
"""
if cls._cached_strategy_class is None:
cls._cached_strategy_class = _build_benchmark_strategy()
return cls._cached_strategy_class

@classmethod
def get_default_strategy(cls) -> ScenarioStrategy:
"""
Return the default strategy (``ALL`` — run every benchmark technique).

Returns:
ScenarioStrategy: The ``all`` aggregate member.
"""
return cls.get_strategy_class()("all")

@classmethod
def default_dataset_config(cls) -> DatasetConfiguration:
"""
Return the default dataset configuration for benchmarking.

Returns:
DatasetConfiguration: Configuration with the HarmBench dataset.
"""
return DatasetConfiguration(
dataset_names=["harmbench"],
max_dataset_size=8,
)

@apply_defaults
def __init__(
self,
*,
adversarial_models: dict[str, PromptChatTarget],
objective_scorer: TrueFalseScorer | None = None,
scenario_result_id: str | None = None,
) -> None:
"""
Initialize the Benchmark scenario.

Args:
adversarial_models: Mapping of user-chosen label → adversarial
chat target. Each model will be benchmarked across all
selected techniques and datasets.
objective_scorer: Scorer for evaluating attack success.
Defaults to the registered default objective scorer.
scenario_result_id: Optional ID of an existing scenario
result to resume.

Raises:
ValueError: If ``adversarial_models`` is empty.
"""
if not adversarial_models:
raise ValueError("adversarial_models must be a non-empty dict mapping labels to PromptChatTarget instances.")

self._adversarial_models = dict(adversarial_models)
self._objective_scorer: TrueFalseScorer = (
objective_scorer if objective_scorer else self._get_default_objective_scorer()
)

super().__init__(
version=self.VERSION,
objective_scorer=self._objective_scorer,
strategy_class=self.get_strategy_class(),
scenario_result_id=scenario_result_id,
)

async def _get_atomic_attacks_async(self) -> list[AtomicAttack]:
"""
Build atomic attacks from the cross-product of techniques × models × datasets.

Factories are built locally from adversarial-capable ``SCENARIO_TECHNIQUES``
(not the registry singleton). Each model is injected at create-time via
``attack_adversarial_config_override``.

Returns:
list[AtomicAttack]: One atomic attack per technique/model/dataset combination.

Raises:
ValueError: If the scenario has not been initialized.
"""
if self._objective_target is None:
raise ValueError(
"Scenario not properly initialized. Call await scenario.initialize_async() before running."
)

from pyrit.executor.attack import AttackAdversarialConfig, AttackScoringConfig

benchmarkable_specs = _get_benchmarkable_specs()
local_factories = {
spec.name: AttackTechniqueRegistry.build_factory_from_spec(spec) for spec in benchmarkable_specs
}
scorer_override_map = {spec.name: spec.accepts_scorer_override for spec in benchmarkable_specs}

selected_techniques = {s.value for s in self._scenario_strategies}
seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups()
scoring_config = AttackScoringConfig(objective_scorer=cast("TrueFalseScorer", self._objective_scorer))

atomic_attacks: list[AtomicAttack] = []
for technique_name in selected_techniques:
factory = local_factories.get(technique_name)
if factory is None:
logger.warning("No factory for technique '%s', skipping.", technique_name)
continue

scoring_for_technique = scoring_config if scorer_override_map.get(technique_name, True) else None

for model_label, model_target in self._adversarial_models.items():
adv_config = AttackAdversarialConfig(target=model_target)

for dataset_name, seed_groups in seed_groups_by_dataset.items():
attack_technique = factory.create(
objective_target=self._objective_target,
attack_adversarial_config_override=adv_config,
attack_scoring_config_override=scoring_for_technique,
)
atomic_attacks.append(
AtomicAttack(
atomic_attack_name=f"{technique_name}__{model_label}_{dataset_name}",
attack_technique=attack_technique,
seed_groups=list(seed_groups),
adversarial_chat=model_target,
objective_scorer=cast("TrueFalseScorer", self._objective_scorer),
memory_labels=self._memory_labels,
display_group=model_label,
)
)

return atomic_attacks
Loading
Loading