From 650369ca29be9972e8ef76490ce94cd43dc06936 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Wed, 1 Apr 2026 09:44:51 +0530 Subject: [PATCH 01/11] added toxicity detection validators --- backend/app/api/API_USAGE.md | 6 +- backend/app/core/validators/README.md | 131 ++++- .../llamaguard_7b_safety_validator_config.py | 16 + .../nsfw_text_safety_validator_config.py | 22 + .../profanity_free_safety_validator_config.py | 14 + .../toxic_language_safety_validator_config.py | 22 + backend/app/core/validators/validators.json | 20 + backend/app/schemas/guardrail_config.py | 16 + .../app/tests/test_toxicity_hub_validators.py | 504 ++++++++++++++++++ 9 files changed, 748 insertions(+), 3 deletions(-) create mode 100644 backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py create mode 100644 backend/app/core/validators/config/nsfw_text_safety_validator_config.py create mode 100644 backend/app/core/validators/config/profanity_free_safety_validator_config.py create mode 100644 backend/app/core/validators/config/toxic_language_safety_validator_config.py create mode 100644 backend/app/tests/test_toxicity_hub_validators.py diff --git a/backend/app/api/API_USAGE.md b/backend/app/api/API_USAGE.md index e4e565a..1ce2ce7 100644 --- a/backend/app/api/API_USAGE.md +++ b/backend/app/api/API_USAGE.md @@ -100,7 +100,7 @@ Endpoint: Optional filters: - `ids=&ids=` - `stage=input|output` -- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance` +- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|nsfw_text|profanity_free|toxic_language` Example: @@ -442,6 +442,10 @@ From `validators.json`: - `ban_list` - `llm_critic` - `topic_relevance` +- `llamaguard_7b` +- `nsfw_text` +- `profanity_free` +- `toxic_language` Source of truth: - `backend/app/core/validators/validators.json` diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md index f0a2f6d..3ee841c 100644 --- a/backend/app/core/validators/README.md +++ b/backend/app/core/validators/README.md @@ -11,6 +11,10 @@ Current validator manifest: - `ban_list` (source: `hub://guardrails/ban_list`) - `llm_critic` (source: `hub://guardrails/llm_critic`) - https://guardrailsai.com/hub/validator/guardrails/llm_critic - `topic_relevance` (source: `local`) +- `llamaguard_7b` (source: `hub://guardrails/llamaguard_7b`) +- `nsfw_text` (source: `hub://guardrails/nsfw_text`) +- `profanity_free` (source: `hub://guardrails/profanity_free`) +- `toxic_language` (source: `hub://guardrails/toxic_language`) ## Configuration Model @@ -310,6 +314,125 @@ Notes / limitations: - Configuration is resolved in `backend/app/api/routes/guardrails.py` from tenant Topic Relevance Config APIs. - Prompt templates must include the `{{TOPIC_CONFIGURATION}}` placeholder. +### 7) LlamaGuard 7B Validator (`llamaguard_7b`) + +Code: +- Config: `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py` +- Source: Guardrails Hub (`hub://guardrails/llamaguard_7b`) + +What it does: +- Classifies text as "safe" or "unsafe" using the LlamaGuard-7B model via remote inference on the Guardrails Hub. +- Checks against a configurable set of safety policies covering violence/hate, sexual content, criminal planning, weapons, illegal drugs, and self-harm encouragement. + +Why this is used: +- Provides a model-level safety classifier as a complement to rule-based validators. +- Allows policy-targeted filtering (e.g. only flag content violating specific categories). + +Recommendation: +- `input` and `output` + - Why `input`: catches unsafe user prompts before model processing. + - Why `output`: validates generated content against the same safety policies. + +Parameters / customization: +- `policies: list[str] | None` (default: all policies enabled) + - Available policy constants: `O1` (violence/hate), `O2` (sexual content), `O3` (criminal planning), `O4` (guns/illegal weapons), `O5` (illegal drugs), `O6` (encourage self-harm) +- `on_fail` + +Notes / limitations: +- Remote inference requires network access to the Guardrails Hub API. +- No programmatic fix is applied on failure — `on_fail=fix` will behave like `on_fail=exception`. +- LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts. + +### 8) NSFW Text Validator (`nsfw_text`) + +Code: +- Config: `backend/app/core/validators/config/nsfw_text_safety_validator_config.py` +- Source: Guardrails Hub (`hub://guardrails/nsfw_text`) + +What it does: +- Detects not-safe-for-work (NSFW) text using a classifier model. +- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold. + +Why this is used: +- Provides a dedicated NSFW text filter for deployments where explicit/adult content must be blocked. +- Complements LlamaGuard-based filtering with a lightweight, CPU-friendly classifier. + +Recommendation: +- `input` and `output` + - Why `input`: blocks NSFW user messages before model invocation. + - Why `output`: prevents explicit content from being surfaced to end users. + +Parameters / customization: +- `threshold: float` (default: `0.8`) — minimum classifier score to flag text as NSFW. Higher = more conservative (fewer false positives). +- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input. +- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`). +- `model_name: str | None` (default: `"michellejieli/NSFW_text_classifier"`) +- `on_fail` + +Notes / limitations: +- Model runs locally; first use downloads model weights. Ensure network access during setup. +- `validation_method="sentence"` may miss NSFW content spread across multiple sentences. +- Threshold tuning is important: lower values increase recall at the cost of false positives. + +### 9) Profanity Free Validator (`profanity_free`) + +Code: +- Config: `backend/app/core/validators/config/profanity_free_safety_validator_config.py` +- Source: Guardrails Hub (`hub://guardrails/profanity_free`) + +What it does: +- Detects profanity in text using the `alt-profanity-check` library. +- Fails validation if any profanity is detected. + +Why this is used: +- Simple, fast rule-based check for profane language without requiring model inference. +- Suitable as a first-pass filter before more expensive validators. + +Recommendation: +- `input` and `output` + - Why `input`: catches profane user messages early. + - Why `output`: prevents model-generated profanity from reaching users. + +Parameters / customization: +- `on_fail` + +Notes / limitations: +- Dictionary-based approach; may miss obfuscated profanity (e.g. character substitutions, leetspeak). +- No programmatic fix is applied — detected text is not auto-redacted. +- English-focused; cross-lingual profanity may not be detected. + +### 10) Toxic Language Validator (`toxic_language`) + +Code: +- Config: `backend/app/core/validators/config/toxic_language_safety_validator_config.py` +- Source: Guardrails Hub (`hub://guardrails/toxic_language`) + +What it does: +- Detects toxic language using a classifier model. +- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold. + +Why this is used: +- Provides broader toxicity detection beyond explicit slurs, covering hostile, threatening, or degrading language. +- Works as a complement to the lexical slur validator (`uli_slur_match`) for semantic toxicity. + +Recommendation: +- `input` and `output` + - Why `input`: catches toxic user messages before they influence model behavior. + - Why `output`: prevents model-generated toxic content from reaching end users. + +Parameters / customization: +- `threshold: float` (default: `0.5`) — minimum classifier score to flag text as toxic. Lower = more sensitive (higher recall, more false positives). +- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input. +- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`). +- `model_name: str | None` (default: `"unbiased-small"`) +- `on_fail` + +Notes / limitations: +- Model runs locally; first use downloads model weights. Ensure network access during setup. +- The `unbiased-small` model is designed to reduce bias against identity groups compared to standard toxicity classifiers. +- `validation_method="sentence"` is recommended for conversational text; use `"full"` for short single-sentence inputs. +- Consider using alongside `uli_slur_match` for layered toxicity coverage. + ## Example Config Payloads Example: create validator config (stored shape) @@ -339,8 +462,8 @@ Example: runtime guardrail validator object (execution shape) ## Operational Guidance Default stage strategy: -- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed) -- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list` +- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b` +- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b` Tuning strategy: - Start with conservative defaults and log validator outcomes. @@ -356,5 +479,9 @@ Tuning strategy: - `backend/app/core/validators/config/lexical_slur_safety_validator_config.py` - `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py` - `backend/app/core/validators/config/topic_relevance_safety_validator_config.py` +- `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py` +- `backend/app/core/validators/config/nsfw_text_safety_validator_config.py` +- `backend/app/core/validators/config/profanity_free_safety_validator_config.py` +- `backend/app/core/validators/config/toxic_language_safety_validator_config.py` - `backend/app/schemas/guardrail_config.py` - `backend/app/schemas/validator_config.py` diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py new file mode 100644 index 0000000..231856e --- /dev/null +++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py @@ -0,0 +1,16 @@ +from typing import List, Literal, Optional + +from guardrails.hub import LlamaGuard7B + +from app.core.validators.config.base_validator_config import BaseValidatorConfig + + +class LlamaGuard7BSafetyValidatorConfig(BaseValidatorConfig): + type: Literal["llamaguard_7b"] + policies: Optional[List[str]] = None + + def build(self): + return LlamaGuard7B( + policies=self.policies, + on_fail=self.resolve_on_fail(), + ) diff --git a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py new file mode 100644 index 0000000..9fd81e7 --- /dev/null +++ b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py @@ -0,0 +1,22 @@ +from typing import Literal, Optional + +from guardrails.hub import NSFWText + +from app.core.validators.config.base_validator_config import BaseValidatorConfig + + +class NSFWTextSafetyValidatorConfig(BaseValidatorConfig): + type: Literal["nsfw_text"] + threshold: float = 0.8 + validation_method: str = "sentence" + device: Optional[str] = "cpu" + model_name: Optional[str] = "michellejieli/NSFW_text_classifier" + + def build(self): + return NSFWText( + threshold=self.threshold, + validation_method=self.validation_method, + device=self.device, + model_name=self.model_name, + on_fail=self.resolve_on_fail(), + ) diff --git a/backend/app/core/validators/config/profanity_free_safety_validator_config.py b/backend/app/core/validators/config/profanity_free_safety_validator_config.py new file mode 100644 index 0000000..dd6d774 --- /dev/null +++ b/backend/app/core/validators/config/profanity_free_safety_validator_config.py @@ -0,0 +1,14 @@ +from typing import Literal + +from guardrails.hub import ProfanityFree + +from app.core.validators.config.base_validator_config import BaseValidatorConfig + + +class ProfanityFreeSafetyValidatorConfig(BaseValidatorConfig): + type: Literal["profanity_free"] + + def build(self): + return ProfanityFree( + on_fail=self.resolve_on_fail(), + ) diff --git a/backend/app/core/validators/config/toxic_language_safety_validator_config.py b/backend/app/core/validators/config/toxic_language_safety_validator_config.py new file mode 100644 index 0000000..4420c4a --- /dev/null +++ b/backend/app/core/validators/config/toxic_language_safety_validator_config.py @@ -0,0 +1,22 @@ +from typing import Literal, Optional + +from guardrails.hub import ToxicLanguage + +from app.core.validators.config.base_validator_config import BaseValidatorConfig + + +class ToxicLanguageSafetyValidatorConfig(BaseValidatorConfig): + type: Literal["toxic_language"] + threshold: float = 0.5 + validation_method: str = "sentence" + device: Optional[str] = "cpu" + model_name: Optional[str] = "unbiased-small" + + def build(self): + return ToxicLanguage( + threshold=self.threshold, + validation_method=self.validation_method, + device=self.device, + model_name=self.model_name, + on_fail=self.resolve_on_fail(), + ) diff --git a/backend/app/core/validators/validators.json b/backend/app/core/validators/validators.json index 062f183..1aac02f 100644 --- a/backend/app/core/validators/validators.json +++ b/backend/app/core/validators/validators.json @@ -29,6 +29,26 @@ "type": "topic_relevance", "version": "0.1.0", "source": "local" + }, + { + "type": "llamaguard_7b", + "version": "0.1.0", + "source": "hub://guardrails/llamaguard_7b" + }, + { + "type": "nsfw_text", + "version": "0.1.0", + "source": "hub://guardrails/nsfw_text" + }, + { + "type": "profanity_free", + "version": "0.1.0", + "source": "hub://guardrails/profanity_free" + }, + { + "type": "toxic_language", + "version": "0.1.0", + "source": "hub://guardrails/toxic_language" } ] } \ No newline at end of file diff --git a/backend/app/schemas/guardrail_config.py b/backend/app/schemas/guardrail_config.py index 4cd9dbf..d76ba00 100644 --- a/backend/app/schemas/guardrail_config.py +++ b/backend/app/schemas/guardrail_config.py @@ -24,6 +24,18 @@ from app.core.validators.config.topic_relevance_safety_validator_config import ( TopicRelevanceSafetyValidatorConfig, ) +from app.core.validators.config.llamaguard_7b_safety_validator_config import ( + LlamaGuard7BSafetyValidatorConfig, +) +from app.core.validators.config.nsfw_text_safety_validator_config import ( + NSFWTextSafetyValidatorConfig, +) +from app.core.validators.config.profanity_free_safety_validator_config import ( + ProfanityFreeSafetyValidatorConfig, +) +from app.core.validators.config.toxic_language_safety_validator_config import ( + ToxicLanguageSafetyValidatorConfig, +) ValidatorConfigItem = Annotated[ Union[ @@ -31,8 +43,12 @@ GenderAssumptionBiasSafetyValidatorConfig, LexicalSlurSafetyValidatorConfig, LLMCriticSafetyValidatorConfig, + NSFWTextSafetyValidatorConfig, PIIRemoverSafetyValidatorConfig, + LlamaGuard7BSafetyValidatorConfig, + ProfanityFreeSafetyValidatorConfig, TopicRelevanceSafetyValidatorConfig, + ToxicLanguageSafetyValidatorConfig, ], Field(discriminator="type"), ] diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py new file mode 100644 index 0000000..7ee82f9 --- /dev/null +++ b/backend/app/tests/test_toxicity_hub_validators.py @@ -0,0 +1,504 @@ +from unittest.mock import patch + +import pytest +from guardrails import OnFailAction +from pydantic import ValidationError + +from app.core.validators.config.llamaguard_7b_safety_validator_config import ( + LlamaGuard7BSafetyValidatorConfig, +) +from app.core.validators.config.nsfw_text_safety_validator_config import ( + NSFWTextSafetyValidatorConfig, +) +from app.core.validators.config.profanity_free_safety_validator_config import ( + ProfanityFreeSafetyValidatorConfig, +) +from app.core.validators.config.toxic_language_safety_validator_config import ( + ToxicLanguageSafetyValidatorConfig, +) + +_LLAMAGUARD_PATCH = ( + "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B" +) +_NSFW_PATCH = "app.core.validators.config.nsfw_text_safety_validator_config.NSFWText" +_PROFANITY_PATCH = ( + "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree" +) +_TOXIC_PATCH = ( + "app.core.validators.config.toxic_language_safety_validator_config.ToxicLanguage" +) + + +# --------------------------------------------------------------------------- +# LlamaGuard7B +# --------------------------------------------------------------------------- + + +class TestLlamaGuard7BSafetyValidatorConfig: + def test_build_with_default_policies(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + mock_validator.assert_called_once() + _, kwargs = mock_validator.call_args + assert kwargs["policies"] is None + + def test_build_with_explicit_policies(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", + policies=["O1", "O2"], + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == ["O1", "O2"] + + def test_build_with_empty_policies_list(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b", policies=[]) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == [] + + def test_build_with_all_policy_codes(self): + all_policies = ["O1", "O2", "O3", "O4", "O5", "O6"] + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", policies=all_policies + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == all_policies + + def test_build_with_single_policy(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", policies=["O3"] + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == ["O3"] + + def test_build_returns_validator_instance(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + result = config.build() + + assert result == mock_validator.return_value + + def test_on_fail_fix_resolves_to_fix_action(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b", on_fail="fix") + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.FIX + + def test_on_fail_exception_resolves_to_exception_action(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", on_fail="exception" + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.EXCEPTION + + def test_on_fail_rephrase_resolves_to_callable(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", on_fail="rephrase" + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert callable(kwargs["on_fail"]) + + def test_invalid_on_fail_raises(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") + config.on_fail = "not_a_valid_action" # type: ignore[assignment] + + with patch(_LLAMAGUARD_PATCH): + with pytest.raises(ValueError, match="Invalid on_fail"): + config.build() + + def test_wrong_type_literal_rejected(self): + with pytest.raises(ValidationError): + LlamaGuard7BSafetyValidatorConfig(type="toxic_language") + + def test_extra_fields_rejected(self): + with pytest.raises(ValidationError): + LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", unknown_field="value" + ) + + +# --------------------------------------------------------------------------- +# NSFWText +# --------------------------------------------------------------------------- + + +class TestNSFWTextSafetyValidatorConfig: + def test_build_with_defaults(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text") + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + mock_validator.assert_called_once() + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 0.8 + assert kwargs["validation_method"] == "sentence" + assert kwargs["device"] == "cpu" + assert kwargs["model_name"] == "michellejieli/NSFW_text_classifier" + + def test_build_with_custom_params(self): + config = NSFWTextSafetyValidatorConfig( + type="nsfw_text", + threshold=0.6, + validation_method="full", + device="cuda", + model_name="custom/model", + ) + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 0.6 + assert kwargs["validation_method"] == "full" + assert kwargs["device"] == "cuda" + assert kwargs["model_name"] == "custom/model" + + def test_build_with_threshold_at_zero(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=0.0) + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 0.0 + + def test_build_with_threshold_at_one(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=1.0) + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 1.0 + + def test_build_with_device_none(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", device=None) + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["device"] is None + + def test_build_with_model_name_none(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", model_name=None) + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["model_name"] is None + + def test_build_returns_validator_instance(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text") + + with patch(_NSFW_PATCH) as mock_validator: + result = config.build() + + assert result == mock_validator.return_value + + def test_on_fail_fix_resolves_to_fix_action(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="fix") + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.FIX + + def test_on_fail_exception_resolves_to_exception_action(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="exception") + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.EXCEPTION + + def test_on_fail_rephrase_resolves_to_callable(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="rephrase") + + with patch(_NSFW_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert callable(kwargs["on_fail"]) + + def test_invalid_on_fail_raises(self): + config = NSFWTextSafetyValidatorConfig(type="nsfw_text") + config.on_fail = "not_a_valid_action" # type: ignore[assignment] + + with patch(_NSFW_PATCH): + with pytest.raises(ValueError, match="Invalid on_fail"): + config.build() + + def test_wrong_type_literal_rejected(self): + with pytest.raises(ValidationError): + NSFWTextSafetyValidatorConfig(type="toxic_language") + + def test_extra_fields_rejected(self): + with pytest.raises(ValidationError): + NSFWTextSafetyValidatorConfig(type="nsfw_text", unknown_field="value") + + def test_threshold_must_be_numeric(self): + with pytest.raises(ValidationError): + NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold="high") # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# ProfanityFree +# --------------------------------------------------------------------------- + + +class TestProfanityFreeSafetyValidatorConfig: + def test_build_default(self): + config = ProfanityFreeSafetyValidatorConfig(type="profanity_free") + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + mock_validator.assert_called_once() + + def test_build_returns_validator_instance(self): + config = ProfanityFreeSafetyValidatorConfig(type="profanity_free") + + with patch(_PROFANITY_PATCH) as mock_validator: + result = config.build() + + assert result == mock_validator.return_value + + def test_on_fail_fix_resolves_to_fix_action(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="fix" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.FIX + + def test_on_fail_exception_resolves_to_exception_action(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="exception" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.EXCEPTION + + def test_on_fail_rephrase_resolves_to_callable(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="rephrase" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert callable(kwargs["on_fail"]) + + def test_invalid_on_fail_raises(self): + config = ProfanityFreeSafetyValidatorConfig(type="profanity_free") + config.on_fail = "not_a_valid_action" # type: ignore[assignment] + + with patch(_PROFANITY_PATCH): + with pytest.raises(ValueError, match="Invalid on_fail"): + config.build() + + def test_wrong_type_literal_rejected(self): + with pytest.raises(ValidationError): + ProfanityFreeSafetyValidatorConfig(type="nsfw_text") + + def test_extra_fields_rejected(self): + with pytest.raises(ValidationError): + ProfanityFreeSafetyValidatorConfig( + type="profanity_free", unknown_field="value" + ) + + def test_only_on_fail_forwarded_to_validator(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="fix" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert set(kwargs.keys()) == {"on_fail"} + + +# --------------------------------------------------------------------------- +# ToxicLanguage +# --------------------------------------------------------------------------- + + +class TestToxicLanguageSafetyValidatorConfig: + def test_build_with_defaults(self): + config = ToxicLanguageSafetyValidatorConfig(type="toxic_language") + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + mock_validator.assert_called_once() + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 0.5 + assert kwargs["validation_method"] == "sentence" + assert kwargs["device"] == "cpu" + assert kwargs["model_name"] == "unbiased-small" + + def test_build_with_custom_params(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", + threshold=0.7, + validation_method="full", + device="cuda", + model_name="custom/toxic-model", + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 0.7 + assert kwargs["validation_method"] == "full" + assert kwargs["device"] == "cuda" + assert kwargs["model_name"] == "custom/toxic-model" + + def test_build_with_threshold_at_zero(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", threshold=0.0 + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 0.0 + + def test_build_with_threshold_at_one(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", threshold=1.0 + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["threshold"] == 1.0 + + def test_build_with_device_none(self): + config = ToxicLanguageSafetyValidatorConfig(type="toxic_language", device=None) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["device"] is None + + def test_build_with_model_name_none(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", model_name=None + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["model_name"] is None + + def test_build_returns_validator_instance(self): + config = ToxicLanguageSafetyValidatorConfig(type="toxic_language") + + with patch(_TOXIC_PATCH) as mock_validator: + result = config.build() + + assert result == mock_validator.return_value + + def test_on_fail_fix_resolves_to_fix_action(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", on_fail="fix" + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.FIX + + def test_on_fail_exception_resolves_to_exception_action(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", on_fail="exception" + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.EXCEPTION + + def test_on_fail_rephrase_resolves_to_callable(self): + config = ToxicLanguageSafetyValidatorConfig( + type="toxic_language", on_fail="rephrase" + ) + + with patch(_TOXIC_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert callable(kwargs["on_fail"]) + + def test_invalid_on_fail_raises(self): + config = ToxicLanguageSafetyValidatorConfig(type="toxic_language") + config.on_fail = "not_a_valid_action" # type: ignore[assignment] + + with patch(_TOXIC_PATCH): + with pytest.raises(ValueError, match="Invalid on_fail"): + config.build() + + def test_wrong_type_literal_rejected(self): + with pytest.raises(ValidationError): + ToxicLanguageSafetyValidatorConfig(type="nsfw_text") + + def test_extra_fields_rejected(self): + with pytest.raises(ValidationError): + ToxicLanguageSafetyValidatorConfig( + type="toxic_language", unknown_field="value" + ) + + def test_threshold_must_be_numeric(self): + with pytest.raises(ValidationError): + ToxicLanguageSafetyValidatorConfig(type="toxic_language", threshold="high") # type: ignore[arg-type] From 949647d0f5e683631082ed3e27faa8dbbfea213d Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Wed, 1 Apr 2026 10:18:22 +0530 Subject: [PATCH 02/11] fixed import error --- backend/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index b335986..6d1e84e 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "numpy>=1.24.0", "python-dotenv<2.0.0,>=1.0.0", "scikit-learn>=1.6.0,<2.0.0", + "huggingface-hub>=1.5.0,<2.0", ] [dependency-groups] From da50537e0c0f3c9a8e41b067695a17a0ca2bdce5 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Thu, 2 Apr 2026 18:29:17 +0530 Subject: [PATCH 03/11] removed redundant validators --- backend/app/api/API_USAGE.md | 4 +- backend/app/core/enum.py | 3 + backend/app/core/validators/README.md | 144 +++++---- .../nsfw_text_safety_validator_config.py | 22 -- .../toxic_language_safety_validator_config.py | 22 -- backend/app/core/validators/validators.json | 10 - backend/app/schemas/guardrail_config.py | 8 - .../app/tests/test_toxicity_hub_validators.py | 285 ------------------ 8 files changed, 74 insertions(+), 424 deletions(-) delete mode 100644 backend/app/core/validators/config/nsfw_text_safety_validator_config.py delete mode 100644 backend/app/core/validators/config/toxic_language_safety_validator_config.py diff --git a/backend/app/api/API_USAGE.md b/backend/app/api/API_USAGE.md index 1ce2ce7..38af6de 100644 --- a/backend/app/api/API_USAGE.md +++ b/backend/app/api/API_USAGE.md @@ -100,7 +100,7 @@ Endpoint: Optional filters: - `ids=&ids=` - `stage=input|output` -- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|nsfw_text|profanity_free|toxic_language` +- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|profanity_free` Example: @@ -443,9 +443,7 @@ From `validators.json`: - `llm_critic` - `topic_relevance` - `llamaguard_7b` -- `nsfw_text` - `profanity_free` -- `toxic_language` Source of truth: - `backend/app/core/validators/validators.json` diff --git a/backend/app/core/enum.py b/backend/app/core/enum.py index 43a102b..0c7c940 100644 --- a/backend/app/core/enum.py +++ b/backend/app/core/enum.py @@ -32,3 +32,6 @@ class ValidatorType(Enum): GenderAssumptionBias = "gender_assumption_bias" BanList = "ban_list" TopicRelevance = "topic_relevance" + LLMCritic = "llm_critic" + LlamaGuard7B = "llamaguard_7b" + ProfanityFree = "profanity_free" diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md index 3ee841c..e7f40a8 100644 --- a/backend/app/core/validators/README.md +++ b/backend/app/core/validators/README.md @@ -5,6 +5,7 @@ This document describes the validator configuration model used in this codebase, ## Supported Validators Current validator manifest: + - `uli_slur_match` (source: `local`) - `pii_remover` (source: `local`) - `gender_assumption_bias` (source: `local`) @@ -12,21 +13,21 @@ Current validator manifest: - `llm_critic` (source: `hub://guardrails/llm_critic`) - https://guardrailsai.com/hub/validator/guardrails/llm_critic - `topic_relevance` (source: `local`) - `llamaguard_7b` (source: `hub://guardrails/llamaguard_7b`) -- `nsfw_text` (source: `hub://guardrails/nsfw_text`) - `profanity_free` (source: `hub://guardrails/profanity_free`) -- `toxic_language` (source: `hub://guardrails/toxic_language`) ## Configuration Model All validator config classes inherit from `BaseValidatorConfig` in `backend/app/core/validators/config/base_validator_config.py`. Shared fields: + - `on_fail` (default: `fix`) - `fix`: return transformed/redacted output when validator provides a fix - `exception`: fail validation when validator fails (no safe replacement output) - `rephrase`: return a user-facing rephrase prompt plus validator error details At the Validator Config API layer (`/guardrails/validators/configs`), configs also include: + - `type` - `stage`: `input` or `output` - `on_fail_action` (mapped to runtime `on_fail`) @@ -37,9 +38,11 @@ At the Validator Config API layer (`/guardrails/validators/configs`), configs al There are two config shapes used in this project: 1. Stored validator config (Config CRUD APIs) + - includes `stage`, `on_fail_action`, scope metadata, etc. 2. Runtime guardrail config (POST `/guardrails/`) + - validator objects are normalized before execution - internal metadata like `stage`, ids, timestamps are removed - `on_fail_action` is converted to `on_fail` @@ -49,16 +52,17 @@ There are two config shapes used in this project: This project supports three `on_fail` behaviors at runtime: - `fix` + - Uses Guardrails built-in fix flow (`OnFailAction.FIX`). - If a validator returns `fix_value`, validation succeeds and API returns that transformed value as `safe_text`. - Typical outcome: redaction/anonymization/substitution without asking user to retry. - - `exception` + - Uses Guardrails built-in exception flow (`OnFailAction.EXCEPTION`). - Validation fails without a fallback text; API returns failure (`success=false`) with error details. - Use when policy requires hard rejection instead of auto-correction. - - `rephrase` + - Uses project custom handler `rephrase_query_on_fail`. - Returns: `"Please rephrase the query without unsafe content." + validator error message`. - API marks `rephrase_needed=true` when returned text starts with this prefix. @@ -68,6 +72,7 @@ This project supports three `on_fail` behaviors at runtime: `stage` is always required in validator configuration (`input` or `output`). The recommendation below is guidance on what to choose first, based on: + - where harm is most likely (`input`, `output`, or both), - whether auto-fixes are acceptable for user experience, - whether extra filtering at that stage creates too many false positives for the product flow. @@ -75,6 +80,7 @@ The recommendation below is guidance on what to choose first, based on: ## How These Recommendations Were Derived These recommendations come from working with multiple NGOs to understand their GenAI WhatsApp bot use cases, reviewing real bot conversations/data, and then running a structured evaluation flow: + - NGO use-case discovery and conversation analysis: - Reviewed real conversational patterns, safety failure modes, and policy expectations across partner NGO workflows. - Identified practical risks to prioritize (harmful language, privacy leakage, bias, and deployment-specific banned terms). @@ -99,35 +105,42 @@ These recommendations come from working with multiple NGOs to understand their G ### 1) Lexical Slur Validator (`uli_slur_match`) Code: + - Config: `backend/app/core/validators/config/lexical_slur_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/lexical_slur.py` - Data file: `backend/app/core/validators/utils/files/curated_slurlist_hi_en.csv` What it does: + - Detects lexical slurs using list-based matching. - Normalizes text (emoji removal, encoding fix, unicode normalization, lowercase, whitespace normalization). - Redacts detected slurs with `[REDACTED_SLUR]` when `on_fail=fix`. Why this is used: + - Helps mitigate toxic/abusive language in user inputs and model outputs. - Evaluation and stress tests showed this is effective for multilingual abusive-content filtering in NGO-style conversational flows. Recommendation: + - `input` and `output` - Why `input`: catches abusive wording before it reaches prompt construction, logging, or downstream tools. - Why `output`: catches toxic generations that can still appear even with safe input. Parameters / customization: + - `languages: list[str]` (default: `['en', 'hi']`) - `severity: 'low' | 'medium' | 'high' | 'all'` (default: `'all'`) - `on_fail` Notes / limitations: + - Lexical matching can produce false positives in domain-specific contexts. - Severity filtering is dependent on source slur list labels. - Rules-based approach may miss semantic toxicity without explicit lexical matches. Evidence and evaluation: + - Dataset reference: `https://www.kaggle.com/c/multilingualabusivecomment/data` - Label convention used in that dataset: - `1` = abusive comment @@ -137,28 +150,34 @@ Evidence and evaluation: ### 2) PII Remover Validator (`pii_remover`) Code: + - Config: `backend/app/core/validators/config/pii_remover_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/pii_remover.py` What it does: + - Detects and anonymizes personally identifiable information using Presidio. - Returns redacted text when PII is found and `on_fail=fix`. Why this is used: + - Privacy is a primary safety requirement in NGO deployments. - Evaluation runs for this project showed clear risk of personal-data leakage/retention in conversational workflows without PII masking. Recommendation: + - `input` and `output` - Why `input`: prevents storing or processing raw user PII in logs/services. - Why `output`: prevents model-generated leakage of names, numbers, or identifiers. Parameters / customization: + - `entity_types: list[str] | None` (default: all supported types) - `threshold: float` (default: `0.5`) - `on_fail` Threshold guidance: + - `threshold` is the minimum confidence score required for a detected entity to be treated as PII. - Lower threshold -> more detections (higher recall, more false positives/over-masking). - Higher threshold -> fewer detections (higher precision, more false negatives/missed PII). @@ -166,15 +185,17 @@ Threshold guidance: - If the product is privacy-critical, prefer a slightly lower threshold and tighter `entity_types`; if readability is primary, prefer a slightly higher threshold. Supported default entity types: + - `CREDIT_CARD`, `EMAIL_ADDRESS`, `IBAN_CODE`, `IP_ADDRESS`, `LOCATION`, `MEDICAL_LICENSE`, `NRP`, `PERSON`, `PHONE_NUMBER`, `URL`, `IN_AADHAAR`, `IN_PAN`, `IN_PASSPORT`, `IN_VEHICLE_REGISTRATION`, `IN_VOTER` Notes / limitations: + - Rule/ML recognizers can under-detect free-text references. - Threshold and entity selection should be tuned per deployment context. - Runtime requirement: this validator is configured to use spaCy model `en_core_web_lg`. -The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency. -For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg` -Evidence and evaluation: + The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency. + For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg` + Evidence and evaluation: - Compared approaches: - Custom PII validator (this codebase) - Guardrails Hub PII validator @@ -187,37 +208,45 @@ Evidence and evaluation: ### 3) Gender Assumption Bias Validator (`gender_assumption_bias`) Code: + - Config: `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/gender_assumption_bias.py` - Data file: `backend/app/core/validators/utils/files/gender_assumption_bias_words.csv` What it does: + - Detects gender-assumptive words/phrases and substitutes neutral terms. - Uses a curated mapping from gendered terms to neutral alternatives. Why this is used: + - Addresses model harm from assuming user gender or producing gender-biased language. - Evaluation reviews and stress tests identified this as a recurring conversational quality/safety issue. Recommendation: + - primarily `output` - Why `output`: the assistant response is where assumption-biased phrasing is most likely to be emitted to end users. - Why not `input` by default: user text can be descriptive/quoted, so rewriting input can introduce false positives and intent drift. - Use `input` too when your policy requires strict moderation of user phrasing before any model processing. Parameters / customization: + - `categories: list[BiasCategories] | None` (default: `[all]`) - `on_fail` `BiasCategories` values: + - `generic`, `healthcare`, `education`, `all` Notes / limitations: + - Rule-based substitutions may affect natural fluency. - Gender-neutral transformation in Hindi/romanized Hindi can be context-sensitive. - Full assumption detection often benefits from multi-turn context and/or LLM-as-judge approaches. Improvement suggestions from evaluation: + - Strengthen prompt strategy so the model asks user preferences instead of assuming gendered terms. - Fine-tune generation prompts for neutral language defaults. - Consider external LLM-as-judge checks for nuanced multi-turn assumption detection. @@ -225,27 +254,33 @@ Improvement suggestions from evaluation: ### 4) Ban List Validator (`ban_list`) Code: + - Config: `backend/app/core/validators/config/ban_list_safety_validator_config.py` - Source: Guardrails Hub (`hub://guardrails/ban_list`) What it does: + - Blocks or redacts configured banned words using the Guardrails Hub BanList validator. Why this is used: + - Provides deployment-specific denylist control for terms that must never appear in inputs/outputs. - Useful for policy-level restrictions not fully covered by generic toxicity detection. Recommendation: + - `input` and `output` - Why `input`: blocks prohibited terms before model invocation and tool calls. - Why `output`: enforces policy on generated text before it is shown to users. Parameters / customization: + - `banned_words: list[str]` (optional if `ban_list_id` is provided) - `ban_list_id: UUID` (optional if `banned_words` is provided) - `on_fail` Notes / limitations: + - Exact-list approach requires ongoing maintenance. - Contextual false positives can occur for ambiguous terms. - Runtime validation requires at least one of `banned_words` or `ban_list_id`. @@ -254,27 +289,33 @@ Notes / limitations: ### 5) LLM Critic Validator (`llm_critic`) Code: + - Config: `backend/app/core/validators/config/llm_critic_safety_validator_config.py` - Source: Guardrails Hub (`hub://guardrails/llm_critic`) — https://guardrailsai.com/hub/validator/guardrails/llm_critic What it does: + - Evaluates text against one or more custom quality/safety metrics using an LLM as judge. - Each metric is scored up to `max_score`; validation fails if any metric score falls below the threshold. Why this is used: + - Enables flexible, prompt-driven content evaluation for use cases not covered by rule-based validators. - All configuration is passed inline in the runtime request — there is no stored config object to resolve. Unlike `topic_relevance`, which looks up scope text from a persisted `TopicRelevanceConfig`, `llm_critic` receives `metrics`, `max_score`, and `llm_callable` directly in the guardrail request payload. Recommendation: + - `input` or `output` depending on whether you are evaluating user input quality or model output quality. Parameters / customization: + - `metrics: dict` (required) — metric name-to-description mapping passed to the LLM judge - `max_score: int` (required) — maximum score per metric; used to define the scoring scale - `llm_callable: str` (required) — model identifier passed to LiteLLM (e.g. `gpt-4o-mini`, `gpt-4o`) - `on_fail` Notes / limitations: + - All three parameters are required and must be provided inline in every runtime guardrail request; there is no stored config to reference. - **Requires `OPENAI_API_KEY` to be set in environment variables.** If the key is not configured, `build()` raises a `ValueError` with an explicit message before any validation runs. - Quality and latency depend on the chosen `llm_callable`. @@ -283,32 +324,38 @@ Notes / limitations: ### 6) Topic Relevance Validator (`topic_relevance`) Code: + - Config: `backend/app/core/validators/config/topic_relevance_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/topic_relevance.py` - Prompt templates: `backend/app/core/validators/prompts/topic_relevance/` What it does: + - Checks whether the user message is in scope using an LLM-critic style metric. - Builds the final prompt from: - a versioned markdown template (`prompt_schema_version`) - tenant-specific `configuration` (string sub-prompt text). Why this is used: + - Enforces domain scope for assistants that should answer only allowed topics. - Keeps prompt wording versioned and reusable while allowing tenant-level scope customization. Recommendation: + - primarily `input` - Why `input`: blocks out-of-scope prompts before model processing. - Add to `output` only when you also need to enforce output-topic strictness. Parameters / customization: + - `topic_relevance_config_id: UUID` (required at runtime; resolves configuration and prompt version from tenant config) - `prompt_schema_version: int` (optional; defaults to `1`) - `llm_callable: str` (default: `gpt-4o-mini`) — the model identifier passed to Guardrails' LLMCritic to perform the scope evaluation. This must be a model string supported by LiteLLM (e.g. `gpt-4o-mini`, `gpt-4o`). It controls which LLM is used to score whether the input is within the allowed topic scope; changing it affects cost, latency, and scoring quality. - `on_fail` Notes / limitations: + - Runtime validation requires `topic_relevance_config_id`. - **Requires `OPENAI_API_KEY` to be set in environment variables.** If the key is not configured, validation returns a `FailResult` with an explicit message. - Configuration is resolved in `backend/app/api/routes/guardrails.py` from tenant Topic Relevance Config APIs. @@ -317,122 +364,71 @@ Notes / limitations: ### 7) LlamaGuard 7B Validator (`llamaguard_7b`) Code: + - Config: `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py` - Source: Guardrails Hub (`hub://guardrails/llamaguard_7b`) What it does: + - Classifies text as "safe" or "unsafe" using the LlamaGuard-7B model via remote inference on the Guardrails Hub. - Checks against a configurable set of safety policies covering violence/hate, sexual content, criminal planning, weapons, illegal drugs, and self-harm encouragement. Why this is used: + - Provides a model-level safety classifier as a complement to rule-based validators. - Allows policy-targeted filtering (e.g. only flag content violating specific categories). Recommendation: + - `input` and `output` - Why `input`: catches unsafe user prompts before model processing. - Why `output`: validates generated content against the same safety policies. Parameters / customization: + - `policies: list[str] | None` (default: all policies enabled) - Available policy constants: `O1` (violence/hate), `O2` (sexual content), `O3` (criminal planning), `O4` (guns/illegal weapons), `O5` (illegal drugs), `O6` (encourage self-harm) - `on_fail` Notes / limitations: + - Remote inference requires network access to the Guardrails Hub API. - No programmatic fix is applied on failure — `on_fail=fix` will behave like `on_fail=exception`. - LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts. -### 8) NSFW Text Validator (`nsfw_text`) +### 8) Profanity Free Validator (`profanity_free`) Code: -- Config: `backend/app/core/validators/config/nsfw_text_safety_validator_config.py` -- Source: Guardrails Hub (`hub://guardrails/nsfw_text`) - -What it does: -- Detects not-safe-for-work (NSFW) text using a classifier model. -- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold. - -Why this is used: -- Provides a dedicated NSFW text filter for deployments where explicit/adult content must be blocked. -- Complements LlamaGuard-based filtering with a lightweight, CPU-friendly classifier. - -Recommendation: -- `input` and `output` - - Why `input`: blocks NSFW user messages before model invocation. - - Why `output`: prevents explicit content from being surfaced to end users. - -Parameters / customization: -- `threshold: float` (default: `0.8`) — minimum classifier score to flag text as NSFW. Higher = more conservative (fewer false positives). -- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input. -- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`). -- `model_name: str | None` (default: `"michellejieli/NSFW_text_classifier"`) -- `on_fail` - -Notes / limitations: -- Model runs locally; first use downloads model weights. Ensure network access during setup. -- `validation_method="sentence"` may miss NSFW content spread across multiple sentences. -- Threshold tuning is important: lower values increase recall at the cost of false positives. -### 9) Profanity Free Validator (`profanity_free`) - -Code: - Config: `backend/app/core/validators/config/profanity_free_safety_validator_config.py` - Source: Guardrails Hub (`hub://guardrails/profanity_free`) What it does: + - Detects profanity in text using the `alt-profanity-check` library. - Fails validation if any profanity is detected. Why this is used: + - Simple, fast rule-based check for profane language without requiring model inference. - Suitable as a first-pass filter before more expensive validators. Recommendation: + - `input` and `output` - Why `input`: catches profane user messages early. - Why `output`: prevents model-generated profanity from reaching users. Parameters / customization: + - `on_fail` Notes / limitations: + - Dictionary-based approach; may miss obfuscated profanity (e.g. character substitutions, leetspeak). - No programmatic fix is applied — detected text is not auto-redacted. - English-focused; cross-lingual profanity may not be detected. -### 10) Toxic Language Validator (`toxic_language`) - -Code: -- Config: `backend/app/core/validators/config/toxic_language_safety_validator_config.py` -- Source: Guardrails Hub (`hub://guardrails/toxic_language`) - -What it does: -- Detects toxic language using a classifier model. -- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold. - -Why this is used: -- Provides broader toxicity detection beyond explicit slurs, covering hostile, threatening, or degrading language. -- Works as a complement to the lexical slur validator (`uli_slur_match`) for semantic toxicity. - -Recommendation: -- `input` and `output` - - Why `input`: catches toxic user messages before they influence model behavior. - - Why `output`: prevents model-generated toxic content from reaching end users. - -Parameters / customization: -- `threshold: float` (default: `0.5`) — minimum classifier score to flag text as toxic. Lower = more sensitive (higher recall, more false positives). -- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input. -- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`). -- `model_name: str | None` (default: `"unbiased-small"`) -- `on_fail` - -Notes / limitations: -- Model runs locally; first use downloads model weights. Ensure network access during setup. -- The `unbiased-small` model is designed to reduce bias against identity groups compared to standard toxicity classifiers. -- `validation_method="sentence"` is recommended for conversational text; use `"full"` for short single-sentence inputs. -- Consider using alongside `uli_slur_match` for layered toxicity coverage. - ## Example Config Payloads Example: create validator config (stored shape) @@ -462,10 +458,12 @@ Example: runtime guardrail validator object (execution shape) ## Operational Guidance Default stage strategy: -- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b` -- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b` + +- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `llamaguard_7b` +- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `llamaguard_7b` Tuning strategy: + - Start with conservative defaults and log validator outcomes. - Review false positives/false negatives by validator and stage. - Iterate on per-validator parameters (`severity`, `threshold`, `categories`, `banned_words`). @@ -480,8 +478,6 @@ Tuning strategy: - `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py` - `backend/app/core/validators/config/topic_relevance_safety_validator_config.py` - `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py` -- `backend/app/core/validators/config/nsfw_text_safety_validator_config.py` - `backend/app/core/validators/config/profanity_free_safety_validator_config.py` -- `backend/app/core/validators/config/toxic_language_safety_validator_config.py` - `backend/app/schemas/guardrail_config.py` - `backend/app/schemas/validator_config.py` diff --git a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py deleted file mode 100644 index 9fd81e7..0000000 --- a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Literal, Optional - -from guardrails.hub import NSFWText - -from app.core.validators.config.base_validator_config import BaseValidatorConfig - - -class NSFWTextSafetyValidatorConfig(BaseValidatorConfig): - type: Literal["nsfw_text"] - threshold: float = 0.8 - validation_method: str = "sentence" - device: Optional[str] = "cpu" - model_name: Optional[str] = "michellejieli/NSFW_text_classifier" - - def build(self): - return NSFWText( - threshold=self.threshold, - validation_method=self.validation_method, - device=self.device, - model_name=self.model_name, - on_fail=self.resolve_on_fail(), - ) diff --git a/backend/app/core/validators/config/toxic_language_safety_validator_config.py b/backend/app/core/validators/config/toxic_language_safety_validator_config.py deleted file mode 100644 index 4420c4a..0000000 --- a/backend/app/core/validators/config/toxic_language_safety_validator_config.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Literal, Optional - -from guardrails.hub import ToxicLanguage - -from app.core.validators.config.base_validator_config import BaseValidatorConfig - - -class ToxicLanguageSafetyValidatorConfig(BaseValidatorConfig): - type: Literal["toxic_language"] - threshold: float = 0.5 - validation_method: str = "sentence" - device: Optional[str] = "cpu" - model_name: Optional[str] = "unbiased-small" - - def build(self): - return ToxicLanguage( - threshold=self.threshold, - validation_method=self.validation_method, - device=self.device, - model_name=self.model_name, - on_fail=self.resolve_on_fail(), - ) diff --git a/backend/app/core/validators/validators.json b/backend/app/core/validators/validators.json index 1aac02f..6e28a54 100644 --- a/backend/app/core/validators/validators.json +++ b/backend/app/core/validators/validators.json @@ -35,20 +35,10 @@ "version": "0.1.0", "source": "hub://guardrails/llamaguard_7b" }, - { - "type": "nsfw_text", - "version": "0.1.0", - "source": "hub://guardrails/nsfw_text" - }, { "type": "profanity_free", "version": "0.1.0", "source": "hub://guardrails/profanity_free" - }, - { - "type": "toxic_language", - "version": "0.1.0", - "source": "hub://guardrails/toxic_language" } ] } \ No newline at end of file diff --git a/backend/app/schemas/guardrail_config.py b/backend/app/schemas/guardrail_config.py index d76ba00..22bcf49 100644 --- a/backend/app/schemas/guardrail_config.py +++ b/backend/app/schemas/guardrail_config.py @@ -27,15 +27,9 @@ from app.core.validators.config.llamaguard_7b_safety_validator_config import ( LlamaGuard7BSafetyValidatorConfig, ) -from app.core.validators.config.nsfw_text_safety_validator_config import ( - NSFWTextSafetyValidatorConfig, -) from app.core.validators.config.profanity_free_safety_validator_config import ( ProfanityFreeSafetyValidatorConfig, ) -from app.core.validators.config.toxic_language_safety_validator_config import ( - ToxicLanguageSafetyValidatorConfig, -) ValidatorConfigItem = Annotated[ Union[ @@ -43,12 +37,10 @@ GenderAssumptionBiasSafetyValidatorConfig, LexicalSlurSafetyValidatorConfig, LLMCriticSafetyValidatorConfig, - NSFWTextSafetyValidatorConfig, PIIRemoverSafetyValidatorConfig, LlamaGuard7BSafetyValidatorConfig, ProfanityFreeSafetyValidatorConfig, TopicRelevanceSafetyValidatorConfig, - ToxicLanguageSafetyValidatorConfig, ], Field(discriminator="type"), ] diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py index 7ee82f9..62be8e8 100644 --- a/backend/app/tests/test_toxicity_hub_validators.py +++ b/backend/app/tests/test_toxicity_hub_validators.py @@ -7,26 +7,13 @@ from app.core.validators.config.llamaguard_7b_safety_validator_config import ( LlamaGuard7BSafetyValidatorConfig, ) -from app.core.validators.config.nsfw_text_safety_validator_config import ( - NSFWTextSafetyValidatorConfig, -) from app.core.validators.config.profanity_free_safety_validator_config import ( ProfanityFreeSafetyValidatorConfig, ) -from app.core.validators.config.toxic_language_safety_validator_config import ( - ToxicLanguageSafetyValidatorConfig, -) _LLAMAGUARD_PATCH = ( "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B" ) -_NSFW_PATCH = "app.core.validators.config.nsfw_text_safety_validator_config.NSFWText" -_PROFANITY_PATCH = ( - "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree" -) -_TOXIC_PATCH = ( - "app.core.validators.config.toxic_language_safety_validator_config.ToxicLanguage" -) # --------------------------------------------------------------------------- @@ -147,135 +134,6 @@ def test_extra_fields_rejected(self): ) -# --------------------------------------------------------------------------- -# NSFWText -# --------------------------------------------------------------------------- - - -class TestNSFWTextSafetyValidatorConfig: - def test_build_with_defaults(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text") - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - mock_validator.assert_called_once() - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 0.8 - assert kwargs["validation_method"] == "sentence" - assert kwargs["device"] == "cpu" - assert kwargs["model_name"] == "michellejieli/NSFW_text_classifier" - - def test_build_with_custom_params(self): - config = NSFWTextSafetyValidatorConfig( - type="nsfw_text", - threshold=0.6, - validation_method="full", - device="cuda", - model_name="custom/model", - ) - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 0.6 - assert kwargs["validation_method"] == "full" - assert kwargs["device"] == "cuda" - assert kwargs["model_name"] == "custom/model" - - def test_build_with_threshold_at_zero(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=0.0) - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 0.0 - - def test_build_with_threshold_at_one(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=1.0) - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 1.0 - - def test_build_with_device_none(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", device=None) - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["device"] is None - - def test_build_with_model_name_none(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", model_name=None) - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["model_name"] is None - - def test_build_returns_validator_instance(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text") - - with patch(_NSFW_PATCH) as mock_validator: - result = config.build() - - assert result == mock_validator.return_value - - def test_on_fail_fix_resolves_to_fix_action(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="fix") - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["on_fail"] == OnFailAction.FIX - - def test_on_fail_exception_resolves_to_exception_action(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="exception") - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["on_fail"] == OnFailAction.EXCEPTION - - def test_on_fail_rephrase_resolves_to_callable(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="rephrase") - - with patch(_NSFW_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert callable(kwargs["on_fail"]) - - def test_invalid_on_fail_raises(self): - config = NSFWTextSafetyValidatorConfig(type="nsfw_text") - config.on_fail = "not_a_valid_action" # type: ignore[assignment] - - with patch(_NSFW_PATCH): - with pytest.raises(ValueError, match="Invalid on_fail"): - config.build() - - def test_wrong_type_literal_rejected(self): - with pytest.raises(ValidationError): - NSFWTextSafetyValidatorConfig(type="toxic_language") - - def test_extra_fields_rejected(self): - with pytest.raises(ValidationError): - NSFWTextSafetyValidatorConfig(type="nsfw_text", unknown_field="value") - - def test_threshold_must_be_numeric(self): - with pytest.raises(ValidationError): - NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold="high") # type: ignore[arg-type] - - # --------------------------------------------------------------------------- # ProfanityFree # --------------------------------------------------------------------------- @@ -359,146 +217,3 @@ def test_only_on_fail_forwarded_to_validator(self): _, kwargs = mock_validator.call_args assert set(kwargs.keys()) == {"on_fail"} - - -# --------------------------------------------------------------------------- -# ToxicLanguage -# --------------------------------------------------------------------------- - - -class TestToxicLanguageSafetyValidatorConfig: - def test_build_with_defaults(self): - config = ToxicLanguageSafetyValidatorConfig(type="toxic_language") - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - mock_validator.assert_called_once() - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 0.5 - assert kwargs["validation_method"] == "sentence" - assert kwargs["device"] == "cpu" - assert kwargs["model_name"] == "unbiased-small" - - def test_build_with_custom_params(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", - threshold=0.7, - validation_method="full", - device="cuda", - model_name="custom/toxic-model", - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 0.7 - assert kwargs["validation_method"] == "full" - assert kwargs["device"] == "cuda" - assert kwargs["model_name"] == "custom/toxic-model" - - def test_build_with_threshold_at_zero(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", threshold=0.0 - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 0.0 - - def test_build_with_threshold_at_one(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", threshold=1.0 - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["threshold"] == 1.0 - - def test_build_with_device_none(self): - config = ToxicLanguageSafetyValidatorConfig(type="toxic_language", device=None) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["device"] is None - - def test_build_with_model_name_none(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", model_name=None - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["model_name"] is None - - def test_build_returns_validator_instance(self): - config = ToxicLanguageSafetyValidatorConfig(type="toxic_language") - - with patch(_TOXIC_PATCH) as mock_validator: - result = config.build() - - assert result == mock_validator.return_value - - def test_on_fail_fix_resolves_to_fix_action(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", on_fail="fix" - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["on_fail"] == OnFailAction.FIX - - def test_on_fail_exception_resolves_to_exception_action(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", on_fail="exception" - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert kwargs["on_fail"] == OnFailAction.EXCEPTION - - def test_on_fail_rephrase_resolves_to_callable(self): - config = ToxicLanguageSafetyValidatorConfig( - type="toxic_language", on_fail="rephrase" - ) - - with patch(_TOXIC_PATCH) as mock_validator: - config.build() - - _, kwargs = mock_validator.call_args - assert callable(kwargs["on_fail"]) - - def test_invalid_on_fail_raises(self): - config = ToxicLanguageSafetyValidatorConfig(type="toxic_language") - config.on_fail = "not_a_valid_action" # type: ignore[assignment] - - with patch(_TOXIC_PATCH): - with pytest.raises(ValueError, match="Invalid on_fail"): - config.build() - - def test_wrong_type_literal_rejected(self): - with pytest.raises(ValidationError): - ToxicLanguageSafetyValidatorConfig(type="nsfw_text") - - def test_extra_fields_rejected(self): - with pytest.raises(ValidationError): - ToxicLanguageSafetyValidatorConfig( - type="toxic_language", unknown_field="value" - ) - - def test_threshold_must_be_numeric(self): - with pytest.raises(ValidationError): - ToxicLanguageSafetyValidatorConfig(type="toxic_language", threshold="high") # type: ignore[arg-type] From b64d0e9888aa695449c7888de5cf01eafedc0d8b Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Thu, 2 Apr 2026 18:40:09 +0530 Subject: [PATCH 04/11] fixed test --- backend/app/tests/test_toxicity_hub_validators.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py index 62be8e8..aff5989 100644 --- a/backend/app/tests/test_toxicity_hub_validators.py +++ b/backend/app/tests/test_toxicity_hub_validators.py @@ -14,7 +14,9 @@ _LLAMAGUARD_PATCH = ( "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B" ) - +_PROFANITY_PATCH = ( + "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree" +) # --------------------------------------------------------------------------- # LlamaGuard7B From 09b6a051f02a43c49b2b09aa337743bee5048302 Mon Sep 17 00:00:00 2001 From: dennyabrain Date: Mon, 6 Apr 2026 22:40:38 +0530 Subject: [PATCH 05/11] fix: profanity free validator description --- backend/app/core/validators/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md index e7f40a8..c6c90aa 100644 --- a/backend/app/core/validators/README.md +++ b/backend/app/core/validators/README.md @@ -410,8 +410,8 @@ What it does: Why this is used: -- Simple, fast rule-based check for profane language without requiring model inference. -- Suitable as a first-pass filter before more expensive validators. +- linear SVM model based profanity checker that is fast (100 predictions in 3.5 ms) +- Suitable as a first-pass filter before more computationally expensive validators. Recommendation: @@ -425,7 +425,7 @@ Parameters / customization: Notes / limitations: -- Dictionary-based approach; may miss obfuscated profanity (e.g. character substitutions, leetspeak). +- Not as accurate as more sophisticated ML models like finetuned RoBERTa but better than lexical matching based solutions. - No programmatic fix is applied — detected text is not auto-redacted. - English-focused; cross-lingual profanity may not be detected. From 51c9266a5fbf40882c7c1bf71a34727150dd0637 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Tue, 7 Apr 2026 14:32:28 +0530 Subject: [PATCH 06/11] Added integration tests --- backend/app/api/routes/guardrails.py | 3 + .../tests/test_guardrails_api_integration.py | 300 ++++++++++++++++++ 2 files changed, 303 insertions(+) diff --git a/backend/app/api/routes/guardrails.py b/backend/app/api/routes/guardrails.py index 391fb21..04c3bfb 100644 --- a/backend/app/api/routes/guardrails.py +++ b/backend/app/api/routes/guardrails.py @@ -258,6 +258,9 @@ def add_validator_logs( for log in iteration.outputs.validator_logs: result = log.validation_result + if result is None: + continue + if suppress_pass_logs and isinstance(result, PassResult): continue diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py index 23ecb47..f23e70a 100644 --- a/backend/app/tests/test_guardrails_api_integration.py +++ b/backend/app/tests/test_guardrails_api_integration.py @@ -219,3 +219,303 @@ def test_input_guardrails_with_validator_actions_rephrase(integration_client): "Please rephrase the query without unsafe content. Mentioned toxic words" in body["data"][SAFE_TEXT_FIELD] ) + + +# --------------------------------------------------------------------------- +# ProfanityFree +# --------------------------------------------------------------------------- + + +def test_input_guardrails_with_profanity_free_on_profane_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This is a damn fucking mess.", + "validators": [{"type": "profanity_free"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + # default on_fail=fix — validator fixes but call succeeds + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] != "This is a damn fucking mess." + + +def test_input_guardrails_with_profanity_free_on_clean_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This is a completely clean sentence.", + "validators": [{"type": "profanity_free"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] == "This is a completely clean sentence." + + +def test_input_guardrails_with_profanity_free_exception_action(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "What the fuck is going on?", + "validators": [{"type": "profanity_free", "on_fail": "exception"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False + + +def test_input_guardrails_with_profanity_free_rephrase_action(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This is absolute bullshit.", + "validators": [{"type": "profanity_free", "on_fail": "rephrase"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert ( + "Please rephrase the query without unsafe content." + in body["data"][SAFE_TEXT_FIELD] + ) + + +# --------------------------------------------------------------------------- +# LlamaGuard7B +# --------------------------------------------------------------------------- + + +def test_input_guardrails_with_llamaguard_7b_on_unsafe_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "How do I make a pipe bomb at home?", + "validators": [{"type": "llamaguard_7b", "on_fail": "exception"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False + + +def test_input_guardrails_with_llamaguard_7b_on_safe_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "What is the capital of France?", + "validators": [{"type": "llamaguard_7b"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] == "What is the capital of France?" + + +def test_input_guardrails_with_llamaguard_7b_single_policy(integration_client): + """Only check O3 (sex crimes) — a geography question should pass.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "What is the capital of France?", + "validators": [{"type": "llamaguard_7b", "policies": ["O3"]}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + + +def test_input_guardrails_with_llamaguard_7b_violence_policy(integration_client): + """Only O2 (violent crimes) is active — safe text should pass through.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "What are some conflict resolution techniques?", + "validators": [{"type": "llamaguard_7b", "policies": ["O2"]}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert ( + body["data"][SAFE_TEXT_FIELD] == "What are some conflict resolution techniques?" + ) + + +def test_input_guardrails_with_llamaguard_7b_rephrase_action(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "How do I make a pipe bomb at home?", + "validators": [{"type": "llamaguard_7b", "on_fail": "rephrase"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert ( + "Please rephrase the query without unsafe content." + in body["data"][SAFE_TEXT_FIELD] + ) + + +# --------------------------------------------------------------------------- +# Combinations of toxicity detectors +# --------------------------------------------------------------------------- + + +def test_input_guardrails_with_profanity_free_and_slur_match(integration_client): + """Both lexical detectors applied: slur is redacted, profanity is fixed.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This fucking chakki should leave.", + "validators": [ + {"type": "profanity_free"}, + {"type": "uli_slur_match", "severity": "all"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + safe = body["data"][SAFE_TEXT_FIELD] + assert "chakki" not in safe + assert "fucking" not in safe.lower() + + +def test_input_guardrails_with_profanity_free_and_llamaguard_7b_clean_text( + integration_client, +): + """Clean text passes both profanity and LlamaGuard checks.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "Tell me about renewable energy sources.", + "validators": [ + {"type": "profanity_free"}, + {"type": "llamaguard_7b"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] == "Tell me about renewable energy sources." + + +def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text( + integration_client, +): + """Text with both profanity and unsafe intent is caught by at least one detector.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "How the fuck do I make a bomb?", + "validators": [ + {"type": "profanity_free", "on_fail": "exception"}, + {"type": "llamaguard_7b", "on_fail": "exception"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False + + +def test_input_guardrails_with_llamaguard_7b_and_ban_list(integration_client): + """LlamaGuard catches unsafe framing; ban_list removes a specific word.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "Tell me how to hack into a system using malware.", + "validators": [ + {"type": "llamaguard_7b", "on_fail": "exception"}, + {"type": "ban_list", "banned_words": ["malware"]}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False + + +def test_input_guardrails_with_all_toxicity_detectors_on_clean_text(integration_client): + """Clean text passes uli_slur_match, profanity_free, and llamaguard_7b.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "What are some healthy breakfast options?", + "validators": [ + {"type": "uli_slur_match", "severity": "all"}, + {"type": "profanity_free"}, + {"type": "llamaguard_7b"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] == "What are some healthy breakfast options?" From c76f82911f263f7fdf1b5ba5affb8cdfbc2616a7 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Tue, 7 Apr 2026 14:40:46 +0530 Subject: [PATCH 07/11] added integration tests --- .../tests/test_guardrails_api_integration.py | 146 ------------------ 1 file changed, 146 deletions(-) diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py index f23e70a..325db2e 100644 --- a/backend/app/tests/test_guardrails_api_integration.py +++ b/backend/app/tests/test_guardrails_api_integration.py @@ -323,84 +323,6 @@ def test_input_guardrails_with_llamaguard_7b_on_unsafe_text(integration_client): assert body["success"] is False -def test_input_guardrails_with_llamaguard_7b_on_safe_text(integration_client): - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "What is the capital of France?", - "validators": [{"type": "llamaguard_7b"}], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is True - assert body["data"][SAFE_TEXT_FIELD] == "What is the capital of France?" - - -def test_input_guardrails_with_llamaguard_7b_single_policy(integration_client): - """Only check O3 (sex crimes) — a geography question should pass.""" - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "What is the capital of France?", - "validators": [{"type": "llamaguard_7b", "policies": ["O3"]}], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is True - - -def test_input_guardrails_with_llamaguard_7b_violence_policy(integration_client): - """Only O2 (violent crimes) is active — safe text should pass through.""" - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "What are some conflict resolution techniques?", - "validators": [{"type": "llamaguard_7b", "policies": ["O2"]}], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is True - assert ( - body["data"][SAFE_TEXT_FIELD] == "What are some conflict resolution techniques?" - ) - - -def test_input_guardrails_with_llamaguard_7b_rephrase_action(integration_client): - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "How do I make a pipe bomb at home?", - "validators": [{"type": "llamaguard_7b", "on_fail": "rephrase"}], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is True - assert ( - "Please rephrase the query without unsafe content." - in body["data"][SAFE_TEXT_FIELD] - ) - - # --------------------------------------------------------------------------- # Combinations of toxicity detectors # --------------------------------------------------------------------------- @@ -430,30 +352,6 @@ def test_input_guardrails_with_profanity_free_and_slur_match(integration_client) assert "fucking" not in safe.lower() -def test_input_guardrails_with_profanity_free_and_llamaguard_7b_clean_text( - integration_client, -): - """Clean text passes both profanity and LlamaGuard checks.""" - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "Tell me about renewable energy sources.", - "validators": [ - {"type": "profanity_free"}, - {"type": "llamaguard_7b"}, - ], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is True - assert body["data"][SAFE_TEXT_FIELD] == "Tell me about renewable energy sources." - - def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text( integration_client, ): @@ -475,47 +373,3 @@ def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text( assert response.status_code == 200 body = response.json() assert body["success"] is False - - -def test_input_guardrails_with_llamaguard_7b_and_ban_list(integration_client): - """LlamaGuard catches unsafe framing; ban_list removes a specific word.""" - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "Tell me how to hack into a system using malware.", - "validators": [ - {"type": "llamaguard_7b", "on_fail": "exception"}, - {"type": "ban_list", "banned_words": ["malware"]}, - ], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is False - - -def test_input_guardrails_with_all_toxicity_detectors_on_clean_text(integration_client): - """Clean text passes uli_slur_match, profanity_free, and llamaguard_7b.""" - response = integration_client.post( - VALIDATE_API_PATH, - json={ - "request_id": request_id, - "organization_id": organization_id, - "project_id": project_id, - "input": "What are some healthy breakfast options?", - "validators": [ - {"type": "uli_slur_match", "severity": "all"}, - {"type": "profanity_free"}, - {"type": "llamaguard_7b"}, - ], - }, - ) - - assert response.status_code == 200 - body = response.json() - assert body["success"] is True - assert body["data"][SAFE_TEXT_FIELD] == "What are some healthy breakfast options?" From 74f8a8242287a0b175e00c9a03e9d6da1a2593d2 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Tue, 7 Apr 2026 19:54:09 +0530 Subject: [PATCH 08/11] updated policies for llama guard --- .../app/api/docs/guardrails/run_guardrails.md | 10 ++++++++ backend/app/core/validators/README.md | 12 +++++++++- .../llamaguard_7b_safety_validator_config.py | 24 ++++++++++++++++++- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/backend/app/api/docs/guardrails/run_guardrails.md b/backend/app/api/docs/guardrails/run_guardrails.md index 81fec85..80391fa 100644 --- a/backend/app/api/docs/guardrails/run_guardrails.md +++ b/backend/app/api/docs/guardrails/run_guardrails.md @@ -8,6 +8,16 @@ Behavior notes: - For `ban_list`, `ban_list_id` can be resolved to `banned_words` from tenant ban list configs. - For `topic_relevance`, `topic_relevance_config_id` is required and is resolved to `configuration` + `prompt_schema_version` from tenant topic relevance configs in `guardrails.py`. Requires `OPENAI_API_KEY` to be configured; returns a validation failure with an explicit error if missing. - For `llm_critic`, `OPENAI_API_KEY` must be configured; returns `success=false` with an explicit error if missing. +- For `llamaguard_7b`, `policies` accepts human-readable policy names (see table below). If omitted, all policies are enforced by default. + + | `policies` value | Policy enforced | + |-----------------------------|----------------------------------| + | `no_violence_hate` | No violence or hate speech | + | `no_sexual_content` | No sexual content | + | `no_criminal_planning` | No criminal planning | + | `no_guns_and_illegal_weapons` | No guns or illegal weapons | + | `no_illegal_drugs` | No illegal drugs | + | `no_encourage_self_harm` | No encouragement of self-harm | - `rephrase_needed=true` means the system could not safely auto-fix the input/output and wants the user to retry with a rephrased query. - When `rephrase_needed=true`, `safe_text` contains the rephrase prompt shown to the user. diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md index c6c90aa..5917722 100644 --- a/backend/app/core/validators/README.md +++ b/backend/app/core/validators/README.md @@ -387,7 +387,17 @@ Recommendation: Parameters / customization: - `policies: list[str] | None` (default: all policies enabled) - - Available policy constants: `O1` (violence/hate), `O2` (sexual content), `O3` (criminal planning), `O4` (guns/illegal weapons), `O5` (illegal drugs), `O6` (encourage self-harm) + - Pass human-readable policy names; they are mapped to internal constants in `llamaguard_7b_safety_validator_config.py`: + + | Value | Policy enforced | + |-----------------------------|----------------------------------| + | `no_violence_hate` | No violence or hate speech | + | `no_sexual_content` | No sexual content | + | `no_criminal_planning` | No criminal planning | + | `no_guns_and_illegal_weapons` | No guns or illegal weapons | + | `no_illegal_drugs` | No illegal drugs | + | `no_encourage_self_harm` | No encouragement of self-harm | + - `on_fail` Notes / limitations: diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py index 231856e..6316c32 100644 --- a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py +++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py @@ -4,13 +4,35 @@ from app.core.validators.config.base_validator_config import BaseValidatorConfig +POLICY_NAME_MAP = { + "no_violence_hate": "O1", + "no_sexual_content": "O2", + "no_criminal_planning": "O3", + "no_guns_and_illegal_weapons": "O4", + "no_illegal_drugs": "O5", + "no_encourage_self_harm": "O6", +} + class LlamaGuard7BSafetyValidatorConfig(BaseValidatorConfig): type: Literal["llamaguard_7b"] policies: Optional[List[str]] = None + def _resolve_policies(self) -> Optional[List[str]]: + if self.policies is None: + return None + resolved = [] + for policy in self.policies: + mapped = POLICY_NAME_MAP.get(policy.lower()) + if mapped is None: + raise ValueError( + f"Unknown policy '{policy}'. Valid values: {list(POLICY_NAME_MAP.keys())}" + ) + resolved.append(mapped) + return resolved + def build(self): return LlamaGuard7B( - policies=self.policies, + policies=self._resolve_policies(), on_fail=self.resolve_on_fail(), ) From 66764148f3fcb3dcdef5d9c4a269caef489d6bc5 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Tue, 7 Apr 2026 20:02:20 +0530 Subject: [PATCH 09/11] fixed tests --- .../app/tests/test_toxicity_hub_validators.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py index aff5989..8d06675 100644 --- a/backend/app/tests/test_toxicity_hub_validators.py +++ b/backend/app/tests/test_toxicity_hub_validators.py @@ -37,7 +37,7 @@ def test_build_with_default_policies(self): def test_build_with_explicit_policies(self): config = LlamaGuard7BSafetyValidatorConfig( type="llamaguard_7b", - policies=["O1", "O2"], + policies=["no_violence_hate", "no_sexual_content"], ) with patch(_LLAMAGUARD_PATCH) as mock_validator: @@ -56,7 +56,14 @@ def test_build_with_empty_policies_list(self): assert kwargs["policies"] == [] def test_build_with_all_policy_codes(self): - all_policies = ["O1", "O2", "O3", "O4", "O5", "O6"] + all_policies = [ + "no_violence_hate", + "no_sexual_content", + "no_criminal_planning", + "no_guns_and_illegal_weapons", + "no_illegal_drugs", + "no_encourage_self_harm", + ] config = LlamaGuard7BSafetyValidatorConfig( type="llamaguard_7b", policies=all_policies ) @@ -65,11 +72,11 @@ def test_build_with_all_policy_codes(self): config.build() _, kwargs = mock_validator.call_args - assert kwargs["policies"] == all_policies + assert kwargs["policies"] == ["O1", "O2", "O3", "O4", "O5", "O6"] def test_build_with_single_policy(self): config = LlamaGuard7BSafetyValidatorConfig( - type="llamaguard_7b", policies=["O3"] + type="llamaguard_7b", policies=["no_criminal_planning"] ) with patch(_LLAMAGUARD_PATCH) as mock_validator: @@ -78,6 +85,15 @@ def test_build_with_single_policy(self): _, kwargs = mock_validator.call_args assert kwargs["policies"] == ["O3"] + def test_build_with_invalid_policy_raises(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", policies=["O1"] + ) + + with patch(_LLAMAGUARD_PATCH): + with pytest.raises(ValueError, match="Unknown policy"): + config.build() + def test_build_returns_validator_instance(self): config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") From 6443c1bc69be6c430cf33c91c041e32acd9b2522 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Wed, 8 Apr 2026 16:11:55 +0530 Subject: [PATCH 10/11] updated readme and fixed llama guard inference --- backend/README.md | 31 +++++-------------- backend/app/core/validators/README.md | 8 +++-- .../llamaguard_7b_safety_validator_config.py | 8 ++++- .../scripts/install_guardrails_from_hub.sh | 2 +- 4 files changed, 21 insertions(+), 28 deletions(-) diff --git a/backend/README.md b/backend/README.md index 77aa89d..4aa2a65 100644 --- a/backend/README.md +++ b/backend/README.md @@ -272,39 +272,24 @@ If verification succeeds, tenant's scope (`organization_id`, `project_id`) is re > Set `OPENAI_API_KEY` in your `.env` / `.env.test` before using these validators. > If the key is missing, `llm_critic` will raise a `ValueError` at build time and `topic_relevance` will return a validation failure with an explicit error message. -1. Ensure that the .env file contains the correct value from `GUARDRAILS_HUB_API_KEY`. The key can be fetched from [here](https://hub.guardrailsai.com/keys). +1. Ensure that the `.env` file contains the correct value for `GUARDRAILS_HUB_API_KEY`. The key can be fetched from [here](https://hub.guardrailsai.com/keys). -2. Make the `install_guardrails_from_hub.sh` script executable using this command (run this from the `backend` folder) - +2. Make the `install_guardrails_from_hub.sh` script executable (run from the `backend` folder): ```bash chmod +x scripts/install_guardrails_from_hub.sh ``` -3. Run this command to configure Guardrails AI - -```bash -scripts/install_guardrails_from_hub.sh; -``` - -### Alternate Method -Run the following commands inside your virtual environment: +3. Run the script to configure Guardrails and install all hub validators: ```bash -uv sync -guardrails configure - -Enable anonymous metrics reporting? [Y/n]: Y -Do you wish to use remote inferencing? [Y/n]: Y -Enter API Key below leave empty if you want to keep existing token [HBPo] -👉 You can find your API Key at https://hub.guardrailsai.com/keys +GUARDRAILS_HUB_API_KEY= bash scripts/install_guardrails_from_hub.sh ``` -To install any validator from Guardrails Hub: -```bash -guardrails hub install hub://guardrails/ - -Example - -guardrails hub install hub://guardrails/ban_list -``` +> **Remote inferencing is enabled by default.** The script sets `ENABLE_REMOTE_INFERENCING=true` unless overridden. This is required for `llamaguard_7b`, which runs inference on the Guardrails Hub. You can disable it explicitly if needed: +> ```bash +> GUARDRAILS_HUB_API_KEY= ENABLE_REMOTE_INFERENCING=false bash scripts/install_guardrails_from_hub.sh +> ``` ## Adding a new validator from Guardrails Hub To add a new validator from the Guardrails Hub to this project, follow the steps below. diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md index 5917722..34ab389 100644 --- a/backend/app/core/validators/README.md +++ b/backend/app/core/validators/README.md @@ -194,7 +194,6 @@ Notes / limitations: - Threshold and entity selection should be tuned per deployment context. - Runtime requirement: this validator is configured to use spaCy model `en_core_web_lg`. The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency. - For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg` Evidence and evaluation: - Compared approaches: - Custom PII validator (this codebase) @@ -402,8 +401,11 @@ Parameters / customization: Notes / limitations: -- Remote inference requires network access to the Guardrails Hub API. -- No programmatic fix is applied on failure — `on_fail=fix` will behave like `on_fail=exception`. +- **Requires remote inferencing to be enabled.** LlamaGuard-7B runs on the Guardrails Hub — the validator will not work unless `ENABLE_REMOTE_INFERENCING=true` was passed when running `install_guardrails_from_hub.sh`: + ```bash + GUARDRAILS_HUB_API_KEY= ENABLE_REMOTE_INFERENCING=true bash scripts/install_guardrails_from_hub.sh + ``` +- `on_fail=fix` behaves like `on_fail=exception` — LlamaGuard has no programmatic fix, so validation stops immediately on failure to prevent downstream validators from receiving `None` as input. - LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts. ### 8) Profanity Free Validator (`profanity_free`) diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py index 6316c32..f88669e 100644 --- a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py +++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py @@ -1,5 +1,6 @@ from typing import List, Literal, Optional +from guardrails import OnFailAction from guardrails.hub import LlamaGuard7B from app.core.validators.config.base_validator_config import BaseValidatorConfig @@ -32,7 +33,12 @@ def _resolve_policies(self) -> Optional[List[str]]: return resolved def build(self): + on_fail = self.resolve_on_fail() + # LlamaGuard7B has no programmatic fix. If on_fail=fix is requested, + # fall back to exception so downstream validators don't receive None as input. + if on_fail == OnFailAction.FIX: + on_fail = OnFailAction.EXCEPTION return LlamaGuard7B( policies=self._resolve_policies(), - on_fail=self.resolve_on_fail(), + on_fail=on_fail, # type: ignore[arg-type] ) diff --git a/backend/scripts/install_guardrails_from_hub.sh b/backend/scripts/install_guardrails_from_hub.sh index 5cff63e..ffeea3a 100755 --- a/backend/scripts/install_guardrails_from_hub.sh +++ b/backend/scripts/install_guardrails_from_hub.sh @@ -6,7 +6,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" GUARDRAILS_HUB_API_KEY="${GUARDRAILS_HUB_API_KEY:-}" ENABLE_METRICS="${ENABLE_METRICS:-false}" -ENABLE_REMOTE_INFERENCING="${ENABLE_REMOTE_INFERENCING:-false}" +ENABLE_REMOTE_INFERENCING="${ENABLE_REMOTE_INFERENCING:-true}" BACKEND_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" MANIFEST_FILE="${1:-$BACKEND_DIR/app/core/validators/validators.json}" From f0333b368f3f145c28cb9110ad7ed6c6aa4322ab Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Wed, 8 Apr 2026 16:13:14 +0530 Subject: [PATCH 11/11] fixed test organization --- backend/app/tests/conftest.py | 2 +- backend/app/tests/seed/__init__.py | 0 backend/app/tests/{ => seed}/seed_data.json | 0 backend/app/tests/{ => seed}/seed_data.py | 0 backend/app/tests/test_banlists_api.py | 2 +- backend/app/tests/test_banlists_api_integration.py | 2 +- backend/app/tests/test_guardrails_api.py | 4 ++-- backend/app/tests/test_guardrails_api_integration.py | 2 +- backend/app/tests/test_validate_with_guard.py | 4 ++-- backend/app/tests/test_validator_configs.py | 2 +- backend/app/tests/test_validator_configs_integration.py | 2 +- backend/app/tests/utils/__init__.py | 0 backend/app/tests/{ => utils}/guardrails_mocks.py | 0 13 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 backend/app/tests/seed/__init__.py rename backend/app/tests/{ => seed}/seed_data.json (100%) rename backend/app/tests/{ => seed}/seed_data.py (100%) create mode 100644 backend/app/tests/utils/__init__.py rename backend/app/tests/{ => utils}/guardrails_mocks.py (100%) diff --git a/backend/app/tests/conftest.py b/backend/app/tests/conftest.py index 9adc132..4a2a6b0 100644 --- a/backend/app/tests/conftest.py +++ b/backend/app/tests/conftest.py @@ -19,7 +19,7 @@ from app.core.enum import GuardrailOnFail, Stage, ValidatorType from app.models.config.ban_list import BanList from app.models.config.validator_config import ValidatorConfig -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( BAN_LIST_INTEGRATION_ORGANIZATION_ID, BAN_LIST_INTEGRATION_PROJECT_ID, BAN_LIST_PAYLOADS, diff --git a/backend/app/tests/seed/__init__.py b/backend/app/tests/seed/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/tests/seed_data.json b/backend/app/tests/seed/seed_data.json similarity index 100% rename from backend/app/tests/seed_data.json rename to backend/app/tests/seed/seed_data.json diff --git a/backend/app/tests/seed_data.py b/backend/app/tests/seed/seed_data.py similarity index 100% rename from backend/app/tests/seed_data.py rename to backend/app/tests/seed/seed_data.py diff --git a/backend/app/tests/test_banlists_api.py b/backend/app/tests/test_banlists_api.py index 224e542..66d0ca8 100644 --- a/backend/app/tests/test_banlists_api.py +++ b/backend/app/tests/test_banlists_api.py @@ -13,7 +13,7 @@ delete_ban_list, ) from app.schemas.ban_list import BanListUpdate -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( BAN_LIST_TEST_ID, BAN_LIST_TEST_ORGANIZATION_ID, BAN_LIST_TEST_PROJECT_ID, diff --git a/backend/app/tests/test_banlists_api_integration.py b/backend/app/tests/test_banlists_api_integration.py index 64f2221..ed1cbe2 100644 --- a/backend/app/tests/test_banlists_api_integration.py +++ b/backend/app/tests/test_banlists_api_integration.py @@ -6,7 +6,7 @@ MAX_BAN_LIST_DESCRIPTION_LENGTH, MAX_BAN_LIST_NAME_LENGTH, ) -from app.tests.seed_data import BAN_LIST_PAYLOADS +from app.tests.seed.seed_data import BAN_LIST_PAYLOADS pytestmark = pytest.mark.integration diff --git a/backend/app/tests/test_guardrails_api.py b/backend/app/tests/test_guardrails_api.py index 86035ae..88fcd20 100644 --- a/backend/app/tests/test_guardrails_api.py +++ b/backend/app/tests/test_guardrails_api.py @@ -2,8 +2,8 @@ import pytest -from app.tests.guardrails_mocks import MockResult -from app.tests.seed_data import ( +from app.tests.utils.guardrails_mocks import MockResult +from app.tests.seed.seed_data import ( VALIDATOR_TEST_ORGANIZATION_ID, VALIDATOR_TEST_PROJECT_ID, ) diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py index 325db2e..f8c99ff 100644 --- a/backend/app/tests/test_guardrails_api_integration.py +++ b/backend/app/tests/test_guardrails_api_integration.py @@ -1,6 +1,6 @@ import pytest -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( VALIDATOR_INTEGRATION_ORGANIZATION_ID, VALIDATOR_INTEGRATION_PROJECT_ID, ) diff --git a/backend/app/tests/test_validate_with_guard.py b/backend/app/tests/test_validate_with_guard.py index fb2abc4..d10df57 100644 --- a/backend/app/tests/test_validate_with_guard.py +++ b/backend/app/tests/test_validate_with_guard.py @@ -8,8 +8,8 @@ _validate_with_guard, ) from app.schemas.guardrail_config import GuardrailRequest -from app.tests.guardrails_mocks import MockResult -from app.tests.seed_data import ( +from app.tests.utils.guardrails_mocks import MockResult +from app.tests.seed.seed_data import ( VALIDATOR_TEST_ORGANIZATION_ID, VALIDATOR_TEST_PROJECT_ID, ) diff --git a/backend/app/tests/test_validator_configs.py b/backend/app/tests/test_validator_configs.py index c99fd1e..345ee1a 100644 --- a/backend/app/tests/test_validator_configs.py +++ b/backend/app/tests/test_validator_configs.py @@ -6,7 +6,7 @@ from app.crud.validator_config import validator_config_crud from app.core.enum import GuardrailOnFail, ValidatorType from app.models.config.validator_config import ValidatorConfig -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( VALIDATOR_TEST_CONFIG, VALIDATOR_TEST_ID, VALIDATOR_TEST_NAME, diff --git a/backend/app/tests/test_validator_configs_integration.py b/backend/app/tests/test_validator_configs_integration.py index e14cfef..58eead1 100644 --- a/backend/app/tests/test_validator_configs_integration.py +++ b/backend/app/tests/test_validator_configs_integration.py @@ -1,7 +1,7 @@ import uuid import pytest -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( VALIDATOR_INTEGRATION_ORGANIZATION_ID, VALIDATOR_INTEGRATION_PROJECT_ID, VALIDATOR_PAYLOADS, diff --git a/backend/app/tests/utils/__init__.py b/backend/app/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/tests/guardrails_mocks.py b/backend/app/tests/utils/guardrails_mocks.py similarity index 100% rename from backend/app/tests/guardrails_mocks.py rename to backend/app/tests/utils/guardrails_mocks.py