From 650369ca29be9972e8ef76490ce94cd43dc06936 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Wed, 1 Apr 2026 09:44:51 +0530
Subject: [PATCH 01/11] added toxicity detection validators

---
 backend/app/api/API_USAGE.md                  |   6 +-
 backend/app/core/validators/README.md         | 131 ++++-
 .../llamaguard_7b_safety_validator_config.py  |  16 +
 .../nsfw_text_safety_validator_config.py      |  22 +
 .../profanity_free_safety_validator_config.py |  14 +
 .../toxic_language_safety_validator_config.py |  22 +
 backend/app/core/validators/validators.json   |  20 +
 backend/app/schemas/guardrail_config.py       |  16 +
 .../app/tests/test_toxicity_hub_validators.py | 504 ++++++++++++++++++
 9 files changed, 748 insertions(+), 3 deletions(-)
 create mode 100644 backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
 create mode 100644 backend/app/core/validators/config/nsfw_text_safety_validator_config.py
 create mode 100644 backend/app/core/validators/config/profanity_free_safety_validator_config.py
 create mode 100644 backend/app/core/validators/config/toxic_language_safety_validator_config.py
 create mode 100644 backend/app/tests/test_toxicity_hub_validators.py
diff --git a/backend/app/api/API_USAGE.md b/backend/app/api/API_USAGE.md
index e4e565a..1ce2ce7 100644
--- a/backend/app/api/API_USAGE.md
+++ b/backend/app/api/API_USAGE.md
@@ -100,7 +100,7 @@ Endpoint:
 Optional filters:
 - `ids=<uuid>&ids=<uuid>`
 - `stage=input|output`
-- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance`
+- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|nsfw_text|profanity_free|toxic_language`
 
 Example:
 
@@ -442,6 +442,10 @@ From `validators.json`:
 - `ban_list`
 - `llm_critic`
 - `topic_relevance`
+- `llamaguard_7b`
+- `nsfw_text`
+- `profanity_free`
+- `toxic_language`
 
 Source of truth:
 - `backend/app/core/validators/validators.json`
diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md
index f0a2f6d..3ee841c 100644
--- a/backend/app/core/validators/README.md
+++ b/backend/app/core/validators/README.md
@@ -11,6 +11,10 @@ Current validator manifest:
 - `ban_list` (source: `hub://guardrails/ban_list`)
 - `llm_critic` (source: `hub://guardrails/llm_critic`) - https://guardrailsai.com/hub/validator/guardrails/llm_critic
 - `topic_relevance` (source: `local`)
+- `llamaguard_7b` (source: `hub://guardrails/llamaguard_7b`)
+- `nsfw_text` (source: `hub://guardrails/nsfw_text`)
+- `profanity_free` (source: `hub://guardrails/profanity_free`)
+- `toxic_language` (source: `hub://guardrails/toxic_language`)
 
 ## Configuration Model
 
@@ -310,6 +314,125 @@ Notes / limitations:
 - Configuration is resolved in `backend/app/api/routes/guardrails.py` from tenant Topic Relevance Config APIs.
 - Prompt templates must include the `{{TOPIC_CONFIGURATION}}` placeholder.
 
+### 7) LlamaGuard 7B Validator (`llamaguard_7b`)
+
+Code:
+- Config: `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py`
+- Source: Guardrails Hub (`hub://guardrails/llamaguard_7b`)
+
+What it does:
+- Classifies text as "safe" or "unsafe" using the LlamaGuard-7B model via remote inference on the Guardrails Hub.
+- Checks against a configurable set of safety policies covering violence/hate, sexual content, criminal planning, weapons, illegal drugs, and self-harm encouragement.
+
+Why this is used:
+- Provides a model-level safety classifier as a complement to rule-based validators.
+- Allows policy-targeted filtering (e.g. only flag content violating specific categories).
+
+Recommendation:
+- `input` and `output`
+  - Why `input`: catches unsafe user prompts before model processing.
+  - Why `output`: validates generated content against the same safety policies.
+
+Parameters / customization:
+- `policies: list[str] | None` (default: all policies enabled)
+  - Available policy constants: `O1` (violence/hate), `O2` (sexual content), `O3` (criminal planning), `O4` (guns/illegal weapons), `O5` (illegal drugs), `O6` (encourage self-harm)
+- `on_fail`
+
+Notes / limitations:
+- Remote inference requires network access to the Guardrails Hub API.
+- No programmatic fix is applied on failure — `on_fail=fix` will behave like `on_fail=exception`.
+- LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts.
+
+### 8) NSFW Text Validator (`nsfw_text`)
+
+Code:
+- Config: `backend/app/core/validators/config/nsfw_text_safety_validator_config.py`
+- Source: Guardrails Hub (`hub://guardrails/nsfw_text`)
+
+What it does:
+- Detects not-safe-for-work (NSFW) text using a classifier model.
+- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold.
+
+Why this is used:
+- Provides a dedicated NSFW text filter for deployments where explicit/adult content must be blocked.
+- Complements LlamaGuard-based filtering with a lightweight, CPU-friendly classifier.
+
+Recommendation:
+- `input` and `output`
+  - Why `input`: blocks NSFW user messages before model invocation.
+  - Why `output`: prevents explicit content from being surfaced to end users.
+
+Parameters / customization:
+- `threshold: float` (default: `0.8`) — minimum classifier score to flag text as NSFW. Higher = more conservative (fewer false positives).
+- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input.
+- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`).
+- `model_name: str | None` (default: `"michellejieli/NSFW_text_classifier"`)
+- `on_fail`
+
+Notes / limitations:
+- Model runs locally; first use downloads model weights. Ensure network access during setup.
+- `validation_method="sentence"` may miss NSFW content spread across multiple sentences.
+- Threshold tuning is important: lower values increase recall at the cost of false positives.
+
+### 9) Profanity Free Validator (`profanity_free`)
+
+Code:
+- Config: `backend/app/core/validators/config/profanity_free_safety_validator_config.py`
+- Source: Guardrails Hub (`hub://guardrails/profanity_free`)
+
+What it does:
+- Detects profanity in text using the `alt-profanity-check` library.
+- Fails validation if any profanity is detected.
+
+Why this is used:
+- Simple, fast rule-based check for profane language without requiring model inference.
+- Suitable as a first-pass filter before more expensive validators.
+
+Recommendation:
+- `input` and `output`
+  - Why `input`: catches profane user messages early.
+  - Why `output`: prevents model-generated profanity from reaching users.
+
+Parameters / customization:
+- `on_fail`
+
+Notes / limitations:
+- Dictionary-based approach; may miss obfuscated profanity (e.g. character substitutions, leetspeak).
+- No programmatic fix is applied — detected text is not auto-redacted.
+- English-focused; cross-lingual profanity may not be detected.
+
+### 10) Toxic Language Validator (`toxic_language`)
+
+Code:
+- Config: `backend/app/core/validators/config/toxic_language_safety_validator_config.py`
+- Source: Guardrails Hub (`hub://guardrails/toxic_language`)
+
+What it does:
+- Detects toxic language using a classifier model.
+- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold.
+
+Why this is used:
+- Provides broader toxicity detection beyond explicit slurs, covering hostile, threatening, or degrading language.
+- Works as a complement to the lexical slur validator (`uli_slur_match`) for semantic toxicity.
+
+Recommendation:
+- `input` and `output`
+  - Why `input`: catches toxic user messages before they influence model behavior.
+  - Why `output`: prevents model-generated toxic content from reaching end users.
+
+Parameters / customization:
+- `threshold: float` (default: `0.5`) — minimum classifier score to flag text as toxic. Lower = more sensitive (higher recall, more false positives).
+- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input.
+- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`).
+- `model_name: str | None` (default: `"unbiased-small"`)
+- `on_fail`
+
+Notes / limitations:
+- Model runs locally; first use downloads model weights. Ensure network access during setup.
+- The `unbiased-small` model is designed to reduce bias against identity groups compared to standard toxicity classifiers.
+- `validation_method="sentence"` is recommended for conversational text; use `"full"` for short single-sentence inputs.
+- Consider using alongside `uli_slur_match` for layered toxicity coverage.
+
 ## Example Config Payloads
 
 Example: create validator config (stored shape)
@@ -339,8 +462,8 @@ Example: runtime guardrail validator object (execution shape)
 ## Operational Guidance
 
 Default stage strategy:
-- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed)
-- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`
+- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b`
+- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b`
 
 Tuning strategy:
 - Start with conservative defaults and log validator outcomes.
@@ -356,5 +479,9 @@ Tuning strategy:
 - `backend/app/core/validators/config/lexical_slur_safety_validator_config.py`
 - `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py`
 - `backend/app/core/validators/config/topic_relevance_safety_validator_config.py`
+- `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py`
+- `backend/app/core/validators/config/nsfw_text_safety_validator_config.py`
+- `backend/app/core/validators/config/profanity_free_safety_validator_config.py`
+- `backend/app/core/validators/config/toxic_language_safety_validator_config.py`
 - `backend/app/schemas/guardrail_config.py`
 - `backend/app/schemas/validator_config.py`
diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
new file mode 100644
index 0000000..231856e
--- /dev/null
+++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
@@ -0,0 +1,16 @@
+from typing import List, Literal, Optional
+
+from guardrails.hub import LlamaGuard7B
+
+from app.core.validators.config.base_validator_config import BaseValidatorConfig
+
+
+class LlamaGuard7BSafetyValidatorConfig(BaseValidatorConfig):
+    type: Literal["llamaguard_7b"]
+    policies: Optional[List[str]] = None
+
+    def build(self):
+        return LlamaGuard7B(
+            policies=self.policies,
+            on_fail=self.resolve_on_fail(),
+        )
diff --git a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py
new file mode 100644
index 0000000..9fd81e7
--- /dev/null
+++ b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py
@@ -0,0 +1,22 @@
+from typing import Literal, Optional
+
+from guardrails.hub import NSFWText
+
+from app.core.validators.config.base_validator_config import BaseValidatorConfig
+
+
+class NSFWTextSafetyValidatorConfig(BaseValidatorConfig):
+    type: Literal["nsfw_text"]
+    threshold: float = 0.8
+    validation_method: str = "sentence"
+    device: Optional[str] = "cpu"
+    model_name: Optional[str] = "michellejieli/NSFW_text_classifier"
+
+    def build(self):
+        return NSFWText(
+            threshold=self.threshold,
+            validation_method=self.validation_method,
+            device=self.device,
+            model_name=self.model_name,
+            on_fail=self.resolve_on_fail(),
+        )
diff --git a/backend/app/core/validators/config/profanity_free_safety_validator_config.py b/backend/app/core/validators/config/profanity_free_safety_validator_config.py
new file mode 100644
index 0000000..dd6d774
--- /dev/null
+++ b/backend/app/core/validators/config/profanity_free_safety_validator_config.py
@@ -0,0 +1,14 @@
+from typing import Literal
+
+from guardrails.hub import ProfanityFree
+
+from app.core.validators.config.base_validator_config import BaseValidatorConfig
+
+
+class ProfanityFreeSafetyValidatorConfig(BaseValidatorConfig):
+    type: Literal["profanity_free"]
+
+    def build(self):
+        return ProfanityFree(
+            on_fail=self.resolve_on_fail(),
+        )
diff --git a/backend/app/core/validators/config/toxic_language_safety_validator_config.py b/backend/app/core/validators/config/toxic_language_safety_validator_config.py
new file mode 100644
index 0000000..4420c4a
--- /dev/null
+++ b/backend/app/core/validators/config/toxic_language_safety_validator_config.py
@@ -0,0 +1,22 @@
+from typing import Literal, Optional
+
+from guardrails.hub import ToxicLanguage
+
+from app.core.validators.config.base_validator_config import BaseValidatorConfig
+
+
+class ToxicLanguageSafetyValidatorConfig(BaseValidatorConfig):
+    type: Literal["toxic_language"]
+    threshold: float = 0.5
+    validation_method: str = "sentence"
+    device: Optional[str] = "cpu"
+    model_name: Optional[str] = "unbiased-small"
+
+    def build(self):
+        return ToxicLanguage(
+            threshold=self.threshold,
+            validation_method=self.validation_method,
+            device=self.device,
+            model_name=self.model_name,
+            on_fail=self.resolve_on_fail(),
+        )
diff --git a/backend/app/core/validators/validators.json b/backend/app/core/validators/validators.json
index 062f183..1aac02f 100644
--- a/backend/app/core/validators/validators.json
+++ b/backend/app/core/validators/validators.json
@@ -29,6 +29,26 @@
             "type": "topic_relevance",
             "version": "0.1.0",
             "source": "local"
+        },
+        {
+            "type": "llamaguard_7b",
+            "version": "0.1.0",
+            "source": "hub://guardrails/llamaguard_7b"
+        },
+        {
+            "type": "nsfw_text",
+            "version": "0.1.0",
+            "source": "hub://guardrails/nsfw_text"
+        },
+        {
+            "type": "profanity_free",
+            "version": "0.1.0",
+            "source": "hub://guardrails/profanity_free"
+        },
+        {
+            "type": "toxic_language",
+            "version": "0.1.0",
+            "source": "hub://guardrails/toxic_language"
         }
     ]
 }
\ No newline at end of file
diff --git a/backend/app/schemas/guardrail_config.py b/backend/app/schemas/guardrail_config.py
index 4cd9dbf..d76ba00 100644
--- a/backend/app/schemas/guardrail_config.py
+++ b/backend/app/schemas/guardrail_config.py
@@ -24,6 +24,18 @@
 from app.core.validators.config.topic_relevance_safety_validator_config import (
     TopicRelevanceSafetyValidatorConfig,
 )
+from app.core.validators.config.llamaguard_7b_safety_validator_config import (
+    LlamaGuard7BSafetyValidatorConfig,
+)
+from app.core.validators.config.nsfw_text_safety_validator_config import (
+    NSFWTextSafetyValidatorConfig,
+)
+from app.core.validators.config.profanity_free_safety_validator_config import (
+    ProfanityFreeSafetyValidatorConfig,
+)
+from app.core.validators.config.toxic_language_safety_validator_config import (
+    ToxicLanguageSafetyValidatorConfig,
+)
 
 ValidatorConfigItem = Annotated[
     Union[
@@ -31,8 +43,12 @@
         GenderAssumptionBiasSafetyValidatorConfig,
         LexicalSlurSafetyValidatorConfig,
         LLMCriticSafetyValidatorConfig,
+        NSFWTextSafetyValidatorConfig,
         PIIRemoverSafetyValidatorConfig,
+        LlamaGuard7BSafetyValidatorConfig,
+        ProfanityFreeSafetyValidatorConfig,
         TopicRelevanceSafetyValidatorConfig,
+        ToxicLanguageSafetyValidatorConfig,
     ],
     Field(discriminator="type"),
 ]
diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py
new file mode 100644
index 0000000..7ee82f9
--- /dev/null
+++ b/backend/app/tests/test_toxicity_hub_validators.py
@@ -0,0 +1,504 @@
+from unittest.mock import patch
+
+import pytest
+from guardrails import OnFailAction
+from pydantic import ValidationError
+
+from app.core.validators.config.llamaguard_7b_safety_validator_config import (
+    LlamaGuard7BSafetyValidatorConfig,
+)
+from app.core.validators.config.nsfw_text_safety_validator_config import (
+    NSFWTextSafetyValidatorConfig,
+)
+from app.core.validators.config.profanity_free_safety_validator_config import (
+    ProfanityFreeSafetyValidatorConfig,
+)
+from app.core.validators.config.toxic_language_safety_validator_config import (
+    ToxicLanguageSafetyValidatorConfig,
+)
+
+_LLAMAGUARD_PATCH = (
+    "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B"
+)
+_NSFW_PATCH = "app.core.validators.config.nsfw_text_safety_validator_config.NSFWText"
+_PROFANITY_PATCH = (
+    "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree"
+)
+_TOXIC_PATCH = (
+    "app.core.validators.config.toxic_language_safety_validator_config.ToxicLanguage"
+)
+
+
+# ---------------------------------------------------------------------------
+# LlamaGuard7B
+# ---------------------------------------------------------------------------
+
+
+class TestLlamaGuard7BSafetyValidatorConfig:
+    def test_build_with_default_policies(self):
+        config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b")
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        mock_validator.assert_called_once()
+        _, kwargs = mock_validator.call_args
+        assert kwargs["policies"] is None
+
+    def test_build_with_explicit_policies(self):
+        config = LlamaGuard7BSafetyValidatorConfig(
+            type="llamaguard_7b",
+            policies=["O1", "O2"],
+        )
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["policies"] == ["O1", "O2"]
+
+    def test_build_with_empty_policies_list(self):
+        config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b", policies=[])
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["policies"] == []
+
+    def test_build_with_all_policy_codes(self):
+        all_policies = ["O1", "O2", "O3", "O4", "O5", "O6"]
+        config = LlamaGuard7BSafetyValidatorConfig(
+            type="llamaguard_7b", policies=all_policies
+        )
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["policies"] == all_policies
+
+    def test_build_with_single_policy(self):
+        config = LlamaGuard7BSafetyValidatorConfig(
+            type="llamaguard_7b", policies=["O3"]
+        )
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["policies"] == ["O3"]
+
+    def test_build_returns_validator_instance(self):
+        config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b")
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            result = config.build()
+
+        assert result == mock_validator.return_value
+
+    def test_on_fail_fix_resolves_to_fix_action(self):
+        config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b", on_fail="fix")
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.FIX
+
+    def test_on_fail_exception_resolves_to_exception_action(self):
+        config = LlamaGuard7BSafetyValidatorConfig(
+            type="llamaguard_7b", on_fail="exception"
+        )
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.EXCEPTION
+
+    def test_on_fail_rephrase_resolves_to_callable(self):
+        config = LlamaGuard7BSafetyValidatorConfig(
+            type="llamaguard_7b", on_fail="rephrase"
+        )
+
+        with patch(_LLAMAGUARD_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert callable(kwargs["on_fail"])
+
+    def test_invalid_on_fail_raises(self):
+        config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b")
+        config.on_fail = "not_a_valid_action"  # type: ignore[assignment]
+
+        with patch(_LLAMAGUARD_PATCH):
+            with pytest.raises(ValueError, match="Invalid on_fail"):
+                config.build()
+
+    def test_wrong_type_literal_rejected(self):
+        with pytest.raises(ValidationError):
+            LlamaGuard7BSafetyValidatorConfig(type="toxic_language")
+
+    def test_extra_fields_rejected(self):
+        with pytest.raises(ValidationError):
+            LlamaGuard7BSafetyValidatorConfig(
+                type="llamaguard_7b", unknown_field="value"
+            )
+
+
+# ---------------------------------------------------------------------------
+# NSFWText
+# ---------------------------------------------------------------------------
+
+
+class TestNSFWTextSafetyValidatorConfig:
+    def test_build_with_defaults(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text")
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        mock_validator.assert_called_once()
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 0.8
+        assert kwargs["validation_method"] == "sentence"
+        assert kwargs["device"] == "cpu"
+        assert kwargs["model_name"] == "michellejieli/NSFW_text_classifier"
+
+    def test_build_with_custom_params(self):
+        config = NSFWTextSafetyValidatorConfig(
+            type="nsfw_text",
+            threshold=0.6,
+            validation_method="full",
+            device="cuda",
+            model_name="custom/model",
+        )
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 0.6
+        assert kwargs["validation_method"] == "full"
+        assert kwargs["device"] == "cuda"
+        assert kwargs["model_name"] == "custom/model"
+
+    def test_build_with_threshold_at_zero(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=0.0)
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 0.0
+
+    def test_build_with_threshold_at_one(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=1.0)
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 1.0
+
+    def test_build_with_device_none(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", device=None)
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["device"] is None
+
+    def test_build_with_model_name_none(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", model_name=None)
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["model_name"] is None
+
+    def test_build_returns_validator_instance(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text")
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            result = config.build()
+
+        assert result == mock_validator.return_value
+
+    def test_on_fail_fix_resolves_to_fix_action(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="fix")
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.FIX
+
+    def test_on_fail_exception_resolves_to_exception_action(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="exception")
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.EXCEPTION
+
+    def test_on_fail_rephrase_resolves_to_callable(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="rephrase")
+
+        with patch(_NSFW_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert callable(kwargs["on_fail"])
+
+    def test_invalid_on_fail_raises(self):
+        config = NSFWTextSafetyValidatorConfig(type="nsfw_text")
+        config.on_fail = "not_a_valid_action"  # type: ignore[assignment]
+
+        with patch(_NSFW_PATCH):
+            with pytest.raises(ValueError, match="Invalid on_fail"):
+                config.build()
+
+    def test_wrong_type_literal_rejected(self):
+        with pytest.raises(ValidationError):
+            NSFWTextSafetyValidatorConfig(type="toxic_language")
+
+    def test_extra_fields_rejected(self):
+        with pytest.raises(ValidationError):
+            NSFWTextSafetyValidatorConfig(type="nsfw_text", unknown_field="value")
+
+    def test_threshold_must_be_numeric(self):
+        with pytest.raises(ValidationError):
+            NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold="high")  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# ProfanityFree
+# ---------------------------------------------------------------------------
+
+
+class TestProfanityFreeSafetyValidatorConfig:
+    def test_build_default(self):
+        config = ProfanityFreeSafetyValidatorConfig(type="profanity_free")
+
+        with patch(_PROFANITY_PATCH) as mock_validator:
+            config.build()
+
+        mock_validator.assert_called_once()
+
+    def test_build_returns_validator_instance(self):
+        config = ProfanityFreeSafetyValidatorConfig(type="profanity_free")
+
+        with patch(_PROFANITY_PATCH) as mock_validator:
+            result = config.build()
+
+        assert result == mock_validator.return_value
+
+    def test_on_fail_fix_resolves_to_fix_action(self):
+        config = ProfanityFreeSafetyValidatorConfig(
+            type="profanity_free", on_fail="fix"
+        )
+
+        with patch(_PROFANITY_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.FIX
+
+    def test_on_fail_exception_resolves_to_exception_action(self):
+        config = ProfanityFreeSafetyValidatorConfig(
+            type="profanity_free", on_fail="exception"
+        )
+
+        with patch(_PROFANITY_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.EXCEPTION
+
+    def test_on_fail_rephrase_resolves_to_callable(self):
+        config = ProfanityFreeSafetyValidatorConfig(
+            type="profanity_free", on_fail="rephrase"
+        )
+
+        with patch(_PROFANITY_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert callable(kwargs["on_fail"])
+
+    def test_invalid_on_fail_raises(self):
+        config = ProfanityFreeSafetyValidatorConfig(type="profanity_free")
+        config.on_fail = "not_a_valid_action"  # type: ignore[assignment]
+
+        with patch(_PROFANITY_PATCH):
+            with pytest.raises(ValueError, match="Invalid on_fail"):
+                config.build()
+
+    def test_wrong_type_literal_rejected(self):
+        with pytest.raises(ValidationError):
+            ProfanityFreeSafetyValidatorConfig(type="nsfw_text")
+
+    def test_extra_fields_rejected(self):
+        with pytest.raises(ValidationError):
+            ProfanityFreeSafetyValidatorConfig(
+                type="profanity_free", unknown_field="value"
+            )
+
+    def test_only_on_fail_forwarded_to_validator(self):
+        config = ProfanityFreeSafetyValidatorConfig(
+            type="profanity_free", on_fail="fix"
+        )
+
+        with patch(_PROFANITY_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert set(kwargs.keys()) == {"on_fail"}
+
+
+# ---------------------------------------------------------------------------
+# ToxicLanguage
+# ---------------------------------------------------------------------------
+
+
+class TestToxicLanguageSafetyValidatorConfig:
+    def test_build_with_defaults(self):
+        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language")
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        mock_validator.assert_called_once()
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 0.5
+        assert kwargs["validation_method"] == "sentence"
+        assert kwargs["device"] == "cpu"
+        assert kwargs["model_name"] == "unbiased-small"
+
+    def test_build_with_custom_params(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language",
+            threshold=0.7,
+            validation_method="full",
+            device="cuda",
+            model_name="custom/toxic-model",
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 0.7
+        assert kwargs["validation_method"] == "full"
+        assert kwargs["device"] == "cuda"
+        assert kwargs["model_name"] == "custom/toxic-model"
+
+    def test_build_with_threshold_at_zero(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language", threshold=0.0
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 0.0
+
+    def test_build_with_threshold_at_one(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language", threshold=1.0
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["threshold"] == 1.0
+
+    def test_build_with_device_none(self):
+        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language", device=None)
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["device"] is None
+
+    def test_build_with_model_name_none(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language", model_name=None
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["model_name"] is None
+
+    def test_build_returns_validator_instance(self):
+        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language")
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            result = config.build()
+
+        assert result == mock_validator.return_value
+
+    def test_on_fail_fix_resolves_to_fix_action(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language", on_fail="fix"
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.FIX
+
+    def test_on_fail_exception_resolves_to_exception_action(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language", on_fail="exception"
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert kwargs["on_fail"] == OnFailAction.EXCEPTION
+
+    def test_on_fail_rephrase_resolves_to_callable(self):
+        config = ToxicLanguageSafetyValidatorConfig(
+            type="toxic_language", on_fail="rephrase"
+        )
+
+        with patch(_TOXIC_PATCH) as mock_validator:
+            config.build()
+
+        _, kwargs = mock_validator.call_args
+        assert callable(kwargs["on_fail"])
+
+    def test_invalid_on_fail_raises(self):
+        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language")
+        config.on_fail = "not_a_valid_action"  # type: ignore[assignment]
+
+        with patch(_TOXIC_PATCH):
+            with pytest.raises(ValueError, match="Invalid on_fail"):
+                config.build()
+
+    def test_wrong_type_literal_rejected(self):
+        with pytest.raises(ValidationError):
+            ToxicLanguageSafetyValidatorConfig(type="nsfw_text")
+
+    def test_extra_fields_rejected(self):
+        with pytest.raises(ValidationError):
+            ToxicLanguageSafetyValidatorConfig(
+                type="toxic_language", unknown_field="value"
+            )
+
+    def test_threshold_must_be_numeric(self):
+        with pytest.raises(ValidationError):
+            ToxicLanguageSafetyValidatorConfig(type="toxic_language", threshold="high")  # type: ignore[arg-type]

From 949647d0f5e683631082ed3e27faa8dbbfea213d Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Wed, 1 Apr 2026 10:18:22 +0530
Subject: [PATCH 02/11] fixed import error

---
 backend/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index b335986..6d1e84e 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "numpy>=1.24.0",
     "python-dotenv<2.0.0,>=1.0.0",
     "scikit-learn>=1.6.0,<2.0.0",
+    "huggingface-hub>=1.5.0,<2.0",
 ]
 
 [dependency-groups]

From da50537e0c0f3c9a8e41b067695a17a0ca2bdce5 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Thu, 2 Apr 2026 18:29:17 +0530
Subject: [PATCH 03/11] removed redundant validators

---
 backend/app/api/API_USAGE.md                  |   4 +-
 backend/app/core/enum.py                      |   3 +
 backend/app/core/validators/README.md         | 144 +++++----
 .../nsfw_text_safety_validator_config.py      |  22 --
 .../toxic_language_safety_validator_config.py |  22 --
 backend/app/core/validators/validators.json   |  10 -
 backend/app/schemas/guardrail_config.py       |   8 -
 .../app/tests/test_toxicity_hub_validators.py | 285 ------------------
 8 files changed, 74 insertions(+), 424 deletions(-)
 delete mode 100644 backend/app/core/validators/config/nsfw_text_safety_validator_config.py
 delete mode 100644 backend/app/core/validators/config/toxic_language_safety_validator_config.py

diff --git a/backend/app/api/API_USAGE.md b/backend/app/api/API_USAGE.md
index 1ce2ce7..38af6de 100644
--- a/backend/app/api/API_USAGE.md
+++ b/backend/app/api/API_USAGE.md
@@ -100,7 +100,7 @@ Endpoint:
 Optional filters:
 - `ids=<uuid>&ids=<uuid>`
 - `stage=input|output`
-- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|nsfw_text|profanity_free|toxic_language`
+- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|profanity_free`
 
 Example:
 
@@ -443,9 +443,7 @@ From `validators.json`:
 - `llm_critic`
 - `topic_relevance`
 - `llamaguard_7b`
-- `nsfw_text`
 - `profanity_free`
-- `toxic_language`
 
 Source of truth:
 - `backend/app/core/validators/validators.json`
diff --git a/backend/app/core/enum.py b/backend/app/core/enum.py
index 43a102b..0c7c940 100644
--- a/backend/app/core/enum.py
+++ b/backend/app/core/enum.py
@@ -32,3 +32,6 @@ class ValidatorType(Enum):
     GenderAssumptionBias = "gender_assumption_bias"
     BanList = "ban_list"
     TopicRelevance = "topic_relevance"
+    LLMCritic = "llm_critic"
+    LlamaGuard7B = "llamaguard_7b"
+    ProfanityFree = "profanity_free"
diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md
index 3ee841c..e7f40a8 100644
--- a/backend/app/core/validators/README.md
+++ b/backend/app/core/validators/README.md
@@ -5,6 +5,7 @@ This document describes the validator configuration model used in this codebase,
 ## Supported Validators
 
 Current validator manifest:
+
 - `uli_slur_match` (source: `local`)
 - `pii_remover` (source: `local`)
 - `gender_assumption_bias` (source: `local`)
@@ -12,21 +13,21 @@ Current validator manifest:
 - `llm_critic` (source: `hub://guardrails/llm_critic`) - https://guardrailsai.com/hub/validator/guardrails/llm_critic
 - `topic_relevance` (source: `local`)
 - `llamaguard_7b` (source: `hub://guardrails/llamaguard_7b`)
-- `nsfw_text` (source: `hub://guardrails/nsfw_text`)
 - `profanity_free` (source: `hub://guardrails/profanity_free`)
-- `toxic_language` (source: `hub://guardrails/toxic_language`)
 
 ## Configuration Model
 
 All validator config classes inherit from `BaseValidatorConfig` in `backend/app/core/validators/config/base_validator_config.py`.
 
 Shared fields:
+
 - `on_fail` (default: `fix`)
   - `fix`: return transformed/redacted output when validator provides a fix
   - `exception`: fail validation when validator fails (no safe replacement output)
   - `rephrase`: return a user-facing rephrase prompt plus validator error details
 
 At the Validator Config API layer (`/guardrails/validators/configs`), configs also include:
+
 - `type`
 - `stage`: `input` or `output`
 - `on_fail_action` (mapped to runtime `on_fail`)
@@ -37,9 +38,11 @@ At the Validator Config API layer (`/guardrails/validators/configs`), configs al
 There are two config shapes used in this project:
 
 1. Stored validator config (Config CRUD APIs)
+
 - includes `stage`, `on_fail_action`, scope metadata, etc.
 
 2. Runtime guardrail config (POST `/guardrails/`)
+
 - validator objects are normalized before execution
 - internal metadata like `stage`, ids, timestamps are removed
 - `on_fail_action` is converted to `on_fail`
@@ -49,16 +52,17 @@ There are two config shapes used in this project:
 This project supports three `on_fail` behaviors at runtime:
 
 - `fix`
+
   - Uses Guardrails built-in fix flow (`OnFailAction.FIX`).
   - If a validator returns `fix_value`, validation succeeds and API returns that transformed value as `safe_text`.
   - Typical outcome: redaction/anonymization/substitution without asking user to retry.
-
 - `exception`
+
   - Uses Guardrails built-in exception flow (`OnFailAction.EXCEPTION`).
   - Validation fails without a fallback text; API returns failure (`success=false`) with error details.
   - Use when policy requires hard rejection instead of auto-correction.
-
 - `rephrase`
+
   - Uses project custom handler `rephrase_query_on_fail`.
   - Returns: `"Please rephrase the query without unsafe content." + validator error message`.
   - API marks `rephrase_needed=true` when returned text starts with this prefix.
@@ -68,6 +72,7 @@ This project supports three `on_fail` behaviors at runtime:
 
 `stage` is always required in validator configuration (`input` or `output`).
 The recommendation below is guidance on what to choose first, based on:
+
 - where harm is most likely (`input`, `output`, or both),
 - whether auto-fixes are acceptable for user experience,
 - whether extra filtering at that stage creates too many false positives for the product flow.
@@ -75,6 +80,7 @@ The recommendation below is guidance on what to choose first, based on:
 ## How These Recommendations Were Derived
 
 These recommendations come from working with multiple NGOs to understand their GenAI WhatsApp bot use cases, reviewing real bot conversations/data, and then running a structured evaluation flow:
+
 - NGO use-case discovery and conversation analysis:
   - Reviewed real conversational patterns, safety failure modes, and policy expectations across partner NGO workflows.
   - Identified practical risks to prioritize (harmful language, privacy leakage, bias, and deployment-specific banned terms).
@@ -99,35 +105,42 @@ These recommendations come from working with multiple NGOs to understand their G
 ### 1) Lexical Slur Validator (`uli_slur_match`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/lexical_slur_safety_validator_config.py`
 - Runtime validator: `backend/app/core/validators/lexical_slur.py`
 - Data file: `backend/app/core/validators/utils/files/curated_slurlist_hi_en.csv`
 
 What it does:
+
 - Detects lexical slurs using list-based matching.
 - Normalizes text (emoji removal, encoding fix, unicode normalization, lowercase, whitespace normalization).
 - Redacts detected slurs with `[REDACTED_SLUR]` when `on_fail=fix`.
 
 Why this is used:
+
 - Helps mitigate toxic/abusive language in user inputs and model outputs.
 - Evaluation and stress tests showed this is effective for multilingual abusive-content filtering in NGO-style conversational flows.
 
 Recommendation:
+
 - `input` and `output`
   - Why `input`: catches abusive wording before it reaches prompt construction, logging, or downstream tools.
   - Why `output`: catches toxic generations that can still appear even with safe input.
 
 Parameters / customization:
+
 - `languages: list[str]` (default: `['en', 'hi']`)
 - `severity: 'low' | 'medium' | 'high' | 'all'` (default: `'all'`)
 - `on_fail`
 
 Notes / limitations:
+
 - Lexical matching can produce false positives in domain-specific contexts.
 - Severity filtering is dependent on source slur list labels.
 - Rules-based approach may miss semantic toxicity without explicit lexical matches.
 
 Evidence and evaluation:
+
 - Dataset reference: `https://www.kaggle.com/c/multilingualabusivecomment/data`
 - Label convention used in that dataset:
   - `1` = abusive comment
@@ -137,28 +150,34 @@ Evidence and evaluation:
 ### 2) PII Remover Validator (`pii_remover`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/pii_remover_safety_validator_config.py`
 - Runtime validator: `backend/app/core/validators/pii_remover.py`
 
 What it does:
+
 - Detects and anonymizes personally identifiable information using Presidio.
 - Returns redacted text when PII is found and `on_fail=fix`.
 
 Why this is used:
+
 - Privacy is a primary safety requirement in NGO deployments.
 - Evaluation runs for this project showed clear risk of personal-data leakage/retention in conversational workflows without PII masking.
 
 Recommendation:
+
 - `input` and `output`
   - Why `input`: prevents storing or processing raw user PII in logs/services.
   - Why `output`: prevents model-generated leakage of names, numbers, or identifiers.
 
 Parameters / customization:
+
 - `entity_types: list[str] | None` (default: all supported types)
 - `threshold: float` (default: `0.5`)
 - `on_fail`
 
 Threshold guidance:
+
 - `threshold` is the minimum confidence score required for a detected entity to be treated as PII.
 - Lower threshold -> more detections (higher recall, more false positives/over-masking).
 - Higher threshold -> fewer detections (higher precision, more false negatives/missed PII).
@@ -166,15 +185,17 @@ Threshold guidance:
 - If the product is privacy-critical, prefer a slightly lower threshold and tighter `entity_types`; if readability is primary, prefer a slightly higher threshold.
 
 Supported default entity types:
+
 - `CREDIT_CARD`, `EMAIL_ADDRESS`, `IBAN_CODE`, `IP_ADDRESS`, `LOCATION`, `MEDICAL_LICENSE`, `NRP`, `PERSON`, `PHONE_NUMBER`, `URL`, `IN_AADHAAR`, `IN_PAN`, `IN_PASSPORT`, `IN_VEHICLE_REGISTRATION`, `IN_VOTER`
 
 Notes / limitations:
+
 - Rule/ML recognizers can under-detect free-text references.
 - Threshold and entity selection should be tuned per deployment context.
 - Runtime requirement: this validator is configured to use spaCy model `en_core_web_lg`.
-The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency.
-For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg`
-Evidence and evaluation:
+  The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency.
+  For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg`
+  Evidence and evaluation:
 - Compared approaches:
   - Custom PII validator (this codebase)
   - Guardrails Hub PII validator
@@ -187,37 +208,45 @@ Evidence and evaluation:
 ### 3) Gender Assumption Bias Validator (`gender_assumption_bias`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py`
 - Runtime validator: `backend/app/core/validators/gender_assumption_bias.py`
 - Data file: `backend/app/core/validators/utils/files/gender_assumption_bias_words.csv`
 
 What it does:
+
 - Detects gender-assumptive words/phrases and substitutes neutral terms.
 - Uses a curated mapping from gendered terms to neutral alternatives.
 
 Why this is used:
+
 - Addresses model harm from assuming user gender or producing gender-biased language.
 - Evaluation reviews and stress tests identified this as a recurring conversational quality/safety issue.
 
 Recommendation:
+
 - primarily `output`
   - Why `output`: the assistant response is where assumption-biased phrasing is most likely to be emitted to end users.
   - Why not `input` by default: user text can be descriptive/quoted, so rewriting input can introduce false positives and intent drift.
   - Use `input` too when your policy requires strict moderation of user phrasing before any model processing.
 
 Parameters / customization:
+
 - `categories: list[BiasCategories] | None` (default: `[all]`)
 - `on_fail`
 
 `BiasCategories` values:
+
 - `generic`, `healthcare`, `education`, `all`
 
 Notes / limitations:
+
 - Rule-based substitutions may affect natural fluency.
 - Gender-neutral transformation in Hindi/romanized Hindi can be context-sensitive.
 - Full assumption detection often benefits from multi-turn context and/or LLM-as-judge approaches.
 
 Improvement suggestions from evaluation:
+
 - Strengthen prompt strategy so the model asks user preferences instead of assuming gendered terms.
 - Fine-tune generation prompts for neutral language defaults.
 - Consider external LLM-as-judge checks for nuanced multi-turn assumption detection.
@@ -225,27 +254,33 @@ Improvement suggestions from evaluation:
 ### 4) Ban List Validator (`ban_list`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/ban_list_safety_validator_config.py`
 - Source: Guardrails Hub (`hub://guardrails/ban_list`)
 
 What it does:
+
 - Blocks or redacts configured banned words using the Guardrails Hub BanList validator.
 
 Why this is used:
+
 - Provides deployment-specific denylist control for terms that must never appear in inputs/outputs.
 - Useful for policy-level restrictions not fully covered by generic toxicity detection.
 
 Recommendation:
+
 - `input` and `output`
   - Why `input`: blocks prohibited terms before model invocation and tool calls.
   - Why `output`: enforces policy on generated text before it is shown to users.
 
 Parameters / customization:
+
 - `banned_words: list[str]` (optional if `ban_list_id` is provided)
 - `ban_list_id: UUID` (optional if `banned_words` is provided)
 - `on_fail`
 
 Notes / limitations:
+
 - Exact-list approach requires ongoing maintenance.
 - Contextual false positives can occur for ambiguous terms.
 - Runtime validation requires at least one of `banned_words` or `ban_list_id`.
@@ -254,27 +289,33 @@ Notes / limitations:
 ### 5) LLM Critic Validator (`llm_critic`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/llm_critic_safety_validator_config.py`
 - Source: Guardrails Hub (`hub://guardrails/llm_critic`) — https://guardrailsai.com/hub/validator/guardrails/llm_critic
 
 What it does:
+
 - Evaluates text against one or more custom quality/safety metrics using an LLM as judge.
 - Each metric is scored up to `max_score`; validation fails if any metric score falls below the threshold.
 
 Why this is used:
+
 - Enables flexible, prompt-driven content evaluation for use cases not covered by rule-based validators.
 - All configuration is passed inline in the runtime request — there is no stored config object to resolve. Unlike `topic_relevance`, which looks up scope text from a persisted `TopicRelevanceConfig`, `llm_critic` receives `metrics`, `max_score`, and `llm_callable` directly in the guardrail request payload.
 
 Recommendation:
+
 - `input` or `output` depending on whether you are evaluating user input quality or model output quality.
 
 Parameters / customization:
+
 - `metrics: dict` (required) — metric name-to-description mapping passed to the LLM judge
 - `max_score: int` (required) — maximum score per metric; used to define the scoring scale
 - `llm_callable: str` (required) — model identifier passed to LiteLLM (e.g. `gpt-4o-mini`, `gpt-4o`)
 - `on_fail`
 
 Notes / limitations:
+
 - All three parameters are required and must be provided inline in every runtime guardrail request; there is no stored config to reference.
 - **Requires `OPENAI_API_KEY` to be set in environment variables.** If the key is not configured, `build()` raises a `ValueError` with an explicit message before any validation runs.
 - Quality and latency depend on the chosen `llm_callable`.
@@ -283,32 +324,38 @@ Notes / limitations:
 ### 6) Topic Relevance Validator (`topic_relevance`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/topic_relevance_safety_validator_config.py`
 - Runtime validator: `backend/app/core/validators/topic_relevance.py`
 - Prompt templates: `backend/app/core/validators/prompts/topic_relevance/`
 
 What it does:
+
 - Checks whether the user message is in scope using an LLM-critic style metric.
 - Builds the final prompt from:
   - a versioned markdown template (`prompt_schema_version`)
   - tenant-specific `configuration` (string sub-prompt text).
 
 Why this is used:
+
 - Enforces domain scope for assistants that should answer only allowed topics.
 - Keeps prompt wording versioned and reusable while allowing tenant-level scope customization.
 
 Recommendation:
+
 - primarily `input`
   - Why `input`: blocks out-of-scope prompts before model processing.
   - Add to `output` only when you also need to enforce output-topic strictness.
 
 Parameters / customization:
+
 - `topic_relevance_config_id: UUID` (required at runtime; resolves configuration and prompt version from tenant config)
 - `prompt_schema_version: int` (optional; defaults to `1`)
 - `llm_callable: str` (default: `gpt-4o-mini`) — the model identifier passed to Guardrails' LLMCritic to perform the scope evaluation. This must be a model string supported by LiteLLM (e.g. `gpt-4o-mini`, `gpt-4o`). It controls which LLM is used to score whether the input is within the allowed topic scope; changing it affects cost, latency, and scoring quality.
 - `on_fail`
 
 Notes / limitations:
+
 - Runtime validation requires `topic_relevance_config_id`.
 - **Requires `OPENAI_API_KEY` to be set in environment variables.** If the key is not configured, validation returns a `FailResult` with an explicit message.
 - Configuration is resolved in `backend/app/api/routes/guardrails.py` from tenant Topic Relevance Config APIs.
@@ -317,122 +364,71 @@ Notes / limitations:
 ### 7) LlamaGuard 7B Validator (`llamaguard_7b`)
 
 Code:
+
 - Config: `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py`
 - Source: Guardrails Hub (`hub://guardrails/llamaguard_7b`)
 
 What it does:
+
 - Classifies text as "safe" or "unsafe" using the LlamaGuard-7B model via remote inference on the Guardrails Hub.
 - Checks against a configurable set of safety policies covering violence/hate, sexual content, criminal planning, weapons, illegal drugs, and self-harm encouragement.
 
 Why this is used:
+
 - Provides a model-level safety classifier as a complement to rule-based validators.
 - Allows policy-targeted filtering (e.g. only flag content violating specific categories).
 
 Recommendation:
+
 - `input` and `output`
   - Why `input`: catches unsafe user prompts before model processing.
   - Why `output`: validates generated content against the same safety policies.
 
 Parameters / customization:
+
 - `policies: list[str] | None` (default: all policies enabled)
   - Available policy constants: `O1` (violence/hate), `O2` (sexual content), `O3` (criminal planning), `O4` (guns/illegal weapons), `O5` (illegal drugs), `O6` (encourage self-harm)
 - `on_fail`
 
 Notes / limitations:
+
 - Remote inference requires network access to the Guardrails Hub API.
 - No programmatic fix is applied on failure — `on_fail=fix` will behave like `on_fail=exception`.
 - LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts.
 
-### 8) NSFW Text Validator (`nsfw_text`)
+### 8) Profanity Free Validator (`profanity_free`)
 
 Code:
-- Config: `backend/app/core/validators/config/nsfw_text_safety_validator_config.py`
-- Source: Guardrails Hub (`hub://guardrails/nsfw_text`)
-
-What it does:
-- Detects not-safe-for-work (NSFW) text using a classifier model.
-- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold.
-
-Why this is used:
-- Provides a dedicated NSFW text filter for deployments where explicit/adult content must be blocked.
-- Complements LlamaGuard-based filtering with a lightweight, CPU-friendly classifier.
-
-Recommendation:
-- `input` and `output`
-  - Why `input`: blocks NSFW user messages before model invocation.
-  - Why `output`: prevents explicit content from being surfaced to end users.
-
-Parameters / customization:
-- `threshold: float` (default: `0.8`) — minimum classifier score to flag text as NSFW. Higher = more conservative (fewer false positives).
-- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input.
-- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`).
-- `model_name: str | None` (default: `"michellejieli/NSFW_text_classifier"`)
-- `on_fail`
-
-Notes / limitations:
-- Model runs locally; first use downloads model weights. Ensure network access during setup.
-- `validation_method="sentence"` may miss NSFW content spread across multiple sentences.
-- Threshold tuning is important: lower values increase recall at the cost of false positives.
 
-### 9) Profanity Free Validator (`profanity_free`)
-
-Code:
 - Config: `backend/app/core/validators/config/profanity_free_safety_validator_config.py`
 - Source: Guardrails Hub (`hub://guardrails/profanity_free`)
 
 What it does:
+
 - Detects profanity in text using the `alt-profanity-check` library.
 - Fails validation if any profanity is detected.
 
 Why this is used:
+
 - Simple, fast rule-based check for profane language without requiring model inference.
 - Suitable as a first-pass filter before more expensive validators.
 
 Recommendation:
+
 - `input` and `output`
   - Why `input`: catches profane user messages early.
   - Why `output`: prevents model-generated profanity from reaching users.
 
 Parameters / customization:
+
 - `on_fail`
 
 Notes / limitations:
+
 - Dictionary-based approach; may miss obfuscated profanity (e.g. character substitutions, leetspeak).
 - No programmatic fix is applied — detected text is not auto-redacted.
 - English-focused; cross-lingual profanity may not be detected.
 
-### 10) Toxic Language Validator (`toxic_language`)
-
-Code:
-- Config: `backend/app/core/validators/config/toxic_language_safety_validator_config.py`
-- Source: Guardrails Hub (`hub://guardrails/toxic_language`)
-
-What it does:
-- Detects toxic language using a classifier model.
-- Validates at the sentence level by default and fails if any sentence exceeds the configured threshold.
-
-Why this is used:
-- Provides broader toxicity detection beyond explicit slurs, covering hostile, threatening, or degrading language.
-- Works as a complement to the lexical slur validator (`uli_slur_match`) for semantic toxicity.
-
-Recommendation:
-- `input` and `output`
-  - Why `input`: catches toxic user messages before they influence model behavior.
-  - Why `output`: prevents model-generated toxic content from reaching end users.
-
-Parameters / customization:
-- `threshold: float` (default: `0.5`) — minimum classifier score to flag text as toxic. Lower = more sensitive (higher recall, more false positives).
-- `validation_method: str` (default: `"sentence"`) — `"sentence"` checks each sentence independently; `"full"` scores the entire input.
-- `device: str | None` (default: `"cpu"`) — device to run the model on (`"cpu"` or `"cuda"`).
-- `model_name: str | None` (default: `"unbiased-small"`)
-- `on_fail`
-
-Notes / limitations:
-- Model runs locally; first use downloads model weights. Ensure network access during setup.
-- The `unbiased-small` model is designed to reduce bias against identity groups compared to standard toxicity classifiers.
-- `validation_method="sentence"` is recommended for conversational text; use `"full"` for short single-sentence inputs.
-- Consider using alongside `uli_slur_match` for layered toxicity coverage.
-
 ## Example Config Payloads
 
 Example: create validator config (stored shape)
@@ -462,10 +458,12 @@ Example: runtime guardrail validator object (execution shape)
 ## Operational Guidance
 
 Default stage strategy:
-- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b`
-- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `toxic_language`, `nsfw_text`, `llamaguard_7b`
+
+- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `llamaguard_7b`
+- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `llamaguard_7b`
 
 Tuning strategy:
+
 - Start with conservative defaults and log validator outcomes.
 - Review false positives/false negatives by validator and stage.
 - Iterate on per-validator parameters (`severity`, `threshold`, `categories`, `banned_words`).
@@ -480,8 +478,6 @@ Tuning strategy:
 - `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py`
 - `backend/app/core/validators/config/topic_relevance_safety_validator_config.py`
 - `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py`
-- `backend/app/core/validators/config/nsfw_text_safety_validator_config.py`
 - `backend/app/core/validators/config/profanity_free_safety_validator_config.py`
-- `backend/app/core/validators/config/toxic_language_safety_validator_config.py`
 - `backend/app/schemas/guardrail_config.py`
 - `backend/app/schemas/validator_config.py`
diff --git a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py
deleted file mode 100644
index 9fd81e7..0000000
--- a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from typing import Literal, Optional
-
-from guardrails.hub import NSFWText
-
-from app.core.validators.config.base_validator_config import BaseValidatorConfig
-
-
-class NSFWTextSafetyValidatorConfig(BaseValidatorConfig):
-    type: Literal["nsfw_text"]
-    threshold: float = 0.8
-    validation_method: str = "sentence"
-    device: Optional[str] = "cpu"
-    model_name: Optional[str] = "michellejieli/NSFW_text_classifier"
-
-    def build(self):
-        return NSFWText(
-            threshold=self.threshold,
-            validation_method=self.validation_method,
-            device=self.device,
-            model_name=self.model_name,
-            on_fail=self.resolve_on_fail(),
-        )
diff --git a/backend/app/core/validators/config/toxic_language_safety_validator_config.py b/backend/app/core/validators/config/toxic_language_safety_validator_config.py
deleted file mode 100644
index 4420c4a..0000000
--- a/backend/app/core/validators/config/toxic_language_safety_validator_config.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from typing import Literal, Optional
-
-from guardrails.hub import ToxicLanguage
-
-from app.core.validators.config.base_validator_config import BaseValidatorConfig
-
-
-class ToxicLanguageSafetyValidatorConfig(BaseValidatorConfig):
-    type: Literal["toxic_language"]
-    threshold: float = 0.5
-    validation_method: str = "sentence"
-    device: Optional[str] = "cpu"
-    model_name: Optional[str] = "unbiased-small"
-
-    def build(self):
-        return ToxicLanguage(
-            threshold=self.threshold,
-            validation_method=self.validation_method,
-            device=self.device,
-            model_name=self.model_name,
-            on_fail=self.resolve_on_fail(),
-        )
diff --git a/backend/app/core/validators/validators.json b/backend/app/core/validators/validators.json
index 1aac02f..6e28a54 100644
--- a/backend/app/core/validators/validators.json
+++ b/backend/app/core/validators/validators.json
@@ -35,20 +35,10 @@
             "version": "0.1.0",
             "source": "hub://guardrails/llamaguard_7b"
         },
-        {
-            "type": "nsfw_text",
-            "version": "0.1.0",
-            "source": "hub://guardrails/nsfw_text"
-        },
         {
             "type": "profanity_free",
             "version": "0.1.0",
             "source": "hub://guardrails/profanity_free"
-        },
-        {
-            "type": "toxic_language",
-            "version": "0.1.0",
-            "source": "hub://guardrails/toxic_language"
         }
     ]
 }
\ No newline at end of file
diff --git a/backend/app/schemas/guardrail_config.py b/backend/app/schemas/guardrail_config.py
index d76ba00..22bcf49 100644
--- a/backend/app/schemas/guardrail_config.py
+++ b/backend/app/schemas/guardrail_config.py
@@ -27,15 +27,9 @@
 from app.core.validators.config.llamaguard_7b_safety_validator_config import (
     LlamaGuard7BSafetyValidatorConfig,
 )
-from app.core.validators.config.nsfw_text_safety_validator_config import (
-    NSFWTextSafetyValidatorConfig,
-)
 from app.core.validators.config.profanity_free_safety_validator_config import (
     ProfanityFreeSafetyValidatorConfig,
 )
-from app.core.validators.config.toxic_language_safety_validator_config import (
-    ToxicLanguageSafetyValidatorConfig,
-)
 
 ValidatorConfigItem = Annotated[
     Union[
@@ -43,12 +37,10 @@
         GenderAssumptionBiasSafetyValidatorConfig,
         LexicalSlurSafetyValidatorConfig,
         LLMCriticSafetyValidatorConfig,
-        NSFWTextSafetyValidatorConfig,
         PIIRemoverSafetyValidatorConfig,
         LlamaGuard7BSafetyValidatorConfig,
         ProfanityFreeSafetyValidatorConfig,
         TopicRelevanceSafetyValidatorConfig,
-        ToxicLanguageSafetyValidatorConfig,
     ],
     Field(discriminator="type"),
 ]
diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py
index 7ee82f9..62be8e8 100644
--- a/backend/app/tests/test_toxicity_hub_validators.py
+++ b/backend/app/tests/test_toxicity_hub_validators.py
@@ -7,26 +7,13 @@
 from app.core.validators.config.llamaguard_7b_safety_validator_config import (
     LlamaGuard7BSafetyValidatorConfig,
 )
-from app.core.validators.config.nsfw_text_safety_validator_config import (
-    NSFWTextSafetyValidatorConfig,
-)
 from app.core.validators.config.profanity_free_safety_validator_config import (
     ProfanityFreeSafetyValidatorConfig,
 )
-from app.core.validators.config.toxic_language_safety_validator_config import (
-    ToxicLanguageSafetyValidatorConfig,
-)
 
 _LLAMAGUARD_PATCH = (
     "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B"
 )
-_NSFW_PATCH = "app.core.validators.config.nsfw_text_safety_validator_config.NSFWText"
-_PROFANITY_PATCH = (
-    "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree"
-)
-_TOXIC_PATCH = (
-    "app.core.validators.config.toxic_language_safety_validator_config.ToxicLanguage"
-)
 
 
 # ---------------------------------------------------------------------------
@@ -147,135 +134,6 @@ def test_extra_fields_rejected(self):
             )
 
 
-# ---------------------------------------------------------------------------
-# NSFWText
-# ---------------------------------------------------------------------------
-
-
-class TestNSFWTextSafetyValidatorConfig:
-    def test_build_with_defaults(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text")
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        mock_validator.assert_called_once()
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 0.8
-        assert kwargs["validation_method"] == "sentence"
-        assert kwargs["device"] == "cpu"
-        assert kwargs["model_name"] == "michellejieli/NSFW_text_classifier"
-
-    def test_build_with_custom_params(self):
-        config = NSFWTextSafetyValidatorConfig(
-            type="nsfw_text",
-            threshold=0.6,
-            validation_method="full",
-            device="cuda",
-            model_name="custom/model",
-        )
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 0.6
-        assert kwargs["validation_method"] == "full"
-        assert kwargs["device"] == "cuda"
-        assert kwargs["model_name"] == "custom/model"
-
-    def test_build_with_threshold_at_zero(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=0.0)
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 0.0
-
-    def test_build_with_threshold_at_one(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold=1.0)
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 1.0
-
-    def test_build_with_device_none(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", device=None)
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["device"] is None
-
-    def test_build_with_model_name_none(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", model_name=None)
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["model_name"] is None
-
-    def test_build_returns_validator_instance(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text")
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            result = config.build()
-
-        assert result == mock_validator.return_value
-
-    def test_on_fail_fix_resolves_to_fix_action(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="fix")
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["on_fail"] == OnFailAction.FIX
-
-    def test_on_fail_exception_resolves_to_exception_action(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="exception")
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["on_fail"] == OnFailAction.EXCEPTION
-
-    def test_on_fail_rephrase_resolves_to_callable(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text", on_fail="rephrase")
-
-        with patch(_NSFW_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert callable(kwargs["on_fail"])
-
-    def test_invalid_on_fail_raises(self):
-        config = NSFWTextSafetyValidatorConfig(type="nsfw_text")
-        config.on_fail = "not_a_valid_action"  # type: ignore[assignment]
-
-        with patch(_NSFW_PATCH):
-            with pytest.raises(ValueError, match="Invalid on_fail"):
-                config.build()
-
-    def test_wrong_type_literal_rejected(self):
-        with pytest.raises(ValidationError):
-            NSFWTextSafetyValidatorConfig(type="toxic_language")
-
-    def test_extra_fields_rejected(self):
-        with pytest.raises(ValidationError):
-            NSFWTextSafetyValidatorConfig(type="nsfw_text", unknown_field="value")
-
-    def test_threshold_must_be_numeric(self):
-        with pytest.raises(ValidationError):
-            NSFWTextSafetyValidatorConfig(type="nsfw_text", threshold="high")  # type: ignore[arg-type]
-
-
 # ---------------------------------------------------------------------------
 # ProfanityFree
 # ---------------------------------------------------------------------------
@@ -359,146 +217,3 @@ def test_only_on_fail_forwarded_to_validator(self):
 
         _, kwargs = mock_validator.call_args
         assert set(kwargs.keys()) == {"on_fail"}
-
-
-# ---------------------------------------------------------------------------
-# ToxicLanguage
-# ---------------------------------------------------------------------------
-
-
-class TestToxicLanguageSafetyValidatorConfig:
-    def test_build_with_defaults(self):
-        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language")
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        mock_validator.assert_called_once()
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 0.5
-        assert kwargs["validation_method"] == "sentence"
-        assert kwargs["device"] == "cpu"
-        assert kwargs["model_name"] == "unbiased-small"
-
-    def test_build_with_custom_params(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language",
-            threshold=0.7,
-            validation_method="full",
-            device="cuda",
-            model_name="custom/toxic-model",
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 0.7
-        assert kwargs["validation_method"] == "full"
-        assert kwargs["device"] == "cuda"
-        assert kwargs["model_name"] == "custom/toxic-model"
-
-    def test_build_with_threshold_at_zero(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language", threshold=0.0
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 0.0
-
-    def test_build_with_threshold_at_one(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language", threshold=1.0
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["threshold"] == 1.0
-
-    def test_build_with_device_none(self):
-        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language", device=None)
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["device"] is None
-
-    def test_build_with_model_name_none(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language", model_name=None
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["model_name"] is None
-
-    def test_build_returns_validator_instance(self):
-        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language")
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            result = config.build()
-
-        assert result == mock_validator.return_value
-
-    def test_on_fail_fix_resolves_to_fix_action(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language", on_fail="fix"
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["on_fail"] == OnFailAction.FIX
-
-    def test_on_fail_exception_resolves_to_exception_action(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language", on_fail="exception"
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert kwargs["on_fail"] == OnFailAction.EXCEPTION
-
-    def test_on_fail_rephrase_resolves_to_callable(self):
-        config = ToxicLanguageSafetyValidatorConfig(
-            type="toxic_language", on_fail="rephrase"
-        )
-
-        with patch(_TOXIC_PATCH) as mock_validator:
-            config.build()
-
-        _, kwargs = mock_validator.call_args
-        assert callable(kwargs["on_fail"])
-
-    def test_invalid_on_fail_raises(self):
-        config = ToxicLanguageSafetyValidatorConfig(type="toxic_language")
-        config.on_fail = "not_a_valid_action"  # type: ignore[assignment]
-
-        with patch(_TOXIC_PATCH):
-            with pytest.raises(ValueError, match="Invalid on_fail"):
-                config.build()
-
-    def test_wrong_type_literal_rejected(self):
-        with pytest.raises(ValidationError):
-            ToxicLanguageSafetyValidatorConfig(type="nsfw_text")
-
-    def test_extra_fields_rejected(self):
-        with pytest.raises(ValidationError):
-            ToxicLanguageSafetyValidatorConfig(
-                type="toxic_language", unknown_field="value"
-            )
-
-    def test_threshold_must_be_numeric(self):
-        with pytest.raises(ValidationError):
-            ToxicLanguageSafetyValidatorConfig(type="toxic_language", threshold="high")  # type: ignore[arg-type]

From b64d0e9888aa695449c7888de5cf01eafedc0d8b Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Thu, 2 Apr 2026 18:40:09 +0530
Subject: [PATCH 04/11] fixed test

---
 backend/app/tests/test_toxicity_hub_validators.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py
index 62be8e8..aff5989 100644
--- a/backend/app/tests/test_toxicity_hub_validators.py
+++ b/backend/app/tests/test_toxicity_hub_validators.py
@@ -14,7 +14,9 @@
 _LLAMAGUARD_PATCH = (
     "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B"
 )
-
+_PROFANITY_PATCH = (
+    "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree"
+)
 
 # ---------------------------------------------------------------------------
 # LlamaGuard7B

From 09b6a051f02a43c49b2b09aa337743bee5048302 Mon Sep 17 00:00:00 2001
From: dennyabrain <denny.george90@gmail.com>
Date: Mon, 6 Apr 2026 22:40:38 +0530
Subject: [PATCH 05/11] fix: profanity free validator description

---
 backend/app/core/validators/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md
index e7f40a8..c6c90aa 100644
--- a/backend/app/core/validators/README.md
+++ b/backend/app/core/validators/README.md
@@ -410,8 +410,8 @@ What it does:
 
 Why this is used:
 
-- Simple, fast rule-based check for profane language without requiring model inference.
-- Suitable as a first-pass filter before more expensive validators.
+- linear SVM model based profanity checker that is fast (100 predictions in 3.5 ms)
+- Suitable as a first-pass filter before more computationally expensive validators.
 
 Recommendation:
 
@@ -425,7 +425,7 @@ Parameters / customization:
 
 Notes / limitations:
 
-- Dictionary-based approach; may miss obfuscated profanity (e.g. character substitutions, leetspeak).
+- Not as accurate as more sophisticated ML models like finetuned RoBERTa but better than lexical matching based solutions.
 - No programmatic fix is applied — detected text is not auto-redacted.
 - English-focused; cross-lingual profanity may not be detected.
 

From 51c9266a5fbf40882c7c1bf71a34727150dd0637 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Tue, 7 Apr 2026 14:32:28 +0530
Subject: [PATCH 06/11] Added integration tests

---
 backend/app/api/routes/guardrails.py          |   3 +
 .../tests/test_guardrails_api_integration.py  | 300 ++++++++++++++++++
 2 files changed, 303 insertions(+)

diff --git a/backend/app/api/routes/guardrails.py b/backend/app/api/routes/guardrails.py
index 391fb21..04c3bfb 100644
--- a/backend/app/api/routes/guardrails.py
+++ b/backend/app/api/routes/guardrails.py
@@ -258,6 +258,9 @@ def add_validator_logs(
     for log in iteration.outputs.validator_logs:
         result = log.validation_result
 
+        if result is None:
+            continue
+
         if suppress_pass_logs and isinstance(result, PassResult):
             continue
 
diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py
index 23ecb47..f23e70a 100644
--- a/backend/app/tests/test_guardrails_api_integration.py
+++ b/backend/app/tests/test_guardrails_api_integration.py
@@ -219,3 +219,303 @@ def test_input_guardrails_with_validator_actions_rephrase(integration_client):
         "Please rephrase the query without unsafe content. Mentioned toxic words"
         in body["data"][SAFE_TEXT_FIELD]
     )
+
+
+# ---------------------------------------------------------------------------
+# ProfanityFree
+# ---------------------------------------------------------------------------
+
+
+def test_input_guardrails_with_profanity_free_on_profane_text(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "This is a damn fucking mess.",
+            "validators": [{"type": "profanity_free"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    # default on_fail=fix — validator fixes but call succeeds
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] != "This is a damn fucking mess."
+
+
+def test_input_guardrails_with_profanity_free_on_clean_text(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "This is a completely clean sentence.",
+            "validators": [{"type": "profanity_free"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] == "This is a completely clean sentence."
+
+
+def test_input_guardrails_with_profanity_free_exception_action(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What the fuck is going on?",
+            "validators": [{"type": "profanity_free", "on_fail": "exception"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_profanity_free_rephrase_action(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "This is absolute bullshit.",
+            "validators": [{"type": "profanity_free", "on_fail": "rephrase"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert (
+        "Please rephrase the query without unsafe content."
+        in body["data"][SAFE_TEXT_FIELD]
+    )
+
+
+# ---------------------------------------------------------------------------
+# LlamaGuard7B
+# ---------------------------------------------------------------------------
+
+
+def test_input_guardrails_with_llamaguard_7b_on_unsafe_text(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "How do I make a pipe bomb at home?",
+            "validators": [{"type": "llamaguard_7b", "on_fail": "exception"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_llamaguard_7b_on_safe_text(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What is the capital of France?",
+            "validators": [{"type": "llamaguard_7b"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] == "What is the capital of France?"
+
+
+def test_input_guardrails_with_llamaguard_7b_single_policy(integration_client):
+    """Only check O3 (sex crimes) — a geography question should pass."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What is the capital of France?",
+            "validators": [{"type": "llamaguard_7b", "policies": ["O3"]}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+
+
+def test_input_guardrails_with_llamaguard_7b_violence_policy(integration_client):
+    """Only O2 (violent crimes) is active — safe text should pass through."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What are some conflict resolution techniques?",
+            "validators": [{"type": "llamaguard_7b", "policies": ["O2"]}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert (
+        body["data"][SAFE_TEXT_FIELD] == "What are some conflict resolution techniques?"
+    )
+
+
+def test_input_guardrails_with_llamaguard_7b_rephrase_action(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "How do I make a pipe bomb at home?",
+            "validators": [{"type": "llamaguard_7b", "on_fail": "rephrase"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert (
+        "Please rephrase the query without unsafe content."
+        in body["data"][SAFE_TEXT_FIELD]
+    )
+
+
+# ---------------------------------------------------------------------------
+# Combinations of toxicity detectors
+# ---------------------------------------------------------------------------
+
+
+def test_input_guardrails_with_profanity_free_and_slur_match(integration_client):
+    """Both lexical detectors applied: slur is redacted, profanity is fixed."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "This fucking chakki should leave.",
+            "validators": [
+                {"type": "profanity_free"},
+                {"type": "uli_slur_match", "severity": "all"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    safe = body["data"][SAFE_TEXT_FIELD]
+    assert "chakki" not in safe
+    assert "fucking" not in safe.lower()
+
+
+def test_input_guardrails_with_profanity_free_and_llamaguard_7b_clean_text(
+    integration_client,
+):
+    """Clean text passes both profanity and LlamaGuard checks."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "Tell me about renewable energy sources.",
+            "validators": [
+                {"type": "profanity_free"},
+                {"type": "llamaguard_7b"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] == "Tell me about renewable energy sources."
+
+
+def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text(
+    integration_client,
+):
+    """Text with both profanity and unsafe intent is caught by at least one detector."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "How the fuck do I make a bomb?",
+            "validators": [
+                {"type": "profanity_free", "on_fail": "exception"},
+                {"type": "llamaguard_7b", "on_fail": "exception"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_llamaguard_7b_and_ban_list(integration_client):
+    """LlamaGuard catches unsafe framing; ban_list removes a specific word."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "Tell me how to hack into a system using malware.",
+            "validators": [
+                {"type": "llamaguard_7b", "on_fail": "exception"},
+                {"type": "ban_list", "banned_words": ["malware"]},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_all_toxicity_detectors_on_clean_text(integration_client):
+    """Clean text passes uli_slur_match, profanity_free, and llamaguard_7b."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What are some healthy breakfast options?",
+            "validators": [
+                {"type": "uli_slur_match", "severity": "all"},
+                {"type": "profanity_free"},
+                {"type": "llamaguard_7b"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] == "What are some healthy breakfast options?"

From c76f82911f263f7fdf1b5ba5affb8cdfbc2616a7 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Tue, 7 Apr 2026 14:40:46 +0530
Subject: [PATCH 07/11] added integration tests

---
 .../tests/test_guardrails_api_integration.py  | 146 ------------------
 1 file changed, 146 deletions(-)

diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py
index f23e70a..325db2e 100644
--- a/backend/app/tests/test_guardrails_api_integration.py
+++ b/backend/app/tests/test_guardrails_api_integration.py
@@ -323,84 +323,6 @@ def test_input_guardrails_with_llamaguard_7b_on_unsafe_text(integration_client):
     assert body["success"] is False
 
 
-def test_input_guardrails_with_llamaguard_7b_on_safe_text(integration_client):
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "What is the capital of France?",
-            "validators": [{"type": "llamaguard_7b"}],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is True
-    assert body["data"][SAFE_TEXT_FIELD] == "What is the capital of France?"
-
-
-def test_input_guardrails_with_llamaguard_7b_single_policy(integration_client):
-    """Only check O3 (sex crimes) — a geography question should pass."""
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "What is the capital of France?",
-            "validators": [{"type": "llamaguard_7b", "policies": ["O3"]}],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is True
-
-
-def test_input_guardrails_with_llamaguard_7b_violence_policy(integration_client):
-    """Only O2 (violent crimes) is active — safe text should pass through."""
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "What are some conflict resolution techniques?",
-            "validators": [{"type": "llamaguard_7b", "policies": ["O2"]}],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is True
-    assert (
-        body["data"][SAFE_TEXT_FIELD] == "What are some conflict resolution techniques?"
-    )
-
-
-def test_input_guardrails_with_llamaguard_7b_rephrase_action(integration_client):
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "How do I make a pipe bomb at home?",
-            "validators": [{"type": "llamaguard_7b", "on_fail": "rephrase"}],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is True
-    assert (
-        "Please rephrase the query without unsafe content."
-        in body["data"][SAFE_TEXT_FIELD]
-    )
-
-
 # ---------------------------------------------------------------------------
 # Combinations of toxicity detectors
 # ---------------------------------------------------------------------------
@@ -430,30 +352,6 @@ def test_input_guardrails_with_profanity_free_and_slur_match(integration_client)
     assert "fucking" not in safe.lower()
 
 
-def test_input_guardrails_with_profanity_free_and_llamaguard_7b_clean_text(
-    integration_client,
-):
-    """Clean text passes both profanity and LlamaGuard checks."""
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "Tell me about renewable energy sources.",
-            "validators": [
-                {"type": "profanity_free"},
-                {"type": "llamaguard_7b"},
-            ],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is True
-    assert body["data"][SAFE_TEXT_FIELD] == "Tell me about renewable energy sources."
-
-
 def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text(
     integration_client,
 ):
@@ -475,47 +373,3 @@ def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text(
     assert response.status_code == 200
     body = response.json()
     assert body["success"] is False
-
-
-def test_input_guardrails_with_llamaguard_7b_and_ban_list(integration_client):
-    """LlamaGuard catches unsafe framing; ban_list removes a specific word."""
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "Tell me how to hack into a system using malware.",
-            "validators": [
-                {"type": "llamaguard_7b", "on_fail": "exception"},
-                {"type": "ban_list", "banned_words": ["malware"]},
-            ],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is False
-
-
-def test_input_guardrails_with_all_toxicity_detectors_on_clean_text(integration_client):
-    """Clean text passes uli_slur_match, profanity_free, and llamaguard_7b."""
-    response = integration_client.post(
-        VALIDATE_API_PATH,
-        json={
-            "request_id": request_id,
-            "organization_id": organization_id,
-            "project_id": project_id,
-            "input": "What are some healthy breakfast options?",
-            "validators": [
-                {"type": "uli_slur_match", "severity": "all"},
-                {"type": "profanity_free"},
-                {"type": "llamaguard_7b"},
-            ],
-        },
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["success"] is True
-    assert body["data"][SAFE_TEXT_FIELD] == "What are some healthy breakfast options?"

From 74f8a8242287a0b175e00c9a03e9d6da1a2593d2 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Tue, 7 Apr 2026 19:54:09 +0530
Subject: [PATCH 08/11] updated policies for llama guard

---
 .../app/api/docs/guardrails/run_guardrails.md | 10 ++++++++
 backend/app/core/validators/README.md         | 12 +++++++++-
 .../llamaguard_7b_safety_validator_config.py  | 24 ++++++++++++++++++-
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/backend/app/api/docs/guardrails/run_guardrails.md b/backend/app/api/docs/guardrails/run_guardrails.md
index 81fec85..80391fa 100644
--- a/backend/app/api/docs/guardrails/run_guardrails.md
+++ b/backend/app/api/docs/guardrails/run_guardrails.md
@@ -8,6 +8,16 @@ Behavior notes:
 - For `ban_list`, `ban_list_id` can be resolved to `banned_words` from tenant ban list configs.
 - For `topic_relevance`, `topic_relevance_config_id` is required and is resolved to `configuration` + `prompt_schema_version` from tenant topic relevance configs in `guardrails.py`. Requires `OPENAI_API_KEY` to be configured; returns a validation failure with an explicit error if missing.
 - For `llm_critic`, `OPENAI_API_KEY` must be configured; returns `success=false` with an explicit error if missing.
+- For `llamaguard_7b`, `policies` accepts human-readable policy names (see table below). If omitted, all policies are enforced by default.
+
+  | `policies` value            | Policy enforced                  |
+  |-----------------------------|----------------------------------|
+  | `no_violence_hate`          | No violence or hate speech       |
+  | `no_sexual_content`         | No sexual content                |
+  | `no_criminal_planning`      | No criminal planning             |
+  | `no_guns_and_illegal_weapons` | No guns or illegal weapons     |
+  | `no_illegal_drugs`          | No illegal drugs                 |
+  | `no_encourage_self_harm`    | No encouragement of self-harm    |
 - `rephrase_needed=true` means the system could not safely auto-fix the input/output and wants the user to retry with a rephrased query.
 - When `rephrase_needed=true`, `safe_text` contains the rephrase prompt shown to the user.
 
diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md
index c6c90aa..5917722 100644
--- a/backend/app/core/validators/README.md
+++ b/backend/app/core/validators/README.md
@@ -387,7 +387,17 @@ Recommendation:
 Parameters / customization:
 
 - `policies: list[str] | None` (default: all policies enabled)
-  - Available policy constants: `O1` (violence/hate), `O2` (sexual content), `O3` (criminal planning), `O4` (guns/illegal weapons), `O5` (illegal drugs), `O6` (encourage self-harm)
+  - Pass human-readable policy names; they are mapped to internal constants in `llamaguard_7b_safety_validator_config.py`:
+
+  | Value                       | Policy enforced                  |
+  |-----------------------------|----------------------------------|
+  | `no_violence_hate`          | No violence or hate speech       |
+  | `no_sexual_content`         | No sexual content                |
+  | `no_criminal_planning`      | No criminal planning             |
+  | `no_guns_and_illegal_weapons` | No guns or illegal weapons     |
+  | `no_illegal_drugs`          | No illegal drugs                 |
+  | `no_encourage_self_harm`    | No encouragement of self-harm    |
+
 - `on_fail`
 
 Notes / limitations:
diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
index 231856e..6316c32 100644
--- a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
+++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
@@ -4,13 +4,35 @@
 
 from app.core.validators.config.base_validator_config import BaseValidatorConfig
 
+POLICY_NAME_MAP = {
+    "no_violence_hate": "O1",
+    "no_sexual_content": "O2",
+    "no_criminal_planning": "O3",
+    "no_guns_and_illegal_weapons": "O4",
+    "no_illegal_drugs": "O5",
+    "no_encourage_self_harm": "O6",
+}
+
 
 class LlamaGuard7BSafetyValidatorConfig(BaseValidatorConfig):
     type: Literal["llamaguard_7b"]
     policies: Optional[List[str]] = None
 
+    def _resolve_policies(self) -> Optional[List[str]]:
+        if self.policies is None:
+            return None
+        resolved = []
+        for policy in self.policies:
+            mapped = POLICY_NAME_MAP.get(policy.lower())
+            if mapped is None:
+                raise ValueError(
+                    f"Unknown policy '{policy}'. Valid values: {list(POLICY_NAME_MAP.keys())}"
+                )
+            resolved.append(mapped)
+        return resolved
+
     def build(self):
         return LlamaGuard7B(
-            policies=self.policies,
+            policies=self._resolve_policies(),
             on_fail=self.resolve_on_fail(),
         )

From 66764148f3fcb3dcdef5d9c4a269caef489d6bc5 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Tue, 7 Apr 2026 20:02:20 +0530
Subject: [PATCH 09/11] fixed tests

---
 .../app/tests/test_toxicity_hub_validators.py | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py
index aff5989..8d06675 100644
--- a/backend/app/tests/test_toxicity_hub_validators.py
+++ b/backend/app/tests/test_toxicity_hub_validators.py
@@ -37,7 +37,7 @@ def test_build_with_default_policies(self):
     def test_build_with_explicit_policies(self):
         config = LlamaGuard7BSafetyValidatorConfig(
             type="llamaguard_7b",
-            policies=["O1", "O2"],
+            policies=["no_violence_hate", "no_sexual_content"],
         )
 
         with patch(_LLAMAGUARD_PATCH) as mock_validator:
@@ -56,7 +56,14 @@ def test_build_with_empty_policies_list(self):
         assert kwargs["policies"] == []
 
     def test_build_with_all_policy_codes(self):
-        all_policies = ["O1", "O2", "O3", "O4", "O5", "O6"]
+        all_policies = [
+            "no_violence_hate",
+            "no_sexual_content",
+            "no_criminal_planning",
+            "no_guns_and_illegal_weapons",
+            "no_illegal_drugs",
+            "no_encourage_self_harm",
+        ]
         config = LlamaGuard7BSafetyValidatorConfig(
             type="llamaguard_7b", policies=all_policies
         )
@@ -65,11 +72,11 @@ def test_build_with_all_policy_codes(self):
             config.build()
 
         _, kwargs = mock_validator.call_args
-        assert kwargs["policies"] == all_policies
+        assert kwargs["policies"] == ["O1", "O2", "O3", "O4", "O5", "O6"]
 
     def test_build_with_single_policy(self):
         config = LlamaGuard7BSafetyValidatorConfig(
-            type="llamaguard_7b", policies=["O3"]
+            type="llamaguard_7b", policies=["no_criminal_planning"]
         )
 
         with patch(_LLAMAGUARD_PATCH) as mock_validator:
@@ -78,6 +85,15 @@ def test_build_with_single_policy(self):
         _, kwargs = mock_validator.call_args
         assert kwargs["policies"] == ["O3"]
 
+    def test_build_with_invalid_policy_raises(self):
+        config = LlamaGuard7BSafetyValidatorConfig(
+            type="llamaguard_7b", policies=["O1"]
+        )
+
+        with patch(_LLAMAGUARD_PATCH):
+            with pytest.raises(ValueError, match="Unknown policy"):
+                config.build()
+
     def test_build_returns_validator_instance(self):
         config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b")
 

From 6443c1bc69be6c430cf33c91c041e32acd9b2522 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Wed, 8 Apr 2026 16:11:55 +0530
Subject: [PATCH 10/11] updated readme and fixed llama guard inference

---
 backend/README.md                             | 31 +++++--------------
 backend/app/core/validators/README.md         |  8 +++--
 .../llamaguard_7b_safety_validator_config.py  |  8 ++++-
 .../scripts/install_guardrails_from_hub.sh    |  2 +-
 4 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/backend/README.md b/backend/README.md
index 77aa89d..4aa2a65 100644
--- a/backend/README.md
+++ b/backend/README.md
@@ -272,39 +272,24 @@ If verification succeeds, tenant's scope (`organization_id`, `project_id`) is re
 > Set `OPENAI_API_KEY` in your `.env` / `.env.test` before using these validators.
 > If the key is missing, `llm_critic` will raise a `ValueError` at build time and `topic_relevance` will return a validation failure with an explicit error message.
 
-1. Ensure that the .env file contains the correct value from `GUARDRAILS_HUB_API_KEY`. The key can be fetched from [here](https://hub.guardrailsai.com/keys).
+1. Ensure that the `.env` file contains the correct value for `GUARDRAILS_HUB_API_KEY`. The key can be fetched from [here](https://hub.guardrailsai.com/keys).
 
-2. Make the `install_guardrails_from_hub.sh` script executable using this command (run this from the `backend` folder) -
+2. Make the `install_guardrails_from_hub.sh` script executable (run from the `backend` folder):
 
 ```bash
 chmod +x scripts/install_guardrails_from_hub.sh
 ```
-3. Run this command to configure Guardrails AI -
 
-```bash
-scripts/install_guardrails_from_hub.sh;        
-```
-
-### Alternate Method
-Run the following commands inside your virtual environment:
+3. Run the script to configure Guardrails and install all hub validators:
 
 ```bash
-uv sync
-guardrails configure
-
-Enable anonymous metrics reporting? [Y/n]: Y
-Do you wish to use remote inferencing? [Y/n]: Y
-Enter API Key below leave empty if you want to keep existing token [HBPo]
-👉 You can find your API Key at https://hub.guardrailsai.com/keys
+GUARDRAILS_HUB_API_KEY=<your-key> bash scripts/install_guardrails_from_hub.sh
 ```
 
-To install any validator from Guardrails Hub:
-```bash
-guardrails hub install hub://guardrails/<validator-name>
-
-Example -
-guardrails hub install hub://guardrails/ban_list
-```
+> **Remote inferencing is enabled by default.** The script sets `ENABLE_REMOTE_INFERENCING=true` unless overridden. This is required for `llamaguard_7b`, which runs inference on the Guardrails Hub. You can disable it explicitly if needed:
+> ```bash
+> GUARDRAILS_HUB_API_KEY=<your-key> ENABLE_REMOTE_INFERENCING=false bash scripts/install_guardrails_from_hub.sh
+> ```
 
 ## Adding a new validator from Guardrails Hub
 To add a new validator from the Guardrails Hub to this project, follow the steps below.
diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md
index 5917722..34ab389 100644
--- a/backend/app/core/validators/README.md
+++ b/backend/app/core/validators/README.md
@@ -194,7 +194,6 @@ Notes / limitations:
 - Threshold and entity selection should be tuned per deployment context.
 - Runtime requirement: this validator is configured to use spaCy model `en_core_web_lg`.
   The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency.
-  For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg`
   Evidence and evaluation:
 - Compared approaches:
   - Custom PII validator (this codebase)
@@ -402,8 +401,11 @@ Parameters / customization:
 
 Notes / limitations:
 
-- Remote inference requires network access to the Guardrails Hub API.
-- No programmatic fix is applied on failure — `on_fail=fix` will behave like `on_fail=exception`.
+- **Requires remote inferencing to be enabled.** LlamaGuard-7B runs on the Guardrails Hub — the validator will not work unless `ENABLE_REMOTE_INFERENCING=true` was passed when running `install_guardrails_from_hub.sh`:
+  ```bash
+  GUARDRAILS_HUB_API_KEY=<your-key> ENABLE_REMOTE_INFERENCING=true bash scripts/install_guardrails_from_hub.sh
+  ```
+- `on_fail=fix` behaves like `on_fail=exception` — LlamaGuard has no programmatic fix, so validation stops immediately on failure to prevent downstream validators from receiving `None` as input.
 - LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts.
 
 ### 8) Profanity Free Validator (`profanity_free`)
diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
index 6316c32..f88669e 100644
--- a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
+++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py
@@ -1,5 +1,6 @@
 from typing import List, Literal, Optional
 
+from guardrails import OnFailAction
 from guardrails.hub import LlamaGuard7B
 
 from app.core.validators.config.base_validator_config import BaseValidatorConfig
@@ -32,7 +33,12 @@ def _resolve_policies(self) -> Optional[List[str]]:
         return resolved
 
     def build(self):
+        on_fail = self.resolve_on_fail()
+        # LlamaGuard7B has no programmatic fix. If on_fail=fix is requested,
+        # fall back to exception so downstream validators don't receive None as input.
+        if on_fail == OnFailAction.FIX:
+            on_fail = OnFailAction.EXCEPTION
         return LlamaGuard7B(
             policies=self._resolve_policies(),
-            on_fail=self.resolve_on_fail(),
+            on_fail=on_fail,  # type: ignore[arg-type]
         )
diff --git a/backend/scripts/install_guardrails_from_hub.sh b/backend/scripts/install_guardrails_from_hub.sh
index 5cff63e..ffeea3a 100755
--- a/backend/scripts/install_guardrails_from_hub.sh
+++ b/backend/scripts/install_guardrails_from_hub.sh
@@ -6,7 +6,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 GUARDRAILS_HUB_API_KEY="${GUARDRAILS_HUB_API_KEY:-}"
 
 ENABLE_METRICS="${ENABLE_METRICS:-false}"
-ENABLE_REMOTE_INFERENCING="${ENABLE_REMOTE_INFERENCING:-false}"
+ENABLE_REMOTE_INFERENCING="${ENABLE_REMOTE_INFERENCING:-true}"
 
 BACKEND_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
 MANIFEST_FILE="${1:-$BACKEND_DIR/app/core/validators/validators.json}"

From f0333b368f3f145c28cb9110ad7ed6c6aa4322ab Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Wed, 8 Apr 2026 16:13:14 +0530
Subject: [PATCH 11/11] fixed test organization

---
 backend/app/tests/conftest.py                           | 2 +-
 backend/app/tests/seed/__init__.py                      | 0
 backend/app/tests/{ => seed}/seed_data.json             | 0
 backend/app/tests/{ => seed}/seed_data.py               | 0
 backend/app/tests/test_banlists_api.py                  | 2 +-
 backend/app/tests/test_banlists_api_integration.py      | 2 +-
 backend/app/tests/test_guardrails_api.py                | 4 ++--
 backend/app/tests/test_guardrails_api_integration.py    | 2 +-
 backend/app/tests/test_validate_with_guard.py           | 4 ++--
 backend/app/tests/test_validator_configs.py             | 2 +-
 backend/app/tests/test_validator_configs_integration.py | 2 +-
 backend/app/tests/utils/__init__.py                     | 0
 backend/app/tests/{ => utils}/guardrails_mocks.py       | 0
 13 files changed, 10 insertions(+), 10 deletions(-)
 create mode 100644 backend/app/tests/seed/__init__.py
 rename backend/app/tests/{ => seed}/seed_data.json (100%)
 rename backend/app/tests/{ => seed}/seed_data.py (100%)
 create mode 100644 backend/app/tests/utils/__init__.py
 rename backend/app/tests/{ => utils}/guardrails_mocks.py (100%)

diff --git a/backend/app/tests/conftest.py b/backend/app/tests/conftest.py
index 9adc132..4a2a6b0 100644
--- a/backend/app/tests/conftest.py
+++ b/backend/app/tests/conftest.py
@@ -19,7 +19,7 @@
 from app.core.enum import GuardrailOnFail, Stage, ValidatorType
 from app.models.config.ban_list import BanList
 from app.models.config.validator_config import ValidatorConfig
-from app.tests.seed_data import (
+from app.tests.seed.seed_data import (
     BAN_LIST_INTEGRATION_ORGANIZATION_ID,
     BAN_LIST_INTEGRATION_PROJECT_ID,
     BAN_LIST_PAYLOADS,
diff --git a/backend/app/tests/seed/__init__.py b/backend/app/tests/seed/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/app/tests/seed_data.json b/backend/app/tests/seed/seed_data.json
similarity index 100%
rename from backend/app/tests/seed_data.json
rename to backend/app/tests/seed/seed_data.json
diff --git a/backend/app/tests/seed_data.py b/backend/app/tests/seed/seed_data.py
similarity index 100%
rename from backend/app/tests/seed_data.py
rename to backend/app/tests/seed/seed_data.py
diff --git a/backend/app/tests/test_banlists_api.py b/backend/app/tests/test_banlists_api.py
index 224e542..66d0ca8 100644
--- a/backend/app/tests/test_banlists_api.py
+++ b/backend/app/tests/test_banlists_api.py
@@ -13,7 +13,7 @@
     delete_ban_list,
 )
 from app.schemas.ban_list import BanListUpdate
-from app.tests.seed_data import (
+from app.tests.seed.seed_data import (
     BAN_LIST_TEST_ID,
     BAN_LIST_TEST_ORGANIZATION_ID,
     BAN_LIST_TEST_PROJECT_ID,
diff --git a/backend/app/tests/test_banlists_api_integration.py b/backend/app/tests/test_banlists_api_integration.py
index 64f2221..ed1cbe2 100644
--- a/backend/app/tests/test_banlists_api_integration.py
+++ b/backend/app/tests/test_banlists_api_integration.py
@@ -6,7 +6,7 @@
     MAX_BAN_LIST_DESCRIPTION_LENGTH,
     MAX_BAN_LIST_NAME_LENGTH,
 )
-from app.tests.seed_data import BAN_LIST_PAYLOADS
+from app.tests.seed.seed_data import BAN_LIST_PAYLOADS
 
 pytestmark = pytest.mark.integration
 
diff --git a/backend/app/tests/test_guardrails_api.py b/backend/app/tests/test_guardrails_api.py
index 86035ae..88fcd20 100644
--- a/backend/app/tests/test_guardrails_api.py
+++ b/backend/app/tests/test_guardrails_api.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from app.tests.guardrails_mocks import MockResult
-from app.tests.seed_data import (
+from app.tests.utils.guardrails_mocks import MockResult
+from app.tests.seed.seed_data import (
     VALIDATOR_TEST_ORGANIZATION_ID,
     VALIDATOR_TEST_PROJECT_ID,
 )
diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py
index 325db2e..f8c99ff 100644
--- a/backend/app/tests/test_guardrails_api_integration.py
+++ b/backend/app/tests/test_guardrails_api_integration.py
@@ -1,6 +1,6 @@
 import pytest
 
-from app.tests.seed_data import (
+from app.tests.seed.seed_data import (
     VALIDATOR_INTEGRATION_ORGANIZATION_ID,
     VALIDATOR_INTEGRATION_PROJECT_ID,
 )
diff --git a/backend/app/tests/test_validate_with_guard.py b/backend/app/tests/test_validate_with_guard.py
index fb2abc4..d10df57 100644
--- a/backend/app/tests/test_validate_with_guard.py
+++ b/backend/app/tests/test_validate_with_guard.py
@@ -8,8 +8,8 @@
     _validate_with_guard,
 )
 from app.schemas.guardrail_config import GuardrailRequest
-from app.tests.guardrails_mocks import MockResult
-from app.tests.seed_data import (
+from app.tests.utils.guardrails_mocks import MockResult
+from app.tests.seed.seed_data import (
     VALIDATOR_TEST_ORGANIZATION_ID,
     VALIDATOR_TEST_PROJECT_ID,
 )
diff --git a/backend/app/tests/test_validator_configs.py b/backend/app/tests/test_validator_configs.py
index c99fd1e..345ee1a 100644
--- a/backend/app/tests/test_validator_configs.py
+++ b/backend/app/tests/test_validator_configs.py
@@ -6,7 +6,7 @@
 from app.crud.validator_config import validator_config_crud
 from app.core.enum import GuardrailOnFail, ValidatorType
 from app.models.config.validator_config import ValidatorConfig
-from app.tests.seed_data import (
+from app.tests.seed.seed_data import (
     VALIDATOR_TEST_CONFIG,
     VALIDATOR_TEST_ID,
     VALIDATOR_TEST_NAME,
diff --git a/backend/app/tests/test_validator_configs_integration.py b/backend/app/tests/test_validator_configs_integration.py
index e14cfef..58eead1 100644
--- a/backend/app/tests/test_validator_configs_integration.py
+++ b/backend/app/tests/test_validator_configs_integration.py
@@ -1,7 +1,7 @@
 import uuid
 
 import pytest
-from app.tests.seed_data import (
+from app.tests.seed.seed_data import (
     VALIDATOR_INTEGRATION_ORGANIZATION_ID,
     VALIDATOR_INTEGRATION_PROJECT_ID,
     VALIDATOR_PAYLOADS,
diff --git a/backend/app/tests/utils/__init__.py b/backend/app/tests/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/app/tests/guardrails_mocks.py b/backend/app/tests/utils/guardrails_mocks.py
similarity index 100%
rename from backend/app/tests/guardrails_mocks.py
rename to backend/app/tests/utils/guardrails_mocks.py