ProjectTech4DevAI · rkritika1508 · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -47,6 +47,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install pinned spaCy model in the final environment used at runtime.
 RUN python -m pip install --no-deps "${SPACY_MODEL_WHEEL_URL}"
 
+# Set HuggingFace cache directory
+ENV HF_HOME=/app/hf_cache
+
+# Pre-download HuggingFace model
+RUN /app/.venv/bin/python -c "from transformers import AutoTokenizer, AutoModelForSequenceClassification; \
+AutoTokenizer.from_pretrained('textdetox/xlmr-large-toxicity-classifier', cache_dir='/app/hf_cache'); \
+AutoModelForSequenceClassification.from_pretrained('textdetox/xlmr-large-toxicity-classifier', cache_dir='/app/hf_cache')"
+
 # -------------------------------
 # Entrypoint (runtime setup)
 # -------------------------------

diff --git a/backend/app/api/API_USAGE.md b/backend/app/api/API_USAGE.md
@@ -100,6 +100,7 @@ Endpoint:
 Optional filters:
 - `ids=<uuid>&ids=<uuid>`
 - `stage=input|output`
+- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|profanity_free|nsfw_text`
 - `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|profanity_free`
 
 Example:
@@ -461,6 +462,7 @@ From `validators.json`:
 - `topic_relevance`
 - `llamaguard_7b`
 - `profanity_free`
+- `nsfw_text`
 
 Source of truth:
 - `backend/app/core/validators/validators.json`

diff --git a/backend/app/core/enum.py b/backend/app/core/enum.py
@@ -35,3 +35,4 @@ class ValidatorType(Enum):
     LLMCritic = "llm_critic"
     LlamaGuard7B = "llamaguard_7b"
     ProfanityFree = "profanity_free"
+    NSFWText = "nsfw_text"
diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md
@@ -14,6 +14,7 @@ Current validator manifest:
 - `topic_relevance` (source: `local`)
 - `llamaguard_7b` (source: `hub://guardrails/llamaguard_7b`)
 - `profanity_free` (source: `hub://guardrails/profanity_free`)
+- `nsfw_text` (source: `hub://guardrails/nsfw_text`)
 
 ## Configuration Model
 
@@ -409,7 +410,47 @@ Notes / limitations:
 - `on_fail=fix` returns `""` on failure — LlamaGuard has no programmatic fix, so `safe_text` will be `""` and the response `metadata.reason` will identify this validator as the cause.
 - LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts.
 
-### 8) Profanity Free Validator (`profanity_free`)
+### 8) NSFW Text Validator (`nsfw_text`)
+
+Code:
+
+- Config: `backend/app/core/validators/config/nsfw_text_safety_validator_config.py`
+- Source: Guardrails Hub (`hub://guardrails/nsfw_text`)
+
+What it does:
+
+- Classifies text as NSFW (not safe for work) using a [HuggingFace transformer model](https://huggingface.co/textdetox/xlmr-large-toxicity-classifier).
+- Validates at the sentence level by default; fails if any sentence exceeds the configured threshold.
+
+Why this is used:
+
+- Catches sexually explicit or otherwise inappropriate content that may not be covered by profanity or slur lists.
+- Model-based approach handles paraphrased or implicit NSFW content better than keyword matching.
+
+Recommendation:
+
+- `input` and `output`
+  - Why `input`: prevents explicit user messages from being processed or logged.
+  - Why `output`: prevents the model from returning NSFW content to end users.
+
+Parameters / customization:
+
+- `threshold: float` (default: `0.8`) — probability threshold above which text is classified as NSFW
+- `validation_method: str` (default: `"sentence"`) — granularity of validation:
+  - `"sentence"`: each sentence is classified independently; validation fails if **any** sentence exceeds the threshold. Preferred when inputs are multi-sentence and you want to catch a single offensive sentence without failing the whole message.
+  - `"full"`: the entire text is passed as one unit for classification. Use when inputs are short (single-sentence messages or responses) or when you want to evaluate overall tone rather than per-sentence content.
+- `device: str | None` (default: `"cpu"`) — inference device (`"cpu"` or `"cuda"`)
+- `model_name: str | None` (default: `"textdetox/xlmr-large-toxicity-classifier"`) — HuggingFace model identifier used for classification. Other acceptable value: `"michellejieli/NSFW_text_classifier"`
+- `on_fail`
+
+Notes / limitations:
+
+- Model runs locally; first use will download the model weights unless pre-cached.
+- Default model is English-focused; multilingual NSFW detection may require a different `model_name`.
+- No programmatic fix is applied — with `on_fail=fix`, `safe_text` will be `""` and the response `metadata.reason` will identify this validator as the cause.
+- **Latency**: this validator runs a local transformer model on CPU. For short, single-turn WhatsApp-style messages, sentence-level inference typically adds ~200–500 ms per request on CPU. Use `validation_method="full"` for shorter inputs to avoid per-sentence overhead. For high-throughput deployments, consider using GPU (`device="cuda"`) or moving this validator to async post-processing rather than the synchronous request path.
+
+### 9) Profanity Free Validator (`profanity_free`)
 
 Code:
 
@@ -491,6 +532,7 @@ Tuning strategy:
 - `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py`
 - `backend/app/core/validators/config/topic_relevance_safety_validator_config.py`
 - `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py`
+- `backend/app/core/validators/config/nsfw_text_safety_validator_config.py`
 - `backend/app/core/validators/config/profanity_free_safety_validator_config.py`
 - `backend/app/schemas/guardrail_config.py`
 - `backend/app/schemas/validator_config.py`
diff --git a/backend/app/core/validators/config/nsfw_text_safety_validator_config.py b/backend/app/core/validators/config/nsfw_text_safety_validator_config.py
@@ -0,0 +1,22 @@
+from typing import Literal, Optional
+
+from guardrails.hub import NSFWText
+
+from app.core.validators.config.base_validator_config import BaseValidatorConfig
+
+
+class NSFWTextSafetyValidatorConfig(BaseValidatorConfig):
+    type: Literal["nsfw_text"]
+    threshold: float = 0.8
+    validation_method: str = "sentence"
+    device: Optional[str] = "cpu"
+    model_name: Optional[str] = "textdetox/xlmr-large-toxicity-classifier"
+
+    def build(self):
+        return NSFWText(
+            threshold=self.threshold,
+            validation_method=self.validation_method,
+            device=self.device,
+            model_name=self.model_name,
+            on_fail=self.resolve_on_fail(),
+        )
diff --git a/backend/app/core/validators/validators.json b/backend/app/core/validators/validators.json
@@ -39,6 +39,11 @@
             "type": "profanity_free",
             "version": "0.1.0",
             "source": "hub://guardrails/profanity_free"
+        },
+        {
+            "type": "nsfw_text",
+            "version": "0.1.0",
+            "source": "hub://guardrails/nsfw_text"
         }
     ]
 }
diff --git a/backend/app/schemas/guardrail_config.py b/backend/app/schemas/guardrail_config.py
@@ -27,6 +27,9 @@
 from app.core.validators.config.llamaguard_7b_safety_validator_config import (
     LlamaGuard7BSafetyValidatorConfig,
 )
+from app.core.validators.config.nsfw_text_safety_validator_config import (
+    NSFWTextSafetyValidatorConfig,
+)
 from app.core.validators.config.profanity_free_safety_validator_config import (
     ProfanityFreeSafetyValidatorConfig,
 )
@@ -39,6 +42,7 @@
         LLMCriticSafetyValidatorConfig,
         PIIRemoverSafetyValidatorConfig,
         LlamaGuard7BSafetyValidatorConfig,
+        NSFWTextSafetyValidatorConfig,
         ProfanityFreeSafetyValidatorConfig,
         TopicRelevanceSafetyValidatorConfig,
     ],

diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py
@@ -323,6 +323,65 @@ def test_input_guardrails_with_llamaguard_7b_on_unsafe_text(integration_client):
     assert body["success"] is False
 
 
+# ---------------------------------------------------------------------------
+# NSFWText
+# ---------------------------------------------------------------------------
+
+
+def test_input_guardrails_with_nsfw_text_on_explicit_content(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "She slowly undressed and they had explicit sex.",
+            "validators": [{"type": "nsfw_text", "on_fail": "exception"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_nsfw_text_with_low_threshold(integration_client):
+    """A low threshold makes the detector more sensitive — explicit content is caught."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "She slowly undressed and they had explicit sex.",
+            "validators": [
+                {"type": "nsfw_text", "threshold": 0.1, "on_fail": "exception"}
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_nsfw_text_exception_action(integration_client):
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "Explicit sexual content goes here.",
+            "validators": [{"type": "nsfw_text", "on_fail": "exception"}],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
 # ---------------------------------------------------------------------------
 # Combinations of toxicity detectors
 # ---------------------------------------------------------------------------
@@ -373,3 +432,100 @@ def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text(
     assert response.status_code == 200
     body = response.json()
     assert body["success"] is False
+
+
+def test_input_guardrails_with_nsfw_text_and_profanity_free_on_explicit_profane_text(
+    integration_client,
+):
+    """Profane+explicit content: profanity_free reliably catches the profanity regardless
+    of whether the NSFWText model is available, guaranteeing a failure response."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What the fuck, this is explicit adult sexual content.",
+            "validators": [
+                {"type": "profanity_free", "on_fail": "exception"},
+                {"type": "nsfw_text", "on_fail": "exception"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_nsfw_text_and_slur_match_on_explicit_slur_text(
+    integration_client,
+):
+    """Slur+explicit content: uli_slur_match reliably catches the slur regardless
+    of whether the NSFWText model is available, guaranteeing a failure response."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "This chakki has explicit sexual content.",
+            "validators": [
+                {"type": "uli_slur_match", "severity": "all", "on_fail": "exception"},
+                {"type": "nsfw_text", "on_fail": "exception"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is False
+
+
+def test_input_guardrails_with_profanity_free_and_ban_list_clean_text(
+    integration_client,
+):
+    """Clean text passes both profanity_free and ban_list checks unchanged."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "Tell me about renewable energy sources.",
+            "validators": [
+                {"type": "profanity_free"},
+                {"type": "ban_list", "banned_words": ["fossil"]},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] == "Tell me about renewable energy sources."
+
+
+def test_input_guardrails_with_lexical_toxicity_detectors_on_clean_text(
+    integration_client,
+):
+    """Clean text passes uli_slur_match, profanity_free, and ban_list unchanged."""
+    response = integration_client.post(
+        VALIDATE_API_PATH,
+        json={
+            "request_id": request_id,
+            "organization_id": organization_id,
+            "project_id": project_id,
+            "input": "What are some healthy breakfast options?",
+            "validators": [
+                {"type": "uli_slur_match", "severity": "all"},
+                {"type": "profanity_free"},
+                {"type": "ban_list", "banned_words": ["junk"]},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    body = response.json()
+    assert body["success"] is True
+    assert body["data"][SAFE_TEXT_FIELD] == "What are some healthy breakfast options?"