Agenta-AI · mmabrouk · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.github/workflows/06-railway-preview-build.yml b/.github/workflows/06-railway-preview-build.yml
@@ -161,3 +161,11 @@ jobs:
       image_tag: ${{ needs.prepare.outputs.image_tag }}
       pr_number: ${{ needs.prepare.outputs.pr_number }}
     secrets: inherit
+
+  playwright:
+    needs: [deploy]
+    if: needs.deploy.outputs.preview_url != ''
+    uses: ./.github/workflows/10-playwright-oss-tests.yml
+    with:
+      web_url: ${{ needs.deploy.outputs.preview_url }}
+    secrets: inherit
diff --git a/.github/workflows/07-railway-preview-deploy.yml b/.github/workflows/07-railway-preview-deploy.yml
@@ -11,6 +11,10 @@ on:
         description: "PR number"
         required: true
         type: string
+    outputs:
+      preview_url:
+        description: "The URL of the deployed preview environment"
+        value: ${{ jobs.deploy.outputs.preview_url }}
   workflow_dispatch:
     inputs:
       image_tag:
@@ -36,6 +40,8 @@ env:
 jobs:
   deploy:
     runs-on: ubuntu-latest
+    outputs:
+      preview_url: ${{ steps.deploy.outputs.preview_url }}
     steps:
       - uses: actions/checkout@v4
 

diff --git a/.github/workflows/10-playwright-oss-tests.yml b/.github/workflows/10-playwright-oss-tests.yml
@@ -0,0 +1,131 @@
+name: "10 - playwright: OSS acceptance tests"
+
+on:
+  workflow_call:
+    inputs:
+      web_url:
+        description: "URL of the Agenta deployment to test against"
+        required: true
+        type: string
+  workflow_dispatch:
+    inputs:
+      web_url:
+        description: "URL of the Agenta deployment to test against"
+        required: true
+        type: string
+        default: "https://gateway-production-99ee.up.railway.app"
+
+permissions:
+  contents: read
+
+concurrency:
+  group: playwright-oss-${{ inputs.web_url }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: Playwright OSS acceptance
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    env:
+      AGENTA_WEB_URL: ${{ inputs.web_url }}
+      AGENTA_LICENSE: oss
+      AGENTA_AUTH_MODE: password
+      AGENTA_OSS_OWNER_EMAIL: ${{ secrets.PLAYWRIGHT_OSS_OWNER_EMAIL }}
+      AGENTA_OSS_OWNER_PASSWORD: ${{ secrets.PLAYWRIGHT_OSS_OWNER_PASSWORD }}
+      AGENTA_TEST_PROVIDER: mock
+      AGENTA_EPHEMERAL_PROJECT: "true"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
+      - name: Install dependencies
+        working-directory: web
+        run: pnpm install --no-frozen-lockfile --filter agenta-web-tests...
+
+      - name: Install Playwright browsers
+        working-directory: web/tests
+        run: pnpm exec playwright install --with-deps chromium
+
+      - name: Wait for deployment readiness
+        run: |
+          # Strip any path suffix (e.g. /w) to get the base origin for health checks
+          BASE_URL="$(echo "$AGENTA_WEB_URL" | sed 's|/[^/]*$||; s|/$||')"
+          # If the URL had no path, use it as-is
+          if ! echo "$BASE_URL" | grep -q '://'; then
+            BASE_URL="$AGENTA_WEB_URL"
+          fi
+          HEALTH_URL="${BASE_URL}/api/health"
+          echo "Waiting for ${HEALTH_URL} to be ready..."
+          for i in $(seq 1 30); do
+            if curl -sf -o /dev/null "$HEALTH_URL" 2>/dev/null; then
+              echo "Deployment is ready."
+              exit 0
+            fi
+            echo "Attempt $i/30: not ready, waiting 10s..."
+            sleep 10
+          done
+          echo "Deployment did not become ready within 5 minutes."
+          exit 1
+
+      - name: Run Playwright tests
+        working-directory: web/tests
+        run: pnpm exec playwright test --reporter=html,github
+
+      - name: Upload test report
+        id: upload_report
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: playwright-report
+          path: web/tests/playwright-report/
+          retention-days: 14
+
+      - name: Upload test results
+        id: upload_results
+        uses: actions/upload-artifact@v4
+        if: failure()
+        with:
+          name: playwright-results
+          path: web/tests/test-results/
+          retention-days: 7
+
+      - name: Write job summary
+        if: always()
+        run: |
+          RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          REPORT_URL="${{ steps.upload_report.outputs.artifact-url }}"
+          RESULTS_URL="${{ steps.upload_results.outputs.artifact-url }}"
+          JOB_STATUS="${{ job.status }}"
+
+          {
+            echo "## Playwright Test Summary"
+            echo
+            echo "| Item | Link | Notes |"
+            echo "| --- | --- | --- |"
+            echo "| Workflow run | [Open run](${RUN_URL}) | Full logs and step output |"
+
+            if [ -n "${REPORT_URL}" ]; then
+              echo "| Playwright HTML report | [Download artifact](${REPORT_URL}) | Download the zip, then open \`index.html\` locally |"
+            else
+              echo "| Playwright HTML report | Not available | Report upload did not complete |"
+            fi
+
+            if [ -n "${RESULTS_URL}" ]; then
+              echo "| Raw failure artifacts | [Download artifact](${RESULTS_URL}) | Includes failure screenshots, videos, and traces when present |"
+            else
+              echo "| Raw failure artifacts | Not uploaded | Only uploaded when the job fails |"
+            fi
+
+            echo
+            echo "**Job status:** \`${JOB_STATUS}\`"
+          } >> "$GITHUB_STEP_SUMMARY"
diff --git a/docs/design/playwright-oss-stabilization/README.md b/docs/design/playwright-oss-stabilization/README.md
@@ -0,0 +1,23 @@
+# Playwright OSS Stabilization
+
+Tracks the stabilization of frontend Playwright tests for OSS deployed environments and their integration into CI.
+
+## Current State
+
+**All 12 OSS acceptance tests stabilized** (10 pass, 2 skip gracefully). Phase 0 complete.
+
+## Files
+
+- `context.md` - Problem statement, what was done, goals, and constraints.
+- `research.md` - Test architecture, suite inventory, and key patterns discovered.
+- `plan.md` - Execution phases and next steps.
+- `status.md` - Test results, how to run, known issues, and key patterns.
+- `qa.md` - QA profile, environment contract, and coverage map.
+- `backlog.md` - Remaining work items (P1/P2).
+
+## Quick Links
+
+- OSS acceptance specs: `web/oss/tests/playwright/acceptance/`
+- BDD feature specs: `web/oss/tests/playwright/acceptance/features/`
+- Playwright config: `web/tests/playwright.config.ts`
+- Run instructions: see `status.md`
diff --git a/docs/design/playwright-oss-stabilization/backlog.md b/docs/design/playwright-oss-stabilization/backlog.md
@@ -0,0 +1,64 @@
+# Backlog
+
+## Completed
+
+1. ~~Fix all OSS acceptance tests against deployed environment~~ (10 pass, 2 skip)
+2. ~~Replace direct URL navigation with sidebar navigation in all tests~~
+3. ~~Fix API response interception race conditions~~
+4. ~~Add graceful skip for testset test when no testsets exist~~
+5. ~~Add BDD feature specs in Gherkin format~~
+6. ~~Add explicit safety guard for destructive teardown paths (`AGENTA_ALLOW_DESTRUCTIVE_TEARDOWN`)~~
+7. ~~Harden auth setup with explicit mode selection (auto/password/otp)~~
+8. ~~Add all required dimension tags (`coverage`, `path`, `lens`, `cost`, `license`) to every test~~
+9. ~~Analyze 15 legacy BDD feature files and produce prioritized coverage plan~~
+10. ~~Document E2E vs API/SDK test boundary and data seeding strategy~~
+11. ~~Add a project scoped mock provider fixture for runtime tests~~
+12. ~~Move Playground runtime tests from paid provider assumptions to the mock provider path~~
+
+## P1 (Structural cleanup — Phase 1)
+
+1. Rename `testsset` folder to `testset` (requires updating EE wrapper imports).
+2. Unskip or clearly document API keys test with rationale for what setup it needs.
+3. Fix playground direct URL blank content (frontend bug, not test issue).
+4. Wire the `openai` test provider profile into the generic fixture abstraction.
+
+## P2 (CI integration — Phase 2)
+
+1. Add CI workflow running full acceptance suite on every PR.
+2. Add `test:smoke` and `test:acceptance` script aliases in `web/tests/package.json`.
+3. Create ephemeral project per CI run (global-setup creates via `POST /api/projects`, global-teardown deletes via `DELETE /api/projects/{id}`) to prevent data accumulation from repeated runs.
+4. Make the workflow a required check after stability window.
+
+## P3 (Test independence — Phase 3)
+
+1. Implement two-phase global-setup: Phase 1 browser auth -> `state.json`, Phase 2 extract token -> seed data via direct HTTP.
+2. Make each test domain self-sufficient — create own prerequisites via API instead of depending on prior test side effects.
+3. Structure CI so domain jobs can run in parallel, each in its own ephemeral project.
+
+## P4 (Mock LLM — Phase 4)
+
+1. Investigate LiteLLM `mock/` prefix for dummy LLM responses.
+2. If not viable, implement Agenta-level mock/echo provider.
+3. Convert playground tests from `@cost:paid` to `@cost:free`.
+
+## P5 (Coverage expansion — Phase 5, Tier 1)
+
+1. **Testset CRUD**: Create from UI, CSV upload, edit rows/columns, delete testset.
+2. **Variant management**: Create variant, remove variant, compare variants in overview.
+3. **Playground depth**: Load testset in playground, comparison mode, model/params changes (blocked on mock LLM).
+4. **App deletion**: Delete app from UI.
+
+## P6 (Coverage expansion — Phase 5, Tier 2)
+
+1. **Evaluations**: Run with basic evaluator, validate requirements, view results, delete.
+2. **Observability depth**: Span hierarchy, time filter, search, pagination.
+3. **Custom models**: Full provider CRUD, verify model in playground dropdown.
+4. **Evaluator debugging**: Load test case, run variant, run evaluator in debug view.
+
+## Out of scope for OSS
+
+- Membership management (EE-only, no invitations in OSS)
+- Guest scopes / RBAC (EE-only, no roles in OSS)
+- Human evaluation (being deprecated)
+- Custom workflows (requires local server infra)
+- BaseResponse SDK compat (better as integration test)
diff --git a/docs/design/playwright-oss-stabilization/context.md b/docs/design/playwright-oss-stabilization/context.md
@@ -0,0 +1,43 @@
+# Context
+
+## Problem Statement
+
+Frontend Playwright tests were failing against deployed OSS environments due to:
+
+- Direct URL navigation to workspace-scoped routes returning 404.
+- Playground direct URL rendering blank content (frontend client-side state bug).
+- Stale locators not matching actual UI (div-based table rows, changed placeholders, role mismatches).
+- API response interception race conditions (listeners set up after triggers).
+- Tests failing hard when expected data didn't exist (testsets).
+
+This blocked confidence when validating fixes on real deployments.
+
+## What Was Done
+
+All 12 OSS acceptance tests were stabilized (10 pass, 2 skip gracefully):
+
+1. Replaced all direct URL navigation with sidebar-based navigation.
+2. Fixed playground navigation to go through Overview → Playground sidebar click.
+3. Updated all locators to match actual UI (search box, `getByText`, `menuitem` roles).
+4. Fixed API interception timing (listeners before triggers).
+5. Added graceful skips for missing data (testsets).
+6. Created BDD feature specs in Gherkin format.
+
+## Goals
+
+1. Maintain reliable OSS acceptance suite against deployed environments.
+2. Integrate as CI deployment gate.
+3. Expand coverage with BDD-driven test development.
+4. Keep current auth-by-UI design, hardened with explicit mode selection.
+
+## Non-Goals
+
+- Rewriting all tests to API-seeded auth.
+- Large-scale test framework replacement.
+- Fixing the playground direct URL frontend bug (tracked separately).
+
+## Constraints
+
+- Must run against live deployments (not only localhost).
+- Must support OSS and EE trees without breaking current workflows.
+- `AGENTA_ALLOW_DESTRUCTIVE_TEARDOWN` defaults to `false` on shared environments.