Unsupervisedcom · nhorton · Mar 5, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.deepreview b/.deepreview
@@ -386,3 +386,45 @@ deepreview_config_quality:
       - FAIL: Issues found. List each with the .deepreview file path,
         rule name, which check failed (consolidation / description / overly-broad / placement),
         and a specific recommendation.
+
+job_schema_instruction_compatibility:
+  description: "Verify deepwork_jobs instruction files, templates, and examples are compatible with the job schema."
+  match:
+    include:
+      - "src/deepwork/jobs/job.schema.json"
+      - "src/deepwork/standard_jobs/deepwork_jobs/steps/*.md"
+      - "src/deepwork/standard_jobs/deepwork_jobs/templates/*"
+      - "src/deepwork/standard_jobs/deepwork_jobs/job.yml"
+  review:
+    strategy: matches_together
+    additional_context:
+      unchanged_matching_files: true
+    instructions: |
+      When the job schema or deepwork_jobs instruction files change, verify they
+      are still compatible with each other.
+
+      Read src/deepwork/jobs/job.schema.json to understand the current schema.
+      Then read each instruction file, template, and example in
+      src/deepwork/standard_jobs/deepwork_jobs/ and check:
+
+      1. **Field references**: Every field name mentioned in prose instructions,
+         templates, or examples must exist in the schema at the correct level.
+         Pay special attention to root-level vs step-level fields — a field
+         that exists on steps may not exist at the root, and vice versa.
+
+      2. **Required vs optional**: If instructions say a field is required,
+         verify the schema agrees. If instructions say a field is optional,
+         verify the schema doesn't require it.
+
+      3. **Schema structure**: Template files and examples that show YAML
+         structure must match the schema's property names and nesting.
+
+      4. **Terminology consistency**: Instructions should use the same field
+         names as the schema (e.g., if the schema uses
+         "common_job_info_provided_to_all_steps_at_runtime", instructions
+         should not call it "description" or "job_description").
+
+      Output Format:
+      - PASS: All instruction files are compatible with the schema.
+      - FAIL: Incompatibilities found. List each with the file path, line
+        reference, the incompatible content, and what the schema actually says.
diff --git a/.deepwork/.gitignore b/.deepwork/.gitignore
@@ -2,6 +2,7 @@
 # These files are generated during sessions and should not be committed
 .last_work_tree
 .last_head_ref
+job.schema.json
 
 # Temporary files (but keep the directory via .gitkeep)
 tmp/*

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -21,7 +21,7 @@ We use a skip pattern so the same required checks pass in both PR and merge queu
 |----------|--------|----------------|----------------|
 | **Validate** | Runs | Runs | Runs |
 | **Integration Tests** | Skipped (passes) | Runs | Runs |
-| **E2E Tests** | Skipped (passes) | Runs | Runs |
+| **E2E Tests** | Skipped unless workflow file changed | Runs | Runs |
 | **CLA Check** | Runs | Skipped (passes) | Skipped (passes) |
 
 ### How It Works
@@ -64,7 +64,6 @@ When a job is skipped due to an `if` condition, GitHub treats it as a successful
 
 In GitHub branch protection rules, require these checks:
 - `Validate / tests`
-- `Claude Code Integration Test / pr-check` (for PRs)
 - `Claude Code Integration Test / validate-generation` (for merge queue)
 - `Claude Code Integration Test / claude-code-e2e` (for merge queue)
 - `CLA Assistant / merge-queue-pass` (for merge queue)
@@ -84,10 +83,10 @@ All checks will pass in both PR and merge queue contexts (either by running or b
 ### claude-code-test.yml
 - **Triggers**: `pull_request` (main), `merge_group` (main), `workflow_dispatch`
 - **Jobs**:
-  - `pr-check`: Runs on PRs only, always passes (lightweight check)
   - `validate-generation`: Tests skill generation from fixtures (no API key needed)
   - `claude-code-e2e`: Full end-to-end test with Claude Code CLI (requires `ANTHROPIC_API_KEY`)
-- `validate-generation` and `claude-code-e2e` skip on PRs, run in merge queue and manual dispatch
+- `validate-generation` skips on PRs, runs in merge queue and manual dispatch
+- `claude-code-e2e` skips on PRs unless the workflow file itself is changed (so CI fixes can be iterated in PRs)
 
 ### cla.yml
 - **Triggers**: `pull_request_target`, `issue_comment`, `merge_group` (main), `workflow_dispatch`

diff --git a/.github/workflows/claude-code-test.yml b/.github/workflows/claude-code-test.yml
@@ -130,16 +130,32 @@ jobs:
     env:
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
     steps:
-      # For PRs: just pass quickly (actual tests run in merge queue)
-      - name: Skip on PR
-        if: github.event_name == 'pull_request'
-        run: echo "E2E tests will run in merge queue. Passing for PR."
+      # Determine whether to run the full e2e test suite.
+      # Always runs in merge_group and workflow_dispatch.
+      # For PRs, only runs if the workflow file itself was changed (so we can iterate on CI fixes).
+      - name: Determine if tests should run
+        id: should-run
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          if [ "${{ github.event_name }}" != "pull_request" ]; then
+            echo "run=true" >> $GITHUB_OUTPUT
+          else
+            FILES=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/files --jq '.[].filename' 2>/dev/null || echo "")
+            if echo "$FILES" | grep -q '^\.github/workflows/claude-code-test\.yml$'; then
+              echo "run=true" >> $GITHUB_OUTPUT
+              echo "Workflow file changed in PR - running e2e tests"
+            else
+              echo "run=false" >> $GITHUB_OUTPUT
+              echo "E2E tests will run in merge queue. Passing for PR."
+            fi
+          fi
 
       - uses: actions/checkout@v4
-        if: github.event_name != 'pull_request'
+        if: steps.should-run.outputs.run == 'true'
 
       - name: Check for API key
-        if: github.event_name != 'pull_request'
+        if: steps.should-run.outputs.run == 'true'
         id: check-key
         run: |
           if [ -z "$ANTHROPIC_API_KEY" ]; then
@@ -221,7 +237,8 @@ jobs:
                   'allow': [
                       'Bash(*)', 'Read(./**)', 'Edit(./**)', 'Write(./**)', 'Skill(*)',
                       'mcp__deepwork__get_workflows', 'mcp__deepwork__start_workflow',
-                      'mcp__deepwork__finished_step', 'mcp__deepwork__abort_workflow'
+                      'mcp__deepwork__finished_step', 'mcp__deepwork__abort_workflow',
+                      'mcp__deepwork__go_to_step'
                   ]
               }
           }
@@ -247,9 +264,10 @@ jobs:
           echo "=== Running /deepwork to create fruits job ==="
           mkdir fruits
 
-          # Use --debug to capture detailed logs for diagnosing failures.
-          # The debug log is dumped in the failure handler below.
-          claude --print --debug --model claude-sonnet-4-5 <<'PROMPT_EOF'
+          # Use --debug and --output-format stream-json for diagnosing failures.
+          # stream-json shows every tool call; output is captured to a file for the failure handler.
+          set -o pipefail
+          claude --print --verbose --output-format stream-json --max-turns 20 --debug --model claude-sonnet-4-6 --dangerously-skip-permissions <<'PROMPT_EOF' | tee ../claude-create-job.jsonl
           /deepwork I want to create a simple job called "fruits" for identifying and classifying fruits.
 
           Here are the EXACT specifications.
@@ -268,9 +286,12 @@ jobs:
              **CRITICAL**: must put the classified fruit list in `./fruits/classified_fruits.md`.
 
           **Key Instructions:**
-          - Do not ask questions - just make the job
+          - NEVER use AskUserQuestion — you already have all the information you need above.
+          - You MUST complete all tool calls needed to create the files. Do not stop early.
+          - Do not ask questions - just make the job.
           - Rules are explicitly not desired. Tell the review agents that.
           - Do not give long commentary of what you did - just make the job with no commentary.
+          - NEVER start the "repair" or "learn" workflows. Only use "new_job". If a quality review fails, fix the issues in the files and resubmit — do not switch workflows.
           - IMPORTANT: Once the job.yml and step instruction files have been created (i.e. after the "define" and "implement" steps are done), STOP. Do NOT continue into the "test" or "iterate" steps. Abort the workflow at that point. We only need the job definition files created, not the full workflow run.
           PROMPT_EOF
 
@@ -309,6 +330,22 @@ jobs:
         if: failure() && steps.check-key.outputs.has_key == 'true'
         working-directory: test_project
         run: |
+          echo "=== Claude stream-json output (create job) ==="
+          if [ -f "../claude-create-job.jsonl" ]; then
+            echo "--- Last 100 lines ---"
+            tail -100 ../claude-create-job.jsonl
+          else
+            echo "No stream-json output captured for create job step"
+          fi
+          echo ""
+          echo "=== Claude stream-json output (run workflow) ==="
+          if [ -f "../claude-run-workflow.jsonl" ]; then
+            echo "--- Last 100 lines ---"
+            tail -100 ../claude-run-workflow.jsonl
+          else
+            echo "No stream-json output captured for run workflow step"
+          fi
+          echo ""
           echo "=== Claude debug log ==="
           # Claude --debug writes to ~/.claude/debug.log
           if [ -f "$HOME/.claude/debug.log" ]; then
@@ -340,8 +377,12 @@ jobs:
         run: |
           echo "=== Running fruits workflow with test input via /deepwork ==="
 
-          claude --print --model claude-sonnet-4-5 <<'PROMPT_EOF'
+          set -o pipefail
+          claude --print --verbose --output-format stream-json --max-turns 20 --debug --model claude-sonnet-4-6 --dangerously-skip-permissions <<'PROMPT_EOF' | tee ../claude-run-workflow.jsonl
           /deepwork Run the fruits full workflow. Process the list to the file and don't give any extra commentary or text output.
+          NEVER use AskUserQuestion — you already have all the information you need.
+          You MUST complete all tool calls needed. Do not stop early.
+          CRITICAL: All output files MUST be written relative to the current working directory (the project root), NOT inside .deepwork/jobs/. For example, write to ./fruits/identified_fruits.md, NOT .deepwork/jobs/fruits/identified_fruits.md.
           raw_items: apple, car, banana, chair, orange, table, mango, laptop, grape, bicycle
           PROMPT_EOF
 
@@ -405,4 +446,6 @@ jobs:
             test_project/.claude/skills/deepwork/
             test_project/fruits/identified_fruits.md
             test_project/fruits/classified_fruits.md
+            claude-create-job.jsonl
+            claude-run-workflow.jsonl
           retention-days: 7
diff --git a/specs/deepwork/jobs/JOBS-REQ-001-mcp-workflow-tools.md b/specs/deepwork/jobs/JOBS-REQ-001-mcp-workflow-tools.md
@@ -20,6 +20,7 @@ The DeepWork MCP server exposes five workflow tools to AI agents via the Model C
 10. The server MUST be named `"deepwork"`.
 11. The server MUST include instructions text describing the workflow lifecycle (Discover, Start, Execute, Checkpoint, Iterate, Continue, Complete, Going Back).
 12. Every tool call MUST be logged with the tool name and current stack state.
+13. On startup, the server MUST copy `job.schema.json` from its package-bundled location to `.deepwork/job.schema.json` under the project root, overwriting any existing file at that path. If the copy fails (e.g., permission error), the server MUST log a warning and continue without error.
 
 ### JOBS-REQ-001.2: get_workflows Tool
 

diff --git a/src/deepwork/jobs/mcp/server.py b/src/deepwork/jobs/mcp/server.py
@@ -14,6 +14,7 @@
 from __future__ import annotations
 
 import logging
+import shutil
 from pathlib import Path
 from typing import Any
 
@@ -34,6 +35,26 @@
 logger = logging.getLogger("deepwork.jobs.mcp")
 
 
+def _ensure_schema_available(project_root: Path) -> None:
+    """Copy job.schema.json to .deepwork/ so agents have a stable reference path.
+
+    The schema file is bundled with the DeepWork package at an install-dependent
+    location. This copies it to .deepwork/job.schema.json on every server start
+    so that agents and step instructions can always reference it at a known path.
+    """
+    from deepwork.jobs.schema import get_schema_path
+
+    schema_source = get_schema_path()
+    target_dir = project_root / ".deepwork"
+    target = target_dir / "job.schema.json"
+
+    try:
+        target_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(schema_source, target)
+    except OSError:
+        logger.warning("Could not copy schema to %s", target)
+
+
 def create_server(
     project_root: Path | str,
     enable_quality_gate: bool = True,
@@ -60,6 +81,9 @@ def create_server(
     """
     project_path = Path(project_root).resolve()
 
+    # Copy the job schema to a stable location so agents can always reference it
+    _ensure_schema_available(project_path)
+
     # Initialize components
     state_manager = StateManager(project_path)
 

diff --git a/src/deepwork/jobs/mcp/tools.py b/src/deepwork/jobs/mcp/tools.py
@@ -347,15 +347,16 @@ def get_workflows(self) -> GetWorkflowsResponse:
         """
         jobs, load_errors = self._load_all_jobs()
         job_infos = [self._job_to_info(job) for job in jobs]
-        repair_hint = (
-            "\nThis project likely needs `/deepwork:repair` run to correct the issue"
-            " unless the offending file(s) were changed this session and the agent can fix it directly."
-        )
         error_infos = [
             JobLoadErrorInfo(
                 job_name=e.job_name,
                 job_dir=e.job_dir,
-                error=e.error + repair_hint,
+                error=(
+                    f"{e.error}\n"
+                    f"The invalid file is {e.job_dir}/job.yml. "
+                    f"If you edited that file this session, fix it directly. "
+                    f"If you did not edit it, the project may need `/deepwork repair` to migrate legacy formats."
+                ),
             )
             for e in load_errors
         ]

diff --git a/src/deepwork/standard_jobs/deepwork_jobs/job.yml b/src/deepwork/standard_jobs/deepwork_jobs/job.yml
@@ -16,6 +16,24 @@ common_job_info_provided_to_all_steps_at_runtime: |
   confusion or inefficiencies, and improves job instructions. It also captures bespoke
   learnings specific to the current run into AGENTS.md files in the working folder.
 
+  ## Job Schema (CRITICAL)
+
+  Before creating or editing any `job.yml` file, you MUST read the JSON schema at
+  `.deepwork/job.schema.json`. This schema is the authoritative source of truth for
+  all valid fields, types, and structures. The schema uses `additionalProperties: false`
+  at every level, so any extra or misspelled fields will cause validation failures.
+
+  Key schema rules that agents commonly get wrong:
+  - **Inputs use `oneOf`** — there are exactly two input formats, with no extra fields allowed:
+    - User parameter: `{name: str, description: str}` — ONLY these two fields
+    - File from prior step: `{file: str, from_step: str}` — ONLY these two fields
+  - **No `type` field on inputs** — do NOT add `type: "user_provided"` or `type: "file"` to inputs
+  - **No `path` field on inputs** — file paths are resolved by the framework, not specified in inputs
+  - **Output keys** are the output name; values have `{type: "file"|"files", description: str, required: bool}`
+  - **No `description` field at root level** — use `common_job_info_provided_to_all_steps_at_runtime` instead
+
+  Always read the schema file and validate your job.yml structure against it.
+
 workflows:
   - name: new_job
     summary: "Create a new DeepWork job from scratch through definition, implementation, testing, and iteration"