diff --git a/.ci/README.md b/.ci/README.md new file mode 100644 index 0000000..190d012 --- /dev/null +++ b/.ci/README.md @@ -0,0 +1,386 @@ +# .ci — CI Images and Pipeline + +``` +.ci/ +├── config.yaml # Unified config (images, jobs, agent definitions) +├── utils.py # Shared utilities (load_config, normalize_config, get_git_commit) +├── agent.py # Runner Agent (scheduler, webhooks, remote dispatch) +├── build.py # Image builder +├── run.py # CI pipeline runner (Docker layer) +├── ci_resource.py # GPU/memory detection and allocation +├── github_status.py # GitHub Commit Status reporting +├── images/ +│ ├── nvidia/Dockerfile +│ ├── iluvatar/Dockerfile +│ ├── metax/Dockerfile +│ ├── moore/Dockerfile +│ ├── cambricon/Dockerfile +│ └── ascend/Dockerfile +└── tests/ # Unit tests + ├── conftest.py + ├── test_agent.py + ├── test_build.py + ├── test_run.py + ├── test_resource.py + ├── test_github_status.py + └── test_utils.py +``` + +**Prerequisites**: Docker, Python 3.10+, `pip install pyyaml` + +--- + +## Configuration `config.yaml` + +Config uses a **platform-centric** top-level structure. Each platform defines its image, platform-level defaults, and job list. +At load time, jobs are flattened to `{platform}_{job}` format (e.g., `nvidia_gpu`). + +```yaml +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +github: + status_context_prefix: "ci/infiniops" + +agents: # Remote agent URLs (used by CLI for cross-machine dispatch) + nvidia: + url: http://nvidia-host:8080 + iluvatar: + url: http://iluvatar-host:8080 + +platforms: + nvidia: + image: # Image definition + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: # Flattened as nvidia_gpu + resources: + ngpus: 1 # Scheduler auto-picks this many free GPUs + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + + iluvatar: + image: + dockerfile: .ci/images/iluvatar/ + build_args: + BASE_IMAGE: corex:qs_pj20250825 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: # Platform-level docker args, inherited by all jobs + - "--privileged" + - "--cap-add=ALL" + - "--pid=host" + - "--ipc=host" + volumes: + - /dev:/dev + - /lib/firmware:/lib/firmware + - /usr/src:/usr/src + - /lib/modules:/lib/modules + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: # Flattened as iluvatar_gpu + resources: + gpu_ids: "0" + gpu_style: none # CoreX: passthrough via --privileged + /dev mount + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml +``` + +### Config hierarchy + +| Level | Field | Description | +|---|---|---| +| **Platform** | `image` | Image definition (dockerfile, build_args) | +| | `image_tag` | Default image tag (defaults to `latest`) | +| | `docker_args` | Extra `docker run` args (e.g., `--privileged`) | +| | `volumes` | Extra volume mounts | +| | `setup` | In-container setup command | +| | `env` | Injected container env vars | +| **Job** | `resources.ngpus` | Number of GPUs — scheduler auto-picks free ones (NVIDIA only) | +| | `resources.gpu_ids` | Static GPU device IDs (e.g., `"0"`, `"0,2"`) | +| | `resources.gpu_style` | GPU passthrough: `nvidia` (default), `none`, or `mlu` | +| | `resources.memory` | Container memory limit | +| | `resources.shm_size` | Shared memory size | +| | `resources.timeout` | Max run time in seconds | +| | `stages` | Execution stage list | +| | Any platform field | Jobs can override any platform-level default | + +--- + +## Image builder `build.py` + +| Flag | Description | +|---|---| +| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | Target platform (default: `all`) | +| `--commit` | Use specific commit ref as image tag (default: HEAD) | +| `--force` | Skip Dockerfile change detection | +| `--dry-run` | Print commands without executing | + +```bash +# Build with change detection (skips if no Dockerfile changes) +python .ci/build.py --platform nvidia + +# Build Iluvatar image +python .ci/build.py --platform iluvatar --force + +# Force build all platforms +python .ci/build.py --force +``` + +Build artifacts are stored as local Docker image tags: `infiniops-ci/:` and `:latest`. +Proxy and `no_proxy` env vars are forwarded from the host to `docker build` automatically. + +> `--push` is reserved for future use; requires a `registry` section in `config.yaml`. + +--- + +## Pipeline runner `run.py` + +Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnmon` on PATH), no manual specification needed. + +| Flag | Description | +|---|---| +| `--config` | Config file path (default: `.ci/config.yaml`) | +| `--job` | Job name: short (`gpu`) or full (`nvidia_gpu`). Defaults to all jobs for the current platform | +| `--branch` | Override clone branch (default: config `repo.branch`) | +| `--stage` | Run only the specified stage | +| `--image-tag` | Override image tag | +| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) | +| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) | +| `--results-dir` | Host directory mounted to `/workspace/results` inside the container | +| `--local` | Mount current directory (read-only) instead of cloning from git | +| `--dry-run` | Print docker command without executing | + +```bash +# Simplest usage: auto-detect platform, run all jobs, use config default branch +python .ci/run.py + +# Specify short job name +python .ci/run.py --job gpu + +# Full job name (backward compatible) +python .ci/run.py --job nvidia_gpu + +# Run only the test stage, preview mode +python .ci/run.py --job gpu --stage test --dry-run + +# Test local uncommitted changes without pushing +python .ci/run.py --local +``` + +Container execution flow: `git clone` → `checkout` → `setup` → stages. +With `--local`, the current directory is mounted read-only at `/workspace/repo` and copied to a writable temp directory inside the container before setup runs — host files are never modified. +Proxy vars are forwarded from the host. Test results are written to `--results-dir`. Each run uses a clean environment (no host pip cache mounted). + +--- + +## Platform differences + +| Platform | GPU passthrough | `gpu_style` | Base image | Detection tool | +|---|---|---|---|---| +| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvidia` (default) | `nvcr.io/nvidia/pytorch:24.10-py3` | `nvidia-smi` | +| Iluvatar | `--privileged` + `/dev` mount | `none` | `corex:qs_pj20250825` | `ixsmi` | +| MetaX | `--privileged` | `none` | `maca-pytorch:3.2.1.4-...` | `mx-smi` | +| Moore | `--privileged` | `none` | `vllm_musa:20251112_hygon` | `mthreads-gmi` | +| Cambricon | `--privileged` | `mlu` | `cambricon/pytorch:v1.25.3` | `cnmon` | +| Ascend | TODO | — | `ascend-pytorch:24.0.0` | — | + +`gpu_style` controls the Docker device injection mechanism: `nvidia` uses `--gpus`, `none` uses `CUDA_VISIBLE_DEVICES` (or skips injection for Moore), `mlu` uses `MLU_VISIBLE_DEVICES`. + +--- + +## Runner Agent `agent.py` + +The Runner Agent supports CLI manual dispatch, GitHub webhook triggers, resource-aware dynamic scheduling, and cross-machine remote dispatch. + +### CLI manual execution + +```bash +# Run all jobs (dispatched to remote agents, using config default branch) +python .ci/agent.py run + +# Specify branch +python .ci/agent.py run --branch feat/xxx + +# Run a specific job +python .ci/agent.py run --job nvidia_gpu + +# Filter by platform +python .ci/agent.py run --platform nvidia + +# Preview mode +python .ci/agent.py run --dry-run +``` + +| Flag | Description | +|---|---| +| `--branch` | Test branch (default: config `repo.branch`) | +| `--job` | Specific job name | +| `--platform` | Filter jobs by platform | +| `--commit` | Override commit SHA used for GitHub status reporting | +| `--image-tag` | Override image tag | +| `--dry-run` | Preview mode | + +### Webhook server + +Deploy one Agent instance per platform machine (platform is auto-detected). On each machine: + +```bash +python .ci/agent.py serve --port 8080 +``` + +Additional `serve` flags: + +| Flag | Description | +|---|---| +| `--port` | Listen port (default: 8080) | +| `--host` | Listen address (default: `0.0.0.0`) | +| `--webhook-secret` | GitHub webhook signing secret (or `WEBHOOK_SECRET` env var) | +| `--api-token` | `/api/run` Bearer auth token (or `AGENT_API_TOKEN` env var) | +| `--results-dir` | Results directory (default: `ci-results`) | +| `--utilization-threshold` | GPU idle threshold percentage (default: 10) | + +| Endpoint | Method | Description | +|---|---|---| +| `/webhook` | POST | GitHub webhook (push/pull_request) | +| `/api/run` | POST | Remote job trigger | +| `/api/job/{id}` | GET | Query job status | +| `/health` | GET | Health check | +| `/status` | GET | Queue + resource status | + +Webhook supports `X-Hub-Signature-256` signature verification via `--webhook-secret` or `WEBHOOK_SECRET` env var. + +### Remote agent configuration + +Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote jobs to the corresponding agents: + +```yaml +agents: + nvidia: + url: http://:8080 + iluvatar: + url: http://:8080 + metax: + url: http://:8080 + moore: + url: http://:8080 +``` + +### Resource scheduling + +The Agent auto-detects GPU utilization and system memory to dynamically determine parallelism: +- GPU utilization < threshold (default 10%) and not allocated by Agent → available +- When resources are insufficient, jobs are queued automatically; completed jobs release resources and trigger scheduling of queued tasks + +### GitHub Status + +Set the `GITHUB_TOKEN` env var and the Agent will automatically report commit status: +- `pending` — job started +- `success` / `failure` — job completed + +Status context format: `ci/infiniops/{job_name}` + +--- + +## Multi-machine deployment guide + +### Per-platform setup + +Each machine needs Docker installed, the platform runtime, and the base CI image built. + +| Platform | Runtime check | Base image | Build command | +|---|---|---|---| +| NVIDIA | `nvidia-smi` (+ [Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) | `nvcr.io/nvidia/pytorch:24.10-py3` (public) | `python .ci/build.py --platform nvidia` | +| Iluvatar | `ixsmi` | `corex:qs_pj20250825` (import in advance) | `python .ci/build.py --platform iluvatar` | +| MetaX | `mx-smi` | `maca-pytorch:3.2.1.4-...` (import in advance) | `python .ci/build.py --platform metax` | +| Moore | `mthreads-gmi` | `vllm_musa:20251112_hygon` (import in advance) | `python .ci/build.py --platform moore` | + +### Start Agent services + +On each machine (platform is auto-detected): + +```bash +python .ci/agent.py serve --port 8080 +``` + +### Configure remote agent URLs + +On the trigger machine, add the `agents` section to `config.yaml` (see [Remote agent configuration](#remote-agent-configuration) above for the format). + +### Trigger cross-platform tests + +```bash +# Run all platform jobs at once (using config default branch) +python .ci/agent.py run + +# Preview mode (no actual execution) +python .ci/agent.py run --dry-run + +# Run only a specific platform +python .ci/agent.py run --platform nvidia +``` + +### Optional configuration + +#### GitHub Status reporting + +Set the env var on all machines so each reports its own platform's test status: + +```bash +export GITHUB_TOKEN=ghp_xxxxxxxxxxxx +``` + +#### API Token authentication + +When agents are exposed on untrusted networks, enable token auth: + +```bash +python .ci/agent.py serve --port 8080 --api-token +# Or: export AGENT_API_TOKEN= +``` + +#### GitHub Webhook auto-trigger + +In GitHub repo → Settings → Webhooks, add a webhook for each machine: + +| Field | Value | +|---|---| +| Payload URL | `http://:8080/webhook` | +| Content type | `application/json` | +| Secret | Must match `--webhook-secret` | +| Events | `push` and `pull_request` | + +```bash +python .ci/agent.py serve --port 8080 --webhook-secret +# Or: export WEBHOOK_SECRET= +``` + +### Verification checklist + +```bash +# 1. Dry-run each machine individually +for platform in nvidia iluvatar metax moore; do + python .ci/agent.py run --platform $platform --dry-run +done + +# 2. Health and resource checks +for ip in ; do + curl http://$ip:8080/health + curl http://$ip:8080/status +done + +# 3. Cross-platform test +python .ci/agent.py run --branch master +``` diff --git a/.ci/agent.py b/.ci/agent.py new file mode 100644 index 0000000..3fb5d9e --- /dev/null +++ b/.ci/agent.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting. + +Usage: + # Run jobs locally (or dispatch to remote agents) + python .ci/agent.py run + python .ci/agent.py run --branch master --job nvidia_gpu --dry-run + + # Start webhook server (auto-detects platform) + python .ci/agent.py serve --port 8080 +""" + +import argparse +import collections +import hashlib +import hmac +import json +import os +import shlex +import subprocess +import sys +import threading +import time +import urllib.error +import urllib.request +import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +import ci_resource as res +import github_status as gh +import run + +# Maximum POST body size (1 MB) to prevent memory exhaustion +MAX_CONTENT_LENGTH = 1 * 1024 * 1024 + +# Job states +STATE_QUEUED = "queued" +STATE_RUNNING = "running" +STATE_PENDING = "pending" +STATE_SUCCESS = "success" +STATE_FAILURE = "failure" +STATE_ERROR = "error" + +TAIL_LINES = 50 + +# urllib helpers (module-level for easier mocking in tests) +urllib_request = urllib.request.Request +urllib_urlopen = urllib.request.urlopen + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +class JobRequest: + """Describes a CI job to be executed.""" + + def __init__( + self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None + ): + self.job_id = str(uuid.uuid4())[:8] + self.job_name = job_name + self.branch = branch + self.commit_sha = commit_sha + self.config = config + self.image_tag = image_tag + self.results_dir = results_dir or Path("ci-results") + self.created_at = datetime.now().isoformat() + + job = config["jobs"][job_name] + self.platform = job.get("platform", "nvidia") + + def to_dict(self): + return { + "job_id": self.job_id, + "job_name": self.job_name, + "branch": self.branch, + "commit_sha": self.commit_sha, + "platform": self.platform, + "created_at": self.created_at, + } + + +class JobResult: + """Outcome of a completed job.""" + + def __init__( + self, + job_id, + job_name, + commit_sha, + returncode, + results_dir, + duration, + error_tail=None, + ): + self.job_id = job_id + self.job_name = job_name + self.commit_sha = commit_sha + self.returncode = returncode + self.results_dir = results_dir + self.duration = duration + self.error_tail = error_tail or [] + + self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE + + def to_dict(self): + d = { + "job_id": self.job_id, + "job_name": self.job_name, + "commit_sha": self.commit_sha, + "state": self.state, + "returncode": self.returncode, + "results_dir": str(self.results_dir), + "duration_seconds": round(self.duration, 1), + } + + if self.error_tail: + d["error_tail"] = self.error_tail + + return d + + +# --------------------------------------------------------------------------- +# Job selection and routing +# --------------------------------------------------------------------------- + + +def select_jobs(config, platform=None, job_name=None): + """Return list of job names to run.""" + jobs = config.get("jobs", {}) + + if job_name: + if job_name not in jobs: + raise ValueError(f"job {job_name!r} not in config") + + return [job_name] + + if platform: + return [name for name, job in jobs.items() if job.get("platform") == platform] + + return list(jobs.keys()) + + +# --------------------------------------------------------------------------- +# Scheduler +# --------------------------------------------------------------------------- + + +class Scheduler: + """Resource-aware job scheduler with dynamic parallelism.""" + + def __init__( + self, + config, + platform, + resource_pool, + results_dir=None, + max_workers=4, + no_status=False, + dry_run=False, + ): + self._config = config + self._platform = platform + self._resource_pool = resource_pool + self._results_dir = results_dir or Path("ci-results") + self._no_status = no_status + self._dry_run = dry_run + self._queue = collections.deque() + self._jobs: dict[str, dict] = {} # job_id -> {request, result, state, gpu_ids} + self._executor = ThreadPoolExecutor(max_workers=max_workers) + self._lock = threading.Lock() + self._done_event = threading.Event() + + # GitHub config + github_cfg = config.get("github", {}) + self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops") + repo = config.get("repo", {}) + repo_url = repo.get("url", "") + self._owner, self._repo = gh.parse_repo_url(repo_url) + + def submit(self, job_request): + """Add a job to the queue and attempt to schedule it. + + Returns the job_id. + """ + with self._lock: + self._jobs[job_request.job_id] = { + "request": job_request, + "result": None, + "state": STATE_QUEUED, + "gpu_ids": [], + } + self._queue.append(job_request) + + self._try_schedule() + return job_request.job_id + + def get_job(self, job_id): + """Get job info by ID.""" + with self._lock: + entry = self._jobs.get(job_id) + + if not entry: + return None + + info = entry["request"].to_dict() + info["state"] = entry["state"] + + if entry["result"]: + info.update(entry["result"].to_dict()) + + return info + + def get_status(self): + """Return scheduler status for the /status endpoint.""" + with self._lock: + queued = [self._jobs[r.job_id]["request"].to_dict() for r in self._queue] + running = [] + completed = [] + + for entry in self._jobs.values(): + state = entry["state"] + + if state == STATE_RUNNING: + running.append( + {**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]} + ) + elif state in (STATE_SUCCESS, STATE_FAILURE): + completed.append(entry["result"].to_dict()) + + return { + "queued": queued, + "running": running, + "completed": completed[-20:], # Last 20 + "resources": self._resource_pool.get_status(), + } + + def wait_all(self): + """Block until all submitted jobs are done. Returns list of JobResult.""" + while True: + with self._lock: + pending = any( + e["state"] in (STATE_QUEUED, STATE_RUNNING) + for e in self._jobs.values() + ) + + if not pending: + break + + self._done_event.wait(timeout=2.0) + self._done_event.clear() + + with self._lock: + return [e["result"] for e in self._jobs.values() if e["result"] is not None] + + def _try_schedule(self): + """Try to run queued jobs that have enough resources. + + Resource allocation and job submission are split: allocation decisions + are made under the lock, but executor.submit() happens outside to + prevent deadlock when the thread pool is saturated. + """ + to_launch = [] # [(req, gpu_ids), ...] + + with self._lock: + remaining = collections.deque() + + while self._queue: + req = self._queue.popleft() + job_cfg = self._config["jobs"].get(req.job_name, {}) + gpu_count = res.parse_gpu_requirement(job_cfg) + memory_mb = res.parse_memory_requirement(job_cfg) + + if self._dry_run: + # In dry-run mode, skip resource checks + gpu_ids, ok = [], True + else: + gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb) + + if ok: + self._jobs[req.job_id]["state"] = STATE_RUNNING + self._jobs[req.job_id]["gpu_ids"] = gpu_ids + to_launch.append((req, gpu_ids)) + else: + remaining.append(req) + + self._queue = remaining + + # Submit outside the lock to avoid deadlock with ThreadPoolExecutor + for req, gpu_ids in to_launch: + self._executor.submit(self._run_job, req, gpu_ids) + + def _run_job(self, req, gpu_ids): + """Execute a single job in a worker thread. + + Wrapped in try/finally to guarantee GPU resources are always released + and job state is updated even on unexpected exceptions. + """ + context = gh.build_status_context(self._status_prefix, req.job_name) + result = None + + try: + # Post pending status + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + STATE_PENDING, + context, + f"Running {req.job_name}...", + ) + + job_cfg = self._config["jobs"][req.job_name] + all_stages = job_cfg.get("stages", []) + repo_url = self._config.get("repo", {}).get("url", "") + commit_short = ( + req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha + ) + results_dir = run.build_results_dir( + req.results_dir, req.platform, all_stages, commit_short + ) + + gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None + docker_args = run.build_docker_args( + self._config, + req.job_name, + repo_url, + req.branch, + all_stages, + "/workspace", + req.image_tag, + gpu_id_override=gpu_id_str, + results_dir=results_dir, + ) + + start = time.monotonic() + + if self._dry_run: + print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}") + returncode = 0 + error_tail = [] + else: + results_dir.mkdir(parents=True, exist_ok=True) + proc = subprocess.Popen( + docker_args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + tail_buf = collections.deque(maxlen=TAIL_LINES) + + for line in proc.stdout: + sys.stdout.buffer.write(line) + tail_buf.append(line) + + proc.stdout.close() + returncode = proc.wait() + + if returncode != 0: + error_tail = [ + raw.decode("utf-8", errors="replace").rstrip("\n") + for raw in tail_buf + ] + else: + error_tail = [] + + duration = time.monotonic() - start + + result = JobResult( + job_id=req.job_id, + job_name=req.job_name, + commit_sha=req.commit_sha, + returncode=returncode, + results_dir=results_dir, + duration=duration, + error_tail=error_tail, + ) + + # Post final status + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + result.state, + context, + f"{req.job_name}: {result.state} in {duration:.0f}s", + ) + except Exception as e: + print( + f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr + ) + + if result is None: + result = JobResult( + job_id=req.job_id, + job_name=req.job_name, + commit_sha=req.commit_sha, + returncode=-1, + results_dir=req.results_dir, + duration=0, + error_tail=[str(e)], + ) + + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + STATE_ERROR, + context, + f"{req.job_name}: internal error", + ) + finally: + # Always release resources and update state + self._resource_pool.release(gpu_ids) + + with self._lock: + self._jobs[req.job_id]["result"] = result + self._jobs[req.job_id]["state"] = ( + result.state if result else STATE_FAILURE + ) + + self._done_event.set() + self._try_schedule() + + return result + + +# --------------------------------------------------------------------------- +# Webhook server +# --------------------------------------------------------------------------- + + +def verify_signature(secret, body, signature_header): + """Verify GitHub webhook HMAC-SHA256 signature.""" + if not signature_header: + return False + + expected = ( + "sha256=" + hmac.new(secret.encode("utf-8"), body, hashlib.sha256).hexdigest() + ) + return hmac.compare_digest(expected, signature_header) + + +def _verify_api_token(handler): + """Check Bearer token for /api/run authentication. + + Returns True if authenticated, False (and sends 401) if not. + When no api_token is configured on the server, all requests are allowed. + """ + api_token = getattr(handler.server, "api_token", None) + + if not api_token: + return True + + auth_header = handler.headers.get("Authorization", "") + + if auth_header == f"Bearer {api_token}": + return True + + handler._respond_json(401, {"error": "unauthorized"}) + return False + + +class WebhookHandler(BaseHTTPRequestHandler): + """HTTP handler for GitHub webhooks and API endpoints.""" + + def log_message(self, format, *args): + print(f"[agent] {args[0]}", file=sys.stderr) + + def do_GET(self): + if self.path == "/health": + self._respond_json(200, {"status": "ok", "platform": self.server.platform}) + elif self.path == "/status": + status = self.server.scheduler.get_status() + self._respond_json(200, status) + elif self.path.startswith("/api/job/"): + self._handle_api_job() + else: + self._respond_json(404, {"error": "not found"}) + + def do_POST(self): + content_length = int(self.headers.get("Content-Length", 0)) + + if content_length > MAX_CONTENT_LENGTH: + self._respond_json(413, {"error": "payload too large"}) + return + + body = self.rfile.read(content_length) + + if self.path == "/webhook": + self._handle_webhook(body) + elif self.path == "/api/run": + self._handle_api_run(body) + else: + self._respond_json(404, {"error": "not found"}) + + def _handle_webhook(self, body): + # Verify signature if secret is configured + if self.server.webhook_secret: + sig = self.headers.get("X-Hub-Signature-256", "") + + if not verify_signature(self.server.webhook_secret, body, sig): + self._respond_json(401, {"error": "invalid signature"}) + return + + event_type = self.headers.get("X-GitHub-Event", "") + + if event_type == "ping": + self._respond_json(200, {"msg": "pong"}) + return + + try: + payload = json.loads(body) + except json.JSONDecodeError: + self._respond_json(400, {"error": "invalid JSON"}) + return + + if event_type == "push": + branch, sha = self._parse_push(payload) + elif event_type == "pull_request": + action = payload.get("action", "") + + if action not in ("opened", "synchronize"): + self._respond_json(200, {"msg": f"ignored PR action: {action}"}) + return + + branch, sha = self._parse_pull_request(payload) + else: + self._respond_json(200, {"msg": f"ignored event: {event_type}"}) + return + + if not branch or not sha: + self._respond_json(400, {"error": "could not extract branch/sha"}) + return + + job_ids = self._submit_jobs(branch, sha) + self._respond_json(200, {"accepted": True, "job_ids": job_ids}) + + def _handle_api_run(self, body): + """Handle /api/run: remote job trigger (requires Bearer token auth).""" + if not _verify_api_token(self): + return + + try: + payload = json.loads(body) + except json.JSONDecodeError: + self._respond_json(400, {"error": "invalid JSON"}) + return + + branch = payload.get("branch", "") + sha = payload.get("commit_sha", "") + job_name = payload.get("job") + image_tag = payload.get("image_tag") + + if not branch: + self._respond_json(400, {"error": "branch is required"}) + return + + if not sha: + sha = run.get_git_commit() + + job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag) + self._respond_json(200, {"accepted": True, "job_ids": job_ids}) + + def _handle_api_job(self): + """Handle GET /api/job/{id}.""" + parts = self.path.split("/") + + if len(parts) < 4: + self._respond_json(400, {"error": "missing job_id"}) + return + + job_id = parts[3] + info = self.server.scheduler.get_job(job_id) + + if info is None: + self._respond_json(404, {"error": f"job {job_id} not found"}) + else: + self._respond_json(200, info) + + def _parse_push(self, payload): + branch = payload.get("ref", "").removeprefix("refs/heads/") + sha = payload.get("after", "") + return branch, sha + + def _parse_pull_request(self, payload): + pr = payload.get("pull_request", {}) + head = pr.get("head", {}) + branch = head.get("ref", "") + sha = head.get("sha", "") + return branch, sha + + def _submit_jobs(self, branch, sha, job_name=None, image_tag=None): + config = self.server.config + job_names = select_jobs( + config, platform=self.server.platform, job_name=job_name + ) + job_ids = [] + + for name in job_names: + req = JobRequest( + job_name=name, + branch=branch, + commit_sha=sha, + config=config, + image_tag=image_tag, + results_dir=self.server.results_dir, + ) + jid = self.server.scheduler.submit(req) + job_ids.append(jid) + + return job_ids + + def _respond_json(self, status_code, data): + body = json.dumps(data, indent=2).encode("utf-8") + self.send_response(status_code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + +class AgentServer(HTTPServer): + """HTTP server with scheduler and config context.""" + + def __init__( + self, + host, + port, + config, + scheduler, + platform, + webhook_secret=None, + api_token=None, + results_dir=None, + ): + super().__init__((host, port), WebhookHandler) + self.config = config + self.scheduler = scheduler + self.platform = platform + self.webhook_secret = webhook_secret + self.api_token = api_token + self.results_dir = results_dir or Path("ci-results") + + +# --------------------------------------------------------------------------- +# Remote job dispatch (for CLI triggering remote agents) +# --------------------------------------------------------------------------- + + +def dispatch_remote_job( + agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None +): + """Send a job to a remote agent via HTTP API. Returns job_id or None.""" + url = f"{agent_url.rstrip('/')}/api/run" + body = { + "branch": branch, + "commit_sha": commit_sha, + "job": job_name, + } + + if image_tag: + body["image_tag"] = image_tag + + data = json.dumps(body).encode("utf-8") + headers = {"Content-Type": "application/json"} + + if api_token: + headers["Authorization"] = f"Bearer {api_token}" + + req = urllib_request(url, data=data, headers=headers, method="POST") + + try: + with urllib_urlopen(req, timeout=30) as resp: + result = json.loads(resp.read()) + job_ids = result.get("job_ids", []) + return job_ids[0] if job_ids else None + except Exception as e: + print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr) + return None + + +def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200): + """Poll a remote agent for job completion. Returns final state dict or None.""" + url = f"{agent_url.rstrip('/')}/api/job/{job_id}" + deadline = time.monotonic() + timeout + + while time.monotonic() < deadline: + try: + req = urllib_request(url) + + with urllib_urlopen(req, timeout=10) as resp: + info = json.loads(resp.read()) + + state = info.get("state", "") + + if state in (STATE_SUCCESS, STATE_FAILURE): + return info + except Exception: + pass + + time.sleep(interval) + + return None + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def cmd_run(args): + """Handle 'run' subcommand: dispatch jobs to platform agents via HTTP.""" + config = run.load_config(args.config) + agents = config.get("agents", {}) + branch = args.branch or config.get("repo", {}).get("branch", "master") + commit_sha = args.commit or run.get_git_commit(short=False) + + # Determine which jobs to run + try: + job_names = select_jobs(config, platform=args.platform, job_name=args.job) + except ValueError as e: + print(f"error: {e}", file=sys.stderr) + sys.exit(1) + + if not job_names: + print("error: no matching jobs found", file=sys.stderr) + sys.exit(1) + + # Resolve agent URL for each job + jobs_to_dispatch = [] # [(name, agent_url)] + + for name in job_names: + job = config.get("jobs", {}).get(name, {}) + platform = job.get("platform", "") + agent_url = agents.get(platform, {}).get("url", "") + + if not agent_url: + print( + f"error: no agent URL configured for platform {platform!r} (job {name})", + file=sys.stderr, + ) + sys.exit(1) + + jobs_to_dispatch.append((name, agent_url)) + + api_token = os.environ.get("AGENT_API_TOKEN", "") + results = [] + + if args.dry_run: + for name, agent_url in jobs_to_dispatch: + platform, _, job = name.partition("_") + print(f"[dry-run] dispatch {platform} {job} job to {agent_url}") + else: + # Dispatch all jobs, then poll concurrently. + dispatched = [] # [(name, agent_url, job_id)] + + for name, agent_url in jobs_to_dispatch: + platform, _, job = name.partition("_") + print( + f"==> dispatching {platform} {job} job to {agent_url}", + file=sys.stderr, + ) + job_id = dispatch_remote_job( + agent_url, + name, + branch, + commit_sha, + args.image_tag, + api_token=api_token or None, + ) + + if job_id: + print(f" job_id: {job_id}", file=sys.stderr) + dispatched.append((name, agent_url, job_id)) + else: + print(f" failed to dispatch {name}", file=sys.stderr) + results.append({"job_name": name, "state": "error"}) + + if dispatched: + with ThreadPoolExecutor(max_workers=len(dispatched)) as executor: + futures = { + executor.submit(poll_remote_job, url, jid): (name, url, jid) + for name, url, jid in dispatched + } + + # Collect name lengths for column alignment. + name_width = max(len(n) for n, _, _ in dispatched) + + for future in as_completed(futures): + name, _, _ = futures[future] + result = future.result() + + if result: + state = result.get("state", "unknown") + duration = result.get("duration_seconds", 0) + tag = "PASS" if state == STATE_SUCCESS else "FAIL" + print( + f"<== {tag} {name:<{name_width}} ({duration:.0f}s)", + file=sys.stderr, + ) + + error_tail = result.get("error_tail", []) + + if error_tail: + print( + f"--- error output (last {len(error_tail)} lines) ---", + file=sys.stderr, + ) + + for line in error_tail: + print(f" {line}", file=sys.stderr) + + print("---", file=sys.stderr) + + results.append(result) + else: + print( + f"<== TIMEOUT {name:<{name_width}}", + file=sys.stderr, + ) + results.append({"job_name": name, "state": "timeout"}) + + # Summary: only print when there are failures. + failed = [r for r in results if r.get("state") != STATE_SUCCESS] + + if failed: + print("\n========== Failed ==========", file=sys.stderr) + name_width = max(len(r.get("job_name", "?")) for r in failed) + + for r in failed: + name = r.get("job_name", "?") + state = r.get("state", "unknown") + duration = r.get("duration_seconds", 0) + print( + f" FAIL {name:<{name_width}} {state} ({duration:.0f}s)", + file=sys.stderr, + ) + + sys.exit(1) + + +def cmd_serve(args): + """Handle 'serve' subcommand: start webhook server.""" + config = run.load_config(args.config) + + platform = res.detect_platform() + + if not platform: + print( + "error: could not detect platform (no nvidia-smi or ixsmi found)", + file=sys.stderr, + ) + sys.exit(1) + + platform_jobs = select_jobs(config, platform=platform) + + if not platform_jobs: + print( + f"error: platform {platform!r} detected but no jobs defined in config", + file=sys.stderr, + ) + sys.exit(1) + + pool = res.ResourcePool( + platform, + utilization_threshold=args.utilization_threshold, + ) + scheduler = Scheduler( + config, + platform, + pool, + results_dir=args.results_dir, + ) + + webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "") + api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "") + + if not webhook_secret: + print( + "WARNING: No webhook secret configured. Webhook endpoint accepts " + "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.", + file=sys.stderr, + ) + + if not api_token: + print( + "WARNING: No API token configured. /api/run endpoint is unauthenticated. " + "Set --api-token or AGENT_API_TOKEN for production.", + file=sys.stderr, + ) + + server = AgentServer( + args.host, + args.port, + config, + scheduler, + platform, + webhook_secret=webhook_secret or None, + api_token=api_token or None, + results_dir=args.results_dir, + ) + + print( + f"Agent serving on {args.host}:{args.port} (platform={platform})", + file=sys.stderr, + ) + print(" POST /webhook — GitHub webhook", file=sys.stderr) + print(" POST /api/run — remote job trigger", file=sys.stderr) + print(" GET /health — health check", file=sys.stderr) + print(" GET /status — queue & resource status", file=sys.stderr) + print(" GET /api/job/{id} — job status", file=sys.stderr) + + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down...", file=sys.stderr) + server.shutdown() + + +def main(): + parser = argparse.ArgumentParser( + description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks", + ) + subparsers = parser.add_subparsers(dest="command") + + # --- run subcommand --- + run_parser = subparsers.add_parser("run", help="Run CI jobs") + run_parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + ) + run_parser.add_argument( + "--branch", type=str, help="Branch to test (default: config repo.branch)" + ) + run_parser.add_argument("--job", type=str, help="Specific job name") + run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") + run_parser.add_argument("--image-tag", type=str, help="Override image tag") + run_parser.add_argument("--commit", type=str, help="Override commit SHA") + run_parser.add_argument("--dry-run", action="store_true") + + # --- serve subcommand --- + serve_parser = subparsers.add_parser("serve", help="Start webhook server") + serve_parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + ) + serve_parser.add_argument("--port", type=int, default=8080) + serve_parser.add_argument("--host", type=str, default="0.0.0.0") + serve_parser.add_argument("--webhook-secret", type=str) + serve_parser.add_argument( + "--api-token", + type=str, + help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)", + ) + serve_parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + ) + serve_parser.add_argument( + "--utilization-threshold", + type=int, + default=10, + ) + + args = parser.parse_args() + + if args.command == "run": + cmd_run(args) + elif args.command == "serve": + cmd_serve(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/build.py b/.ci/build.py new file mode 100644 index 0000000..7953209 --- /dev/null +++ b/.ci/build.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" + +import argparse +import json +import os +import shlex +import subprocess +import sys +from pathlib import Path + +from utils import get_git_commit, load_config + + +def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): + """Check if any file under `dockerfile_dir` changed since `base_ref`.""" + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + print( + "warning: git diff failed (shallow clone or initial commit?);" + " assuming Dockerfile changed", + file=sys.stderr, + ) + return True + + return bool(result.stdout.strip()) + + +def docker_login(registry_cfg, dry_run): + """Log in to the registry using `credentials_env` token. + + Returns True on success. + + NOTE: Registry support is currently unused (`config.yaml` has no registry + section). Retained for future integration with an external image management + system. + """ + credentials_env = registry_cfg.get("credentials_env") + registry_url = registry_cfg.get("url", "") + + if not credentials_env or not registry_url: + return True + + token = os.environ.get(credentials_env) + + if not token: + print( + f"error: {credentials_env} not set, cannot login", + file=sys.stderr, + ) + return False + + if dry_run: + print( + f"[dry-run] echo | docker login {registry_url}" + " --username token --password-stdin" + ) + return True + + result = subprocess.run( + ["docker", "login", registry_url, "--username", "token", "--password-stdin"], + input=token, + text=True, + ) + + if result.returncode != 0: + print("error: docker login failed", file=sys.stderr) + return False + + return True + + +def build_image_tag(registry_url, project, platform, tag): + if registry_url: + return f"{registry_url}/{project}/{platform}:{tag}" + + return f"{project}-ci/{platform}:{tag}" + + +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in): + """Build a single platform image. Returns True on success.""" + registry_url = registry_cfg.get("url", "") + project = registry_cfg.get("project", "infiniops") + dockerfile_dir = platform_cfg["dockerfile"] + commit_tag = build_image_tag(registry_url, project, platform, commit) + latest_tag = build_image_tag(registry_url, project, platform, "latest") + + build_args_cfg = platform_cfg.get("build_args", {}) + build_cmd = ["docker", "build", "--network", "host"] + + for key, value in build_args_cfg.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + + if proxy_val: + build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"]) + + private_sdk = platform_cfg.get("private_sdk", {}) + + if private_sdk: + source_env = private_sdk.get("source_env", "") + sdk_url = os.environ.get(source_env, "") if source_env else "" + + if sdk_url: + build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) + + build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) + + if dry_run: + print(f"[dry-run] {shlex.join(build_cmd)}") + + if push: + if not logged_in: + print("[dry-run] (skipping push: docker login failed)") + else: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") + + return True + + print(f"==> building {platform}: {commit_tag}", file=sys.stderr) + result = subprocess.run(build_cmd) + + if result.returncode != 0: + error = { + "stage": "build", + "platform": platform, + "tag": commit_tag, + "exit_code": result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + if push: + if not logged_in: + print("error: docker login failed, cannot push", file=sys.stderr) + return False + + for tag in (commit_tag, latest_tag): + print(f"==> pushing {tag}", file=sys.stderr) + push_result = subprocess.run(["docker", "push", tag]) + + if push_result.returncode != 0: + error = { + "stage": "push", + "platform": platform, + "tag": tag, + "exit_code": push_result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Build CI Docker images") + parser.add_argument( + "--platform", + type=str, + default="all", + help="Platform to build: nvidia, ascend, or all (default: all)", + ) + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--commit", + type=str, + default="HEAD", + help="Git ref for tagging the image (default: HEAD)", + ) + parser.add_argument( + "--push", + action="store_true", + help="Push images to registry after building (requires registry in config)", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip change detection and force build", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing", + ) + args = parser.parse_args() + + config = load_config(args.config) + registry_cfg = config.get("registry", {}) + images_cfg = config.get("images", {}) + + if not images_cfg: + print("error: no `images` section in config", file=sys.stderr) + sys.exit(1) + + if args.platform == "all": + platforms = list(images_cfg.keys()) + else: + if args.platform not in images_cfg: + print( + f"error: platform `{args.platform}` not found in config", + file=sys.stderr, + ) + sys.exit(1) + platforms = [args.platform] + + commit = get_git_commit(args.commit) + logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True + failed = False + + for platform in platforms: + platform_cfg = images_cfg[platform] + dockerfile_dir = platform_cfg["dockerfile"] + + if not Path(dockerfile_dir).is_dir(): + print( + f"warning: dockerfile directory `{dockerfile_dir}` does not exist," + f" skipping {platform}", + file=sys.stderr, + ) + continue + + if not args.force and not has_dockerfile_changed(dockerfile_dir): + print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) + continue + + ok = build_image( + platform, + platform_cfg, + registry_cfg, + commit, + args.push, + args.dry_run, + logged_in=logged_in, + ) + + if not ok: + failed = True + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py new file mode 100644 index 0000000..51b181f --- /dev/null +++ b/.ci/ci_resource.py @@ -0,0 +1,478 @@ +#!/usr/bin/env python3 +"""Resource detection and allocation for CI Runner Agent.""" + +import json +import operator +import os +import re +import shutil +import subprocess +import threading +from dataclasses import dataclass + +# GPU passthrough styles +GPU_STYLE_NVIDIA = "nvidia" +GPU_STYLE_NONE = "none" +GPU_STYLE_MLU = "mlu" + + +@dataclass +class GpuInfo: + index: int + memory_used_mb: float + memory_total_mb: float + utilization_pct: float + + +@dataclass +class SystemResources: + total_memory_mb: float + available_memory_mb: float + cpu_count: int + + +class ResourcePool: + """Thread-safe GPU and system resource manager. + + Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi, mthreads-gmi) + and tracks allocations to enable dynamic parallel scheduling. + """ + + GPU_QUERY_TOOLS = { + "nvidia": "nvidia-smi", + "iluvatar": "ixsmi", + "metax": "mx-smi", + "moore": "mthreads-gmi", + "cambricon": "cnmon", + } + + def __init__(self, platform, utilization_threshold=10): + self._platform = platform + self._utilization_threshold = utilization_threshold + self._allocated: set[int] = set() + self._lock = threading.Lock() + + @property + def platform(self): + return self._platform + + @property + def allocated(self): + with self._lock: + return set(self._allocated) + + def detect_gpus(self) -> list[GpuInfo]: + """Query GPU status via platform-specific CLI tool.""" + if self._platform == "metax": + return self._detect_gpus_metax() + + if self._platform == "moore": + return self._detect_gpus_moore() + + if self._platform == "cambricon": + return self._detect_gpus_cambricon() + + tool = self.GPU_QUERY_TOOLS.get(self._platform) + + if not tool: + return [] + + try: + result = subprocess.run( + [ + tool, + "--query-gpu=index,memory.used,memory.total,utilization.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + gpus = [] + + for line in result.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + + if len(parts) < 4: + continue + + try: + gpus.append( + GpuInfo( + index=int(parts[0]), + memory_used_mb=float(parts[1]), + memory_total_mb=float(parts[2]), + utilization_pct=float(parts[3]), + ) + ) + except (ValueError, IndexError): + continue + + return gpus + + def _detect_gpus_metax(self) -> list[GpuInfo]: + """Parse mx-smi output for MetaX GPUs. + + Runs --show-memory and --show-usage separately and merges results. + Output format example: + GPU#0 MXC550 0000:1a:00.0 + Memory + vis_vram total : 67108864 KB + vis_vram used : 879032 KB + Utilization + GPU : 0 % + """ + + def run_mxsmi(flag): + try: + r = subprocess.run( + ["mx-smi", flag], + capture_output=True, + text=True, + timeout=10, + ) + return r.stdout if r.returncode == 0 else "" + except (FileNotFoundError, subprocess.TimeoutExpired): + return "" + + mem_out = run_mxsmi("--show-memory") + util_out = run_mxsmi("--show-usage") + + # Parse memory: collect {index: (used_kb, total_kb)} + mem = {} + current = None + for line in mem_out.splitlines(): + m = re.match(r"GPU#(\d+)", line.strip()) + if m: + current = int(m.group(1)) + mem[current] = [0.0, 0.0] + continue + if current is None: + continue + m = re.search(r"vis_vram total\s*:\s*([\d.]+)\s*KB", line) + if m: + mem[current][1] = float(m.group(1)) / 1024 # KB -> MB + m = re.search(r"vis_vram used\s*:\s*([\d.]+)\s*KB", line) + if m: + mem[current][0] = float(m.group(1)) / 1024 # KB -> MB + + # Parse utilization: collect {index: utilization_pct} + util = {} + current = None + in_util = False + for line in util_out.splitlines(): + m = re.match(r"GPU#(\d+)", line.strip()) + if m: + current = int(m.group(1)) + in_util = False + continue + if current is None: + continue + if "Utilization" in line: + in_util = True + continue + if in_util: + m = re.match(r"\s*GPU\s*:\s*([\d.]+)\s*%", line) + if m: + util[current] = float(m.group(1)) + in_util = False + + gpus = [] + for idx in sorted(mem): + used_mb, total_mb = mem[idx] + gpus.append( + GpuInfo( + index=idx, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util.get(idx, 0.0), + ) + ) + return gpus + + def _detect_gpus_moore(self) -> list[GpuInfo]: + """Parse mthreads-gmi JSON output for Moore Threads GPUs. + + Uses: mthreads-gmi -q --json + Expected JSON structure: + { + "Attached GPUs": { + "GPU 00000000:3B:00.0": { + "Minor Number": "0", + "Memory Usage": { + "Total": "24576 MiB", + "Used": "512 MiB" + }, + "Utilization": { + "Gpu": "5 %" + } + } + } + } + """ + + def extract_number(s): + m = re.search(r"([\d.]+)", str(s)) + return float(m.group(1)) if m else 0.0 + + try: + result = subprocess.run( + ["mthreads-gmi", "-q", "--json"], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + try: + data = json.loads(result.stdout) + except json.JSONDecodeError: + return [] + + gpus = [] + attached = data.get("Attached GPUs", {}) + + for gpu_data in attached.values(): + try: + index = int(gpu_data.get("Minor Number", len(gpus))) + + mem = gpu_data.get("Memory Usage", {}) + total_mb = extract_number(mem.get("Total", "0 MiB")) + used_mb = extract_number(mem.get("Used", "0 MiB")) + util_pct = extract_number( + gpu_data.get("Utilization", {}).get("Gpu", "0 %") + ) + + gpus.append( + GpuInfo( + index=index, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util_pct, + ) + ) + except (ValueError, AttributeError): + continue + + return sorted(gpus, key=operator.attrgetter("index")) + + def _detect_gpus_cambricon(self) -> list[GpuInfo]: + """Parse cnmon output for Cambricon MLU cards. + + Each card appears as two consecutive data rows: + Row 1: | {card} {vf} {name} {fw} | {bus_id} | {util}% {ecc} | + Row 2: | {fan}% {temp} {pwr} | {mem_used} MiB/ {mem_total} MiB | ... | + """ + try: + result = subprocess.run( + ["cnmon"], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + gpus = [] + lines = result.stdout.splitlines() + i = 0 + + while i < len(lines): + line = lines[i] + # Row 1: "| {index} ... | {bus_id} | {util}% {ecc} |" + m1 = re.match(r"^\|\s+(\d+)\s+.*\|\s*([\d.]+)%", line) + + if m1 and i + 1 < len(lines): + try: + card_index = int(m1.group(1)) + util_pct = float(m1.group(2)) + row2 = lines[i + 1] + mem_m = re.search(r"([\d.]+)\s+MiB/\s*([\d.]+)\s+MiB", row2) + + if mem_m: + used_mb = float(mem_m.group(1)) + total_mb = float(mem_m.group(2)) + else: + used_mb, total_mb = 0.0, 0.0 + + gpus.append( + GpuInfo( + index=card_index, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util_pct, + ) + ) + except (ValueError, AttributeError): + pass + i += 2 + continue + + i += 1 + + return sorted(gpus, key=operator.attrgetter("index")) + + def detect_system_resources(self) -> SystemResources: + """Read system memory from /proc/meminfo and CPU count.""" + total_mb = 0.0 + available_mb = 0.0 + + try: + with open("/proc/meminfo", encoding="utf-8") as f: + for line in f: + if line.startswith("MemTotal:"): + total_mb = float(line.split()[1]) / 1024 + elif line.startswith("MemAvailable:"): + available_mb = float(line.split()[1]) / 1024 + except OSError: + pass + + return SystemResources( + total_memory_mb=total_mb, + available_memory_mb=available_mb, + cpu_count=os.cpu_count() or 1, + ) + + def get_free_gpus(self) -> list[int]: + """Return GPU indices with utilization below threshold.""" + gpus = self.detect_gpus() + return [ + g.index for g in gpus if g.utilization_pct < self._utilization_threshold + ] + + def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]: + """Try to allocate GPUs and check memory. + + Returns (allocated_gpu_ids, success). On failure returns ([], False). + GPU detection and memory checks run outside the lock to avoid blocking + other threads while subprocess.run (nvidia-smi) executes. + """ + if gpu_count <= 0: + if memory_mb > 0: + sys_res = self.detect_system_resources() + + if sys_res.available_memory_mb < memory_mb: + return ([], False) + + return ([], True) + + # Detect GPUs and memory outside the lock (subprocess.run can block) + free_gpus = set(self.get_free_gpus()) + sys_res = self.detect_system_resources() if memory_mb > 0 else None + + with self._lock: + available = free_gpus - self._allocated + + if len(available) < gpu_count: + return ([], False) + + if sys_res is not None and sys_res.available_memory_mb < memory_mb: + return ([], False) + + selected = sorted(available)[:gpu_count] + self._allocated.update(selected) + return (selected, True) + + def release(self, gpu_ids): + """Return GPUs to the free pool.""" + with self._lock: + self._allocated -= set(gpu_ids) + + def get_status(self) -> dict: + """Return current resource status for API endpoints.""" + gpus = self.detect_gpus() + sys_res = self.detect_system_resources() + + with self._lock: + allocated = sorted(self._allocated) + + return { + "platform": self._platform, + "gpus": [ + { + "index": g.index, + "memory_used_mb": g.memory_used_mb, + "memory_total_mb": g.memory_total_mb, + "utilization_pct": g.utilization_pct, + "allocated_by_agent": g.index in allocated, + } + for g in gpus + ], + "allocated_gpu_ids": allocated, + "system": { + "total_memory_mb": round(sys_res.total_memory_mb, 1), + "available_memory_mb": round(sys_res.available_memory_mb, 1), + "cpu_count": sys_res.cpu_count, + }, + "utilization_threshold": self._utilization_threshold, + } + + +def parse_gpu_requirement(job_config) -> int: + """Extract GPU count requirement from a job config.""" + resources = job_config.get("resources", {}) + gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) + + if gpu_style == GPU_STYLE_NONE: + return 0 + + ngpus = resources.get("ngpus") + if ngpus is not None: + return int(ngpus) + + gpu_ids = str(resources.get("gpu_ids", "")) + + if not gpu_ids: + return resources.get("gpu_count", 0) + + if gpu_ids == "all": + return 0 # "all" means use all available, don't reserve specific count + + return len(gpu_ids.split(",")) + + +def parse_memory_requirement(job_config) -> float: + """Extract memory requirement in MB from a job config.""" + resources = job_config.get("resources", {}) + memory = str(resources.get("memory", "")) + + if not memory: + return 0 + + memory = memory.lower().strip() + + if memory.endswith("gb"): + return float(memory[:-2]) * 1024 + elif memory.endswith("g"): + return float(memory[:-1]) * 1024 + elif memory.endswith("mb"): + return float(memory[:-2]) + elif memory.endswith("m"): + return float(memory[:-1]) + + try: + return float(memory) * 1024 # Default: GB + except ValueError: + return 0 + + +def detect_platform(): + """Auto-detect the current platform by probing GPU query tools on PATH.""" + for platform, tool in ResourcePool.GPU_QUERY_TOOLS.items(): + if shutil.which(tool): + return platform + + return None diff --git a/.ci/config.yaml b/.ci/config.yaml new file mode 100644 index 0000000..b70e7df --- /dev/null +++ b/.ci/config.yaml @@ -0,0 +1,146 @@ +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +github: + status_context_prefix: "ci/infiniops" + +# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote +# machines via `agent.py run`. Required on the trigger machine when each platform's +# agent runs on a separate host. See the README for multi-machine deployment details. +# agents: +# nvidia: +# url: http://nvidia-host:8080 +# iluvatar: +# url: http://iluvatar-host:8080 +# metax: +# url: http://metax-host:8080 +# moore: +# url: http://moore-host:8080 +# cambricon: +# url: http://cambricon-host:8080 + +platforms: + nvidia: + image: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + ngpus: 1 # Scheduler auto-picks this many free GPUs + memory: 32GB + shm_size: 16g # Prevent PyTorch default 64MB shared memory limit + timeout: 3600 + # env: # Uncomment to inject extra env vars into the container. + # MY_VAR: value + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + + iluvatar: + image: + dockerfile: .ci/images/iluvatar/ + build_args: + BASE_IMAGE: corex:qs_pj20250825 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + - "--cap-add=ALL" + - "--pid=host" + - "--ipc=host" + volumes: + - /dev:/dev + - /lib/firmware:/lib/firmware + - /usr/src:/usr/src + - /lib/modules:/lib/modules + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" # GPU visibility via CUDA_VISIBLE_DEVICES + gpu_style: none # CoreX: passthrough via --privileged + /dev mount + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + + metax: + image: + dockerfile: .ci/images/metax/ + build_args: + BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + - "--ulimit=memlock=-1" + - "--ulimit=stack=67108864" + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" + gpu_style: none # MetaX: passthrough via --privileged, no CUDA_VISIBLE_DEVICES + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + + moore: + image: + dockerfile: .ci/images/moore/ + build_args: + BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" + gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES set by base image + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + + cambricon: + image: + dockerfile: .ci/images/cambricon/ + build_args: + BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310 + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" + gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + + ascend: # TODO: Ascend image is not ready yet + image: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source_env: PRIVATE_SDK_URL diff --git a/.ci/github_status.py b/.ci/github_status.py new file mode 100644 index 0000000..f8f017f --- /dev/null +++ b/.ci/github_status.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""GitHub Commit Status API wrapper using urllib (zero external dependencies).""" + +import json +import os +import re +import sys +import urllib.error +import urllib.request + + +def parse_repo_url(url): + """Extract (owner, repo) from a GitHub URL. + + Handles: + - https://github.com/Owner/Repo.git + - git@github.com:Owner/Repo.git + """ + # HTTPS format + m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url) + + if m: + return m.group(1), m.group(2) + + # SSH format + m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url) + + if m: + return m.group(1), m.group(2) + + return "", "" + + +def build_status_context(prefix, job_name): + """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'.""" + return f"{prefix}/{job_name}" + + +def post_commit_status( + owner, + repo, + sha, + state, + context, + description, + target_url=None, + token=None, +): + """Post a commit status to GitHub. + + Args: + state: One of 'pending', 'success', 'failure', 'error'. + Returns True on success, False on failure. + """ + token = token or os.environ.get("GITHUB_TOKEN", "") + + if not token: + print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr) + return False + + if not owner or not repo or not sha: + print( + "warning: missing owner/repo/sha, skipping status update", file=sys.stderr + ) + return False + + url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}" + body = { + "state": state, + "context": context, + "description": description[:140], + } + + if target_url: + body["target_url"] = target_url + + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "Content-Type": "application/json", + }, + method="POST", + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return 200 <= resp.status < 300 + except urllib.error.HTTPError as e: + print( + f"warning: GitHub status API returned {e.code}: {e.reason}", + file=sys.stderr, + ) + return False + except urllib.error.URLError as e: + print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr) + return False diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile new file mode 100644 index 0000000..66392eb --- /dev/null +++ b/.ci/images/ascend/Dockerfile @@ -0,0 +1,39 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + coreutils \ + curl \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PRIVATE_SDK_URL +RUN if [ -n "$PRIVATE_SDK_URL" ]; then \ + curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \ + chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \ + rm /tmp/sdk.run; \ + fi + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile new file mode 100644 index 0000000..138f3cb --- /dev/null +++ b/.ci/images/cambricon/Dockerfile @@ -0,0 +1,33 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Python 3.10 executables (`pip`-installed tools) live under `/usr/local/python3.10/bin`. +ENV PATH=/usr/local/python3.10/bin:${PATH} + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +# `git` and `cmake` are pre-installed; `coreutils-single` covers coreutils needs. +RUN dnf install -y ninja-build && dnf clean all + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + ruff==0.15.7 + +# Pin pre-installed Cambricon `torch` to prevent `pip` from replacing it with upstream version. +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile new file mode 100644 index 0000000..79afc85 --- /dev/null +++ b/.ci/images/iluvatar/Dockerfile @@ -0,0 +1,53 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# CoreX runtime environment (base image sets these in `/etc/bash.bashrc`, +# but `docker build` `RUN` uses `/bin/sh` which doesn't source it). +ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages +ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + ninja-build \ + coreutils \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml \ + ruff==0.15.7 + +RUN pip config set global.index-url https://pypi.org/simple + +# Pin pre-installed CoreX `torch` to prevent `pip` from replacing it with upstream version. +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile new file mode 100644 index 0000000..540bc9d --- /dev/null +++ b/.ci/images/metax/Dockerfile @@ -0,0 +1,46 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# `conda` Python is used in this image. +ENV PATH=/opt/conda/bin:${PATH} + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + coreutils \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest-cov \ + pytest-xdist \ + pyyaml \ + ruff==0.15.7 + +# Pin pre-installed MetaX `torch` to prevent `pip` from replacing it with upstream version. +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile new file mode 100644 index 0000000..a95d9bd --- /dev/null +++ b/.ci/images/moore/Dockerfile @@ -0,0 +1,38 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# `MUSA_HOME`, `PATH`, `LD_LIBRARY_PATH` already set by base image. + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + ninja-build \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + libclang \ + pytest-cov \ + pytest-xdist \ + ruff==0.15.7 + +# Pin pre-installed `torch` to prevent `pip` from replacing it with upstream version. +RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile new file mode 100644 index 0000000..b4984da --- /dev/null +++ b/.ci/images/nvidia/Dockerfile @@ -0,0 +1,46 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + coreutils \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml \ + ruff==0.15.7 + +# Pin pre-installed `torch` to prevent `pip` from replacing it with a different version. +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py new file mode 100644 index 0000000..24a8867 --- /dev/null +++ b/.ci/run.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python3 +"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" + +import argparse +import os +import shlex +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +from ci_resource import ( + GPU_STYLE_NVIDIA, + GPU_STYLE_NONE, + GPU_STYLE_MLU, + ResourcePool, + detect_platform, +) +from utils import get_git_commit, load_config + +# Flags that consume the next token as their value (e.g. -n 4, -k expr). +_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"} + + +def apply_test_override(run_cmd, test_path): + """Replace positional test path(s) in a pytest stage command. + + For example: ``pytest tests/ -n 4 ...`` becomes + ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is + ``tests/test_gemm.py``. + """ + parts = shlex.split(run_cmd) + + if not parts or parts[0] != "pytest": + return run_cmd + + result = ["pytest", test_path] + skip_next = False + + for p in parts[1:]: + if skip_next: + result.append(p) + skip_next = False + continue + + if p.startswith("-"): + result.append(p) + if p in _PYTEST_VALUE_FLAGS: + skip_next = True + continue + + # Skip existing test paths; the override is already in result[1]. + if not ("/" in p or p.endswith(".py") or "::" in p): + result.append(p) + + return shlex.join(result) + + +def build_results_dir(base, platform, stages, commit): + """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`.""" + stage_names = "+".join(s["name"] for s in stages) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + dirname = f"{platform}_{stage_names}_{commit}_{timestamp}" + + return Path(base) / dirname + + +def resolve_image(config, platform, image_tag): + """Resolve an image reference to a full image name. + + Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config + contains a registry section, returns a registry-prefixed URL. Otherwise + returns a local tag (current default). + """ + registry = config.get("registry", {}) + registry_url = registry.get("url", "") + project = registry.get("project", "infiniops") + + if not registry_url: + return f"{project}-ci/{platform}:{image_tag}" + + return f"{registry_url}/{project}/{platform}:{image_tag}" + + +def build_runner_script(): + return r""" +set -e +cd /workspace +mkdir -p /workspace/results +if [ -n "$LOCAL_SRC" ]; then + cp -r "$LOCAL_SRC" /tmp/src + cd /tmp/src +else + git clone "$REPO_URL" repo + cd repo + git checkout "$BRANCH" +fi +echo "========== Setup ==========" +eval "$SETUP_CMD" +set +e +failed=0 +for i in $(seq 1 "$NUM_STAGES"); do + name_var="STAGE_${i}_NAME" + cmd_var="STAGE_${i}_CMD" + name="${!name_var}" + cmd="${!cmd_var}" + echo "========== Stage: $name ==========" + [ -n "$cmd" ] && { eval "$cmd" || failed=1; } +done +echo "========== Summary ==========" +if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then + chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true +fi +exit $failed +""" + + +def build_docker_args( + config, + job_name, + repo_url, + branch, + stages, + workdir, + image_tag_override, + gpu_id_override=None, + results_dir=None, + local_path=None, +): + job = config["jobs"][job_name] + platform = job.get("platform", "nvidia") + image_tag = image_tag_override or job.get("image", "latest") + image = resolve_image(config, platform, image_tag) + resources = job.get("resources", {}) + setup_raw = job.get("setup", "pip install .[dev]") + + if isinstance(setup_raw, list): + setup_cmd = "\n".join(setup_raw) + else: + setup_cmd = setup_raw + + args = [ + "docker", + "run", + "--rm", + "--network", + "host", + "-i", + "-w", + workdir, + "-e", + f"REPO_URL={repo_url}", + "-e", + f"BRANCH={branch}", + "-e", + f"SETUP_CMD={setup_cmd}", + "-e", + f"NUM_STAGES={len(stages)}", + "-e", + f"HOST_UID={os.getuid()}", + "-e", + f"HOST_GID={os.getgid()}", + ] + + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + + if proxy_val: + args.extend(["-e", f"{proxy_var}={proxy_val}"]) + args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"]) + + for key, value in job.get("env", {}).items(): + args.extend(["-e", f"{key}={value}"]) + + if results_dir: + args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) + + if local_path: + args.extend(["-v", f"{local_path}:/workspace/repo:ro"]) + args.extend(["-e", "LOCAL_SRC=/workspace/repo"]) + + for i, s in enumerate(stages): + args.append("-e") + args.append(f"STAGE_{i + 1}_NAME={s['name']}") + args.append("-e") + args.append(f"STAGE_{i + 1}_CMD={s.get('run', '')}") + + # Platform-specific device access + for flag in job.get("docker_args", []): + args.append(flag) + + for vol in job.get("volumes", []): + args.extend(["-v", vol]) + + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) + ngpus = resources.get("ngpus") + gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) + + if gpu_style == GPU_STYLE_NVIDIA: + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif ngpus: + args.extend(["--gpus", f"count={ngpus}"]) + elif gpu_style == GPU_STYLE_NONE and gpu_id and gpu_id != "all": + # For platforms like Iluvatar/CoreX that use --privileged + /dev mount, + # control visible GPUs via CUDA_VISIBLE_DEVICES. + args.extend(["-e", f"CUDA_VISIBLE_DEVICES={gpu_id}"]) + elif gpu_style == GPU_STYLE_MLU and gpu_id and gpu_id != "all": + # For Cambricon MLU platforms that use --privileged, + # control visible devices via MLU_VISIBLE_DEVICES. + args.extend(["-e", f"MLU_VISIBLE_DEVICES={gpu_id}"]) + + memory = resources.get("memory") + + if memory: + mem = str(memory).lower().replace("gb", "g").replace("mb", "m") + + if not mem.endswith("g") and not mem.endswith("m"): + mem = f"{mem}g" + + args.extend(["--memory", mem]) + + shm_size = resources.get("shm_size") + + if shm_size: + args.extend(["--shm-size", str(shm_size)]) + + timeout_sec = resources.get("timeout") + args.append(image) + + if timeout_sec: + # Requires coreutils `timeout` inside the container image. + args.extend(["timeout", str(timeout_sec)]) + + args.extend(["bash", "-c", build_runner_script().strip()]) + + return args + + +def resolve_job_names(jobs, platform, job=None): + """Resolve job names for a platform. + + - ``job=None`` — all jobs for the platform. + - ``job="gpu"`` (short name) — matched via ``short_name`` field. + - ``job="nvidia_gpu"`` (full name) — direct lookup. + """ + if job and job in jobs: + return [job] + + if job: + matches = [ + name + for name, cfg in jobs.items() + if cfg.get("platform") == platform and cfg.get("short_name") == job + ] + + if not matches: + print( + f"error: job {job!r} not found for platform {platform!r}", + file=sys.stderr, + ) + sys.exit(1) + + return matches + + matches = [name for name, cfg in jobs.items() if cfg.get("platform") == platform] + + if not matches: + print(f"error: no jobs for platform {platform!r}", file=sys.stderr) + sys.exit(1) + + return matches + + +def main(): + parser = argparse.ArgumentParser(description="Run Docker CI pipeline") + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--branch", type=str, help="Override repo branch (default: config repo.branch)" + ) + parser.add_argument( + "--job", + type=str, + help="Job name: short name (gpu) or full name (nvidia_gpu). Default: all jobs", + ) + parser.add_argument( + "--stage", + type=str, + help="Run only this stage name (still runs setup first)", + ) + parser.add_argument( + "--image-tag", + type=str, + help="Override image tag (stable, latest, or commit hash)", + ) + parser.add_argument( + "--gpu-id", + type=str, + help='GPU device IDs to use, e.g. "0", "0,2", "all"', + ) + parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + help="Base directory for test results (default: ./ci-results)", + ) + parser.add_argument( + "--test", + type=str, + help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"', + ) + parser.add_argument( + "--local", + action="store_true", + help="Mount current directory (read-only) into the container instead of cloning from git", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print docker command and exit", + ) + args = parser.parse_args() + + config = load_config(args.config) + repo = config.get("repo", {}) + repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") + branch = args.branch or repo.get("branch", "master") + + platform = detect_platform() + + if not platform: + tools = ", ".join(ResourcePool.GPU_QUERY_TOOLS.values()) + print(f"error: could not detect platform (no {tools} found)", file=sys.stderr) + sys.exit(1) + + print(f"platform: {platform}", file=sys.stderr) + + jobs = config.get("jobs", {}) + + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) + + job_names = resolve_job_names(jobs, platform, job=args.job) + failed = 0 + + for job_name in job_names: + job = jobs[job_name] + all_stages = job.get("stages", []) + + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + + if not stages: + print( + f"error: stage {args.stage!r} not found in {job_name}", + file=sys.stderr, + ) + sys.exit(1) + else: + stages = all_stages + + if args.test: + stages = [ + {**s, "run": apply_test_override(s.get("run", ""), args.test)} + for s in stages + ] + + job_platform = job.get("platform", platform) + commit = get_git_commit() + results_dir = build_results_dir(args.results_dir, job_platform, stages, commit) + + local_path = Path.cwd().resolve() if args.local else None + docker_args = build_docker_args( + config, + job_name, + repo_url, + branch, + stages, + "/workspace", + args.image_tag, + gpu_id_override=args.gpu_id, + results_dir=results_dir, + local_path=local_path, + ) + + if args.dry_run: + print(shlex.join(docker_args)) + continue + + print(f"==> running job: {job_name}", file=sys.stderr) + results_dir.mkdir(parents=True, exist_ok=True) + returncode = subprocess.run(docker_args).returncode + + if returncode != 0: + print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr) + failed += 1 + + sys.exit(1 if failed else 0) + + +if __name__ == "__main__": + main() diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py new file mode 100644 index 0000000..38ed716 --- /dev/null +++ b/.ci/tests/conftest.py @@ -0,0 +1,46 @@ +import sys +from pathlib import Path + +# Allow `import run` and `import build` directly. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import pytest + +from utils import normalize_config + + +@pytest.fixture +def minimal_config(): + """Minimal platform-centric config, normalized to flat format.""" + raw = { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [ + { + "name": "test", + "run": "pytest tests/ -v", + } + ], + } + }, + } + }, + } + return normalize_config(raw) diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py new file mode 100644 index 0000000..73708db --- /dev/null +++ b/.ci/tests/test_agent.py @@ -0,0 +1,535 @@ +import hashlib +import hmac +import json +import threading +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +import agent +import ci_resource as res +from utils import normalize_config + + +# --------------------------------------------------------------------------- +# Test fixtures. +# --------------------------------------------------------------------------- + + +@pytest.fixture +def agent_config(): + raw = { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "github": { + "status_context_prefix": "ci/infiniops", + }, + "agents": { + "nvidia": {"url": "http://nvidia-host:8080"}, + "iluvatar": {"url": "http://iluvatar-host:8080"}, + }, + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [{"name": "test", "run": "pytest tests/ -v"}], + }, + }, + }, + "iluvatar": { + "image": { + "dockerfile": ".ci/images/iluvatar/", + "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "gpu_style": "none", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [{"name": "test", "run": "pytest tests/ -v"}], + }, + }, + }, + }, + } + return normalize_config(raw) + + +@pytest.fixture +def mock_resource_pool(): + pool = MagicMock(spec=res.ResourcePool) + pool.platform = "nvidia" + pool.allocate.return_value = ([0], True) + pool.release.return_value = None + pool.get_status.return_value = { + "platform": "nvidia", + "gpus": [], + "allocated_gpu_ids": [], + "system": {}, + } + return pool + + +# --------------------------------------------------------------------------- +# Tests for `select_jobs`. +# --------------------------------------------------------------------------- + + +def test_select_jobs_by_name(agent_config): + jobs = agent.select_jobs(agent_config, job_name="nvidia_gpu") + assert jobs == ["nvidia_gpu"] + + +def test_select_jobs_by_platform(agent_config): + jobs = agent.select_jobs(agent_config, platform="nvidia") + assert jobs == ["nvidia_gpu"] + + +def test_select_jobs_by_platform_iluvatar(agent_config): + jobs = agent.select_jobs(agent_config, platform="iluvatar") + assert jobs == ["iluvatar_gpu"] + + +def test_select_jobs_all(agent_config): + jobs = agent.select_jobs(agent_config) + assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"} + + +def test_select_jobs_invalid_name(agent_config): + with pytest.raises(ValueError, match="not_exist"): + agent.select_jobs(agent_config, job_name="not_exist") + + +# --------------------------------------------------------------------------- +# Tests for `verify_signature`. +# --------------------------------------------------------------------------- + + +def test_verify_signature_valid(): + secret = "my-secret" + body = b'{"action": "push"}' + sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() + assert agent.verify_signature(secret, body, sig) is True + + +def test_verify_signature_invalid(): + assert agent.verify_signature("secret", b"body", "sha256=wrong") is False + + +def test_verify_signature_empty(): + assert agent.verify_signature("secret", b"body", "") is False + + +# --------------------------------------------------------------------------- +# Tests for `JobRequest` and `JobResult`. +# --------------------------------------------------------------------------- + + +def test_job_request_fields(agent_config): + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) + assert req.job_name == "nvidia_gpu" + assert req.platform == "nvidia" + assert req.commit_sha == "abc123" + assert len(req.job_id) == 8 + d = req.to_dict() + assert d["job_name"] == "nvidia_gpu" + + +def test_job_result_success(): + r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5) + assert r.state == "success" + + +def test_job_result_failure(): + r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0) + assert r.state == "failure" + + +# --------------------------------------------------------------------------- +# Tests for the `Scheduler` class. +# --------------------------------------------------------------------------- + + +def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0)) + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + results_dir=Path("/tmp/test-results"), + no_status=True, + dry_run=True, + ) + req = agent.JobRequest( + "nvidia_gpu", + "master", + "abc123", + agent_config, + results_dir=Path("/tmp/test-results"), + ) + scheduler.submit(req) + results = scheduler.wait_all() + assert len(results) == 1 + assert results[0].state == "success" + + +def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): + pool = MagicMock(spec=res.ResourcePool) + pool.allocate.return_value = ([], False) + pool.get_status.return_value = { + "platform": "nvidia", + "gpus": [], + "allocated_gpu_ids": [], + "system": {}, + } + + scheduler = agent.Scheduler( + agent_config, + "nvidia", + pool, + no_status=True, + dry_run=False, + ) + + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) + scheduler.submit(req) + + info = scheduler.get_job(req.job_id) + assert info["state"] == "queued" + + +def test_scheduler_get_status(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, + ) + + status = scheduler.get_status() + assert "queued" in status + assert "running" in status + assert "completed" in status + assert "resources" in status + + +# --------------------------------------------------------------------------- +# Tests for `WebhookHandler` push event parsing. +# --------------------------------------------------------------------------- + + +def test_webhook_parse_push(): + handler = agent.WebhookHandler.__new__(agent.WebhookHandler) + payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"} + branch, sha = handler._parse_push(payload) + assert branch == "feat/test" + assert sha == "abc123def456" + + +def test_webhook_parse_pr(): + handler = agent.WebhookHandler.__new__(agent.WebhookHandler) + payload = { + "pull_request": { + "head": { + "ref": "feat/pr-branch", + "sha": "def789", + } + } + } + branch, sha = handler._parse_pull_request(payload) + assert branch == "feat/pr-branch" + assert sha == "def789" + + +# --------------------------------------------------------------------------- +# Integration-style webhook HTTP tests. +# --------------------------------------------------------------------------- + + +def _urlopen_no_proxy(url_or_req, **kwargs): + """`urlopen` mock that bypasses any `HTTP_PROXY`.""" + import urllib.request + + opener = urllib.request.build_opener(urllib.request.ProxyHandler({})) + return opener.open(url_or_req, **kwargs) + + +def test_health_endpoint(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + ) + server = agent.AgentServer( + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + try: + resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5) + data = json.loads(resp.read()) + assert data["status"] == "ok" + assert data["platform"] == "nvidia" + finally: + server.server_close() + + +def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={"Content-Type": "application/json"}, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + assert len(data["job_ids"]) >= 1 + finally: + server.server_close() + + +def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, + ) + secret = "test-secret" + server = agent.AgentServer( + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", + webhook_secret=secret, + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + payload = json.dumps( + { + "ref": "refs/heads/master", + "after": "abc123def456", + } + ).encode() + sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest() + + req = urllib.request.Request( + f"http://127.0.0.1:{port}/webhook", + data=payload, + headers={ + "Content-Type": "application/json", + "X-GitHub-Event": "push", + "X-Hub-Signature-256": sig, + }, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + finally: + server.server_close() + + +def test_webhook_invalid_signature(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + ) + server = agent.AgentServer( + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", + webhook_secret="real-secret", + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.error + import urllib.request + + payload = b'{"ref": "refs/heads/master", "after": "abc"}' + req = urllib.request.Request( + f"http://127.0.0.1:{port}/webhook", + data=payload, + headers={ + "Content-Type": "application/json", + "X-GitHub-Event": "push", + "X-Hub-Signature-256": "sha256=invalid", + }, + ) + + try: + with pytest.raises(urllib.error.HTTPError) as exc_info: + _urlopen_no_proxy(req, timeout=5) + + assert exc_info.value.code == 401 + finally: + server.server_close() + + +# --------------------------------------------------------------------------- +# Tests for API token authentication. +# --------------------------------------------------------------------------- + + +def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): + """When `api_token` is set, `/api/run` rejects requests without a valid token.""" + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", + api_token="my-secret-token", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.error + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={"Content-Type": "application/json"}, + ) + + try: + with pytest.raises(urllib.error.HTTPError) as exc_info: + _urlopen_no_proxy(req, timeout=5) + + assert exc_info.value.code == 401 + finally: + server.server_close() + + +def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch): + """When `api_token` is set, `/api/run` accepts requests with a correct Bearer token.""" + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", + api_token="my-secret-token", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer my-secret-token", + }, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + finally: + server.server_close() diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py new file mode 100644 index 0000000..4d28885 --- /dev/null +++ b/.ci/tests/test_build.py @@ -0,0 +1,186 @@ +import build + + +# --------------------------------------------------------------------------- +# Tests for `build_image_tag`. +# --------------------------------------------------------------------------- + + +def test_build_image_tag_with_registry(): + tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest") + assert tag == "localhost:5000/infiniops/nvidia:latest" + + +def test_build_image_tag_without_registry(): + tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234") + assert tag == "infiniops-ci/nvidia:abc1234" + + +def test_build_image_tag_commit_hash(): + tag = build.build_image_tag( + "registry.example.com:5000", "proj", "ascend", "deadbeef" + ) + assert tag == "registry.example.com:5000/proj/ascend:deadbeef" + + +# --------------------------------------------------------------------------- +# Tests for `has_dockerfile_changed`. +# --------------------------------------------------------------------------- + + +def test_has_dockerfile_changed_true_when_stdout_nonempty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout="Dockerfile\n"), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +def test_has_dockerfile_changed_false_when_stdout_empty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is False + + +def test_has_dockerfile_changed_true_on_git_error(mocker): + # Shallow clone or initial commit: `git diff` returns non-zero. + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=128, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +# --------------------------------------------------------------------------- +# Tests for `docker_login`. +# --------------------------------------------------------------------------- + + +def test_docker_login_no_credentials_env(mocker): + run_mock = mocker.patch("subprocess.run") + result = build.docker_login({"url": "localhost:5000"}, dry_run=False) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_token_not_set(mocker, monkeypatch, capsys): + monkeypatch.delenv("REGISTRY_TOKEN", raising=False) + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is False + run_mock.assert_not_called() + + +def test_docker_login_dry_run_does_not_call_subprocess(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=True) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_success(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is True + run_mock.assert_called_once() + cmd = run_mock.call_args[0][0] + assert "docker" in cmd + assert "login" in cmd + + +# --------------------------------------------------------------------------- +# Tests for `build_image` dry-run mode and proxy forwarding. +# --------------------------------------------------------------------------- + + +def _platform_cfg(): + return { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + + +def _registry_cfg(): + return {"url": "localhost:5000", "project": "infiniops"} + + +def test_build_image_dry_run_no_subprocess(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + run_mock = mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + run_mock.assert_not_called() + captured = capsys.readouterr() + assert "[dry-run]" in captured.out + + +def test_build_image_dry_run_output_contains_image_tag(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + captured = capsys.readouterr() + assert "abc1234" in captured.out + + +def test_build_image_proxy_in_build_args(mocker, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + called_cmd = run_mock.call_args[0][0] + joined = " ".join(called_cmd) + assert "HTTP_PROXY=http://proxy.test:3128" in joined + assert "http_proxy=http://proxy.test:3128" in joined + + +def test_build_image_returns_false_on_docker_error(mocker, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=1), + ) + result = build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + assert result is False diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py new file mode 100644 index 0000000..9e29c79 --- /dev/null +++ b/.ci/tests/test_github_status.py @@ -0,0 +1,145 @@ +import json +from unittest.mock import MagicMock + + +import github_status as gh + + +# --------------------------------------------------------------------------- +# Tests for `parse_repo_url`. +# --------------------------------------------------------------------------- + + +def test_parse_repo_url_https(): + owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git") + assert owner == "InfiniTensor" + assert repo == "InfiniOps" + + +def test_parse_repo_url_https_no_git(): + owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo") + assert owner == "Owner" + assert repo == "Repo" + + +def test_parse_repo_url_ssh(): + owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git") + assert owner == "Owner" + assert repo == "Repo" + + +def test_parse_repo_url_invalid(): + owner, repo = gh.parse_repo_url("not-a-url") + assert owner == "" + assert repo == "" + + +# --------------------------------------------------------------------------- +# Tests for `build_status_context`. +# --------------------------------------------------------------------------- + + +def test_build_status_context(): + ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu") + assert ctx == "ci/infiniops/nvidia_gpu" + + +# --------------------------------------------------------------------------- +# Tests for `post_commit_status`. +# --------------------------------------------------------------------------- + + +def test_post_status_no_token(monkeypatch): + monkeypatch.delenv("GITHUB_TOKEN", raising=False) + result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc") + assert result is False + + +def test_post_status_missing_owner(): + result = gh.post_commit_status( + "", "repo", "abc123", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_success(monkeypatch): + mock_response = MagicMock() + mock_response.status = 201 + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + captured_req = {} + + def mock_urlopen(req, **kwargs): + captured_req["url"] = req.full_url + captured_req["data"] = json.loads(req.data) + captured_req["headers"] = dict(req.headers) + return mock_response + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "InfiniTensor", + "InfiniOps", + "abc123def", + "success", + "ci/infiniops/nvidia_gpu", + "Tests passed", + token="ghp_test_token", + ) + + assert result is True + assert "abc123def" in captured_req["url"] + assert captured_req["data"]["state"] == "success" + assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu" + assert "ghp_test_token" in captured_req["headers"]["Authorization"] + + +def test_post_status_http_error(monkeypatch): + import urllib.error + + def mock_urlopen(req, **kwargs): + raise urllib.error.HTTPError( + url="", code=422, msg="Unprocessable", hdrs=None, fp=None + ) + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "owner", "repo", "sha", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_url_error(monkeypatch): + import urllib.error + + def mock_urlopen(req, **kwargs): + raise urllib.error.URLError("connection refused") + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "owner", "repo", "sha", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_truncates_description(monkeypatch): + mock_response = MagicMock() + mock_response.status = 201 + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + captured = {} + + def mock_urlopen(req, **kwargs): + captured["data"] = json.loads(req.data) + return mock_response + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + long_desc = "x" * 200 + gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok") + + assert len(captured["data"]["description"]) == 140 diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py new file mode 100644 index 0000000..0db3fbb --- /dev/null +++ b/.ci/tests/test_resource.py @@ -0,0 +1,327 @@ +import threading + + +import ci_resource as res + + +# --------------------------------------------------------------------------- +# Tests for `GpuInfo` and `SystemResources`. +# --------------------------------------------------------------------------- + + +def test_gpu_info_fields(): + g = res.GpuInfo( + index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50 + ) + assert g.index == 0 + assert g.memory_total_mb == 8000 + + +def test_system_resources_fields(): + s = res.SystemResources( + total_memory_mb=32000, available_memory_mb=16000, cpu_count=8 + ) + assert s.cpu_count == 8 + + +# --------------------------------------------------------------------------- +# Tests for `detect_gpus`. +# --------------------------------------------------------------------------- + + +def test_detect_gpus_nvidia_parses_csv(monkeypatch): + csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + gpus = pool.detect_gpus() + assert len(gpus) == 2 + assert gpus[0].index == 0 + assert gpus[0].memory_used_mb == 512 + assert gpus[0].utilization_pct == 5 + assert gpus[1].index == 1 + assert gpus[1].utilization_pct == 80 + + +def test_detect_gpus_empty_on_failure(monkeypatch): + def mock_run(cmd, **kwargs): + class R: + returncode = 1 + stdout = "" + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + assert pool.detect_gpus() == [] + + +def test_detect_gpus_unknown_platform(): + pool = res.ResourcePool("unknown_platform") + assert pool.detect_gpus() == [] + + +def test_detect_gpus_file_not_found(monkeypatch): + def mock_run(cmd, **kwargs): + raise FileNotFoundError("nvidia-smi not found") + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + assert pool.detect_gpus() == [] + + +# --------------------------------------------------------------------------- +# Tests for `detect_system_resources`. +# --------------------------------------------------------------------------- + + +def test_detect_system_resources(monkeypatch, tmp_path): + meminfo = tmp_path / "meminfo" + meminfo.write_text( + "MemTotal: 32000000 kB\n" + "MemFree: 10000000 kB\n" + "MemAvailable: 20000000 kB\n" + ) + + + _real_open = open + + def fake_open(path, **kw): + if str(path) == "/proc/meminfo": + return _real_open(str(meminfo), **kw) + return _real_open(path, **kw) + + monkeypatch.setattr("builtins.open", fake_open) + + pool = res.ResourcePool("nvidia") + sys_res = pool.detect_system_resources() + assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1 + assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1 + assert sys_res.cpu_count > 0 + + +# --------------------------------------------------------------------------- +# Tests for `get_free_gpus`. +# --------------------------------------------------------------------------- + + +def test_get_free_gpus_filters_by_utilization(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + free = pool.get_free_gpus() + assert 0 in free + assert 2 in free + assert 1 not in free + + +# --------------------------------------------------------------------------- +# Tests for `allocate` and `release`. +# --------------------------------------------------------------------------- + + +def test_allocate_success(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(1) + assert ok is True + assert len(gpu_ids) == 1 + assert gpu_ids[0] in (0, 1) + + +def test_allocate_insufficient_gpus(monkeypatch): + csv_output = "0, 100, 8192, 5\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(3) + assert ok is False + assert gpu_ids == [] + + +def test_allocate_zero_gpus(): + pool = res.ResourcePool("unknown") + gpu_ids, ok = pool.allocate(0) + assert ok is True + assert gpu_ids == [] + + +def test_release_frees_gpus(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(2) + assert ok is True + assert len(gpu_ids) == 2 + + # All GPUs allocated; next allocation should fail. + _, ok2 = pool.allocate(1) + assert ok2 is False + + # Release one GPU. + pool.release([gpu_ids[0]]) + gpu_ids2, ok3 = pool.allocate(1) + assert ok3 is True + assert gpu_ids2 == [gpu_ids[0]] + + +def test_allocate_excludes_allocated(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids1, _ = pool.allocate(1) + gpu_ids2, _ = pool.allocate(1) + + assert gpu_ids1 != gpu_ids2 + assert set(gpu_ids1 + gpu_ids2) == {0, 1} + + +def test_thread_safety(monkeypatch): + csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=50) + allocated_all = [] + lock = threading.Lock() + + def allocate_one(): + ids, ok = pool.allocate(1) + + if ok: + with lock: + allocated_all.extend(ids) + + threads = [threading.Thread(target=allocate_one) for _ in range(4)] + + for t in threads: + t.start() + + for t in threads: + t.join() + + assert len(allocated_all) == 4 + assert len(set(allocated_all)) == 4 + + +# --------------------------------------------------------------------------- +# Tests for `get_status`. +# --------------------------------------------------------------------------- + + +def test_get_status(monkeypatch): + csv_output = "0, 512, 8192, 5\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + status = pool.get_status() + assert status["platform"] == "nvidia" + assert len(status["gpus"]) == 1 + assert "system" in status + + +# --------------------------------------------------------------------------- +# Tests for `parse_gpu_requirement` and `parse_memory_requirement`. +# --------------------------------------------------------------------------- + + +def test_parse_gpu_requirement_nvidia(): + job = {"resources": {"gpu_ids": "0,1", "gpu_style": "nvidia"}} + assert res.parse_gpu_requirement(job) == 2 + + +def test_parse_gpu_requirement_none(): + job = {"resources": {"gpu_style": "none"}} + assert res.parse_gpu_requirement(job) == 0 + + +def test_parse_gpu_requirement_all(): + job = {"resources": {"gpu_ids": "all"}} + assert res.parse_gpu_requirement(job) == 0 + + +def test_parse_gpu_requirement_default(): + job = {"resources": {"gpu_ids": "0"}} + assert res.parse_gpu_requirement(job) == 1 + + +def test_parse_memory_requirement_gb(): + assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024 + + +def test_parse_memory_requirement_mb(): + assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512 + + +def test_parse_memory_requirement_empty(): + assert res.parse_memory_requirement({"resources": {}}) == 0 diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py new file mode 100644 index 0000000..93987e5 --- /dev/null +++ b/.ci/tests/test_run.py @@ -0,0 +1,298 @@ +from pathlib import Path + +import pytest + +import run + + +# --------------------------------------------------------------------------- +# Tests for `resolve_image`. +# --------------------------------------------------------------------------- + + +def test_resolve_image_with_registry(): + cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}} + img = run.resolve_image(cfg, "nvidia", "latest") + assert img == "localhost:5000/infiniops/nvidia:latest" + + +def test_resolve_image_without_registry(minimal_config): + img = run.resolve_image(minimal_config, "nvidia", "abc1234") + assert img == "infiniops-ci/nvidia:abc1234" + + +# --------------------------------------------------------------------------- +# Tests for `build_runner_script`. +# --------------------------------------------------------------------------- + + +def test_runner_script_contains_git_clone(): + script = run.build_runner_script() + assert "git clone" in script + + +def test_runner_script_contains_setup_cmd(): + script = run.build_runner_script() + assert "SETUP_CMD" in script + + +def test_runner_script_exits_on_failure(): + script = run.build_runner_script() + assert "exit $failed" in script + + +def test_runner_script_creates_results_dir(): + script = run.build_runner_script() + assert "mkdir -p /workspace/results" in script + + +# --------------------------------------------------------------------------- +# Tests for `build_docker_args` basic structure. +# --------------------------------------------------------------------------- + + +def test_docker_args_basic_structure(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert args[0] == "docker" + assert "run" in args + assert "--rm" in args + + +def test_docker_args_correct_image(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "infiniops-ci/nvidia:latest" in args + + +def test_docker_args_image_tag_override(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + "abc1234", + ) + assert "infiniops-ci/nvidia:abc1234" in args + + +# --------------------------------------------------------------------------- +# Tests for `build_docker_args` proxy passthrough. +# --------------------------------------------------------------------------- + + +def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "-e" in args + assert "HTTP_PROXY=http://proxy.example.com:8080" in args + assert "http_proxy=http://proxy.example.com:8080" in args + + +def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.delenv("http_proxy", raising=False) + monkeypatch.delenv("HTTPS_PROXY", raising=False) + monkeypatch.delenv("https_proxy", raising=False) + monkeypatch.delenv("NO_PROXY", raising=False) + monkeypatch.delenv("no_proxy", raising=False) + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + + for arg in args: + assert not arg.startswith("HTTP_PROXY=") + assert not arg.startswith("http_proxy=") + assert not arg.startswith("HTTPS_PROXY=") + assert not arg.startswith("https_proxy=") + assert not arg.startswith("NO_PROXY=") + assert not arg.startswith("no_proxy=") + + +def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "HTTP_PROXY=http://lowercase.proxy:3128" in args + assert "http_proxy=http://lowercase.proxy:3128" in args + + +# --------------------------------------------------------------------------- +# Tests for `build_docker_args` GPU flags. +# --------------------------------------------------------------------------- + + +def _make_args(config, gpu_id_override=None): + return run.build_docker_args( + config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + gpu_id_override=gpu_id_override, + ) + + +def test_docker_args_gpu_device(minimal_config): + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert "device=0" in args[idx + 1] + + +def test_docker_args_gpu_all(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all" + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert args[idx + 1] == "all" + + +def test_docker_args_no_gpu(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "" + minimal_config["jobs"]["nvidia_gpu"]["resources"].pop("gpu_count", None) + args = _make_args(minimal_config) + assert "--gpus" not in args + + +def test_docker_args_gpu_override(minimal_config): + args = _make_args(minimal_config, gpu_id_override="2,3") + idx = args.index("--gpus") + assert "2,3" in args[idx + 1] + + +# --------------------------------------------------------------------------- +# Tests for `build_docker_args` memory format. +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("32GB", "32g"), + ("512MB", "512m"), + ("8", "8g"), + ("16gb", "16g"), + ("256mb", "256m"), + ], +) +def test_docker_args_memory_format(minimal_config, raw, expected): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw + args = _make_args(minimal_config) + idx = args.index("--memory") + assert args[idx + 1] == expected + + +# --------------------------------------------------------------------------- +# Tests for `build_docker_args` stages encoding. +# --------------------------------------------------------------------------- + + +def test_docker_args_num_stages(minimal_config): + args = _make_args(minimal_config) + assert "NUM_STAGES=1" in args + + +def test_docker_args_stage_name_cmd(minimal_config): + args = _make_args(minimal_config) + assert "STAGE_1_NAME=test" in args + assert any(a.startswith("STAGE_1_CMD=") for a in args) + + +def test_docker_args_multiple_stages(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["stages"] = [ + {"name": "lint", "run": "ruff check ."}, + {"name": "test", "run": "pytest tests/"}, + ] + args = _make_args(minimal_config) + assert "NUM_STAGES=2" in args + assert "STAGE_1_NAME=lint" in args + assert "STAGE_2_NAME=test" in args + + +# --------------------------------------------------------------------------- +# Tests for `build_docker_args` `results_dir` mount. +# --------------------------------------------------------------------------- + + +def test_docker_args_results_dir(minimal_config, tmp_path): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + results_dir=tmp_path, + ) + joined = " ".join(str(a) for a in args) + assert "-v" in args + assert "/workspace/results" in joined + + +# --------------------------------------------------------------------------- +# Tests for `build_results_dir`. +# --------------------------------------------------------------------------- + + +def test_build_results_dir_contains_platform(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "nvidia" in d.name + + +def test_build_results_dir_contains_commit(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "abc1234" in d.name + + +def test_build_results_dir_contains_stage_names(): + stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "lint+test" in d.name + + +def test_build_results_dir_under_base(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678") + assert d.parent == Path("/tmp/my-results") diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py new file mode 100644 index 0000000..b07011c --- /dev/null +++ b/.ci/tests/test_utils.py @@ -0,0 +1,90 @@ +from utils import normalize_config + + +def test_normalize_creates_flat_jobs(): + raw = { + "repo": {"url": "https://github.com/org/repo.git"}, + "platforms": { + "nvidia": { + "image": {"dockerfile": ".ci/images/nvidia/"}, + "setup": "pip install .", + "docker_args": ["--gpus", "all"], + "jobs": { + "gpu": { + "resources": {"gpu_ids": "0"}, + "stages": [{"name": "test", "run": "pytest"}], + }, + "multi_gpu": { + "resources": {"gpu_ids": "0,1"}, + "stages": [{"name": "test", "run": "pytest"}], + }, + }, + }, + }, + } + config = normalize_config(raw) + + assert "nvidia_gpu" in config["jobs"] + assert "nvidia_multi_gpu" in config["jobs"] + assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia" + assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ." + assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"] + assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0" + assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1" + + +def test_normalize_extracts_images(): + raw = { + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "pytorch:latest"}, + }, + "jobs": {}, + }, + }, + } + config = normalize_config(raw) + assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/" + assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest" + + +def test_normalize_job_overrides_platform_defaults(): + raw = { + "platforms": { + "nvidia": { + "setup": "default setup", + "jobs": { + "special": { + "setup": "custom setup", + "stages": [], + }, + }, + }, + }, + } + config = normalize_config(raw) + assert config["jobs"]["nvidia_special"]["setup"] == "custom setup" + + +def test_normalize_preserves_top_level_keys(): + raw = { + "repo": {"url": "https://github.com/org/repo.git"}, + "github": {"status_context_prefix": "ci/test"}, + "agents": {"nvidia": {"url": "http://host:8080"}}, + "platforms": {}, + } + config = normalize_config(raw) + assert config["repo"]["url"] == "https://github.com/org/repo.git" + assert config["github"]["status_context_prefix"] == "ci/test" + assert config["agents"]["nvidia"]["url"] == "http://host:8080" + + +def test_normalize_passthrough_flat_config(): + """Old flat format without `platforms` key is returned as-is.""" + flat = { + "images": {"nvidia": {}}, + "jobs": {"nvidia_gpu": {"platform": "nvidia"}}, + } + assert normalize_config(flat) is flat diff --git a/.ci/utils.py b/.ci/utils.py new file mode 100644 index 0000000..07dec87 --- /dev/null +++ b/.ci/utils.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Shared utilities for the CI toolchain.""" + +import subprocess +import sys + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def normalize_config(raw): + """Convert platform-centric config to flat images/jobs format. + + Input (new format): + platforms: + nvidia: + image: {dockerfile: ..., build_args: ...} + setup: pip install .[dev] + jobs: + gpu: {resources: ..., stages: ...} + + Output (flat format consumed by run.py / build.py / agent.py): + images: + nvidia: {dockerfile: ..., build_args: ...} + jobs: + nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...} + + If the config already uses the flat format (no 'platforms' key), returns as-is. + """ + if "platforms" not in raw: + return raw + + config = {} + + for key in ("repo", "github", "agents"): + if key in raw: + config[key] = raw[key] + + config["images"] = {} + config["jobs"] = {} + + for platform, pcfg in raw.get("platforms", {}).items(): + # Image config + if "image" in pcfg: + config["images"][platform] = pcfg["image"] + + # Platform-level defaults inherited by jobs + defaults = {} + + for key in ("image_tag", "docker_args", "volumes", "setup", "env"): + if key in pcfg: + defaults[key] = pcfg[key] + + # Flatten jobs: {platform}_{job_name} + for job_name, job_cfg in pcfg.get("jobs", {}).items(): + full_name = f"{platform}_{job_name}" + flat = { + "platform": platform, + "short_name": job_name, + "image": defaults.get("image_tag", "latest"), + } + + # Apply platform defaults + for key in ("docker_args", "volumes", "setup", "env"): + if key in defaults: + flat[key] = defaults[key] + + # Job-level overrides + flat.update(job_cfg) + + config["jobs"][full_name] = flat + + # Warn on mismatched agent/platform keys (catches typos like 'nvdia'). + agent_keys = set(config.get("agents", {}).keys()) + platform_keys = set(raw.get("platforms", {}).keys()) + + for key in agent_keys - platform_keys: + print( + f"warning: agents.{key} has no matching platform in platforms.*", + file=sys.stderr, + ) + + return config + + +def load_config(path): + """Load a YAML config file and normalize to flat format.""" + with open(path, encoding="utf-8") as f: + raw = yaml.safe_load(f) + + return normalize_config(raw) + + +def get_git_commit(ref="HEAD", short=True): + """Get git commit SHA. Returns 'unknown' on failure.""" + cmd = ["git", "rev-parse"] + + if short: + cmd.append("--short") + + cmd.append(ref) + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + return "unknown" + + return result.stdout.strip() diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..2296f7d --- /dev/null +++ b/.clang-format @@ -0,0 +1,3 @@ +--- +BasedOnStyle: Google +... diff --git a/.gitignore b/.gitignore index d4fb281..2effaff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Generated files +build/ +generated/ + # Prerequisites *.d @@ -39,3 +43,220 @@ # debug information files *.dwo + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..b9e2deb --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,192 @@ +cmake_minimum_required(VERSION 3.18) +project(InfiniOps LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Internal variable to control pybind11's automatic optimization flags (like `-flto`). +set(PYBIND11_ENABLE_EXTRAS ON) + +# Options for backends. +option(WITH_CPU "Enable CPU backend" OFF) +option(WITH_NVIDIA "Enable CUDA backend" OFF) +option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF) +option(WITH_METAX "Enable MetaX backend" OFF) +option(WITH_CAMBRICON "Enable Cambricon backend" OFF) +option(WITH_MOORE "Enable Moore backend" OFF) + +option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) +option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF) + +if(AUTO_DETECT_DEVICES) + message(STATUS "Auto-detecting available devices...") + + set(WITH_CPU ON) + + file(GLOB NVIDIA_DEV_FILES "/dev/nvidia*") + + if(NVIDIA_DEV_FILES) + set(WITH_NVIDIA ON) + message(STATUS "Auto-detected NVIDIA environment.") + endif() + + file(GLOB ILUVATAR_DEV_FILES "/dev/iluvatar*") + + if(ILUVATAR_DEV_FILES) + set(WITH_ILUVATAR ON) + message(STATUS "Auto-detected Iluvatar environment.") + endif() + + if(DEFINED ENV{MACA_PATH}) + set(WITH_METAX ON) + message(STATUS "Auto-detected MetaX environment from MACA_PATH") + else() + execute_process( + COMMAND sh -c "grep -h 9999 /sys/bus/pci/devices/*/vendor 2>/dev/null" + OUTPUT_VARIABLE _pci_vendor_output + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + string(FIND "${_pci_vendor_output}" "9999" _found_pos) + + if(_found_pos GREATER -1) + set(WITH_METAX ON) + message(STATUS "Detected MetaX GPU from PCI vendor ID 0x9999") + else() + set(WITH_METAX OFF) + message(STATUS "No MetaX GPU detected") + endif() + endif() + + if(DEFINED ENV{NEUWARE_HOME}) + set(WITH_CAMBRICON ON) + message(STATUS "Auto-detected Cambricon environment.") + endif() + + if(DEFINED ENV{MUSA_ROOT} OR DEFINED ENV{MUSA_HOME} OR DEFINED ENV{MUSA_PATH}) + set(WITH_MOORE ON) + set(WITH_MOORE ON CACHE BOOL "Enable Moore backend" FORCE) + message(STATUS "Auto-detected Moore environment.") + else() + set(WITH_MOORE OFF) + set(WITH_MOORE OFF CACHE BOOL "Enable Moore backend" FORCE) + endif() +endif() + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) + +# Only one CUDA-like GPU backend can be enabled at a time. +set(_gpu_backend_count 0) +foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE) + if(${_gpu_backend}) + math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1") + endif() +endforeach() + +if(_gpu_backend_count GREATER 1) + message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.") +endif() + +if(WITH_NVIDIA) + add_compile_definitions(WITH_NVIDIA=1) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +# Iluvatar: CUDA-compatible device, uses `clang++` with `-x ivcore` (not `nvcc`). +# Reference: `InfiniCore` `xmake/iluvatar.lua`. +if(WITH_ILUVATAR) + add_compile_definitions(WITH_ILUVATAR=1) + set(ILUVATAR_ARCH "ivcore20" CACHE STRING "Iluvatar GPU architecture") + find_program(CLANGXX NAMES clang++) + if(CLANGXX) + set(CMAKE_CUDA_COMPILER "${CLANGXX}" CACHE STRING "Iluvatar CUDA compiler (clang++)") + else() + set(CMAKE_CUDA_COMPILER "clang++" CACHE STRING "Iluvatar CUDA compiler (clang++)") + endif() + set(CMAKE_CUDA_FLAGS "-x ivcore -std=c++17 --cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable" CACHE STRING "Iluvatar CUDA flags") + set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Iluvatar") + message(STATUS "Iluvatar: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${ILUVATAR_ARCH}") + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +if(WITH_METAX) + add_compile_definitions(WITH_METAX=1) + + # Normally can be found at: `/opt/maca/`. + set(MACA_PATH $ENV{MACA_PATH}) + set(CMAKE_C_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh) + set(CMAKE_CXX_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh) + + include_directories("${MACA_PATH}/include") + link_directories("${MACA_PATH}/lib") + + # Libraries: mcruntime / mcdnn / mcblas. + find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED) + find_library(MACA_DNN_LIB NAMES mcdnn HINTS "${MACA_PATH}/lib" REQUIRED) + find_library(MACA_BLAS_LIB NAMES mcblas HINTS "${MACA_PATH}/lib" REQUIRED) +endif() + +if(WITH_MOORE) + add_compile_definitions(WITH_MOORE=1) + + set(MUSA_ROOT "") + foreach(_musa_env MUSA_ROOT MUSA_HOME MUSA_PATH) + if(NOT MUSA_ROOT AND DEFINED ENV{${_musa_env}} AND NOT "$ENV{${_musa_env}}" STREQUAL "") + set(MUSA_ROOT "$ENV{${_musa_env}}") + endif() + endforeach() + + if(NOT MUSA_ROOT AND EXISTS "/usr/local/musa") + set(MUSA_ROOT "/usr/local/musa") + endif() + + if(NOT MUSA_ROOT) + message(FATAL_ERROR "`WITH_MOORE` is `ON` but `MUSA_ROOT`/`MUSA_HOME`/`MUSA_PATH` is not set and `/usr/local/musa` was not found.") + endif() + + if(NOT EXISTS "${MUSA_ROOT}/bin/mcc") + message(FATAL_ERROR "Could not find `mcc` under `${MUSA_ROOT}/bin`.") + endif() + + message(STATUS "Using Moore from `${MUSA_ROOT}`.") + + set(CMAKE_C_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mcc_wrapper.sh) + set(CMAKE_CXX_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mcc_wrapper.sh) + + include_directories("${MUSA_ROOT}/include") + link_directories("${MUSA_ROOT}/lib") + + find_library(MUSA_LIB NAMES musa HINTS "${MUSA_ROOT}/lib" REQUIRED) + find_library(MUSART_LIB NAMES musart HINTS "${MUSA_ROOT}/lib" REQUIRED) + find_library(MUBLAS_LIB NAMES mublas HINTS "${MUSA_ROOT}/lib" REQUIRED) +endif() + +if(WITH_CAMBRICON) + add_compile_definitions(WITH_CAMBRICON=1) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + + include_directories("${NEUWARE_HOME}/include") + link_directories("${NEUWARE_HOME}/lib") + link_directories("${NEUWARE_HOME}/lib64") + + # Libraries: `cnrt` / `cnnl` / `cnnl_extra` / `cnpapi`. + find_library(CAMBRICON_RUNTIME_LIB NAMES cnrt HINTS "${NEUWARE_HOME}/lib64" REQUIRED) + find_library(CAMBRICON_CNNL_LIB NAMES cnnl HINTS "${NEUWARE_HOME}/lib64" REQUIRED) + find_library(CAMBRICON_CNNL_EXTRA_LIB NAMES cnnl_extra HINTS "${NEUWARE_HOME}/lib64" REQUIRED) + find_library(CAMBRICON_PAPI_LIB NAMES cnpapi HINTS "${NEUWARE_HOME}/lib64" REQUIRED) +endif() + +# If all other platforms are not enabled, CPU is enabled by default. +if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON) + add_compile_definitions(WITH_CPU=1) +endif() + +if(WITH_METAX OR WITH_MOORE) + set(PYBIND11_ENABLE_EXTRAS OFF) +endif() + +add_subdirectory(src) + +add_subdirectory(examples) diff --git a/README.md b/README.md new file mode 100644 index 0000000..875a936 --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ +# InfiniOps + +InfiniOps is a high-performance, hardware-agnostic operator library. + +## 🛠️ Prerequisites + +Ensure your environment meets the following requirements based on your target backend: + + - C++17 compatible compiler + - CMake 3.18+ + - Hardware-specific SDKs (e.g., CUDA Toolkit) + +--- + +## ⚙️ Installation & Building + +InfiniOps uses CMake to manage backends. + +### 1. Setup Environment + +Ensure you have the corresponding SDK installed and environment variables set up for the platform/accelerator you are working on. + +### 2. Configure and Build + +Using these commands at the root directory of this project: + +```bash +mkdir build && cd build + +cmake .. + +make -j$(nproc) +``` + +For the ``: + +| Option | Functionality | Default +|----------------------------------------|------------------------------------|:-: +| `-DWITH_CPU=[ON\|OFF]` | Compile the CPU implementation | n +| `-DWITH_NVIDIA=[ON\|OFF]` | Compile the NVIDIA implementation | n +| `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | n +| `-DGENERATE_PYTHON_BINDINGS=[ON\|OFF]` | Generate Python bindings | n + +*Note: If no accelerator options are provided, `WITH_CPU` is enabled by default.* + +## 🚀 Running Examples +After a successful build, the executables are located in the `build/examples` directory. + +Run the GEMM example: + +```bash +./examples/gemm +``` + +Run the data_type example: + +```bash +./examples/data_type +``` + +Run the tensor example: + +```bash +./examples/tensor +``` + +Run the pybind11 example: + +```bash +PYTHONPATH=src python ../examples/gemm.py +``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..68ebc1b --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,16 @@ +file(GLOB_RECURSE EXAMPLE_SOURCES CONFIGURE_DEPENDS "*.cc") + +# Iterate through each file and create an executable. +foreach(source_file ${EXAMPLE_SOURCES}) + get_filename_component(example_name ${source_file} NAME_WE) + + add_executable(${example_name} ${source_file}) + + target_link_libraries(${example_name} PRIVATE infiniops) + + target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + + get_filename_component(example_dir ${source_file} DIRECTORY) + + target_include_directories(${example_name} PRIVATE ${example_dir}) +endforeach() diff --git a/examples/data_type.cc b/examples/data_type.cc new file mode 100644 index 0000000..f937123 --- /dev/null +++ b/examples/data_type.cc @@ -0,0 +1,28 @@ +#include "data_type.h" + +#include +#include +#include + +static void PrintDataTypeInfo(const infini::ops::DataType& dtype) {} + +int main() { + using namespace infini::ops; + + static const std::vector kDataTypes{ + DataType::kInt8, DataType::kInt16, DataType::kInt32, + DataType::kInt64, DataType::kUInt8, DataType::kUInt16, + DataType::kUInt32, DataType::kUInt64, DataType::kFloat16, + DataType::kBFloat16, DataType::kFloat32, DataType::kFloat64}; + + std::cout << std::left << std::setw(10) << "Name" << std::left + << std::setw(10) << "Element Size\n"; + + for (const auto& dtype : kDataTypes) { + std::cout << std::left << std::setw(10) << kDataTypeToDesc.at(dtype) + << std::left << std::setw(10) << kDataTypeToSize.at(dtype) + << '\n'; + } + + return 0; +} diff --git a/examples/gemm.py b/examples/gemm.py new file mode 100644 index 0000000..cd707c1 --- /dev/null +++ b/examples/gemm.py @@ -0,0 +1,15 @@ +import infini.ops +import torch + +m, n, k = 2, 3, 4 + +x = torch.randn(m, k, device="cpu") +y = torch.randn(k, n, device="cpu") +z = torch.empty(m, n, device="cpu") + +infini.ops.gemm(x, y, z) + +print(x) +print(y) +print(z) +print(torch.mm(x, y)) diff --git a/examples/gemm/gemm.cc b/examples/gemm/gemm.cc new file mode 100644 index 0000000..4664740 --- /dev/null +++ b/examples/gemm/gemm.cc @@ -0,0 +1,91 @@ +#include +#include +#include + +#ifdef WITH_CPU +#include "cpu/gemm/gemm.h" +#endif +#if WITH_NVIDIA +#include "nvidia/gemm/cublas.h" +#endif +#if WITH_ILUVATAR +#include "iluvatar/gemm/cublas.h" +#endif +#if WITH_METAX +#include "metax/gemm/mcblas.h" +#endif +#if WITH_CAMBRICON +#include "cambricon/gemm/cnblas.h" +#endif +#if WITH_MOORE +#include "moore/gemm/mublas.h" +#endif + +#include "runtime_api.h" +#include "tensor.h" + +int main() { + using namespace infini::ops; + + constexpr auto m{2}; + constexpr auto k{3}; + constexpr auto n{4}; + + std::vector a_shape{m, k}; + std::vector b_shape{k, n}; + std::vector c_shape{m, n}; + + const auto a_num_elements{std::accumulate(a_shape.cbegin(), a_shape.cend(), 1, + std::multiplies())}; + const auto b_num_elements{std::accumulate(b_shape.cbegin(), b_shape.cend(), 1, + std::multiplies())}; + const auto c_num_elements{std::accumulate(c_shape.cbegin(), c_shape.cend(), 1, + std::multiplies())}; + + std::vector a_vec(a_num_elements); + std::vector b_vec(b_num_elements); + std::vector c_vec(c_num_elements); + + std::iota(a_vec.begin(), a_vec.end(), 0); + std::iota(b_vec.begin(), b_vec.end(), 0); + + Device dev{DEFAULT_DEVICE_TYPE}; + + Tensor a_host{a_vec.data(), a_shape, dev}; + Tensor b_host{b_vec.data(), b_shape, dev}; + Tensor c_host{c_vec.data(), c_shape, dev}; + + const auto a_size{a_num_elements * kDataTypeToSize.at(a_host.dtype())}; + const auto b_size{b_num_elements * kDataTypeToSize.at(b_host.dtype())}; + const auto c_size{c_num_elements * kDataTypeToSize.at(c_host.dtype())}; + + void *a_ptr, *b_ptr, *c_ptr; + + DEVICE_MALLOC(&a_ptr, a_size); + DEVICE_MALLOC(&b_ptr, b_size); + DEVICE_MALLOC(&c_ptr, c_size); + + DEVICE_MEMCPY(a_ptr, a_vec.data(), a_size, DEVICE_MEMCPY_HOST_TO_DEVICE); + DEVICE_MEMCPY(b_ptr, b_vec.data(), b_size, DEVICE_MEMCPY_HOST_TO_DEVICE); + DEVICE_MEMSET(c_ptr, 0, c_size); + + Tensor a_device{a_ptr, a_host.shape(), a_host.dtype(), a_host.device(), + a_host.strides()}; + Tensor b_device{b_ptr, b_host.shape(), b_host.dtype(), a_host.device(), + b_host.strides()}; + Tensor c_device{c_ptr, c_host.shape(), c_host.dtype(), a_host.device(), + c_host.strides()}; + + Gemm::call(a_device, b_device, c_device); + + DEVICE_MEMCPY(c_vec.data(), c_ptr, c_size, DEVICE_MEMCPY_DEVICE_TO_HOST); + DEVICE_FREE(a_ptr); + DEVICE_FREE(b_ptr); + DEVICE_FREE(c_ptr); + + std::cout << "A: " << a_host.ToString() << "\n"; + std::cout << "B: " << b_host.ToString() << "\n"; + std::cout << "C: " << c_host.ToString() << "\n"; + + return 0; +} diff --git a/examples/runtime_api.h b/examples/runtime_api.h new file mode 100644 index 0000000..c5b7597 --- /dev/null +++ b/examples/runtime_api.h @@ -0,0 +1,61 @@ +#ifndef INFINI_OPS_EXAMPLES_RUNTIME_API_H_ +#define INFINI_OPS_EXAMPLES_RUNTIME_API_H_ + +#ifdef WITH_NVIDIA +#include +#define DEVICE_MALLOC cudaMalloc +#define DEVICE_FREE cudaFree +#define DEVICE_MEMCPY cudaMemcpy +#define DEVICE_MEMSET cudaMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE cudaMemcpyHostToDevice +#define DEVICE_MEMCPY_DEVICE_TO_HOST cudaMemcpyDeviceToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kNvidia +#elif WITH_ILUVATAR +#include +#define DEVICE_MALLOC cudaMalloc +#define DEVICE_FREE cudaFree +#define DEVICE_MEMCPY cudaMemcpy +#define DEVICE_MEMSET cudaMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE cudaMemcpyHostToDevice +#define DEVICE_MEMCPY_DEVICE_TO_HOST cudaMemcpyDeviceToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kIluvatar +#elif WITH_METAX +#include +#define DEVICE_MALLOC mcMalloc +#define DEVICE_FREE mcFree +#define DEVICE_MEMCPY mcMemcpy +#define DEVICE_MEMSET mcMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE mcMemcpyHostToDevice +#define DEVICE_MEMCPY_DEVICE_TO_HOST mcMemcpyDeviceToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kMetax +#elif WITH_CAMBRICON +#include +#define DEVICE_MALLOC cnrtMalloc +#define DEVICE_FREE cnrtFree +#define DEVICE_MEMCPY cnrtMemcpy +#define DEVICE_MEMSET cnrtMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE cnrtMemcpyHostToDev +#define DEVICE_MEMCPY_DEVICE_TO_HOST cnrtMemcpyDevToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kCambricon +#elif WITH_MOORE +#include +#define DEVICE_MALLOC musaMalloc +#define DEVICE_FREE musaFree +#define DEVICE_MEMCPY musaMemcpy +#define DEVICE_MEMSET musaMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE musaMemcpyHostToDevice +#define DEVICE_MEMCPY_DEVICE_TO_HOST musaMemcpyDeviceToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kMoore +#elif WITH_CPU +#include +#include +#define DEVICE_MALLOC(ptr, size) (*(ptr) = std::malloc(size)) +#define DEVICE_FREE std::free +#define DEVICE_MEMCPY(dst, src, size, kind) std::memcpy(dst, src, size) +#define DEVICE_MEMSET std::memset +#define DEVICE_MEMCPY_HOST_TO_DEVICE 0 +#define DEVICE_MEMCPY_DEVICE_TO_HOST 1 +#define DEFAULT_DEVICE_TYPE Device::Type::kCpu +#endif + +#endif diff --git a/examples/tensor.cc b/examples/tensor.cc new file mode 100644 index 0000000..ff768bd --- /dev/null +++ b/examples/tensor.cc @@ -0,0 +1,25 @@ +#include "tensor.h" + +#include +#include +#include +#include + +int main() { + using namespace infini::ops; + + const Tensor::Shape shape{2, 3, 4}; + + const auto num_elements{ + std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies())}; + + std::vector elems(num_elements); + + std::iota(elems.begin(), elems.end(), 0); + + Tensor x{elems.data(), shape}; + + std::cout << x.ToString() << '\n'; + + return 0; +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3dbc186 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["scikit-build-core", "pybind11", "libclang"] +build-backend = "scikit_build_core.build" + +[project] +name = "InfiniOps" +version = "0.1.0" + +[project.optional-dependencies] +dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] + +[tool.scikit-build.wheel] +install-dir = "infini" + +[tool.scikit-build.cmake.define] +AUTO_DETECT_DEVICES = "ON" +GENERATE_PYTHON_BINDINGS = "ON" + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py new file mode 100644 index 0000000..edde67c --- /dev/null +++ b/scripts/generate_wrappers.py @@ -0,0 +1,442 @@ +import argparse +import json +import pathlib +import shutil +import subprocess +import textwrap + +import clang.cindex +from clang.cindex import CursorKind + +_SRC_DIR = pathlib.Path("src") + +_BASE_DIR = _SRC_DIR / "base" + +_GENERATION_DIR = pathlib.Path("generated") + +_BINDINGS_DIR = _GENERATION_DIR / "bindings" + +_GENERATED_SRC_DIR = _GENERATION_DIR / "src" + +_INCLUDE_DIR = _GENERATION_DIR / "include" + +_INDENTATION = " " + + +class _OperatorExtractor: + def __call__(self, op_name): + def _get_system_include_flags(): + def _get_compilers(): + compilers = [] + + for compiler in ("clang++", "g++"): + if shutil.which(compiler) is not None: + compilers.append(compiler) + + return compilers + + system_include_flags = [] + + for compiler in _get_compilers(): + for line in subprocess.getoutput( + f"{compiler} -E -x c++ -v /dev/null" + ).splitlines(): + if not line.startswith(" "): + continue + + system_include_flags.append("-isystem") + system_include_flags.append(line.strip()) + + return system_include_flags + + system_include_flags = _get_system_include_flags() + + index = clang.cindex.Index.create() + args = ("-std=c++17", "-x", "c++", "-I", "src") + tuple(system_include_flags) + translation_unit = index.parse(f"src/base/{op_name}.h", args=args) + + nodes = tuple(type(self)._find(translation_unit.cursor, op_name)) + + constructors = [] + calls = [] + + for node in nodes: + if node.kind == CursorKind.CONSTRUCTOR: + constructors.append(node) + elif node.kind == CursorKind.CXX_METHOD and node.spelling == "operator()": + calls.append(node) + + return _Operator(op_name, constructors, calls) + + @staticmethod + def _find(node, op_name): + pascal_case_op_name = _snake_to_pascal(op_name) + + if ( + node.semantic_parent + and node.semantic_parent.spelling == pascal_case_op_name + ): + yield node + + for child in node.get_children(): + yield from _OperatorExtractor._find(child, op_name) + + +class _Operator: + def __init__(self, name, constructors, calls): + self.name = name + + self.constructors = constructors + + self.calls = calls + + +def _generate_pybind11(operator): + def _generate_params(node): + return ( + ", ".join( + f"{arg.type.spelling} {arg.spelling}" + for arg in node.get_arguments() + if arg.spelling != "stream" + ) + .replace("const Tensor", "py::object") + .replace("Tensor", "py::object") + ) + + def _generate_arguments(node): + return ", ".join( + f"TensorFromPybind11Handle({arg.spelling})" + if "Tensor" in arg.type.spelling + else arg.spelling + for arg in node.get_arguments() + if arg.spelling != "stream" + ) + + op_name = operator.name + + def _generate_init(constructor): + constructor_params = _generate_params(constructor) + + return f""" .def(py::init([]({constructor_params}) {{ + return std::unique_ptr{{static_cast(Self::make({_generate_arguments(constructor)}).release())}}; + }}))""" + + def _generate_call(op_name, call, method=True): + call_params = _generate_params(call) + + if not method: + return f""" m.def("{op_name}", []({call_params}) {{ return Self::call({_generate_arguments(call)}); }});""" + + return f""" .def("__call__", [](const Self& self, {call_params}) {{ + return static_cast&>(self)({_generate_arguments(call)}); + }})""" + + inits = "\n".join( + _generate_init(constructor) for constructor in operator.constructors + ) + calls = "\n".join(_generate_call(operator.name, call) for call in operator.calls) + callers = "\n".join( + _generate_call(operator.name, call, method=False) for call in operator.calls + ) + + pascal_case_op_name = _snake_to_pascal(op_name) + + return f"""#ifndef INFINI_OPS_BINDINGS_{op_name.upper()}_H_ +#define INFINI_OPS_BINDINGS_{op_name.upper()}_H_ + +#include +#include + +#include "base/{op_name}.h" +#include "pybind11_utils.h" + +namespace py = pybind11; + +namespace infini::ops {{ + +void Bind{pascal_case_op_name}(py::module& m) {{ + using Self = {pascal_case_op_name}; + + py::class_(m, "{pascal_case_op_name}") +{inits} +{calls}; + +{callers} +}} + +}} // namespace infini::ops + +#endif +""" + + +def _generate_legacy_c(operator, paths): + def _generate_source(operator): + impl_includes = "\n".join( + f'#include "{str(path).removeprefix("src/")}"' for path in paths + ) + + return f"""#include "../../handle.h" +#include "../../tensor.h" +#include "infiniop/ops/{operator.name.lower()}.h" +{impl_includes} + +static infini::ops::DataType DataTypeFromInfiniDType( + const infiniDtype_t& dtype) {{ + static constexpr infini::ops::ConstexprMap + kInfiniDTypeToDataType{{ + {{{{{{INFINI_DTYPE_I8, infini::ops::DataType::kInt8}}, + {{INFINI_DTYPE_I16, infini::ops::DataType::kInt16}}, + {{INFINI_DTYPE_I32, infini::ops::DataType::kInt32}}, + {{INFINI_DTYPE_I64, infini::ops::DataType::kInt64}}, + {{INFINI_DTYPE_U8, infini::ops::DataType::kUInt8}}, + {{INFINI_DTYPE_U16, infini::ops::DataType::kUInt16}}, + {{INFINI_DTYPE_U32, infini::ops::DataType::kUInt32}}, + {{INFINI_DTYPE_U64, infini::ops::DataType::kUInt64}}, + {{INFINI_DTYPE_F16, infini::ops::DataType::kFloat16}}, + {{INFINI_DTYPE_BF16, infini::ops::DataType::kBFloat16}}, + {{INFINI_DTYPE_F32, infini::ops::DataType::kFloat32}}, + {{INFINI_DTYPE_F64, infini::ops::DataType::kFloat64}}}}}}}}; + + return kInfiniDTypeToDataType.at(dtype); +}} + +static infini::ops::Device::Type DeviceTypeFromInfiniDevice( + const infiniDevice_t& device) {{ + static constexpr infini::ops::ConstexprMap< + infiniDevice_t, infini::ops::Device::Type, + static_cast(INFINI_DEVICE_TYPE_COUNT)> + kInfiniDeviceToDeviceType{{ + {{{{{{INFINI_DEVICE_CPU, infini::ops::Device::Type::kCpu}}, + {{INFINI_DEVICE_NVIDIA, infini::ops::Device::Type::kNvidia}}, + {{INFINI_DEVICE_CAMBRICON, infini::ops::Device::Type::kCambricon}}, + {{INFINI_DEVICE_ASCEND, infini::ops::Device::Type::kAscend}}, + {{INFINI_DEVICE_METAX, infini::ops::Device::Type::kMetax}}, + {{INFINI_DEVICE_MOORE, infini::ops::Device::Type::kMoore}}, + {{INFINI_DEVICE_ILUVATAR, infini::ops::Device::Type::kIluvatar}}, + {{INFINI_DEVICE_KUNLUN, infini::ops::Device::Type::kKunlun}}, + {{INFINI_DEVICE_HYGON, infini::ops::Device::Type::kHygon}}, + {{INFINI_DEVICE_QY, infini::ops::Device::Type::kQy}}}}}}}}; + + return kInfiniDeviceToDeviceType.at(device); +}} + +__C {_generate_create_func_def(operator)} + +__C {_generate_get_workspace_size_func_def(operator)} + +__C {_generate_call_func_def(operator)} + +__C {_generate_destroy_func_def(operator)} +""" + + def _generate_header(operator): + return f"""#ifndef __INFINIOP_{operator.name.upper()}_API_H__ +#define __INFINIOP_{operator.name.upper()}_API_H__ + +#include "base/{operator.name.lower()}.h" + +typedef struct infini::ops::Operator *infiniop{operator.name}Descriptor_t; + +__C __export {_generate_create_func_decl(operator)}; + +__C __export {_generate_get_workspace_size_func_decl(operator)}; + +__C __export {_generate_call_func_decl(operator)}; + +__C __export {_generate_destroy_func_decl(operator)}; + +#endif +""" + + def _generate_create_func_def(operator): + name = operator.name + constructor = operator.constructors[-1] + + return f"""{_generate_create_func_decl(operator)} {{ + *desc_ptr = infini::ops::Operator::make({_generate_arguments(constructor)}).release(); + + return INFINI_STATUS_SUCCESS; +}}""" + + def _generate_get_workspace_size_func_def(operator): + return f"""{_generate_get_workspace_size_func_decl(operator)} {{ + *size = 0; // desc->workspace_size(); + + return INFINI_STATUS_SUCCESS; +}}""" + + def _generate_call_func_def(operator): + call = operator.calls[-1] + + return f"""{_generate_call_func_decl(operator)} {{ + (*desc)(stream, {_generate_arguments(call, is_data=True)}); + + return INFINI_STATUS_SUCCESS; +}}""" + + def _generate_destroy_func_def(operator): + return f"""{_generate_destroy_func_decl(operator)} {{ + delete desc; + + return INFINI_STATUS_SUCCESS; +}}""" + + def _generate_create_func_decl(operator): + name = operator.name + constructor = operator.constructors[-1] + params = _generate_params(constructor) + + return f"infiniStatus_t infiniopCreate{name}Descriptor(infiniopHandle_t handle, infiniop{name}Descriptor_t *desc_ptr, {params})" + + def _generate_get_workspace_size_func_decl(operator): + name = operator.name + + return f"infiniStatus_t infiniopGet{name}WorkspaceSize(infiniop{name}Descriptor_t desc, size_t *size)" + + def _generate_call_func_decl(operator): + name = operator.name + call = operator.calls[-1] + params = _generate_params(call, call=True) + params = params.replace("void * stream, ", "") + + return f"infiniStatus_t infiniop{name}(infiniop{name}Descriptor_t desc, void *workspace, size_t workspace_size, {params}, void *stream)" + + def _generate_destroy_func_decl(operator): + name = operator.name + + return f"infiniStatus_t infiniopDestroy{name}Descriptor(infiniop{name}Descriptor_t desc)" + + def _generate_params(node, call=False): + arguments = tuple(node.get_arguments()) + + arguments = (arguments[-1], *arguments[:-1]) + + def _handle_tensor(spelling): + if call: + return spelling.replace("Tensor", "void *") + return spelling.replace("Tensor", "infiniopTensorDescriptor_t") + + def _handle_std_optional(spelling): + return spelling.replace("std::optional<", "").replace(">", "") + + return ", ".join( + f"{_handle_std_optional(_handle_tensor(arg.type.spelling))} {arg.spelling}" + for arg in arguments + ) + + def _generate_arguments(node, is_data=False): + return ", ".join( + _generate_tensor_caster(arg.spelling, is_data=is_data) + if "Tensor" in arg.type.spelling + else arg.spelling + for arg in node.get_arguments() + if arg.spelling != "handle" and arg.spelling != "stream" + ) + + def _generate_tensor_caster(name, is_data=False): + if is_data: + return f"infini::ops::Tensor(const_cast({name}), infini::ops::Tensor::Shape{{}})" + + return f"infini::ops::Tensor{{nullptr, {name}->shape(), DataTypeFromInfiniDType({name}->dtype()), infini::ops::Device{{DeviceTypeFromInfiniDevice(handle->device), handle->device_id}}, {name}->strides()}}" + + return _generate_source(operator), _generate_header(operator) + + +def _snake_to_pascal(snake_str): + return "".join(word.capitalize() for word in snake_str.split("_")) + + +def _get_all_ops(devices): + ops = {} + + for file_path in _BASE_DIR.iterdir(): + if not file_path.is_file(): + continue + + op_name = file_path.stem + + ops[op_name] = [] + + for file_path in _SRC_DIR.rglob("*"): + if not file_path.is_file() or file_path.parent.parent.name not in devices: + continue + + if f"class Operator<{_snake_to_pascal(op_name)}" in file_path.read_text(): + ops[op_name].append(file_path) + + return ops + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="An automatic wrapper generator.") + + parser.add_argument( + "--devices", + nargs="+", + default="cpu", + type=str, + help="Devices to use. Please pick from cpu, nvidia, cambricon, ascend, metax, moore, iluvatar, kunlun, hygon, and qy. (default: cpu)", + ) + + args = parser.parse_args() + + _BINDINGS_DIR.mkdir(parents=True, exist_ok=True) + _GENERATED_SRC_DIR.mkdir(parents=True, exist_ok=True) + _INCLUDE_DIR.mkdir(parents=True, exist_ok=True) + + ops_json = pathlib.Path("ops.json") + + if ops_json.exists(): + ops = json.loads(ops_json.read_text()) + else: + ops = _get_all_ops(args.devices) + + header_paths = [] + bind_func_names = [] + + for op_name, impl_paths in ops.items(): + extractor = _OperatorExtractor() + operator = extractor(op_name) + + source_path = _GENERATED_SRC_DIR / op_name + header_name = f"{op_name}.h" + bind_func_name = f"Bind{_snake_to_pascal(op_name)}" + + (_BINDINGS_DIR / header_name).write_text(_generate_pybind11(operator)) + + legacy_c_source, legacy_c_header = _generate_legacy_c(operator, impl_paths) + source_path.mkdir(exist_ok=True) + (_GENERATED_SRC_DIR / op_name / "operator.cc").write_text(legacy_c_source) + (_INCLUDE_DIR / header_name).write_text(legacy_c_header) + + header_paths.append(header_name) + bind_func_names.append(bind_func_name) + + impl_includes = "\n".join( + f'#include "{impl_path}"' + for impl_paths in ops.values() + for impl_path in impl_paths + ) + op_includes = "\n".join(f'#include "{header_path}"' for header_path in header_paths) + bind_func_calls = "\n".join( + f"{bind_func_name}(m);" for bind_func_name in bind_func_names + ) + + (_BINDINGS_DIR / "ops.cc").write_text(f"""#include + +// clang-format off +{impl_includes} +// clang-format on + +{op_includes} + +namespace infini::ops {{ + +PYBIND11_MODULE(ops, m) {{ +{textwrap.indent(bind_func_calls, _INDENTATION)} +}} + +}} // namespace infini::ops +""") diff --git a/scripts/mcc_wrapper.sh b/scripts/mcc_wrapper.sh new file mode 100755 index 0000000..29ce5cd --- /dev/null +++ b/scripts/mcc_wrapper.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Filter out flags unsupported by `mcc`. +ARGS=() +skip_next=0 +linking=1 +for arg in "$@"; do + if [ $skip_next -eq 1 ]; then + skip_next=0 + continue + fi + case "$arg" in + -c|-E|-S) + linking=0 + ARGS+=("$arg") + ;; + -pthread) + ;; + -B) + skip_next=1 + ;; + -B*) + ;; + *) + ARGS+=("$arg") + ;; + esac +done + +MUSA_ROOT_DIR="${MUSA_ROOT:-${MUSA_HOME:-${MUSA_PATH:-/usr/local/musa}}}" + +if command -v g++ >/dev/null 2>&1; then + GXX_MAJOR="$(g++ -dumpversion | cut -d. -f1)" + if [ -d "/usr/include/c++/${GXX_MAJOR}" ]; then + ARGS=( + "-isystem" "/usr/include/c++/${GXX_MAJOR}" + "-isystem" "/usr/include/x86_64-linux-gnu/c++/${GXX_MAJOR}" + "-isystem" "/usr/include/c++/${GXX_MAJOR}/backward" + "${ARGS[@]}" + ) + fi + + STDCPP_LIB="$(g++ -print-file-name=libstdc++.so)" + if [ $linking -eq 1 ] && [ -f "${STDCPP_LIB}" ]; then + ARGS=("-L$(dirname "${STDCPP_LIB}")" "${ARGS[@]}") + fi +fi + +exec "${MUSA_ROOT_DIR}/bin/mcc" "${ARGS[@]}" diff --git a/scripts/mxcc_wrapper.sh b/scripts/mxcc_wrapper.sh new file mode 100755 index 0000000..0010617 --- /dev/null +++ b/scripts/mxcc_wrapper.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Filter out flags unsupported by `mxcc`. +ARGS=() +skip_next=0 +for arg in "$@"; do + if [ $skip_next -eq 1 ]; then + skip_next=0 + continue + fi + case "$arg" in + -pthread) + ;; + -B) + skip_next=1 + ;; + -B*) + ;; + *) + ARGS+=("$arg") + ;; + esac +done + +exec ${MACA_PATH}/mxgpu_llvm/bin/mxcc "${ARGS[@]}" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..585e3ab --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,217 @@ +add_library(infiniops SHARED) + +file(GLOB BASE_SRCS CONFIGURE_DEPENDS "*.cc") +target_sources(infiniops PRIVATE ${BASE_SRCS}) + +set(DEVICE_LIST "") + +if(WITH_CPU) + set(CPU_PATTERNS + "cpu/*.cc" + "cpu/*.cpp" + ) + + file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS}) + list(APPEND CORE_SOURCES ${CPU_SOURCES}) + + target_compile_definitions(infiniops PUBLIC WITH_CPU=1) + + find_package(OpenMP REQUIRED) + target_link_libraries(infiniops PRIVATE OpenMP::OpenMP_CXX) + + list(APPEND DEVICE_LIST "cpu") +endif() + +if(WITH_NVIDIA) + set(NVIDIA_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "cuda/*.cu" + "nvidia/*.cc" + "nvidia/*.cpp" + "nvidia/*.cu" + ) + + file(GLOB_RECURSE NVIDIA_SOURCES CONFIGURE_DEPENDS ${NVIDIA_PATTERNS}) + + enable_language(CUDA) + + target_compile_definitions(infiniops PUBLIC WITH_NVIDIA=1) + target_sources(infiniops PRIVATE ${NVIDIA_SOURCES}) + + find_package(CUDAToolkit REQUIRED) + target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas CUDA::cuda_driver) + + list(APPEND DEVICE_LIST "nvidia") + set_target_properties(infiniops PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) +endif() + +if(WITH_ILUVATAR) + set(ILUVATAR_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "cuda/*.cu" + "iluvatar/*.cc" + "iluvatar/*.cpp" + "iluvatar/*.cu" + ) + + file(GLOB_RECURSE ILUVATAR_SOURCES CONFIGURE_DEPENDS ${ILUVATAR_PATTERNS}) + + enable_language(CUDA) + + target_compile_definitions(infiniops PUBLIC WITH_ILUVATAR=1) + target_sources(infiniops PRIVATE ${ILUVATAR_SOURCES}) + + find_package(CUDAToolkit REQUIRED) + target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas CUDA::cuda_driver) + + set_target_properties(infiniops PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + + list(APPEND DEVICE_LIST "iluvatar") +endif() + +if(WITH_METAX) + set(METAX_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "metax/*.cc" + "metax/*.maca" + ) + + file(GLOB_RECURSE METAX_SOURCES CONFIGURE_DEPENDS ${METAX_PATTERNS}) + + set_source_files_properties(${METAX_SOURCES} PROPERTIES LANGUAGE CXX) + + target_compile_definitions(infiniops PRIVATE WITH_METAX=1) + target_compile_options(infiniops PUBLIC "-x" "maca") + target_sources(infiniops PRIVATE ${METAX_SOURCES}) + + target_include_directories(infiniops PUBLIC "${MACA_PATH}/include") + target_link_libraries(infiniops PUBLIC + ${MACA_RUNTIME_LIB} + ${MACA_DNN_LIB} + ${MACA_BLAS_LIB} + ) + + list(APPEND DEVICE_LIST "metax") +endif() + +if(WITH_MOORE) + set(MOORE_PATTERNS + "cuda/*.cc" + "cuda/*.cpp" + "moore/*.cc" + "moore/*.cpp" + "moore/*.mu" + ) + + file(GLOB_RECURSE MOORE_SOURCES CONFIGURE_DEPENDS ${MOORE_PATTERNS}) + + set_source_files_properties(${MOORE_SOURCES} PROPERTIES LANGUAGE CXX) + + target_compile_definitions(infiniops PRIVATE WITH_MOORE=1) + target_compile_options(infiniops PUBLIC "-x" "musa") + target_sources(infiniops PRIVATE ${MOORE_SOURCES}) + + target_include_directories(infiniops PUBLIC "${MUSA_ROOT}/include") + target_link_libraries(infiniops PUBLIC ${MUSA_LIB} ${MUSART_LIB} ${MUBLAS_LIB}) + + list(APPEND DEVICE_LIST "moore") +endif() + +if(WITH_CAMBRICON) + file(GLOB_RECURSE CAMBRICON_MLU_SOURCES CONFIGURE_DEPENDS "cambricon/*/*.mlu") + find_program(CNCC_COMPILER cncc HINTS "${NEUWARE_HOME}/bin" "$ENV{NEUWARE_HOME}/bin" /usr/local/neuware/bin) + if(CNCC_COMPILER) + message(STATUS "Found cncc: ${CNCC_COMPILER}") + set(MLU_COMPILE_OPTS + -c --bang-mlu-arch=mtp_592 -O3 -fPIC -Wall -Werror -std=c++17 -pthread + -I${CMAKE_CURRENT_SOURCE_DIR} -I${NEUWARE_HOME}/include + -idirafter /usr/local/neuware/lib/clang/11.1.0/include + ) + function(compile_mlu_file src_file) + get_filename_component(name ${src_file} NAME_WE) + get_filename_component(path ${src_file} DIRECTORY) + set(out_file "${CMAKE_CURRENT_BINARY_DIR}/${path}/${name}.o") + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${path}") + add_custom_command(OUTPUT ${out_file} + COMMAND ${CNCC_COMPILER} ${MLU_COMPILE_OPTS} -c ${src_file} -o ${out_file} + DEPENDS ${src_file} + COMMENT "Building MLU kernel: ${src_file}" + ) + set_property(DIRECTORY APPEND PROPERTY CAMBRICON_OBJECTS ${out_file}) + endfunction() + foreach(src ${CAMBRICON_MLU_SOURCES}) + compile_mlu_file(${src}) + endforeach() + get_directory_property(CAMBRICON_OBJECT_FILES CAMBRICON_OBJECTS) + if(CAMBRICON_OBJECT_FILES) + target_sources(infiniops PRIVATE ${CAMBRICON_OBJECT_FILES}) + endif() + else() + message(WARNING "cncc compiler not found. MLU kernels will not be compiled.") + endif() + target_compile_definitions(infiniops PRIVATE WITH_CAMBRICON=1) + + target_include_directories(infiniops PUBLIC "${NEUWARE_HOME}/include") + target_link_libraries(infiniops PUBLIC ${CAMBRICON_RUNTIME_LIB} ${CAMBRICON_CNNL_LIB} ${CAMBRICON_CNNL_EXTRA_LIB} ${CAMBRICON_PAPI_LIB}) + + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + target_compile_options(infiniops PUBLIC + "$<$:SHELL:-idirafter /usr/local/neuware/lib/clang/11.1.0/include>" + ) + endif() + + list(APPEND DEVICE_LIST "cambricon") +endif() + +target_include_directories(infiniops PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +if(GENERATE_PYTHON_BINDINGS) + find_package(Python COMPONENTS Interpreter REQUIRED) + execute_process( + COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py --devices ${DEVICE_LIST} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + RESULT_VARIABLE script_result + ) + + if(NOT script_result EQUAL 0) + message(FATAL_ERROR "Generating wrappers - failed") + else() + message(STATUS "Generating wrappers - done") + endif() + + set(PYBIND11_SOURCES "${PROJECT_SOURCE_DIR}/generated/bindings/ops.cc") + + # TODO: There might be a better solution. + if(WITH_NVIDIA OR WITH_ILUVATAR) + set_source_files_properties(${PYBIND11_SOURCES} PROPERTIES LANGUAGE CUDA) + endif() + + find_package(Python COMPONENTS Interpreter Development) + find_package(pybind11 CONFIG) + + if(PYBIND11_ENABLE_EXTRAS) + pybind11_add_module(ops ${PYBIND11_SOURCES}) + else() + pybind11_add_module(ops NO_EXTRAS ${PYBIND11_SOURCES}) + endif() + + target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR}) + target_link_libraries(ops PRIVATE infiniops) + + set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN") + set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN") + + install(TARGETS infiniops ops DESTINATION .) + + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "") + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" DESTINATION .) +endif() diff --git a/src/base/add.h b/src/base/add.h new file mode 100644 index 0000000..06bfa4c --- /dev/null +++ b/src/base/add.h @@ -0,0 +1,71 @@ +#ifndef INFINI_OPS_BASE_ADD_H_ +#define INFINI_OPS_BASE_ADD_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +class Add : public Operator { + public: + Add(const Tensor input, const Tensor other, Tensor out) + : ndim_{out.ndim()}, + output_size_{out.numel()}, + input_type_{input.dtype()}, + other_type_{other.dtype()}, + out_type_{out.dtype()}, + input_shape_{input.shape()}, + other_shape_{other.shape()}, + out_shape_{out.shape()}, + input_strides_{input.strides()}, + other_strides_{other.strides()}, + out_strides_{out.strides()}, + is_input_contiguous_{input.IsContiguous()}, + is_other_contiguous_{other.IsContiguous()}, + is_out_contiguous_{out.IsContiguous()} { + assert(!out.HasBroadcastDim() && + "the output of `Add` should NOT have broadcasted dim!"); + // TODO(lzm): support mix-precision later using the generic elementwise + // framework. + assert(input_type_ == other_type_ && other_type_ == out_type_ && + "operator `Add` requires all input and output Tensors to have the " + "same dtype"); + } + + virtual void operator()(const Tensor input, const Tensor other, + Tensor out) const = 0; + + protected: + Tensor::Size ndim_{0}; + + Tensor::Size output_size_{0}; + + const DataType input_type_; + + const DataType other_type_; + + const DataType out_type_; + + Tensor::Shape input_shape_; + + Tensor::Shape other_shape_; + + Tensor::Shape out_shape_; + + Tensor::Strides input_strides_; + + Tensor::Strides other_strides_; + + Tensor::Strides out_strides_; + + bool is_input_contiguous_{false}; + + bool is_other_contiguous_{false}; + + bool is_out_contiguous_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/causal_softmax.h b/src/base/causal_softmax.h new file mode 100644 index 0000000..b8393d8 --- /dev/null +++ b/src/base/causal_softmax.h @@ -0,0 +1,52 @@ +#ifndef INFINI_OPS_BASE_CAUSAL_SOFTMAX_H_ +#define INFINI_OPS_BASE_CAUSAL_SOFTMAX_H_ + +#include +#include + +#include "operator.h" +#include "tensor.h" + +namespace infini::ops { + +class CausalSoftmax : public Operator { + public: + CausalSoftmax(const Tensor input, Tensor out) + : dtype_{input.dtype()}, + ndim_{out.ndim()}, + batch_size_{ndim_ == 2 ? 1 : out.size(-3)}, + seq_len_{out.size(-2)}, + total_seq_len_{out.size(-1)}, + input_strides_{input.strides()}, + out_strides_{out.strides()} { + assert(input.shape() == out.shape() && + "`CausalSoftmax` requires `input` and `out` same shape"); + assert(input.dtype() == out.dtype() && + "`CausalSoftmax` requires `input` and `out` same dtype"); + assert((ndim_ == 2 || ndim_ == 3) && + "`CausalSoftmax` requires 2D or 3D tensor"); + assert(seq_len_ <= total_seq_len_ && + "`CausalSoftmax` requires shape[-2] <= shape[-1]"); + } + + virtual void operator()(const Tensor input, Tensor out) const = 0; + + protected: + const DataType dtype_; + + Tensor::Size ndim_{0}; + + Tensor::Size batch_size_{0}; + + Tensor::Size seq_len_{0}; + + Tensor::Size total_seq_len_{0}; + + Tensor::Strides input_strides_; + + Tensor::Strides out_strides_; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/gemm.h b/src/base/gemm.h new file mode 100644 index 0000000..0bb3502 --- /dev/null +++ b/src/base/gemm.h @@ -0,0 +1,102 @@ +#ifndef INFINI_OPS_BASE_GEMM_H_ +#define INFINI_OPS_BASE_GEMM_H_ + +#include +#include + +#include "operator.h" + +namespace infini::ops { + +class Gemm : public Operator { + public: + Gemm(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) + : alpha_{alpha.value_or(1.0)}, + beta_{beta.value_or(1.0)}, + trans_a_{static_cast(trans_a.value_or(false))}, + trans_b_{static_cast(trans_b.value_or(false))}, + m_{c.size(-2)}, + n_{c.size(-1)}, + k_{trans_a_ ? a.size(-2) : a.size(-1)}, + a_type_{a.dtype()}, + b_type_{b.dtype()}, + c_type_{c.dtype()}, + a_strides_{a.strides()}, + b_strides_{b.strides()}, + c_strides_{c.strides()}, + lda_{std::max(a.stride(-2), a.stride(-1))}, + ldb_{std::max(b.stride(-2), b.stride(-1))}, + ldc_{std::max(c.stride(-2), c.stride(-1))}, + batch_count_{c.strides().size() > 2 ? c.size(-3) : 1}, + batch_stride_a_{a.strides().size() > 2 ? a.stride(-3) : 0}, + batch_stride_b_{b.strides().size() > 2 ? b.stride(-3) : 0}, + batch_stride_c_{c.strides().size() > 2 ? c.stride(-3) : 0} { + // TODO: Check constraints. + } + + Gemm(const Tensor a, const Tensor b, Tensor c) + : Gemm{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt, c} {} + + virtual void operator()(const Tensor a, const Tensor b, + std::optional alpha, std::optional beta, + std::optional trans_a, + std::optional trans_b, Tensor c) const = 0; + + virtual void operator()(const Tensor a, const Tensor b, Tensor c) const { + return operator()(a, b, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, c); + } + + virtual void operator()(const Tensor a, const Tensor b, + std::optional alpha, std::optional beta, + Tensor c) const { + return operator()(a, b, alpha, beta, std::nullopt, std::nullopt, c); + } + + protected: + float alpha_{1.0}; + + float beta_{1.0}; + + bool trans_a_{false}; + + bool trans_b_{false}; + + Tensor::Size m_{0}; + + Tensor::Size n_{0}; + + Tensor::Size k_{0}; + + const DataType a_type_; + + const DataType b_type_; + + const DataType c_type_; + + Tensor::Strides a_strides_; + + Tensor::Strides b_strides_; + + Tensor::Strides c_strides_; + + Tensor::Stride lda_{0}; + + Tensor::Stride ldb_{0}; + + Tensor::Stride ldc_{0}; + + Tensor::Size batch_count_{1}; + + Tensor::Stride batch_stride_a_{0}; + + Tensor::Stride batch_stride_b_{0}; + + Tensor::Stride batch_stride_c_{0}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/rms_norm.h b/src/base/rms_norm.h new file mode 100644 index 0000000..dc28f0a --- /dev/null +++ b/src/base/rms_norm.h @@ -0,0 +1,61 @@ +#ifndef INFINI_OPS_BASE_RMS_NORM_H_ +#define INFINI_OPS_BASE_RMS_NORM_H_ + +#include +#include + +#include "operator.h" +#include "tensor.h" + +namespace infini::ops { + +class RmsNorm : public Operator { + public: + RmsNorm(const Tensor input, const Tensor weight, float eps, Tensor out) + : input_shape_{input.shape()}, + out_shape_{out.shape()}, + input_strides_{input.strides()}, + out_strides_{out.strides()}, + eps_{eps}, + dim_{out.size(-1)}, + ndim_{out.ndim()}, + batch_size_{ndim_ == 2 ? out.size(-2) : out.size(-3)}, + nhead_{ndim_ == 2 ? 1 : out.size(-2)} { + assert(input.dtype() == out.dtype()); + } + + RmsNorm(const Tensor input, const Tensor weight, Tensor out) + : RmsNorm{input, weight, 1e-6f, out} {} + + // TODO: Type of `eps` should be `std::optional` instead of `float`. + virtual void operator()(const Tensor input, const Tensor weight, float eps, + Tensor out) const = 0; + + virtual void operator()(const Tensor input, const Tensor weight, + Tensor out) const { + return operator()(input, weight, eps_, out); + } + + protected: + Tensor::Shape input_shape_; + + Tensor::Shape out_shape_; + + Tensor::Strides input_strides_; + + Tensor::Strides out_strides_; + + float eps_{1e-6f}; + + Tensor::Size dim_{0}; + + Tensor::Size ndim_{0}; + + Tensor::Size batch_size_{0}; + + Tensor::Size nhead_{1}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/swiglu.h b/src/base/swiglu.h new file mode 100644 index 0000000..023b14a --- /dev/null +++ b/src/base/swiglu.h @@ -0,0 +1,68 @@ +#ifndef INFINI_OPS_BASE_SWIGLU_H_ +#define INFINI_OPS_BASE_SWIGLU_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +class Swiglu : public Operator { + public: + Swiglu(const Tensor input, const Tensor gate, Tensor out) + : ndim_{out.ndim()}, + output_size_{out.numel()}, + input_type_{input.dtype()}, + gate_type_{gate.dtype()}, + out_type_{out.dtype()}, + input_shape_{input.shape()}, + gate_shape_{gate.shape()}, + out_shape_{out.shape()}, + input_strides_{input.strides()}, + gate_strides_{gate.strides()}, + out_strides_{out.strides()}, + is_input_contiguous_{input.IsContiguous()}, + is_gate_contiguous_{gate.IsContiguous()}, + is_out_contiguous_{out.IsContiguous()} { + assert( + input_type_ == gate_type_ && gate_type_ == out_type_ && + "operator `Swiglu` requires all input and output tensors to have the " + "same dtype"); + } + + virtual void operator()(const Tensor input, const Tensor gate, + Tensor out) const = 0; + + protected: + Tensor::Size ndim_{0}; + + Tensor::Size output_size_{0}; + + const DataType input_type_; + + const DataType gate_type_; + + const DataType out_type_; + + Tensor::Shape input_shape_; + + Tensor::Shape gate_shape_; + + Tensor::Shape out_shape_; + + Tensor::Strides input_strides_; + + Tensor::Strides gate_strides_; + + Tensor::Strides out_strides_; + + bool is_input_contiguous_{false}; + + bool is_gate_contiguous_{false}; + + bool is_out_contiguous_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cambricon/common.h b/src/cambricon/common.h new file mode 100644 index 0000000..fc8ede0 --- /dev/null +++ b/src/cambricon/common.h @@ -0,0 +1,75 @@ +#ifndef INFINI_OPS_CAMBRICON_COMMON_H_ +#define INFINI_OPS_CAMBRICON_COMMON_H_ + +#include +#include + +#include "data_type.h" +#include "device.h" + +#define NRAM_MAX_SIZE (1024 * 240) + +#ifdef __BANG__ + +namespace infini::ops::reduce { + +constexpr int batch_size = 128 / sizeof(float); + +__mlu_func__ void SumInternal(float* dst, float* src, int max_batch) { + const int width = max_batch / batch_size; + + if (width >= 4) { + __bang_sumpool(dst, src, batch_size, 1, width, 1, width, 1, 1); + __bang_reduce_sum(dst, dst, batch_size); + } else { + float sum = 0.0f; + for (int i = 0; i < max_batch; ++i) { + sum += src[i]; + } + dst[0] = sum; + } +} + +} // namespace infini::ops::reduce + +#endif // __BANG__ + +namespace infini::ops::cnnl_utils { + +inline cnnlDataType_t GetDataType(DataType dtype) { + switch (dtype) { + case DataType::kInt8: + return CNNL_DTYPE_INT8; + case DataType::kUInt8: + return CNNL_DTYPE_UINT8; + case DataType::kInt32: + return CNNL_DTYPE_INT32; + case DataType::kInt64: + return CNNL_DTYPE_INT64; + case DataType::kFloat16: + return CNNL_DTYPE_HALF; + case DataType::kFloat32: + return CNNL_DTYPE_FLOAT; + case DataType::kBFloat16: + return CNNL_DTYPE_BFLOAT16; + case DataType::kFloat64: + return CNNL_DTYPE_DOUBLE; + default: + return CNNL_DTYPE_INVALID; + } +} + +} // namespace infini::ops::cnnl_utils + +namespace infini::ops::cnrt_utils { + +inline void GetLaunchConfig(const Device& device, int* core_per_cluster, + int* cluster_count) { + int device_id = device.index(); + cnrtDeviceGetAttribute(cluster_count, cnrtAttrClusterCount, device_id); + cnrtDeviceGetAttribute(core_per_cluster, cnrtAttrMcorePerCluster, device_id); +} + +} // namespace infini::ops::cnrt_utils + +#endif diff --git a/src/cambricon/device_.h b/src/cambricon/device_.h new file mode 100644 index 0000000..224a8d8 --- /dev/null +++ b/src/cambricon/device_.h @@ -0,0 +1,23 @@ +#ifndef INFINI_OPS_CAMBRICON_DEVICE__H_ +#define INFINI_OPS_CAMBRICON_DEVICE__H_ + +#include "bang_bf16.h" +#include "bang_fp16.h" +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +template <> +struct TypeMap { + using type = __half; +}; + +template <> +struct TypeMap { + using type = __bang_bfloat16; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cambricon/gemm/cnblas.h b/src/cambricon/gemm/cnblas.h new file mode 100644 index 0000000..ac95bd5 --- /dev/null +++ b/src/cambricon/gemm/cnblas.h @@ -0,0 +1,159 @@ +#ifndef INFINI_OPS_CAMBRICON_GEMM_CNBLAS_H_ +#define INFINI_OPS_CAMBRICON_GEMM_CNBLAS_H_ + +#include +#include +#include + +// clang-format off +#include +#include +// clang-format on + +#include "base/gemm.h" +#include "cambricon/common.h" + +namespace infini::ops { + +template <> +class Operator : public Gemm { + public: + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) + : Gemm{a, b, alpha, beta, trans_a, trans_b, c}, + a_rows_{a.size(-2)}, + a_cols_{a.size(-1)}, + b_rows_{b.size(-2)}, + b_cols_{b.size(-1)}, + c_rows_{c.size(-2)}, + c_cols_{c.size(-1)} { + assert(!trans_a_ && "`trans_a` is not currently supported"); + assert(!trans_b_ && "`trans_b` is not currently supported"); + + cnnlCreate(&cnnl_handle_); + + cnnlCreateTensorDescriptor(&desc_a_); + cnnlCreateTensorDescriptor(&desc_b_); + cnnlCreateTensorDescriptor(&desc_c_); + + cnnlCreateMatMulDescriptor(&matmul_desc_); + cnnlCreateMatMulAlgo(&matmul_algo_); + cnnlCreateMatMulHeuristicResult(&heuristic_result_); + + int32_t use_stride = 1; + cnnlSetMatMulDescAttr(matmul_desc_, CNNL_MATMUL_USE_STRIDE, &use_stride, + sizeof(int32_t)); + + SetupTensorDescriptor(desc_a_, a_strides_, a_type_, a_rows_, a_cols_, + batch_count_, batch_stride_a_); + SetupTensorDescriptor(desc_b_, b_strides_, b_type_, b_rows_, b_cols_, + batch_count_, batch_stride_b_); + SetupTensorDescriptor(desc_c_, c_strides_, c_type_, c_rows_, c_cols_, + batch_count_, batch_stride_c_); + int count = 0; + cnnlGetBatchMatMulExAlgoHeuristic(cnnl_handle_, matmul_desc_, desc_a_, + desc_b_, desc_c_, NULL, 1, + &heuristic_result_, &count); + + cnrtMalloc(&default_workspace_, workspace_size_in_bytes()); + } + + Operator(const Tensor a, const Tensor b, Tensor c) + : Operator{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + c} {} + + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, Tensor c) + : Operator{a, b, alpha, beta, std::nullopt, std::nullopt, c} {} + + ~Operator() { + cnrtFree(default_workspace_); + cnnlDestroyTensorDescriptor(desc_c_); + cnnlDestroyTensorDescriptor(desc_b_); + cnnlDestroyTensorDescriptor(desc_a_); + cnnlDestroyMatMulDescriptor(matmul_desc_); + cnnlDestroyMatMulAlgo(matmul_algo_); + cnnlDestroyMatMulHeuristicResult(heuristic_result_); + cnnlDestroy(cnnl_handle_); + } + + void operator()(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const override { + const auto& alpha_value{alpha.value_or(alpha_)}; + const auto& beta_value{beta.value_or(beta_)}; + + cnnlSetQueue(cnnl_handle_, (cnrtQueue_t)stream_); + + auto workspace{workspace_ ? workspace_ : default_workspace_}; + auto workspace_size{workspace_size_in_bytes_ ? workspace_size_in_bytes_ + : workspace_size_in_bytes()}; + + cnnlBatchMatMulEx(cnnl_handle_, matmul_desc_, matmul_algo_, &alpha_value, + desc_a_, a.data(), desc_b_, b.data(), &beta_value, + desc_c_, c.data(), workspace, workspace_size); + } + + std::size_t workspace_size_in_bytes() const override { + std::size_t size{0}; + + cnnlGetBatchMatMulExHeuristicResult(heuristic_result_, matmul_algo_, &size); + + return size; + } + + private: + void SetupTensorDescriptor(cnnlTensorDescriptor_t desc, + const Tensor::Strides& strides, DataType dtype, + Tensor::Size rows, Tensor::Size cols, + Tensor::Size batch, Tensor::Stride batch_stride) { + cnnlDataType_t cnnl_dtype = cnnl_utils::GetDataType(dtype); + + if (batch > 1) { + std::vector dims = {static_cast(batch), static_cast(rows), + static_cast(cols)}; + std::vector strides_arr = { + static_cast(batch_stride), + static_cast(strides[strides.size() - 2]), + static_cast(strides[strides.size() - 1])}; + cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, cnnl_dtype, + dims.size(), dims.data(), strides_arr.data()); + } else { + std::vector dims = {static_cast(rows), static_cast(cols)}; + std::vector strides_arr = { + static_cast(strides[strides.size() - 2]), + static_cast(strides[strides.size() - 1])}; + cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, cnnl_dtype, + dims.size(), dims.data(), strides_arr.data()); + } + } + + cnnlHandle_t cnnl_handle_; + + cnnlTensorDescriptor_t desc_a_; + + cnnlTensorDescriptor_t desc_b_; + + cnnlTensorDescriptor_t desc_c_; + + cnnlMatMulDescriptor_t matmul_desc_; + + cnnlMatMulAlgo_t matmul_algo_; + + cnnlMatMulHeuristicResult_t heuristic_result_; + + Tensor::Size a_rows_, a_cols_; + + Tensor::Size b_rows_, b_cols_; + + Tensor::Size c_rows_, c_cols_; + + // TODO: Remove the following member after default workspace mechanism has + // been introduced globally. + void* default_workspace_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cambricon/rms_norm/kernel.mlu b/src/cambricon/rms_norm/kernel.mlu new file mode 100644 index 0000000..b4d7e8d --- /dev/null +++ b/src/cambricon/rms_norm/kernel.mlu @@ -0,0 +1,332 @@ +#include "rms_norm.h" + +__nram__ char nram_buffer[NRAM_MAX_SIZE]; + +namespace infini::ops { + +template +__mlu_global__ void RmsNorm(const T* input, const TW* weight, T* output, + size_t* shape, ptrdiff_t* output_strides, + ptrdiff_t* input_strides, float epsilon, + int num_dims, int norm_dim_size) { + // Calculate problem dimensions. + int batch_volume = 1; + for (int dim_idx = 0; dim_idx < num_dims - 1; ++dim_idx) { + batch_volume *= shape[dim_idx]; + } + int vector_size = shape[num_dims - 1]; + + // Task distribution across cores. + int remaining_tasks = batch_volume % taskDim; + int base_tasks_per_core = batch_volume / taskDim; + int actual_tasks = base_tasks_per_core + (taskId < remaining_tasks ? 1 : 0); + int task_start_idx = + (taskId < remaining_tasks + ? taskId * (base_tasks_per_core + 1) + : remaining_tasks * (base_tasks_per_core + 1) + + (taskId - remaining_tasks) * base_tasks_per_core); + + // Determine optimal batch size based on vector size. + int max_batch_size; + if (vector_size <= 64) { + max_batch_size = vector_size; + } else { + max_batch_size = + (NRAM_MAX_SIZE - 256) / (2 * sizeof(T) + sizeof(TW) + sizeof(float)); + max_batch_size = std::min(max_batch_size, vector_size); + max_batch_size = (max_batch_size / 64) * 64; // Align to 64 elements + } + + constexpr int reduce_buffer_size = 128 / sizeof(float); + + // NRAM buffer allocation with dynamic sizing. + float* reduction_buffer = (float*)nram_buffer; + T* input_cache = (T*)(reduction_buffer + reduce_buffer_size); + TW* weight_cache = (TW*)(input_cache + max_batch_size); + float* float_buffer = (float*)(weight_cache + max_batch_size); + float* weight_float_buffer = (float*)(float_buffer + max_batch_size); + + // Process vectors assigned to current core. + for (int task_idx = 0; task_idx < actual_tasks; ++task_idx) { + int current_index = task_start_idx + task_idx; + + // Calculate memory offsets for current task. + int input_offset = 0; + int output_offset = 0; + int temp_index = current_index; + + for (int dim = 0; dim < num_dims - 1; ++dim) { + int dim_coord = temp_index % shape[dim]; + input_offset += dim_coord * input_strides[dim]; + output_offset += dim_coord * output_strides[dim]; + temp_index /= shape[dim]; + } + + // Compute sum of squares. + float sum_squared = 0.0f; + + if (vector_size <= 128) { + __memcpy(input_cache, input + input_offset, vector_size * sizeof(T), + GDRAM2NRAM); + if constexpr (std::is_same::value) { + __bang_half2float(float_buffer, reinterpret_cast(input_cache), + vector_size); + } else if constexpr (std::is_same::value) { + __bang_bfloat162float(float_buffer, input_cache, vector_size); + } else { + __memcpy(float_buffer, input_cache, vector_size * sizeof(float), + NRAM2NRAM); + } + + __bang_mul(float_buffer, float_buffer, float_buffer, vector_size); + + // Direct accumulation for small vectors. + for (int i = 0; i < vector_size; ++i) { + sum_squared += float_buffer[i]; + } + } else { + // Large vector processing with chunking. + __bang_write_value(reduction_buffer, reduce_buffer_size, 0); + size_t processed_elements = 0; + + while (processed_elements < vector_size) { + size_t current_batch = + std::min((size_t)max_batch_size, vector_size - processed_elements); + + __memcpy(input_cache, + input + input_offset + + processed_elements * input_strides[num_dims - 1], + current_batch * sizeof(T), GDRAM2NRAM); + + if constexpr (std::is_same::value) { + __bang_half2float(float_buffer, reinterpret_cast(input_cache), + current_batch); + } else if constexpr (std::is_same::value) { + __bang_bfloat162float(float_buffer, input_cache, current_batch); + } else { + __memcpy(float_buffer, input_cache, current_batch * sizeof(float), + NRAM2NRAM); + } + + __bang_mul(float_buffer, float_buffer, float_buffer, current_batch); + + float batch_sum = 0.0f; + if (current_batch >= 128) { + infini::ops::reduce::SumInternal(reduction_buffer, float_buffer, + current_batch); + batch_sum = reduction_buffer[0]; + } else { + for (size_t i = 0; i < current_batch; ++i) { + batch_sum += float_buffer[i]; + } + } + + sum_squared += batch_sum; + processed_elements += current_batch; + } + } + + // Compute normalization factor. + float rms_value = sqrtf(sum_squared / vector_size + epsilon); + float inv_rms = 1.0f / rms_value; + + // Process vector for normalization. + if (vector_size <= max_batch_size) { + __memcpy(input_cache, input + input_offset, vector_size * sizeof(T), + GDRAM2NRAM); + __memcpy(weight_cache, weight, vector_size * sizeof(TW), GDRAM2NRAM); + + if constexpr (std::is_same::value) { + __bang_half2float(float_buffer, reinterpret_cast(input_cache), + vector_size); + } else if constexpr (std::is_same::value) { + __bang_bfloat162float(float_buffer, input_cache, vector_size); + } else { + __memcpy(float_buffer, input_cache, vector_size * sizeof(float), + NRAM2NRAM); + } + + if constexpr (std::is_same::value) { + __bang_half2float(weight_float_buffer, + reinterpret_cast(weight_cache), vector_size); + } else if constexpr (std::is_same::value) { + __bang_bfloat162float(weight_float_buffer, weight_cache, vector_size); + } else { + __memcpy(weight_float_buffer, weight_cache, vector_size * sizeof(float), + NRAM2NRAM); + } + + // Multiply by weight and apply normalization. + __bang_mul(float_buffer, float_buffer, weight_float_buffer, vector_size); + __bang_mul_scalar(float_buffer, float_buffer, inv_rms, vector_size); + + if constexpr (std::is_same::value) { + __bang_float2half(reinterpret_cast(input_cache), float_buffer, + vector_size); + } else if constexpr (std::is_same::value) { + __bang_float2bfloat16(input_cache, float_buffer, vector_size); + } else { + __memcpy(input_cache, float_buffer, vector_size * sizeof(float), + NRAM2NRAM); + } + + __memcpy(output + output_offset, input_cache, vector_size * sizeof(T), + NRAM2GDRAM); + } else { + // Large vector processing with chunking. + size_t processed_elements = 0; + while (processed_elements < vector_size) { + size_t current_batch = + std::min((size_t)max_batch_size, vector_size - processed_elements); + + // Load input and weight data. + __memcpy(input_cache, + input + input_offset + + processed_elements * input_strides[num_dims - 1], + current_batch * sizeof(T), GDRAM2NRAM); + __memcpy(weight_cache, weight + processed_elements, + current_batch * sizeof(TW), GDRAM2NRAM); + + if constexpr (std::is_same::value) { + __bang_half2float(float_buffer, reinterpret_cast(input_cache), + current_batch); + } else if constexpr (std::is_same::value) { + __bang_bfloat162float(float_buffer, input_cache, current_batch); + } else { + __memcpy(float_buffer, input_cache, current_batch * sizeof(float), + NRAM2NRAM); + } + + if constexpr (std::is_same::value) { + __bang_half2float(weight_float_buffer, + reinterpret_cast(weight_cache), + current_batch); + } else if constexpr (std::is_same::value) { + __bang_bfloat162float(weight_float_buffer, weight_cache, + current_batch); + } else { + __memcpy(weight_float_buffer, weight_cache, + current_batch * sizeof(float), NRAM2NRAM); + } + + __bang_mul(float_buffer, float_buffer, weight_float_buffer, + current_batch); + __bang_mul_scalar(float_buffer, float_buffer, inv_rms, current_batch); + + if constexpr (std::is_same::value) { + __bang_float2half(reinterpret_cast(input_cache), float_buffer, + current_batch); + } else if constexpr (std::is_same::value) { + __bang_float2bfloat16(input_cache, float_buffer, current_batch); + } else { + __memcpy(input_cache, float_buffer, current_batch * sizeof(float), + NRAM2NRAM); + } + + __memcpy(output + output_offset + + processed_elements * output_strides[num_dims - 1], + input_cache, current_batch * sizeof(T), NRAM2GDRAM); + + processed_elements += current_batch; + } + } + } +} + +template +void RmsNormUnion(void* workspace, int core_per_cluster, int cluster_count, + cnrtQueue_t queue, void* y, const void* x, const void* w, + const size_t* shape, const ptrdiff_t* y_strides, + const ptrdiff_t* x_strides, float eps, int ndim) { + cnrtDim3_t kernel_dim; + cnrtFunctionType_t kernel_type; + + // Configure kernel dimensions. + kernel_dim.x = core_per_cluster; + kernel_dim.y = cluster_count; + kernel_dim.z = 1; + kernel_type = cnrtFuncTypeUnion1; // Can choose others, but must adapt + // kernel_type accordingly + int dimsize = shape[ndim - 1]; // Length of operation dimension + int dim_s; // dim_s is the next power of 2 greater than dimsize + float mi = log2(dimsize); + if (floor(mi) == mi) { + dim_s = dimsize; + } else { + dim_s = pow(2, floor(mi) + 1); + } + constexpr int reduce_num = + 128 / sizeof(float); // Cambricon __bang_reduce_sum can only reduce 128 + // bytes at a time + if (dim_s < reduce_num) { + dim_s = reduce_num; // Force dim_s >= reduce_num + } + + // Prepare device pointers. + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto w_ = reinterpret_cast(w); + char* tmp_device = reinterpret_cast(workspace); + char* tmp_stride = tmp_device + ndim * sizeof(size_t); + size_t* mlu_shape = (size_t*)tmp_device; + ptrdiff_t* mlu_x_strides = (ptrdiff_t*)tmp_stride; + ptrdiff_t* mlu_y_strides = mlu_x_strides + ndim; + + // Copy shape and stride information to device. + CNRT_CHECK(cnrtMemcpyAsync(mlu_shape, const_cast(shape), + ndim * sizeof(size_t), queue, + cnrtMemcpyHostToDev)); // const not supported + CNRT_CHECK(cnrtMemcpyAsync(mlu_x_strides, const_cast(x_strides), + ndim * sizeof(ptrdiff_t), queue, + cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpyAsync(mlu_y_strides, const_cast(y_strides), + ndim * sizeof(ptrdiff_t), queue, + cnrtMemcpyHostToDev)); + + RmsNorm<<>>( + x_, w_, y_, mlu_shape, mlu_y_strides, mlu_x_strides, eps, ndim, dim_s); + + cnrtQueueSync(queue); +} + +template void RmsNormUnion<__half, __half>(void*, int, int, cnrtQueue_t, void*, + const void*, const void*, + const size_t*, const ptrdiff_t*, + const ptrdiff_t*, float, int); + +template void RmsNormUnion<__half, __bang_bfloat16>( + void*, int, int, cnrtQueue_t, void*, const void*, const void*, + const size_t*, const ptrdiff_t*, const ptrdiff_t*, float, int); + +template void RmsNormUnion<__half, float>(void*, int, int, cnrtQueue_t, void*, + const void*, const void*, + const size_t*, const ptrdiff_t*, + const ptrdiff_t*, float, int); + +template void RmsNormUnion<__bang_bfloat16, __half>( + void*, int, int, cnrtQueue_t, void*, const void*, const void*, + const size_t*, const ptrdiff_t*, const ptrdiff_t*, float, int); + +template void RmsNormUnion<__bang_bfloat16, __bang_bfloat16>( + void*, int, int, cnrtQueue_t, void*, const void*, const void*, + const size_t*, const ptrdiff_t*, const ptrdiff_t*, float, int); + +template void RmsNormUnion<__bang_bfloat16, float>( + void*, int, int, cnrtQueue_t, void*, const void*, const void*, + const size_t*, const ptrdiff_t*, const ptrdiff_t*, float, int); + +template void RmsNormUnion(void*, int, int, cnrtQueue_t, void*, + const void*, const void*, + const size_t*, const ptrdiff_t*, + const ptrdiff_t*, float, int); + +template void RmsNormUnion( + void*, int, int, cnrtQueue_t, void*, const void*, const void*, + const size_t*, const ptrdiff_t*, const ptrdiff_t*, float, int); + +template void RmsNormUnion(void*, int, int, cnrtQueue_t, void*, + const void*, const void*, + const size_t*, const ptrdiff_t*, + const ptrdiff_t*, float, int); + +} // namespace infini::ops diff --git a/src/cambricon/rms_norm/rms_norm.h b/src/cambricon/rms_norm/rms_norm.h new file mode 100644 index 0000000..852fe66 --- /dev/null +++ b/src/cambricon/rms_norm/rms_norm.h @@ -0,0 +1,66 @@ +#ifndef INFINI_OPS_CAMBRICON_RMS_NORM_H_ +#define INFINI_OPS_CAMBRICON_RMS_NORM_H_ + +#include +#include +#include + +#include "cambricon/common.h" +#include "cambricon/device_.h" +#include "base/rms_norm.h" + +namespace infini::ops { + +// TODO: Remove forward declaration. +template +void RmsNormUnion(void* workspace, int core_per_cluster, int cluster_count, + cnrtQueue_t queue, void* y, const void* x, const void* w, + const size_t* shape, const ptrdiff_t* y_strides, + const ptrdiff_t* x_strides, float eps, int ndim); + +template <> +class Operator : public RmsNorm { + public: + Operator(const Tensor input, const Tensor weight, float eps, Tensor out) + : RmsNorm{input, weight, eps, out} { + cnrt_utils::GetLaunchConfig(input.device(), &core_per_cluster, + &cluster_count); + cnrtMalloc(&default_workspace_, workspace_size_in_bytes()); + } + + void operator()(const Tensor input, const Tensor weight, float eps, + Tensor out) const override { + auto queue = static_cast(stream_ ? stream_ : 0); + auto workspace{workspace_ ? workspace_ : default_workspace_}; + + DispatchFunc< + Device::Type::kCambricon, + List, + List>( + {input.dtype(), weight.dtype()}, + [&](auto input_tag, auto weight_tag) { + using InputT = typename decltype(input_tag)::type; + using WeightT = typename decltype(weight_tag)::type; + + RmsNormUnion( + workspace, core_per_cluster, cluster_count, queue, out.data(), + input.data(), weight.data(), out_shape_.data(), + out_strides_.data(), input_strides_.data(), eps, ndim_); + }, + "CambriconRmsNorm::operator() - output dispatch"); + } + + ~Operator() { cnrtFree(default_workspace_); } + + std::size_t workspace_size_in_bytes() const override { + return ndim_ * (sizeof(size_t) + 2 * sizeof(ptrdiff_t)); + } + + void* default_workspace_{nullptr}; + int core_per_cluster = 0; + int cluster_count = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/caster.h b/src/caster.h new file mode 100644 index 0000000..cefd116 --- /dev/null +++ b/src/caster.h @@ -0,0 +1,14 @@ +#ifndef INFINI_OPS_CASTER_H_ +#define INFINI_OPS_CASTER_H_ + +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +template +struct Caster; + +} // namespace infini::ops + +#endif diff --git a/src/common/constexpr_map.h b/src/common/constexpr_map.h new file mode 100644 index 0000000..7454f54 --- /dev/null +++ b/src/common/constexpr_map.h @@ -0,0 +1,32 @@ +#ifndef INFINI_OPS_COMMON_CONSTEXPR_MAP_H_ +#define INFINI_OPS_COMMON_CONSTEXPR_MAP_H_ + +#include +#include +#include +#include + +namespace infini::ops { + +template +struct ConstexprMap { + constexpr ConstexprMap(std::array, size> data) + : data_(data) {} + + constexpr Value at(Key key) const { + for (const auto& pr : data_) { + if (pr.first == key) return pr.second; + } + // TODO(lzm): change to logging. + assert("the key is not found in the `ConstexprMap`"); + // Unreachable, provided to satisfy the compiler's requirement. + std::abort(); + } + + private: + std::array, size> data_; +}; + +} // namespace infini::ops + +#endif diff --git a/src/common/generic_utils.h b/src/common/generic_utils.h new file mode 100644 index 0000000..795f2fb --- /dev/null +++ b/src/common/generic_utils.h @@ -0,0 +1,26 @@ +#ifndef INFINI_OPS_COMMON_GENERIC_UTILS_H_ +#define INFINI_OPS_COMMON_GENERIC_UTILS_H_ + +#include + +namespace infini::ops::utils { + +std::size_t IndexToOffset(std::size_t flat_index, std::size_t ndim, + const std::size_t* shape, + const std::ptrdiff_t* strides) { + std::size_t res = 0; + for (std::size_t i = ndim; i-- > 0;) { + res += (flat_index % shape[i]) * strides[i]; + flat_index /= shape[i]; + } + return res; +} + +template +constexpr auto CeilDiv(const X& x, const Y& y) { + return (x + y - 1) / y; +} + +} // namespace infini::ops::utils + +#endif diff --git a/src/common/traits.h b/src/common/traits.h new file mode 100644 index 0000000..c746f4c --- /dev/null +++ b/src/common/traits.h @@ -0,0 +1,170 @@ +#ifndef INFINI_OPS_COMMON_TRAITS_H_ +#define INFINI_OPS_COMMON_TRAITS_H_ + +#include +#include + +namespace infini::ops { + +// --------------------- List and TypePack --------------------- +// A generic container for a sequence of compile-time values. +template +struct List {}; + +// `ListGet(List{})` extracts the `i`th value from a `List` +// tag. +template +constexpr auto ListGetImpl(List) { + if constexpr (index == 0) + return head; + else + return ListGetImpl(List{}); +} + +template +constexpr auto ListGet(List list) { + return ListGetImpl(list); +} + +template +struct TypePack {}; + +// ----------------------------------------------------------------------------- +// Tags +// ----------------------------------------------------------------------------- +// Tags are passed as regular function arguments to user functors instead of +// template parameters. This lets users write plain C++17 `[](auto tag)` lambdas +// rather than C++20 template lambdas (`[]()`). + +// `TypeTag`: carries a C++ type. Recover with `typename +// decltype(tag)::type`. +template +struct TypeTag { + using type = T; +}; + +// `ValueTag`: carries a compile-time value. Recover with +// `decltype(tag)::value`. +template +struct ValueTag { + using value_type = decltype(v); + static constexpr auto value = v; +}; + +// ----------------------------------------------------------------------------- +// List Queries +// ----------------------------------------------------------------------------- + +// Check at compile-time if a value exists within a construct (e.g., `List<>`). +// Example: `static_assert(ContainsValue)`; +template +struct Contains; + +template +struct Contains, value> + : std::disjunction...> {}; + +template +inline constexpr bool ContainsValue = Contains::value; + +// Check at compile-time if a type `T` is present in a variadic list of types +// `Ts`. +// Example: `static_assert(IsTypeInList)`; +template +inline constexpr bool IsTypeInList = (std::is_same_v || ...); + +// Trait to detect whether `T` is a `List<...>` specialization. +template +struct IsListType : std::false_type {}; + +template +struct IsListType> : std::true_type {}; + +// ----------------------------------------------------------------------------- +// List Operations +// ----------------------------------------------------------------------------- + +// Concatenates two List types into a single `List`. +// Example: `ConcatType, List<3, 4>>` is `List<1, 2, 3, 4>`. +template +struct Concat; + +template +struct Concat, List> { + using type = List; +}; + +template +using ConcatType = typename Concat::type; + +template +struct Flatten; + +template +struct Flatten> { + using type = List; +}; + +template +struct Flatten { + using type = typename Flatten, Rest...>::type; +}; + +// ----------------------------------------------------------------------------- +// Invocability Detection (SFINAE) +// ----------------------------------------------------------------------------- + +// Checks if a `Functor` can be called with a `ValueTag` and `Args...`. +template +struct IsInvocable : std::false_type {}; + +template +struct IsInvocable()( + ValueTag{}, std::declval()...))>, + Args...> : std::true_type {}; + +template +inline constexpr bool IsInvocableValue = + IsInvocable::value; + +// ----------------------------------------------------------------------------- +// Filtering Logic +// ----------------------------------------------------------------------------- + +// Recursive template to filter values based on `Functor` support at +// compile-time. +template +struct Filter; + +// Base case: All values processed. +template +struct Filter, List> { + using type = List; +}; + +// Recursive step: Test the `head` value and accumulate if supported. +template +struct Filter, List, head, tail...> { + using type = typename std::conditional_t< + IsInvocableValue && + !ContainsValue, head>, + Filter, List, tail...>, + Filter, List, tail...>>::type; +}; + +// Interface to filter a `List` type directly. +template +struct FilterList; + +template +struct FilterList, List> { + using type = + typename Filter, List<>, items...>::type; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/add/add.h b/src/cpu/add/add.h new file mode 100644 index 0000000..c56d31f --- /dev/null +++ b/src/cpu/add/add.h @@ -0,0 +1,65 @@ +#ifndef INFINI_OPS_CPU_ADD_ADD_H_ +#define INFINI_OPS_CPU_ADD_ADD_H_ + +#include + +#include "base/add.h" +#include "common/generic_utils.h" +#include "cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Add, + Caster { + public: + Operator(const Tensor input, const Tensor other, Tensor out) + : Add{input, other, out} { + // TODO: Check constraints. + } + + void operator()(const Tensor input, const Tensor other, + Tensor out) const override { + DispatchFunc( + out_type_, + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(input, other, out); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor input, const Tensor other, Tensor out) const { + using ComputeType = std::conditional_t || + IsFP16, + float, T>; + + const auto* input_ptr = static_cast(input.data()); + const auto* other_ptr = static_cast(other.data()); + auto* out_ptr = static_cast(out.data()); + + auto get_idx = [&](Tensor::Size i, bool is_contig, const auto* shape, + const auto* strides) { + return is_contig ? i : utils::IndexToOffset(i, ndim_, shape, strides); + }; + +#pragma omp parallel for + for (Tensor::Size i = 0; i < output_size_; ++i) { + auto input_idx = get_idx(i, is_input_contiguous_, input_shape_.data(), + input_strides_.data()); + auto other_idx = get_idx(i, is_other_contiguous_, other_shape_.data(), + other_strides_.data()); + auto out_idx = get_idx(i, is_out_contiguous_, out_shape_.data(), + out_strides_.data()); + + out_ptr[out_idx] = Cast(Cast(input_ptr[input_idx]) + + Cast(other_ptr[other_idx])); + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/caster_.h b/src/cpu/caster_.h new file mode 100644 index 0000000..4d2cca6 --- /dev/null +++ b/src/cpu/caster_.h @@ -0,0 +1,74 @@ +#ifndef INFINI_OPS_COMMON_CPU_CASTER_H_ +#define INFINI_OPS_COMMON_CPU_CASTER_H_ + +#include + +#include "caster.h" +#include "cpu/device_.h" + +namespace infini::ops { + +template <> +struct Caster { + template + static Dst Cast(Src&& x) { + static_assert(!std::is_reference_v, + "`Cast` cannot return reference types"); + + using PureDst = std::remove_cv_t>; + using PureSrc = std::remove_cv_t>; + + if constexpr (std::is_same_v) { + return std::forward(x); + } + + constexpr bool src_is_custom = IsBFloat16 || + IsFP16; + constexpr bool dst_is_custom = IsBFloat16 || + IsFP16; + + if constexpr (!src_is_custom && !dst_is_custom) { + return static_cast(std::forward(x)); + } else { + return FromFloatHelper(ToFloatHelper(std::forward(x))); + } + } + + private: + template + struct HasToFloat : std::false_type {}; + + template + struct HasToFloat().ToFloat())>> + : std::true_type {}; + + template + struct HasFromFloat : std::false_type {}; + + template + struct HasFromFloat< + T, std::void_t()))>> + : std::true_type {}; + + template + static constexpr float ToFloatHelper(T&& x) { + if constexpr (HasToFloat::value) { + return std::forward(x).ToFloat(); + } else { + return static_cast(x); + } + } + + template + static constexpr PureDst FromFloatHelper(float f) { + if constexpr (HasFromFloat::value) { + return PureDst::FromFloat(f); + } else { + return static_cast(f); + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/causal_softmax/causal_softmax.h b/src/cpu/causal_softmax/causal_softmax.h new file mode 100644 index 0000000..14848ee --- /dev/null +++ b/src/cpu/causal_softmax/causal_softmax.h @@ -0,0 +1,83 @@ +#ifndef INFINI_OPS_CPU_CAUSAL_SOFTMAX_H_ +#define INFINI_OPS_CPU_CAUSAL_SOFTMAX_H_ + +#include + +#include "base/causal_softmax.h" +#include "common/generic_utils.h" +#include "cpu/caster_.h" +#include "data_type.h" +#include "tensor.h" + +namespace infini::ops { + +template <> +class Operator : public CausalSoftmax, + Caster { + public: + Operator(const Tensor input, Tensor out) : CausalSoftmax{input, out} {} + + void operator()(const Tensor input, Tensor out) const override { + DispatchFunc( + out.dtype(), + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(input, out); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor input, Tensor out) const { + auto* out_ptr = static_cast(out.data()); + const auto* input_ptr = static_cast(input.data()); + + auto out_stride_b = ndim_ == 3 ? out_strides_[0] : 0; + auto out_stride_i = out_strides_[ndim_ - 2]; + auto out_stride_j = out_strides_[ndim_ - 1]; + auto input_stride_b = ndim_ == 3 ? input_strides_[0] : 0; + auto input_stride_i = input_strides_[ndim_ - 2]; + auto input_stride_j = input_strides_[ndim_ - 1]; + + for (Tensor::Size bi = 0; bi < batch_size_; ++bi) { + for (Tensor::Size i = 0; i < seq_len_; ++i) { + ptrdiff_t out_offset = bi * out_stride_b + i * out_stride_i; + ptrdiff_t input_offset = bi * input_stride_b + i * input_stride_i; + T* out_row = out_ptr + out_offset; + const T* input_row = input_ptr + input_offset; + + Tensor::Size valid_len = total_seq_len_ - seq_len_ + i + 1; + + for (Tensor::Size j = valid_len; j < total_seq_len_; ++j) { + out_row[j * out_stride_j] = Cast(0.0f); + } + + float max_val = Cast(input_row[0]); + for (Tensor::Size j = 1; j < valid_len; ++j) { + float v = Cast(input_row[j * input_stride_j]); + if (v > max_val) { + max_val = v; + } + } + + float sum = 0.0f; + for (Tensor::Size j = 0; j < valid_len; ++j) { + float v = + std::exp(Cast(input_row[j * input_stride_j]) - max_val); + out_row[j * out_stride_j] = Cast(v); + sum += v; + } + + for (Tensor::Size j = 0; j < valid_len; ++j) { + out_row[j * out_stride_j] = + Cast(Cast(out_row[j * out_stride_j]) / sum); + } + } + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/device_.h b/src/cpu/device_.h new file mode 100644 index 0000000..0d74232 --- /dev/null +++ b/src/cpu/device_.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_CPU_DEVICE__H_ +#define INFINI_OPS_CPU_DEVICE__H_ + +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +template <> +struct TypeMap { + using type = Float16; +}; + +template <> +struct TypeMap { + using type = BFloat16; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/gemm/gemm.h b/src/cpu/gemm/gemm.h new file mode 100644 index 0000000..a4dfb98 --- /dev/null +++ b/src/cpu/gemm/gemm.h @@ -0,0 +1,98 @@ +#ifndef INFINI_OPS_CPU_GEMM_H_ +#define INFINI_OPS_CPU_GEMM_H_ + +#include + +#include "base/gemm.h" +#include "common/generic_utils.h" +#include "cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Gemm, + Caster { + public: + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) + : Gemm{a, b, alpha, beta, trans_a, trans_b, c} { + // TODO: Check constraints. + } + + Operator(const Tensor a, const Tensor b, Tensor c) + : Operator{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + c} {} + + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, Tensor c) + : Operator{a, b, alpha, beta, std::nullopt, std::nullopt, c} {} + + void operator()(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const override { + DispatchFunc( + c.dtype(), + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(a, b, alpha, beta, trans_a, trans_b, c); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const { + const auto* A = static_cast(a.data()); + const auto* B = static_cast(b.data()); + auto* C = static_cast(c.data()); + + const auto& alpha_value{alpha.value_or(alpha_)}; + const auto& beta_value{beta.value_or(beta_)}; + const auto& trans_a_value{trans_a.value_or(trans_a_)}; + const auto& trans_b_value{trans_b.value_or(trans_b_)}; + + Tensor::Stride stride_a_m = trans_a_value + ? a_strides_[a_strides_.size() - 1] + : a_strides_[a_strides_.size() - 2]; + Tensor::Stride stride_a_k = trans_a_value + ? a_strides_[a_strides_.size() - 2] + : a_strides_[a_strides_.size() - 1]; + Tensor::Stride stride_b_k = trans_b_value + ? b_strides_[b_strides_.size() - 1] + : b_strides_[b_strides_.size() - 2]; + Tensor::Stride stride_b_n = trans_b_value + ? b_strides_[b_strides_.size() - 2] + : b_strides_[b_strides_.size() - 1]; + Tensor::Stride stride_c_m = c_strides_[c_strides_.size() - 2]; + Tensor::Stride stride_c_n = c_strides_[c_strides_.size() - 1]; + + for (Tensor::Size b = 0; b < batch_count_; ++b) { + const auto* A_batch = A + b * batch_stride_a_; + const auto* B_batch = B + b * batch_stride_b_; + auto* C_batch = C + b * batch_stride_c_; + + for (Tensor::Size i = 0; i < m_; ++i) { + for (Tensor::Size j = 0; j < n_; ++j) { + float sum = 0.0f; + + for (Tensor::Size l = 0; l < k_; ++l) { + float a_val = Cast(A_batch[i * stride_a_m + l * stride_a_k]); + float b_val = Cast(B_batch[l * stride_b_k + j * stride_b_n]); + sum += a_val * b_val; + } + + Tensor::Size idx = i * stride_c_m + j * stride_c_n; + float c_val = beta_value == 0.0f ? 0.0f : Cast(C_batch[idx]); + C_batch[idx] = Cast(alpha_value * sum + beta_value * c_val); + } + } + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/rms_norm/rms_norm.h b/src/cpu/rms_norm/rms_norm.h new file mode 100644 index 0000000..9cae419 --- /dev/null +++ b/src/cpu/rms_norm/rms_norm.h @@ -0,0 +1,70 @@ +#ifndef INFINI_OPS_CPU_RMS_NORM_H_ +#define INFINI_OPS_CPU_RMS_NORM_H_ + +#include + +#include "base/rms_norm.h" +#include "common/generic_utils.h" +#include "cpu/caster_.h" +#include "data_type.h" +#include "tensor.h" + +namespace infini::ops { + +template <> +class Operator : public RmsNorm, + Caster { + public: + using RmsNorm::RmsNorm; + + void operator()(const Tensor input, const Tensor weight, float eps, + Tensor out) const override { + DispatchFunc( + out.dtype(), + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(input, weight, eps, out); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor input, const Tensor weight, float eps, + Tensor out) const { + auto* out_ptr = static_cast(out.data()); + const auto* input_ptr = static_cast(input.data()); + const auto* weight_ptr = static_cast(weight.data()); + + auto stride_input_batch = input_strides_.size() > 1 ? input_strides_[0] : 0; + auto stride_input_nhead = + input_strides_.size() > 1 ? input_strides_[1] : input_strides_[0]; + auto stride_out_batch = out_strides_.size() > 1 ? out_strides_[0] : 0; + auto stride_out_nhead = + out_strides_.size() > 1 ? out_strides_[1] : out_strides_[0]; + + for (Tensor::Size bi = 0; bi < batch_size_; ++bi) { + for (Tensor::Size hi = 0; hi < nhead_; ++hi) { + const T* input_row = + input_ptr + bi * stride_input_batch + hi * stride_input_nhead; + T* out_row = out_ptr + bi * stride_out_batch + hi * stride_out_nhead; + + float ss = 0; + for (Tensor::Size k = 0; k < dim_; ++k) { + float v = Cast(input_row[k]); + ss += v * v; + } + float rms = 1.f / std::sqrt(ss / static_cast(dim_) + eps); + + for (Tensor::Size k = 0; k < dim_; ++k) { + out_row[k] = Cast(Cast(input_row[k]) * + Cast(weight_ptr[k]) * rms); + } + } + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/swiglu/swiglu.h b/src/cpu/swiglu/swiglu.h new file mode 100644 index 0000000..57dccf1 --- /dev/null +++ b/src/cpu/swiglu/swiglu.h @@ -0,0 +1,65 @@ +#ifndef INFINI_OPS_CPU_SWIGLU_SWIGLU_H_ +#define INFINI_OPS_CPU_SWIGLU_SWIGLU_H_ + +#include + +#include "base/swiglu.h" +#include "common/generic_utils.h" +#include "cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Swiglu, + Caster { + public: + using Swiglu::Swiglu; + + void operator()(const Tensor input, const Tensor gate, + Tensor out) const override { + DispatchFunc( + out_type_, + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(input, gate, out); + }, + "Operator::operator()"); + } + + private: + template + void Compute(const Tensor input, const Tensor gate, Tensor out) const { + using ComputeType = std::conditional_t || + IsFP16, + float, T>; + + const auto* input_ptr = static_cast(input.data()); + const auto* gate_ptr = static_cast(gate.data()); + auto* out_ptr = static_cast(out.data()); + + auto get_idx = [&](Tensor::Size i, bool is_contig, const auto* shape, + const auto* strides) { + return is_contig ? i : utils::IndexToOffset(i, ndim_, shape, strides); + }; + +#pragma omp parallel for + for (Tensor::Size i = 0; i < output_size_; ++i) { + auto input_idx = get_idx(i, is_input_contiguous_, input_shape_.data(), + input_strides_.data()); + auto gate_idx = get_idx(i, is_gate_contiguous_, gate_shape_.data(), + gate_strides_.data()); + auto out_idx = get_idx(i, is_out_contiguous_, out_shape_.data(), + out_strides_.data()); + const ComputeType gate_val = Cast(gate_ptr[gate_idx]); + const ComputeType sigmoid_gate = static_cast( + 1.0 / (1.0 + std::exp(-static_cast(gate_val)))); + const ComputeType swish_gate = gate_val * sigmoid_gate; + out_ptr[out_idx] = + Cast(Cast(input_ptr[input_idx]) * swish_gate); + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/add/kernel.cuh b/src/cuda/add/kernel.cuh new file mode 100644 index 0000000..4928d6b --- /dev/null +++ b/src/cuda/add/kernel.cuh @@ -0,0 +1,54 @@ +#ifndef INFINI_OPS_CUDA_ADD_KERNEL_CUH_ +#define INFINI_OPS_CUDA_ADD_KERNEL_CUH_ + +#include "cuda/kernel_commons.h" + +namespace infini::ops { + +template +struct AddOp { + static constexpr std::size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T& input, + const T& other) const { + if constexpr (IsFP16 || IsBFloat16) { + return __hadd(input, other); + } else if constexpr (std::is_same_v) { + return __fadd_rn(input, other); + } else { + return input + other; + } + } +}; + +template +__global__ void AddKernel(T* __restrict__ out, const T* __restrict__ input, + const T* __restrict__ other, + const size_t* __restrict__ out_shape, + const size_t* __restrict__ input_shape, + const size_t* __restrict__ other_shape, + const ptrdiff_t* __restrict__ out_strides, + const ptrdiff_t* __restrict__ input_strides, + const ptrdiff_t* __restrict__ other_strides, + size_t output_size, size_t ndim, bool out_contiguous, + bool input_contiguous, bool other_contiguous) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < output_size) { + size_t out_idx = + out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides); + size_t input_idx = + input_contiguous ? idx + : IndexToOffset(idx, ndim, input_shape, input_strides); + size_t other_idx = + other_contiguous ? idx + : IndexToOffset(idx, ndim, other_shape, other_strides); + + out[out_idx] = AddOp{}(input[input_idx], other[other_idx]); + } +} + +} // namespace infini::ops + +#endif diff --git a/src/cuda/add/kernel.h b/src/cuda/add/kernel.h new file mode 100644 index 0000000..2e0ddb9 --- /dev/null +++ b/src/cuda/add/kernel.h @@ -0,0 +1,96 @@ +#ifndef INFINI_OPS_CUDA_ADD_KERNEL_H_ +#define INFINI_OPS_CUDA_ADD_KERNEL_H_ + +#include + +#include "base/add.h" +#include "common/generic_utils.h" +#include "cuda/add/kernel.cuh" +#include "cuda/kernel_commons.h" + +namespace infini::ops { + +template +class CudaAdd : public Add { + public: + CudaAdd(const Tensor input, const Tensor other, Tensor out) + : Add{input, other, out} { + size_t shape_size = ndim_ * sizeof(*d_input_shape_); + size_t strides_size = ndim_ * sizeof(*d_input_strides_); + + Backend::malloc((void**)&d_input_shape_, shape_size); + Backend::malloc((void**)&d_other_shape_, shape_size); + Backend::malloc((void**)&d_out_shape_, shape_size); + Backend::malloc((void**)&d_input_strides_, strides_size); + Backend::malloc((void**)&d_other_strides_, strides_size); + Backend::malloc((void**)&d_out_strides_, strides_size); + + Backend::memcpy(d_input_shape_, input_shape_.data(), shape_size, + Backend::memcpyH2D); + Backend::memcpy(d_other_shape_, other_shape_.data(), shape_size, + Backend::memcpyH2D); + Backend::memcpy(d_out_shape_, out_shape_.data(), shape_size, + Backend::memcpyH2D); + Backend::memcpy(d_input_strides_, input_strides_.data(), strides_size, + Backend::memcpyH2D); + Backend::memcpy(d_other_strides_, other_strides_.data(), strides_size, + Backend::memcpyH2D); + Backend::memcpy(d_out_strides_, out_strides_.data(), strides_size, + Backend::memcpyH2D); + } + + ~CudaAdd() { + Backend::free(d_input_shape_); + Backend::free(d_other_shape_); + Backend::free(d_out_shape_); + Backend::free(d_input_strides_); + Backend::free(d_other_strides_); + Backend::free(d_out_strides_); + } + + void operator()(const Tensor input, const Tensor other, + Tensor out) const override { + int block_size = Backend::GetOptimalBlockSize(); + DispatchFunc( + {static_cast(out_type_), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + dim3 blockDims( + std::min(static_cast(block_size), output_size_)); + dim3 gridDims(utils::CeilDiv(output_size_, blockDims.x)); + + T* d_out = reinterpret_cast(out.data()); + const T* d_input = reinterpret_cast(input.data()); + const T* d_other = reinterpret_cast(other.data()); + + AddKernel + <<>>( + d_out, d_input, d_other, d_out_shape_, d_input_shape_, + d_other_shape_, d_out_strides_, d_input_strides_, + d_other_strides_, output_size_, ndim_, is_out_contiguous_, + is_input_contiguous_, is_other_contiguous_); + }, + "CudaAdd::operator()"); + } + + private: + Tensor::Size* d_input_shape_{nullptr}; + + Tensor::Size* d_other_shape_{nullptr}; + + Tensor::Size* d_out_shape_{nullptr}; + + Tensor::Stride* d_input_strides_{nullptr}; + + Tensor::Stride* d_other_strides_{nullptr}; + + Tensor::Stride* d_out_strides_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/caster_.h b/src/cuda/caster_.h new file mode 100644 index 0000000..4083f28 --- /dev/null +++ b/src/cuda/caster_.h @@ -0,0 +1,98 @@ +#ifndef INFINI_OPS_COMMON_CUDA_CASTER_H_ +#define INFINI_OPS_COMMON_CUDA_CASTER_H_ + +#include "caster.h" + +namespace infini::ops { + +template +struct CudaCasterImpl { + template + __host__ __device__ static Dst Cast(Src&& x) { + static_assert(!std::is_reference_v, + "`Cast` cannot return reference types"); + + using PureSrc = std::remove_cv_t>; + using PureDst = std::remove_cv_t>; + + if constexpr (std::is_same_v) { + return std::forward(x); + } else { + return HardwareCast(std::forward(x), PriorityHigh{}); + } + } + + private: + template + using PureType = std::remove_cv_t>; + + template + __host__ __device__ static constexpr float ToFloatHelper(T&& x) { + using PureSrc = PureType; + if constexpr (IsBFloat16) { + return __bfloat162float(x); + } else if constexpr (IsFP16) { + return __half2float(x); + } else { + return static_cast(std::forward(x)); + } + } + + template + __host__ __device__ static constexpr Dst FromFloatHelper(float f) { + using PureDst = PureType; + if constexpr (IsBFloat16) { + return __float2bfloat16(f); + } else if constexpr (IsFP16) { + return __float2half(f); + } else { + return static_cast(f); + } + } + + // Priority tags for overload resolution. + struct PriorityLow {}; + + struct PriorityHigh : PriorityLow {}; + + // Fallback: lowest priority. This always matches if nothing else does. + template + __host__ __device__ static constexpr Dst HardwareCast(Src&& x, PriorityLow) { + return FromFloatHelper(ToFloatHelper(std::forward(x))); + } + +// Usage: `DEFINE_DIRECT_CAST(INTRINSIC, CONDITION)`. +#define DEFINE_DIRECT_CAST(INTRINSIC, ...) \ + template \ + __host__ __device__ static auto HardwareCast(Src x, PriorityHigh) \ + -> std::enable_if_t<(__VA_ARGS__), \ + decltype(INTRINSIC(std::declval()))> { \ + return INTRINSIC(x); \ + } + + DEFINE_DIRECT_CAST( + __bfloat162int_rn, + std::is_same_v, int>&& IsBFloat16>) + DEFINE_DIRECT_CAST( + __bfloat162short_rn, + std::is_same_v, short>&& IsBFloat16>) + DEFINE_DIRECT_CAST( + __int2bfloat16_rn, + IsBFloat16>&& std::is_same_v, int>) + DEFINE_DIRECT_CAST( + __int2half_rn, + IsFP16>&& std::is_same_v, int>) + DEFINE_DIRECT_CAST( + __double2bfloat16, + IsBFloat16>&& std::is_same_v, double>) + DEFINE_DIRECT_CAST( + __double2half, + IsFP16>&& std::is_same_v, double>) + DEFINE_DIRECT_CAST( + __half, IsFP16>&& IsBFloat16>) +#undef DEFINE_DIRECT_CAST +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/causal_softmax/kernel.cuh b/src/cuda/causal_softmax/kernel.cuh new file mode 100644 index 0000000..83acbc6 --- /dev/null +++ b/src/cuda/causal_softmax/kernel.cuh @@ -0,0 +1,106 @@ +#ifndef INFINI_OPS_CUDA_CAUSAL_SOFTMAX_KERNEL_CUH_ +#define INFINI_OPS_CUDA_CAUSAL_SOFTMAX_KERNEL_CUH_ + +#include +#include +#include + +#include "cuda/caster_.h" +#include "cuda/kernel_commons.h" + +namespace infini::ops { + +namespace { + +template +__device__ __forceinline__ Data ExpAndCast(Compute x) { + return Caster::template Cast( + expf(Caster::template Cast(x))); +} + +struct BlockMaxOp { + template + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (a > b) ? a : b; + } +}; + +template +__device__ __forceinline__ Data BlockMax(const Data* data_ptr, size_t count) { + Data thread_max = count > 0 ? data_ptr[0] : Data{}; + for (size_t i = threadIdx.x; i < count; i += block_size) { + Data v = data_ptr[i]; + thread_max = (v > thread_max) ? v : thread_max; + } + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + return BlockReduce(temp_storage).Reduce(thread_max, BlockMaxOp()); +} + +template +__device__ __forceinline__ Compute BlockSum(const Data* data_ptr, + size_t count) { + Compute thread_sum = 0; + for (size_t i = threadIdx.x; i < count; i += block_size) { + thread_sum += Caster::template Cast(data_ptr[i]); + } + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + return BlockReduce(temp_storage).Sum(thread_sum); +} + +} // namespace + +template +__global__ void CausalSoftmaxKernel( + Data* __restrict__ out_ptr, const Data* __restrict__ input_ptr, + size_t batch_size, size_t seq_len, size_t total_seq_len, + int64_t stride_out_batch, int64_t stride_out_row, + int64_t stride_input_batch, int64_t stride_input_row) { + size_t row_idx = blockIdx.x; + size_t batch_idx = blockIdx.y; + + Data* out_row = + out_ptr + batch_idx * stride_out_batch + row_idx * stride_out_row; + const Data* input_row = + input_ptr + batch_idx * stride_input_batch + row_idx * stride_input_row; + + size_t valid_len = total_seq_len - seq_len + row_idx + 1; + + __shared__ Data max_val; + Data block_max = BlockMax(input_row, valid_len); + if (threadIdx.x == 0) { + max_val = block_max; + } + __syncthreads(); + + for (size_t col = threadIdx.x; col < total_seq_len; col += block_size) { + if (col < valid_len) { + Compute diff = Caster::template Cast(input_row[col]) - + Caster::template Cast(max_val); + out_row[col] = ExpAndCast(diff); + } else { + out_row[col] = Caster::template Cast(0.0f); + } + } + __syncthreads(); + + __shared__ Compute sum_val; + Compute block_sum = + BlockSum(out_row, total_seq_len); + if (threadIdx.x == 0) { + sum_val = block_sum; + } + __syncthreads(); + + for (size_t col = threadIdx.x; col < total_seq_len; col += block_size) { + Compute quot = Caster::template Cast(out_row[col]) / sum_val; + out_row[col] = Caster::template Cast(quot); + } +} + +} // namespace infini::ops + +#endif diff --git a/src/cuda/causal_softmax/kernel.h b/src/cuda/causal_softmax/kernel.h new file mode 100644 index 0000000..7ca0135 --- /dev/null +++ b/src/cuda/causal_softmax/kernel.h @@ -0,0 +1,57 @@ +#ifndef INFINI_OPS_CUDA_CAUSAL_SOFTMAX_KERNEL_H_ +#define INFINI_OPS_CUDA_CAUSAL_SOFTMAX_KERNEL_H_ + +#include +#include + +#include "base/causal_softmax.h" +#include "cuda/causal_softmax/kernel.cuh" +#include "cuda/kernel_commons.h" +#include "data_type.h" +#include "dispatcher.h" + +namespace infini::ops { + +template +class CudaCausalSoftmax : public CausalSoftmax { + public: + using CausalSoftmax::CausalSoftmax; + + void operator()(const Tensor input, Tensor out) const override { + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + + auto stride_input_batch = ndim_ == 3 ? input_strides_[0] : 0; + auto stride_input_row = input_strides_[ndim_ - 2]; + auto stride_out_batch = ndim_ == 3 ? out_strides_[0] : 0; + auto stride_out_row = out_strides_[ndim_ - 2]; + + dim3 grid(static_cast(seq_len_), + static_cast(batch_size_)); + + assert(out.dtype() == input.dtype()); + + int block_size = Backend::GetOptimalBlockSize(); + + DispatchFunc, ReducedFloatTypes>, + AllCudaBlockSizes>( + // TODO: Output dtype should use the one passed in during construction. + {static_cast(out.dtype()), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + CausalSoftmaxKernel + <<>>( + reinterpret_cast(out.data()), + reinterpret_cast(input.data()), batch_size_, + seq_len_, total_seq_len_, stride_out_batch, stride_out_row, + stride_input_batch, stride_input_row); + }, + "CudaCausalSoftmax::operator()"); + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/gemm/blas.h b/src/cuda/gemm/blas.h new file mode 100644 index 0000000..fe88a7b --- /dev/null +++ b/src/cuda/gemm/blas.h @@ -0,0 +1,109 @@ +#ifndef INFINI_OPS_CUDA_GEMM_BLAS_H_ +#define INFINI_OPS_CUDA_GEMM_BLAS_H_ + +#include + +#include "base/gemm.h" + +namespace infini::ops { + +template +class Blas : public Gemm { + public: + Blas(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) + : Gemm{a, b, alpha, beta, trans_a, trans_b, c}, + a_is_col_major_{a.stride(-1) == 1}, + b_is_col_major_{b.stride(-1) == 1}, + swap_a_and_b_{c.stride(-1) == 1} { + // TODO: Check constraints. + } + + Blas(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, Tensor c) + : Blas{a, b, alpha, beta, std::nullopt, std::nullopt, c} {} + + Blas(const Tensor a, const Tensor b, Tensor c) + : Blas{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt, c} {} + + void operator()(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const override { + Backend::blasSetStream(GetHandle(), + static_cast(stream_)); + + const auto& alpha_value{alpha.value_or(alpha_)}; + const auto& beta_value{beta.value_or(beta_)}; + + const auto& trans_a_value{trans_a.value_or(trans_a_)}; + const auto& trans_b_value{trans_b.value_or(trans_b_)}; + auto op_a{GetOpA(trans_a_value, trans_b_value)}; + auto op_b{GetOpB(trans_a_value, trans_b_value)}; + const void* alpha_ptr{GetAlphaPtr(alpha_value, c.dtype())}; + const void* beta_ptr{GetBetaPtr(beta_value, c.dtype())}; + + Backend::blasGemmStridedBatchedEx( + GetHandle(), op_a, op_b, swap_a_and_b_ ? n_ : m_, + swap_a_and_b_ ? m_ : n_, k_, alpha_ptr, + swap_a_and_b_ ? b.data() : a.data(), + Backend::GetDataType(swap_a_and_b_ ? b.dtype() : a.dtype()), + swap_a_and_b_ ? ldb_ : lda_, + swap_a_and_b_ ? batch_stride_b_ : batch_stride_a_, + swap_a_and_b_ ? a.data() : b.data(), + Backend::GetDataType(swap_a_and_b_ ? a.dtype() : b.dtype()), + swap_a_and_b_ ? lda_ : ldb_, + swap_a_and_b_ ? batch_stride_a_ : batch_stride_b_, beta_ptr, c.data(), + Backend::GetDataType(c.dtype()), ldc_, batch_stride_c_, batch_count_, + Backend::GetComputeType(c.dtype()), Backend::BLAS_GEMM_DEFAULT); + } + + protected: + virtual const void* GetAlphaPtr(const float& alpha, DataType) const { + return α + } + + virtual const void* GetBetaPtr(const float& beta, DataType) const { + return β + } + + private: + auto GetOpA(int trans_a, int trans_b) const { + if (swap_a_and_b_) { + return (b_is_col_major_ == trans_b) ? Backend::BLAS_OP_T + : Backend::BLAS_OP_N; + } + return (a_is_col_major_ != trans_a) ? Backend::BLAS_OP_T + : Backend::BLAS_OP_N; + } + + auto GetOpB(int trans_a, int trans_b) const { + if (swap_a_and_b_) { + return (a_is_col_major_ == trans_a) ? Backend::BLAS_OP_T + : Backend::BLAS_OP_N; + } + return (b_is_col_major_ != trans_b) ? Backend::BLAS_OP_T + : Backend::BLAS_OP_N; + } + + // TODO: This static singleton is not thread-safe under concurrent access + // from multiple host threads. Add proper synchronization in the future. + static typename Backend::blasHandle_t& GetHandle() { + static typename Backend::blasHandle_t handle = []() { + typename Backend::blasHandle_t h; + Backend::blasCreate(&h); + return h; + }(); + return handle; + } + + bool a_is_col_major_{false}; + + bool b_is_col_major_{false}; + + bool swap_a_and_b_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/kernel_commons.h b/src/cuda/kernel_commons.h new file mode 100644 index 0000000..bb25fad --- /dev/null +++ b/src/cuda/kernel_commons.h @@ -0,0 +1,33 @@ +#ifndef INFINI_OPS_COMMON_CUDA_KERNEL_COMMONS_H_ +#define INFINI_OPS_COMMON_CUDA_KERNEL_COMMONS_H_ + +#include "caster.h" + +namespace infini::ops { + +using AllCudaBlockSizes = List<128, 256, 512, 1024, 2048>; + +__forceinline__ __device__ __host__ size_t +IndexToOffset(size_t flat_index, size_t ndim, const size_t* shape, + const ptrdiff_t* strides) { + size_t res = 0; + for (size_t i = ndim; i-- > 0;) { + res += (flat_index % shape[i]) * strides[i]; + flat_index /= shape[i]; + } + return res; +} + +// Selects the largest block size from `AllCudaBlockSizes` that does not exceed +// `max_threads_per_block`. +inline int ComputeOptimalBlockSize(int max_threads_per_block) { + if (max_threads_per_block >= 2048) return 2048; + if (max_threads_per_block >= 1024) return 1024; + if (max_threads_per_block >= 512) return 512; + if (max_threads_per_block >= 256) return 256; + return 128; +} + +} // namespace infini::ops + +#endif diff --git a/src/cuda/rms_norm/kernel.cuh b/src/cuda/rms_norm/kernel.cuh new file mode 100644 index 0000000..ccb091b --- /dev/null +++ b/src/cuda/rms_norm/kernel.cuh @@ -0,0 +1,64 @@ +#ifndef INFINI_OPS_CUDA_RMS_NORM_KERNEL_CUH_ +#define INFINI_OPS_CUDA_RMS_NORM_KERNEL_CUH_ + +#include +#include +#include + +#include "cuda/caster_.h" +#include "cuda/kernel_commons.h" + +namespace infini::ops { + +namespace { + +template +__device__ __forceinline__ TCompute SumSquared(const TData* data_ptr, + size_t count) { + TCompute ss = 0; + for (size_t i = threadIdx.x; i < count; i += block_size) { + TCompute value = Caster::template Cast(data_ptr[i]); + ss += value * value; + } + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + return BlockReduce(temp_storage).Sum(ss); +} + +} // namespace + +template +__global__ void RmsNormKernel(TData* __restrict__ y, int64_t stride_y_batch, + int64_t stride_y_nhead, + const TData* __restrict__ x, + int64_t stride_x_batch, int64_t stride_x_nhead, + const TWeight* __restrict__ w, size_t nhead, + size_t dim, float epsilon) { + size_t batch_idx = blockIdx.x / nhead; + size_t head_idx = blockIdx.x % nhead; + + auto y_ptr = y + batch_idx * stride_y_batch + head_idx * stride_y_nhead; + auto x_ptr = x + batch_idx * stride_x_batch + head_idx * stride_x_nhead; + auto w_ptr = w; + + TCompute ss = SumSquared(x_ptr, dim); + + __shared__ TCompute rms; + if (threadIdx.x == 0) { + rms = Caster::template Cast( + rsqrtf(ss / Caster::template Cast(dim) + epsilon)); + } + __syncthreads(); + + for (size_t i = threadIdx.x; i < dim; i += block_size) { + y_ptr[i] = Caster::template Cast( + Caster::template Cast(x_ptr[i]) * + Caster::template Cast(w_ptr[i]) * rms); + } +} + +} // namespace infini::ops + +#endif diff --git a/src/cuda/rms_norm/kernel.h b/src/cuda/rms_norm/kernel.h new file mode 100644 index 0000000..848f8fa --- /dev/null +++ b/src/cuda/rms_norm/kernel.h @@ -0,0 +1,59 @@ +#ifndef INFINI_OPS_CUDA_RMS_NORM_KERNEL_H_ +#define INFINI_OPS_CUDA_RMS_NORM_KERNEL_H_ + +#include +#include + +#include "base/rms_norm.h" +#include "cuda/kernel_commons.h" +#include "cuda/rms_norm/kernel.cuh" +#include "data_type.h" +#include "dispatcher.h" + +namespace infini::ops { + +template +class CudaRmsNorm : public RmsNorm { + public: + using RmsNorm::RmsNorm; + + void operator()(const Tensor input, const Tensor weight, float eps, + Tensor out) const override { + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + + auto stride_input_batch = input_strides_.size() > 1 ? input_strides_[0] : 0; + auto stride_input_nhead = + input_strides_.size() > 1 ? input_strides_[1] : input_strides_[0]; + auto stride_out_batch = out_strides_.size() > 1 ? out_strides_[0] : 0; + auto stride_out_nhead = + out_strides_.size() > 1 ? out_strides_[1] : out_strides_[0]; + + uint32_t num_blocks = static_cast(batch_size_ * nhead_); + + assert(out.dtype() == input.dtype() && out.dtype() == weight.dtype()); + + int block_size = Backend::GetOptimalBlockSize(); + + DispatchFunc, ReducedFloatTypes>, + AllCudaBlockSizes>( + {static_cast(out.dtype()), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + RmsNormKernel + <<>>( + reinterpret_cast(out.data()), stride_out_batch, + stride_out_nhead, reinterpret_cast(input.data()), + stride_input_batch, stride_input_nhead, + reinterpret_cast(weight.data()), nhead_, dim_, + eps_); + }, + "CudaRmsNorm::operator()"); + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/swiglu/kernel.cuh b/src/cuda/swiglu/kernel.cuh new file mode 100644 index 0000000..5c3add3 --- /dev/null +++ b/src/cuda/swiglu/kernel.cuh @@ -0,0 +1,82 @@ +#ifndef INFINI_OPS_CUDA_SWIGLU_KERNEL_CUH_ +#define INFINI_OPS_CUDA_SWIGLU_KERNEL_CUH_ + +#include + +#include "cuda/kernel_commons.h" + +namespace infini::ops { + +// Optimized sigmoid function with support for FP16 and BF16 types. +// TODO: The unified FP16/BF16 branch uses `Caster` and scalar float +// arithmetic instead of native vectorized intrinsics (e.g. `h2rcp`, +// `__hmul2`). Profile and restore specialized paths if needed. +template +__device__ __forceinline__ T Sigmoid(const T& x) { + if constexpr (IsFP16 || IsBFloat16) { + float xf = Caster::template Cast(x); + return Caster::template Cast( + __frcp_rn(__fadd_rn(1.0f, __expf(-xf)))); + } else if constexpr (std::is_same_v) { + return __frcp_rn(__fadd_rn(1.0f, __expf(-x))); + } else { + return 1.0f / (1.0f + expf(-x)); + } +} + +// SwiGLU(x, gate) = Swish(x) * gate = (x * sigmoid(x)) * gate. +template +__global__ void SwigluKernel(T* __restrict__ out, const T* __restrict__ a, + const T* __restrict__ b, + const size_t* __restrict__ out_shape, + const size_t* __restrict__ input_shape, + const size_t* __restrict__ gate_shape, + const ptrdiff_t* __restrict__ out_strides, + const ptrdiff_t* __restrict__ input_strides, + const ptrdiff_t* __restrict__ gate_strides, + size_t output_size, size_t ndim, + bool out_contiguous, bool input_contiguous, + bool gate_contiguous) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < output_size) { + size_t out_idx, input_idx, gate_idx; + + if (out_contiguous) { + out_idx = idx; + } else { + out_idx = IndexToOffset(idx, ndim, out_shape, out_strides); + } + + if (input_contiguous) { + input_idx = idx; + } else { + input_idx = IndexToOffset(idx, ndim, input_shape, input_strides); + } + + if (gate_contiguous) { + gate_idx = idx; + } else { + gate_idx = IndexToOffset(idx, ndim, gate_shape, gate_strides); + } + + T up = a[input_idx]; + T gate = b[gate_idx]; + + if constexpr (IsFP16 || IsBFloat16) { + float gatef = Caster::template Cast(gate); + float upf = Caster::template Cast(up); + float sigf = __frcp_rn(__fadd_rn(1.0f, __expf(-gatef))); + out[out_idx] = Caster::template Cast( + __fmul_rn(__fmul_rn(gatef, sigf), upf)); + } else if constexpr (std::is_same_v) { + out[out_idx] = __fmul_rn(__fmul_rn(gate, Sigmoid(gate)), up); + } else { + out[out_idx] = gate * Sigmoid(gate) * up; + } + } +} + +} // namespace infini::ops + +#endif diff --git a/src/cuda/swiglu/kernel.h b/src/cuda/swiglu/kernel.h new file mode 100644 index 0000000..72ff3cc --- /dev/null +++ b/src/cuda/swiglu/kernel.h @@ -0,0 +1,95 @@ +#ifndef INFINI_OPS_CUDA_SWIGLU_KERNEL_H_ +#define INFINI_OPS_CUDA_SWIGLU_KERNEL_H_ + +#include + +#include "base/swiglu.h" +#include "common/generic_utils.h" +#include "cuda/swiglu/kernel.cuh" + +namespace infini::ops { + +template +class CudaSwiglu : public Swiglu { + public: + CudaSwiglu(const Tensor input, const Tensor gate, Tensor out) + : Swiglu{input, gate, out} { + size_t shape_size = ndim_ * sizeof(*d_input_shape_); + size_t strides_size = ndim_ * sizeof(*d_input_strides_); + + Backend::malloc((void**)&d_input_shape_, shape_size); + Backend::malloc((void**)&d_gate_shape_, shape_size); + Backend::malloc((void**)&d_out_shape_, shape_size); + Backend::malloc((void**)&d_input_strides_, strides_size); + Backend::malloc((void**)&d_gate_strides_, strides_size); + Backend::malloc((void**)&d_out_strides_, strides_size); + + Backend::memcpy(d_input_shape_, input_shape_.data(), shape_size, + Backend::memcpyH2D); + Backend::memcpy(d_gate_shape_, gate_shape_.data(), shape_size, + Backend::memcpyH2D); + Backend::memcpy(d_out_shape_, out_shape_.data(), shape_size, + Backend::memcpyH2D); + Backend::memcpy(d_input_strides_, input_strides_.data(), strides_size, + Backend::memcpyH2D); + Backend::memcpy(d_gate_strides_, gate_strides_.data(), strides_size, + Backend::memcpyH2D); + Backend::memcpy(d_out_strides_, out_strides_.data(), strides_size, + Backend::memcpyH2D); + } + + ~CudaSwiglu() { + Backend::free(d_input_shape_); + Backend::free(d_gate_shape_); + Backend::free(d_out_shape_); + Backend::free(d_input_strides_); + Backend::free(d_gate_strides_); + Backend::free(d_out_strides_); + } + + void operator()(const Tensor input, const Tensor gate, + Tensor out) const override { + int block_size = Backend::GetOptimalBlockSize(); + DispatchFunc( + {static_cast(out_type_), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + dim3 blockDims( + std::min(static_cast(block_size), output_size_)); + dim3 gridDims(utils::CeilDiv(output_size_, blockDims.x)); + + T* d_out = reinterpret_cast(out.data()); + const T* d_input = reinterpret_cast(input.data()); + const T* d_gate = reinterpret_cast(gate.data()); + + SwigluKernel + <<>>( + d_out, d_input, d_gate, d_out_shape_, d_input_shape_, + d_gate_shape_, d_out_strides_, d_input_strides_, + d_gate_strides_, output_size_, ndim_, is_out_contiguous_, + is_input_contiguous_, is_gate_contiguous_); + }, + "CudaSwiglu::operator()"); + } + + private: + Tensor::Size* d_input_shape_{nullptr}; + + Tensor::Size* d_gate_shape_{nullptr}; + + Tensor::Size* d_out_shape_{nullptr}; + + Tensor::Stride* d_input_strides_{nullptr}; + + Tensor::Stride* d_gate_strides_{nullptr}; + + Tensor::Stride* d_out_strides_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/data_type.h b/src/data_type.h new file mode 100644 index 0000000..05ea3c3 --- /dev/null +++ b/src/data_type.h @@ -0,0 +1,211 @@ +#ifndef INFINI_OPS_DATA_TYPE_H_ +#define INFINI_OPS_DATA_TYPE_H_ + +#include +#include +#include + +#include "common/constexpr_map.h" +#include "common/traits.h" +#include "device.h" + +namespace infini::ops { + +enum class DataType : std::int8_t { + kInt8, + kInt16, + kInt32, + kInt64, + kUInt8, + kUInt16, + kUInt32, + kUInt64, + kFloat16, + kBFloat16, + kFloat32, + kFloat64 +}; + +constexpr ConstexprMap kDataTypeToSize{{{ + {DataType::kInt8, 1}, + {DataType::kInt16, 2}, + {DataType::kInt32, 4}, + {DataType::kInt64, 8}, + {DataType::kUInt8, 1}, + {DataType::kUInt16, 2}, + {DataType::kUInt32, 4}, + {DataType::kUInt64, 8}, + {DataType::kFloat16, 2}, + {DataType::kBFloat16, 2}, + {DataType::kFloat32, 4}, + {DataType::kFloat64, 8}, +}}}; + +constexpr ConstexprMap kDataTypeToDesc{{{ + {DataType::kInt8, "int8"}, + {DataType::kInt16, "int16"}, + {DataType::kInt32, "int32"}, + {DataType::kInt64, "int64"}, + {DataType::kUInt8, "uint8"}, + {DataType::kUInt16, "uint16"}, + {DataType::kUInt32, "uint32"}, + {DataType::kUInt64, "uint64"}, + {DataType::kFloat16, "float16"}, + {DataType::kBFloat16, "bfloat16"}, + {DataType::kFloat32, "float32"}, + {DataType::kFloat64, "float64"}, +}}}; + +constexpr ConstexprMap kStringToDataType{{{ + {"int8", DataType::kInt8}, + {"int16", DataType::kInt16}, + {"int32", DataType::kInt32}, + {"int64", DataType::kInt64}, + {"uint8", DataType::kUInt8}, + {"uint16", DataType::kUInt16}, + {"uint32", DataType::kUInt32}, + {"uint64", DataType::kUInt64}, + {"float16", DataType::kFloat16}, + {"bfloat16", DataType::kBFloat16}, + {"float32", DataType::kFloat32}, + {"float64", DataType::kFloat64}, +}}}; + +struct Float16 { + std::uint16_t bits; + + static inline Float16 FromFloat(float val) { + std::uint32_t f32; + std::memcpy(&f32, &val, sizeof(f32)); + std::uint16_t sign = (f32 >> 16) & 0x8000; + std::int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + std::uint32_t mantissa = f32 & 0x7FFFFF; + + if (exponent >= 16) { + // NaN + if (exponent == 128 && mantissa != 0) { + return {static_cast(sign | 0x7E00)}; + } + // Inf + return {static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return {static_cast(sign | ((exponent + 15) << 10) | + (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return {static_cast(sign | (mantissa >> 13))}; + } + // Too small for subnormal: return signed zero. + return {sign}; + } + + inline float ToFloat() const { + std::uint32_t sign = (bits & 0x8000) << 16; + std::int32_t exponent = (bits >> 10) & 0x1F; + std::uint32_t mantissa = bits & 0x3FF; + std::uint32_t f32_bits; + + if (exponent == 31) { + f32_bits = sign | 0x7F800000 | (mantissa << 13); + } else if (exponent == 0) { + if (mantissa == 0) { + f32_bits = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32_bits = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32_bits = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + + float result; + std::memcpy(&result, &f32_bits, sizeof(result)); + return result; + } +}; + +struct BFloat16 { + std::uint16_t bits; + + static inline BFloat16 FromFloat(float val) { + std::uint32_t bits32; + std::memcpy(&bits32, &val, sizeof(bits32)); + + const std::uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + std::uint16_t bf16_bits = + static_cast((bits32 + rounding_bias) >> 16); + return {bf16_bits}; + } + + inline float ToFloat() const { + std::uint32_t bits32 = static_cast(bits) << 16; + float result; + std::memcpy(&result, &bits32, sizeof(result)); + return result; + } +}; + +template +struct TypeMap; + +template +using TypeMapType = typename TypeMap::type; + +#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE) \ + template \ + struct TypeMap { \ + using type = CPP_TYPE; \ + }; + +DEFINE_DATA_TYPE_MAPPING(kUInt8, std::uint8_t) +DEFINE_DATA_TYPE_MAPPING(kInt8, std::int8_t) +DEFINE_DATA_TYPE_MAPPING(kUInt16, std::uint16_t) +DEFINE_DATA_TYPE_MAPPING(kInt16, std::int16_t) +DEFINE_DATA_TYPE_MAPPING(kUInt32, std::uint32_t) +DEFINE_DATA_TYPE_MAPPING(kInt32, std::int32_t) +DEFINE_DATA_TYPE_MAPPING(kUInt64, std::uint64_t) +DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t) +DEFINE_DATA_TYPE_MAPPING(kFloat32, float) +DEFINE_DATA_TYPE_MAPPING(kFloat64, double) +#undef DEFINE_DATA_TYPE_MAPPING + +// Checks whether a C++ type is the bfloat16 or float16 type for the given +// device. Full specializations for each device's float16/bfloat16 types are +// provided in the corresponding platform `device_.h` headers. +template +inline constexpr bool IsBFloat16 = + std::is_same_v>; + +template +inline constexpr bool IsFP16 = + std::is_same_v>; + +// Defines the common categories of data types using List. +using FloatTypes = List; +using ReducedFloatTypes = List; +using IntTypes = + List; +using UIntTypes = List; + +using BitTypes8 = List; +using BitTypes16 = List; +using BitTypes32 = + List; +using BitTypes64 = + List; + +using AllFloatTypes = ConcatType; +using AllIntTypes = ConcatType; +using AllTypes = ConcatType; + +} // namespace infini::ops + +#endif diff --git a/src/device.h b/src/device.h new file mode 100644 index 0000000..5d9b3ee --- /dev/null +++ b/src/device.h @@ -0,0 +1,160 @@ +#ifndef INFINI_OPS_DEVICE_H_ +#define INFINI_OPS_DEVICE_H_ + +#include "common/constexpr_map.h" +#include "common/traits.h" +#include "hash.h" + +namespace infini::ops { + +class Device { + public: + enum class Type { + kCpu = 0, + kNvidia = 1, + kCambricon = 2, + kAscend = 3, + kMetax = 4, + kMoore = 5, + kIluvatar = 6, + kKunlun = 7, + kHygon = 8, + kQy = 9, + kCount + }; + + Device() = default; + + Device(const Type& type, const int& index = 0) : type_{type}, index_{index} {} + + static const Type TypeFromString(const std::string& name) { + return kDescToDevice.at(name); + } + + static const std::string_view StringFromType(const Type& type) { + return kDeviceToDesc.at(type); + } + + const Type& type() const { return type_; } + + const int& index() const { return index_; } + + std::string ToString() const { + return std::string{StringFromType(type_)} + ":" + std::to_string(index_); + } + + bool operator==(const Device& other) const { + return type_ == other.type_ && index_ == other.index_; + } + + bool operator!=(const Device& other) const { return !(*this == other); } + + private: + Type type_{Type::kCpu}; + + static constexpr ConstexprMap(Device::Type::kCount)> + kDeviceToDesc{{{ + {Type::kCpu, "cpu"}, + {Type::kNvidia, "nvidia"}, + {Type::kCambricon, "cambricon"}, + {Type::kAscend, "ascend"}, + {Type::kMetax, "metax"}, + {Type::kMoore, "moore"}, + {Type::kIluvatar, "iluvatar"}, + {Type::kKunlun, "kunlun"}, + {Type::kHygon, "hygon"}, + {Type::kQy, "qy"}, + }}}; + + static constexpr ConstexprMap(Device::Type::kCount)> + kDescToDevice{{{ + {"cpu", Type::kCpu}, + {"nvidia", Type::kNvidia}, + {"cambricon", Type::kCambricon}, + {"ascend", Type::kAscend}, + {"metax", Type::kMetax}, + {"moore", Type::kMoore}, + {"iluvatar", Type::kIluvatar}, + {"kunlun", Type::kKunlun}, + {"hygon", Type::kHygon}, + {"qy", Type::kQy}, + }}}; + + int index_{0}; +}; + +struct EnabledDeviceFilter { + // Each block defines a template operator() specialized for a specific + // Device. If the macro is NOT defined, the specialization is not compiled, + // and FilterList will exclude it from ActiveDevices. + +#ifdef WITH_CPU + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_NVIDIA + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_CAMBRICON + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_ASCEND + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_METAX + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_MOORE + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_ILUVATAR + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_KUNLUN + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_HYGON + void operator()(ValueTag) const {} +#endif + +#ifdef WITH_QY + void operator()(ValueTag) const {} +#endif +}; + +// Defines the common categories of devices using List. +using AllDeviceTypes = + List; + +using ActiveDevices = + typename infini::ops::FilterList, + AllDeviceTypes>::type; + +} // namespace infini::ops + +template <> +struct std::hash { + std::size_t operator()(const infini::ops::Device& device) const { + std::size_t seed{0}; + + hash_combine(seed, device.type()); + + hash_combine(seed, device.index()); + + return seed; + } +}; + +#endif diff --git a/src/dispatcher.h b/src/dispatcher.h new file mode 100644 index 0000000..c971d0d --- /dev/null +++ b/src/dispatcher.h @@ -0,0 +1,341 @@ +#ifndef INFINI_OPS_DISPATCHER_H_ +#define INFINI_OPS_DISPATCHER_H_ + +#include +#include +#include +#include + +#include "common/traits.h" +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +// ----------------------------------------------------------------------------- +// Core Generic Runtime Dispatchers +// ----------------------------------------------------------------------------- + +namespace detail { + +// Implements the dispatch body over a resolved `List`. +template +auto DispatchFuncImpl(ValueType value, Functor&& func, + std::string_view context_str, List, + Args&&... args) { + using ReturnType = decltype(std::forward(func)( + ValueTag(head)>{}, std::forward(args)...)); + + // Path for void functions. + if constexpr (std::is_void_v) { + bool handled = ((value == static_cast(tail) + ? (std::forward(func)( + ValueTag{}, std::forward(args)...), + true) + : false) || + ... || + (value == static_cast(head) + ? (std::forward(func)( + ValueTag{}, std::forward(args)...), + true) + : false)); + + if (!handled) { + // TODO(lzm): change to logging. + std::cerr << "dispatch error (void): value " << static_cast(value) + << " not supported in the context: " << context_str << "\n"; + std::abort(); + } + } + // Path for non-void functions. + else { + std::optional result; + bool handled = ((value == static_cast(tail) + ? (result.emplace(std::forward(func)( + ValueTag{}, std::forward(args)...)), + true) + : false) || + ... || + (value == static_cast(head) + ? (result.emplace(std::forward(func)( + ValueTag{}, std::forward(args)...)), + true) + : false)); + + if (handled) { + return *result; + } + // TODO(lzm): change to logging. + std::cerr << "dispatch error (non-void): value " << static_cast(value) + << " not supported in the context: " << context_str << "\n"; + std::abort(); + return ReturnType{}; + } +} + +// Deduces `head`/`tail` from a `List` type via partial specialization, +// then forwards to `DispatchFuncImpl`. +template +struct DispatchFuncUnwrap; + +template +struct DispatchFuncUnwrap, + std::tuple> { + static auto call(ValueType value, Functor&& func, + std::string_view context_str, Args&&... args) { + return DispatchFuncImpl(value, std::forward(func), context_str, + List{}, std::forward(args)...); + } +}; + +// Empty-list specialization +template +struct DispatchFuncUnwrap, std::tuple> { + static auto call(ValueType value, Functor&&, std::string_view context_str, + Args&&...) { + // TODO(lzm): change to logging. + std::cerr << "dispatch error: no allowed values registered for value " + << static_cast(value) + << " in the context: " << context_str << "\n"; + std::abort(); + } +}; + +} // namespace detail + +// (Single Dispatch) Dispatches a runtime value to a compile-time functor. +template +auto DispatchFunc(ValueType value, Functor&& func, + std::string_view context_str = "", Args&&... args) { + using FilteredPack = typename Filter, List<>, + all_values...>::type; + + return detail::DispatchFuncUnwrap< + ValueType, Functor, FilteredPack, + std::tuple>::call(value, std::forward(func), + context_str, std::forward(args)...); +} + +// (Multi-Dispatch) Dispatches a vector of runtime values to a compile-time +// functor. +// Base Case: All Dimensions Resolved +template +auto DispatchFunc(const std::vector& values, size_t /*index*/, + Functor&& func, std::string_view /*context_str*/, + List, Args&&... args) { + return std::forward(func)(List{}, + std::forward(args)...); +} + +// Forward declaration of the recursive multi-dispatch overload. +template +auto DispatchFunc(const std::vector& values, size_t index, + Functor&& func, std::string_view context_str, List, + Args&&... args); + +// Adapter used in the recursive multi-dispatch case: given a resolved value +// `val` recurse into the next dimension. +template +struct MultiDispatchRecurseAdapter; + +template +struct MultiDispatchRecurseAdapter, Functor, items...> { + const std::vector& values; + size_t next_index; + Functor& func; + std::string_view context_str; + + template + auto operator()(ValueTag, Args&&... args) const { + return DispatchFunc(values, next_index, func, context_str, + List{}, + std::forward(args)...); + } +}; + +template +auto MultiDispatchFirstDim(const std::vector& values, size_t index, + Functor& func, std::string_view context_str, + List, List, Args&&... args) { + static_assert(sizeof...(allowed) > 0, + "`DispatchFunc` dimension list is empty"); + using EnumType = std::common_type_t; + + MultiDispatchRecurseAdapter adapter{ + values, index + 1, func, context_str}; + + return DispatchFunc( + static_cast(values.at(index)), adapter, context_str, + std::forward(args)...); +} + +// (Multi-Dispatch) Recursive Case +template +auto DispatchFunc(const std::vector& values, size_t index, + Functor&& func, std::string_view context_str, List, + Args&&... args) { + return MultiDispatchFirstDim>( + values, index, func, context_str, List{}, FirstList{}, + std::forward(args)...); +} + +// ----------------------------------------------------------------------------- +// High-Level Specialized Dispatchers +// ----------------------------------------------------------------------------- +// These provide cleaner and more convenient APIs for common InfiniOps types. + +namespace detail { + +// Bridges the generic value dispatch layer to the `DataType`-specific type +// dispatch layer. +template +struct DataTypeAdapter { + Functor& func; + + template + auto operator()(ValueTag, Args&&... args) const { + using T = TypeMapType(dtype)>; + return func(TypeTag{}, std::forward(args)...); + } +}; + +template +struct DataTypeMultiAdapter { + Functor& func; + + template + auto operator()(List, Args&&... args) const { + return func(TypeTag(dtypes)>>{}..., + std::forward(args)...); + } +}; + +template +struct DeviceAdapter { + Functor& func; + + template + auto operator()(ValueTag, Args&&... args) const { + return func(ValueTag{}, std::forward(args)...); + } +}; + +template +struct DeviceMultiAdapter { + Functor& func; + + template + auto operator()(List, Args&&... args) const { + return func(ValueTag{}..., std::forward(args)...); + } +}; + +} // namespace detail + +// `DataType` Dispatch +template +auto DispatchFunc(DataType dtype, Functor&& func, + std::string_view context_str = "", Args&&... args) { + detail::DataTypeAdapter> adapter{func}; + return DispatchFunc(dtype, adapter, context_str, + std::forward(args)...); +} + +// `DataType` Multi-Dispatch +template +auto DispatchFunc(std::initializer_list dtypes, Functor&& func, + std::string_view context_str = "", Args&&... args) { + std::vector v; + for (auto d : dtypes) v.push_back(static_cast(d)); + + detail::DataTypeMultiAdapter> adapter{ + func}; + return DispatchFunc(v, 0, adapter, context_str, List<>{}, + std::forward(args)...); +} + +// `Device` Dispatch +template +auto DispatchFunc(Device::Type device, Functor&& func, + std::string_view context_str = "", Args&&... args) { + detail::DeviceAdapter> adapter{func}; + return DispatchFunc(allowed_devices)...>( + device, adapter, context_str, std::forward(args)...); +} + +// `Device` Multi-Dispatch +template +auto DispatchFunc(std::initializer_list devices, Functor&& func, + std::string_view context_str = "", Args&&... args) { + std::vector v; + for (auto d : devices) v.push_back(static_cast(d)); + + detail::DeviceMultiAdapter> adapter{func}; + return DispatchFunc(v, 0, adapter, context_str, List<>{}, + std::forward(args)...); +} + +template +auto DispatchFuncListAliasImpl(ValueType value, Functor&& func, + std::string_view context_str, List, + Args&&... args) { + return DispatchFunc>(items)...>( + value, std::forward(func), context_str, + std::forward(args)...); +} + +template +auto DispatchFuncListAliasImpl(ValueType value, Functor&& func, + std::string_view context_str, List, + Args&&... args) { + return DispatchFunc>(items)...>( + value, std::forward(func), context_str, + std::forward(args)...); +} + +// Interface for Generic `List` Aliases (for non-DataType dispatch, e.g. Device) +template ::value>> +auto DispatchFunc(ValueType value, Functor&& func, + std::string_view context_str = "", Args&&... args) { + return DispatchFuncListAliasImpl(value, std::forward(func), + context_str, ListType{}, + std::forward(args)...); +} + +// Interface for Generic `List` Aliases (for DataType dispatch with device type) +template ::value>> +auto DispatchFunc(ValueType value, Functor&& func, + std::string_view context_str = "", Args&&... args) { + return DispatchFuncListAliasImpl(value, std::forward(func), + context_str, ListType{}, + std::forward(args)...); +} + +// Interface for Any `int64_t`-Convertible Types +template +auto DispatchFunc(std::initializer_list keys, Functor&& func, + std::string_view context_str = "", Args&&... args) { + std::vector v_keys(keys); + return DispatchFunc(v_keys, 0, std::forward(func), + context_str, List<>{}, + std::forward(args)...); +} + +} // namespace infini::ops + +#endif diff --git a/src/handle.h b/src/handle.h new file mode 100644 index 0000000..4deeb83 --- /dev/null +++ b/src/handle.h @@ -0,0 +1,36 @@ +#ifndef INFINI_OPS_HANDLE_H_ +#define INFINI_OPS_HANDLE_H_ + +#include + +namespace infini::ops { + +class Handle { + public: + void* stream() const { return stream_; } + + void* workspace() const { return workspace_; } + + std::size_t workspace_size_in_bytes() const { + return workspace_size_in_bytes_; + } + + void set_stream(void* stream) { stream_ = stream; } + + void set_workspace(void* workspace) { workspace_ = workspace; } + + void set_workspace_size_in_bytes(std::size_t workspace_size_in_bytes) { + workspace_size_in_bytes_ = workspace_size_in_bytes; + } + + private: + void* stream_{nullptr}; + + void* workspace_{nullptr}; + + std::size_t workspace_size_in_bytes_{0}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 0000000..aced9cf --- /dev/null +++ b/src/hash.h @@ -0,0 +1,12 @@ +#ifndef INFINI_OPS_HASH_H_ +#define INFINI_OPS_HASH_H_ + +#include + +template +inline void hash_combine(std::size_t& seed, const T& v) { + std::hash> hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +#endif diff --git a/src/iluvatar/add/kernel.h b/src/iluvatar/add/kernel.h new file mode 100644 index 0000000..78ccff0 --- /dev/null +++ b/src/iluvatar/add/kernel.h @@ -0,0 +1,44 @@ +#ifndef INFINI_OPS_ILUVATAR_ADD_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_ADD_KERNEL_H_ + +#include + +#include "cuda/add/kernel.h" +#include "iluvatar/device_.h" + +namespace infini::ops { + +namespace add { + +struct IluvatarBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; + + static constexpr auto malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = cudaMemcpy; + + static constexpr auto free = cudaFree; + + static constexpr auto memcpyH2D = cudaMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace add + +template <> +class Operator + : public CudaAdd { + public: + using CudaAdd::CudaAdd; +}; + +} // namespace infini::ops + +#endif diff --git a/src/iluvatar/causal_softmax/kernel.h b/src/iluvatar/causal_softmax/kernel.h new file mode 100644 index 0000000..0f45118 --- /dev/null +++ b/src/iluvatar/causal_softmax/kernel.h @@ -0,0 +1,34 @@ +#ifndef INFINI_OPS_ILUVATAR_CAUSAL_SOFTMAX_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_CAUSAL_SOFTMAX_KERNEL_H_ + +#include + +#include "cuda/causal_softmax/kernel.h" +#include "iluvatar/device_.h" + +namespace infini::ops { + +namespace causal_softmax { + +struct IluvatarBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace causal_softmax + +template <> +class Operator + : public CudaCausalSoftmax { + public: + using CudaCausalSoftmax::CudaCausalSoftmax; +}; + +} // namespace infini::ops + +#endif diff --git a/src/iluvatar/device_.h b/src/iluvatar/device_.h new file mode 100644 index 0000000..387c453 --- /dev/null +++ b/src/iluvatar/device_.h @@ -0,0 +1,69 @@ +#ifndef INFINI_OPS_ILUVATAR_DEVICE__H_ +#define INFINI_OPS_ILUVATAR_DEVICE__H_ + +#include +#include + +// clang-format off +#include +#include +#include +// clang-format on + +#include "cuda/caster_.h" +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +using cuda_bfloat16 = nv_bfloat16; + +using cuda_bfloat162 = nv_bfloat162; + +template <> +struct TypeMap { + using type = half; +}; + +template <> +struct TypeMap { + using type = __nv_bfloat16; +}; + +// Caches `cudaDeviceProp` per device, initialized once at first access. +class DevicePropertyCache { + public: + static const cudaDeviceProp& GetCurrentDeviceProps() { + int device_id = 0; + cudaGetDevice(&device_id); + return GetDeviceProps(device_id); + } + + static const cudaDeviceProp& GetDeviceProps(int device_id) { + static std::vector cache = []() { + int count = 0; + cudaGetDeviceCount(&count); + if (count == 0) return std::vector{}; + std::vector props(count); + for (int i = 0; i < count; ++i) { + cudaGetDeviceProperties(&props[i], i); + } + return props; + }(); + + assert(device_id >= 0 && device_id < static_cast(cache.size())); + return cache[device_id]; + } +}; + +inline int QueryMaxThreadsPerBlock() { + return DevicePropertyCache::GetCurrentDeviceProps().maxThreadsPerBlock; +} + +template <> +struct Caster + : CudaCasterImpl {}; + +} // namespace infini::ops + +#endif diff --git a/src/iluvatar/gemm/cublas.h b/src/iluvatar/gemm/cublas.h new file mode 100644 index 0000000..310d888 --- /dev/null +++ b/src/iluvatar/gemm/cublas.h @@ -0,0 +1,75 @@ +#ifndef INFINI_OPS_ILUVATAR_GEMM_CUBLAS_H_ +#define INFINI_OPS_ILUVATAR_GEMM_CUBLAS_H_ + +#include + +// clang-format off +#include "cublas_v2.h" +// clang-format on + +#include "cuda/gemm/blas.h" + +namespace infini::ops { + +namespace gemm { + +struct IluvatarBackend { + using blasHandle_t = cublasHandle_t; + + using stream_t = cudaStream_t; + + static constexpr auto BLAS_OP_N = CUBLAS_OP_N; + + static constexpr auto BLAS_OP_T = CUBLAS_OP_T; + + static constexpr auto R_16F = CUDA_R_16F; + + static constexpr auto R_16BF = CUDA_R_16BF; + + static constexpr auto R_32F = CUDA_R_32F; + + // Iluvatar uses `cudaDataType` for `computeType`, so we need to use + // `CUDA_R_32F` instead of `CUBLAS_COMPUTE_32F_FAST_TF32`. + static constexpr auto BLAS_COMPUTE_32F = CUDA_R_32F; + + static constexpr auto BLAS_COMPUTE_32F_FAST_TF32 = CUDA_R_32F; + + // Iluvatar uses `CUBLAS_GEMM_DEFAULT_TENSOR_OP` instead of + // `CUBLAS_GEMM_DEFAULT`. + static constexpr auto BLAS_GEMM_DEFAULT = CUBLAS_GEMM_DEFAULT_TENSOR_OP; + + static constexpr auto blasCreate = cublasCreate; + + static constexpr auto blasSetStream = cublasSetStream; + + static constexpr auto blasDestroy = cublasDestroy; + + static constexpr auto blasGemmStridedBatchedEx = [](auto&&... args) { + return cublasGemmStridedBatchedEx(std::forward(args)...); + }; + + static auto GetDataType(DataType dtype) { + if (dtype == DataType::kFloat16) return R_16F; + if (dtype == DataType::kBFloat16) return R_16BF; + return R_32F; + } + + static auto GetComputeType(DataType dtype) { + if (dtype == DataType::kFloat16 || dtype == DataType::kBFloat16) + return BLAS_COMPUTE_32F; + return BLAS_COMPUTE_32F_FAST_TF32; + } +}; + +} // namespace gemm + +template <> +class Operator + : public Blas { + public: + using Blas::Blas; +}; + +} // namespace infini::ops + +#endif diff --git a/src/iluvatar/rms_norm/kernel.h b/src/iluvatar/rms_norm/kernel.h new file mode 100644 index 0000000..470e764 --- /dev/null +++ b/src/iluvatar/rms_norm/kernel.h @@ -0,0 +1,34 @@ +#ifndef INFINI_OPS_ILUVATAR_RMS_NORM_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_RMS_NORM_KERNEL_H_ + +#include + +#include "cuda/rms_norm/kernel.h" +#include "iluvatar/device_.h" + +namespace infini::ops { + +namespace rms_norm { + +struct IluvatarBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace rms_norm + +template <> +class Operator + : public CudaRmsNorm { + public: + using CudaRmsNorm::CudaRmsNorm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/iluvatar/swiglu/kernel.h b/src/iluvatar/swiglu/kernel.h new file mode 100644 index 0000000..7fc2e16 --- /dev/null +++ b/src/iluvatar/swiglu/kernel.h @@ -0,0 +1,44 @@ +#ifndef INFINI_OPS_ILUVATAR_SWIGLU_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_SWIGLU_KERNEL_H_ + +#include + +#include "cuda/swiglu/kernel.h" +#include "iluvatar/device_.h" + +namespace infini::ops { + +namespace swiglu { + +struct IluvatarBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; + + static constexpr auto malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = cudaMemcpy; + + static constexpr auto free = cudaFree; + + static constexpr auto memcpyH2D = cudaMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace swiglu + +template <> +class Operator + : public CudaSwiglu { + public: + using CudaSwiglu::CudaSwiglu; +}; + +} // namespace infini::ops + +#endif diff --git a/src/metax/add/kernel.h b/src/metax/add/kernel.h new file mode 100644 index 0000000..6ef2a09 --- /dev/null +++ b/src/metax/add/kernel.h @@ -0,0 +1,41 @@ +#ifndef INFINI_OPS_METAX_ADD_KERNEL_H_ +#define INFINI_OPS_METAX_ADD_KERNEL_H_ + +#include + +#include "cuda/add/kernel.h" +#include "metax/device_.h" + +namespace infini::ops { + +namespace add { + +struct MetaxBackend { + using stream_t = mcStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMetax; + + static constexpr auto malloc = mcMalloc; + + static constexpr auto memcpy = mcMemcpy; + + static constexpr auto free = mcFree; + + static constexpr auto memcpyH2D = mcMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace add + +template <> +class Operator : public CudaAdd { + public: + using CudaAdd::CudaAdd; +}; + +} // namespace infini::ops + +#endif diff --git a/src/metax/causal_softmax/kernel.h b/src/metax/causal_softmax/kernel.h new file mode 100644 index 0000000..5ec32b7 --- /dev/null +++ b/src/metax/causal_softmax/kernel.h @@ -0,0 +1,34 @@ +#ifndef INFINI_OPS_METAX_CAUSAL_SOFTMAX_KERNEL_H_ +#define INFINI_OPS_METAX_CAUSAL_SOFTMAX_KERNEL_H_ + +#include + +#include "cuda/causal_softmax/kernel.h" +#include "metax/device_.h" + +namespace infini::ops { + +namespace causal_softmax { + +struct MetaxBackend { + using stream_t = mcStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMetax; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace causal_softmax + +template <> +class Operator + : public CudaCausalSoftmax { + public: + using CudaCausalSoftmax::CudaCausalSoftmax; +}; + +} // namespace infini::ops + +#endif diff --git a/src/metax/device_.h b/src/metax/device_.h new file mode 100644 index 0000000..6d59c76 --- /dev/null +++ b/src/metax/device_.h @@ -0,0 +1,36 @@ +#ifndef INFINI_OPS_METAX_DEVICE__H_ +#define INFINI_OPS_METAX_DEVICE__H_ + +#include +#include +#include + +#include "cuda/caster_.h" +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +using cuda_bfloat16 = maca_bfloat16; + +using cuda_bfloat162 = maca_bfloat162; + +template <> +struct TypeMap { + using type = __half; +}; + +template <> +struct TypeMap { + using type = __maca_bfloat16; +}; + +// TODO: Add MCR device properties query for Metax. +inline int QueryMaxThreadsPerBlock() { return 256; } + +template <> +struct Caster : CudaCasterImpl {}; + +} // namespace infini::ops + +#endif diff --git a/src/metax/gemm/mcblas.h b/src/metax/gemm/mcblas.h new file mode 100644 index 0000000..4d5f313 --- /dev/null +++ b/src/metax/gemm/mcblas.h @@ -0,0 +1,71 @@ +#ifndef INFINI_OPS_METAX_GEMM_MCBLAS_H_ +#define INFINI_OPS_METAX_GEMM_MCBLAS_H_ + +#include + +// clang-format off +#include +// clang-format on + +#include "cuda/gemm/blas.h" + +namespace infini::ops { + +namespace gemm { + +struct MetaxBackend { + using blasHandle_t = mcblasHandle_t; + + using stream_t = mcStream_t; + + static constexpr auto BLAS_OP_N = MCBLAS_OP_N; + + static constexpr auto BLAS_OP_T = MCBLAS_OP_T; + + static constexpr auto R_16F = MACA_R_16F; + + static constexpr auto R_16BF = MACA_R_16BF; + + static constexpr auto R_32F = MACA_R_32F; + + static constexpr auto BLAS_COMPUTE_32F = MCBLAS_COMPUTE_32F; + + static constexpr auto BLAS_COMPUTE_32F_FAST_TF32 = + MCBLAS_COMPUTE_32F_FAST_TF32; + + static constexpr auto BLAS_GEMM_DEFAULT = MCBLAS_GEMM_DEFAULT; + + static constexpr auto blasCreate = mcblasCreate; + + static constexpr auto blasSetStream = mcblasSetStream; + + static constexpr auto blasDestroy = mcblasDestroy; + + static constexpr auto blasGemmStridedBatchedEx = [](auto&&... args) { + return mcblasGemmStridedBatchedEx(std::forward(args)...); + }; + + static auto GetDataType(DataType dtype) { + if (dtype == DataType::kFloat16) return R_16F; + if (dtype == DataType::kBFloat16) return R_16BF; + return R_32F; + } + + static auto GetComputeType(DataType dtype) { + if (dtype == DataType::kFloat16 || dtype == DataType::kBFloat16) + return BLAS_COMPUTE_32F; + return BLAS_COMPUTE_32F_FAST_TF32; + } +}; + +} // namespace gemm + +template <> +class Operator : public Blas { + public: + using Blas::Blas; +}; + +} // namespace infini::ops + +#endif diff --git a/src/metax/rms_norm/kernel.h b/src/metax/rms_norm/kernel.h new file mode 100644 index 0000000..5806435 --- /dev/null +++ b/src/metax/rms_norm/kernel.h @@ -0,0 +1,34 @@ +#ifndef INFINI_OPS_METAX_RMS_NORM_KERNEL_H_ +#define INFINI_OPS_METAX_RMS_NORM_KERNEL_H_ + +#include + +#include "cuda/rms_norm/kernel.h" +#include "metax/device_.h" + +namespace infini::ops { + +namespace rms_norm { + +struct MetaxBackend { + using stream_t = mcStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMetax; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace rms_norm + +template <> +class Operator + : public CudaRmsNorm { + public: + using CudaRmsNorm::CudaRmsNorm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/metax/swiglu/kernel.h b/src/metax/swiglu/kernel.h new file mode 100644 index 0000000..75b9c46 --- /dev/null +++ b/src/metax/swiglu/kernel.h @@ -0,0 +1,44 @@ +#ifndef INFINI_OPS_METAX_SWIGLU_KERNEL_H_ +#define INFINI_OPS_METAX_SWIGLU_KERNEL_H_ + +#include + +#include "cuda/swiglu/kernel.h" +#include "metax/device_.h" + +namespace infini::ops { + +namespace swiglu { + +struct MetaxBackend { + using stream_t = mcStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMetax; + + static constexpr auto malloc = [](auto&&... args) { + return mcMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = mcMemcpy; + + static constexpr auto free = mcFree; + + static constexpr auto memcpyH2D = mcMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace swiglu + +template <> +class Operator + : public CudaSwiglu { + public: + using CudaSwiglu::CudaSwiglu; +}; + +} // namespace infini::ops + +#endif diff --git a/src/moore/add/kernel.h b/src/moore/add/kernel.h new file mode 100644 index 0000000..1f393dc --- /dev/null +++ b/src/moore/add/kernel.h @@ -0,0 +1,51 @@ +#ifndef INFINI_OPS_MOORE_ADD_KERNEL_H_ +#define INFINI_OPS_MOORE_ADD_KERNEL_H_ + +#include + +// clang-format off +#include "moore/polyfills.cuh" +// clang-format on + +#include "cuda/add/kernel.h" +#include "moore/device_.h" + +namespace infini::ops { + +namespace add { + +struct MooreBackend { + using stream_t = musaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMoore; + + static constexpr auto malloc = [](auto&&... args) { + return musaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = [](auto&&... args) { + return musaMemcpy(std::forward(args)...); + }; + + static constexpr auto free = [](auto&&... args) { + return musaFree(std::forward(args)...); + }; + + static constexpr auto memcpyH2D = musaMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace add + +template <> +class Operator : public CudaAdd { + public: + using CudaAdd::CudaAdd; +}; + +} // namespace infini::ops + +#endif diff --git a/src/moore/device_.h b/src/moore/device_.h new file mode 100644 index 0000000..d7c7599 --- /dev/null +++ b/src/moore/device_.h @@ -0,0 +1,41 @@ +#ifndef INFINI_OPS_MOORE_DEVICE__H_ +#define INFINI_OPS_MOORE_DEVICE__H_ + +#include +#include +#include + +#include "cuda/caster_.h" +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +using cuda_bfloat16 = __mt_bfloat16; + +using cuda_bfloat162 = __mt_bfloat162; + +template <> +struct TypeMap { + using type = half; +}; + +template <> +struct TypeMap { + using type = __mt_bfloat16; +}; + +inline int QueryMaxThreadsPerBlock() { + int device = 0; + musaGetDevice(&device); + musaDeviceProp prop; + musaGetDeviceProperties(&prop, device); + return prop.maxThreadsPerBlock; +} + +template <> +struct Caster : CudaCasterImpl {}; + +} // namespace infini::ops + +#endif diff --git a/src/moore/gemm/mublas.h b/src/moore/gemm/mublas.h new file mode 100644 index 0000000..8ec931f --- /dev/null +++ b/src/moore/gemm/mublas.h @@ -0,0 +1,89 @@ +#ifndef INFINI_OPS_MOORE_GEMM_MUBLAS_H_ +#define INFINI_OPS_MOORE_GEMM_MUBLAS_H_ + +#include +#include + +#include + +#include "cuda/gemm/blas.h" + +namespace infini::ops { + +namespace gemm { + +struct MooreBackend { + using blasHandle_t = mublasHandle_t; + + using stream_t = musaStream_t; + + static constexpr auto BLAS_OP_N = MUBLAS_OP_N; + + static constexpr auto BLAS_OP_T = MUBLAS_OP_T; + + static constexpr auto R_16F = MUSA_R_16F; + + static constexpr auto R_16BF = MUSA_R_16BF; + + static constexpr auto R_32F = MUSA_R_32F; + + static constexpr auto BLAS_GEMM_DEFAULT = MUBLAS_GEMM_DEFAULT; + + static constexpr auto blasCreate = mublasCreate; + + static constexpr auto blasSetStream = mublasSetStream; + + static constexpr auto blasDestroy = mublasDestroy; + + static constexpr auto blasGemmStridedBatchedEx = [](auto&&... args) { + return mublasGemmStridedBatchedEx(std::forward(args)...); + }; + + static musaDataType_t GetDataType(DataType dtype) { + if (dtype == DataType::kFloat16) return R_16F; + if (dtype == DataType::kBFloat16) return R_16BF; + return R_32F; + } + + static mublasComputeType_t GetComputeType(DataType dtype) { + if (dtype == DataType::kFloat16) return MUBLAS_COMPUTE_16F; + if (dtype == DataType::kBFloat16) return MUBLAS_COMPUTE_32F; + return MUBLAS_COMPUTE_32F; + } +}; + +} // namespace gemm + +template <> +class Operator : public Blas { + public: + using Blas::Blas; + + protected: + const void* GetAlphaPtr(const float& alpha, DataType dtype) const override { + if (gemm::MooreBackend::GetComputeType(dtype) == MUBLAS_COMPUTE_16F) { + alpha_fp16_ = Float16::FromFloat(alpha); + return &alpha_fp16_; + } + + return α + } + + const void* GetBetaPtr(const float& beta, DataType dtype) const override { + if (gemm::MooreBackend::GetComputeType(dtype) == MUBLAS_COMPUTE_16F) { + beta_fp16_ = Float16::FromFloat(beta); + return &beta_fp16_; + } + + return β + } + + private: + mutable Float16 alpha_fp16_{}; + + mutable Float16 beta_fp16_{}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/moore/polyfills.cuh b/src/moore/polyfills.cuh new file mode 100644 index 0000000..88645a4 --- /dev/null +++ b/src/moore/polyfills.cuh @@ -0,0 +1,42 @@ +#ifndef INFINI_OPS_MOORE_POLYFILLS_CUH_ +#define INFINI_OPS_MOORE_POLYFILLS_CUH_ + +#include + +// clang-format off +#include +#include +// clang-format on + +namespace infini::ops { + +template +__device__ __forceinline__ T __hadd(const T& a, const T& b) { + return a + b; +} + +template +__device__ __forceinline__ auto __high2bfloat16(const T& a) { + return __float2bfloat16_rn(::__high2float(a)); +} + +template +__device__ __forceinline__ T __hneg(const T& a) { + return -a; +} + +template +__device__ __forceinline__ auto __low2bfloat16(const T& a) { + return __float2bfloat16_rn(::__low2float(a)); +} + +template +__device__ __forceinline__ T hrcp(const T& a) { + return T(__frcp_rn(static_cast(a))); +} + +} // namespace infini::ops + +#define hrcp infini::ops::hrcp + +#endif diff --git a/src/moore/swiglu/kernel.h b/src/moore/swiglu/kernel.h new file mode 100644 index 0000000..0c6b058 --- /dev/null +++ b/src/moore/swiglu/kernel.h @@ -0,0 +1,52 @@ +#ifndef INFINI_OPS_MOORE_SWIGLU_KERNEL_H_ +#define INFINI_OPS_MOORE_SWIGLU_KERNEL_H_ + +#include + +// clang-format off +#include "moore/polyfills.cuh" +// clang-format on + +#include "cuda/swiglu/kernel.h" +#include "moore/device_.h" + +namespace infini::ops { + +namespace swiglu { + +struct MooreBackend { + using stream_t = musaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMoore; + + static constexpr auto malloc = [](auto&&... args) { + return musaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = [](auto&&... args) { + return musaMemcpy(std::forward(args)...); + }; + + static constexpr auto free = [](auto&&... args) { + return musaFree(std::forward(args)...); + }; + + static constexpr auto memcpyH2D = musaMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace swiglu + +template <> +class Operator + : public CudaSwiglu { + public: + using CudaSwiglu::CudaSwiglu; +}; + +} // namespace infini::ops + +#endif diff --git a/src/nvidia/add/kernel.h b/src/nvidia/add/kernel.h new file mode 100644 index 0000000..6e6c2c3 --- /dev/null +++ b/src/nvidia/add/kernel.h @@ -0,0 +1,44 @@ +#ifndef INFINI_OPS_NVIDIA_ADD_KERNEL_H_ +#define INFINI_OPS_NVIDIA_ADD_KERNEL_H_ + +#include + +#include "cuda/add/kernel.h" +#include "nvidia/device_.h" + +namespace infini::ops { + +namespace add { + +struct NvidiaBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kNvidia; + + static constexpr auto malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = cudaMemcpy; + + static constexpr auto free = cudaFree; + + static constexpr auto memcpyH2D = cudaMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace add + +template <> +class Operator + : public CudaAdd { + public: + using CudaAdd::CudaAdd; +}; + +} // namespace infini::ops + +#endif diff --git a/src/nvidia/causal_softmax/kernel.h b/src/nvidia/causal_softmax/kernel.h new file mode 100644 index 0000000..62fdf8b --- /dev/null +++ b/src/nvidia/causal_softmax/kernel.h @@ -0,0 +1,34 @@ +#ifndef INFINI_OPS_NVIDIA_CAUSAL_SOFTMAX_KERNEL_H_ +#define INFINI_OPS_NVIDIA_CAUSAL_SOFTMAX_KERNEL_H_ + +#include + +#include "cuda/causal_softmax/kernel.h" +#include "nvidia/device_.h" + +namespace infini::ops { + +namespace causal_softmax { + +struct NvidiaBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kNvidia; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace causal_softmax + +template <> +class Operator + : public CudaCausalSoftmax { + public: + using CudaCausalSoftmax::CudaCausalSoftmax; +}; + +} // namespace infini::ops + +#endif diff --git a/src/nvidia/device_.h b/src/nvidia/device_.h new file mode 100644 index 0000000..90de446 --- /dev/null +++ b/src/nvidia/device_.h @@ -0,0 +1,68 @@ +#ifndef INFINI_OPS_NVIDIA_DEVICE__H_ +#define INFINI_OPS_NVIDIA_DEVICE__H_ + +#include +#include + +// clang-format off +#include +#include +#include +// clang-format on + +#include "cuda/caster_.h" +#include "data_type.h" +#include "device.h" + +namespace infini::ops { + +using cuda_bfloat16 = nv_bfloat16; + +using cuda_bfloat162 = nv_bfloat162; + +template <> +struct TypeMap { + using type = half; +}; + +template <> +struct TypeMap { + using type = __nv_bfloat16; +}; + +// Caches `cudaDeviceProp` per device, initialized once at first access. +class DevicePropertyCache { + public: + static const cudaDeviceProp& GetCurrentDeviceProps() { + int device_id = 0; + cudaGetDevice(&device_id); + return GetDeviceProps(device_id); + } + + static const cudaDeviceProp& GetDeviceProps(int device_id) { + static std::vector cache = []() { + int count = 0; + cudaGetDeviceCount(&count); + if (count == 0) return std::vector{}; + std::vector props(count); + for (int i = 0; i < count; ++i) { + cudaGetDeviceProperties(&props[i], i); + } + return props; + }(); + + assert(device_id >= 0 && device_id < static_cast(cache.size())); + return cache[device_id]; + } +}; + +inline int QueryMaxThreadsPerBlock() { + return DevicePropertyCache::GetCurrentDeviceProps().maxThreadsPerBlock; +} + +template <> +struct Caster : CudaCasterImpl {}; + +} // namespace infini::ops + +#endif diff --git a/src/nvidia/gemm/cublas.h b/src/nvidia/gemm/cublas.h new file mode 100644 index 0000000..eaf3b40 --- /dev/null +++ b/src/nvidia/gemm/cublas.h @@ -0,0 +1,71 @@ +#ifndef INFINI_OPS_NVIDIA_GEMM_CUBLAS_H_ +#define INFINI_OPS_NVIDIA_GEMM_CUBLAS_H_ + +#include + +// clang-format off +#include "cublas_v2.h" +// clang-format on + +#include "cuda/gemm/blas.h" + +namespace infini::ops { + +namespace gemm { + +struct NvidiaBackend { + using blasHandle_t = cublasHandle_t; + + using stream_t = cudaStream_t; + + static constexpr auto BLAS_OP_N = CUBLAS_OP_N; + + static constexpr auto BLAS_OP_T = CUBLAS_OP_T; + + static constexpr auto R_16F = CUDA_R_16F; + + static constexpr auto R_16BF = CUDA_R_16BF; + + static constexpr auto R_32F = CUDA_R_32F; + + static constexpr auto BLAS_COMPUTE_32F = CUBLAS_COMPUTE_32F; + + static constexpr auto BLAS_COMPUTE_32F_FAST_TF32 = + CUBLAS_COMPUTE_32F_FAST_TF32; + + static constexpr auto BLAS_GEMM_DEFAULT = CUBLAS_GEMM_DEFAULT; + + static constexpr auto blasCreate = cublasCreate; + + static constexpr auto blasSetStream = cublasSetStream; + + static constexpr auto blasDestroy = cublasDestroy; + + static constexpr auto blasGemmStridedBatchedEx = [](auto&&... args) { + return cublasGemmStridedBatchedEx(std::forward(args)...); + }; + + static auto GetDataType(DataType dtype) { + if (dtype == DataType::kFloat16) return R_16F; + if (dtype == DataType::kBFloat16) return R_16BF; + return R_32F; + } + + static auto GetComputeType(DataType dtype) { + if (dtype == DataType::kFloat16 || dtype == DataType::kBFloat16) + return BLAS_COMPUTE_32F; + return BLAS_COMPUTE_32F_FAST_TF32; + } +}; + +} // namespace gemm + +template <> +class Operator : public Blas { + public: + using Blas::Blas; +}; + +} // namespace infini::ops + +#endif diff --git a/src/nvidia/rms_norm/kernel.h b/src/nvidia/rms_norm/kernel.h new file mode 100644 index 0000000..e346a31 --- /dev/null +++ b/src/nvidia/rms_norm/kernel.h @@ -0,0 +1,34 @@ +#ifndef INFINI_OPS_NVIDIA_RMS_NORM_KERNEL_H_ +#define INFINI_OPS_NVIDIA_RMS_NORM_KERNEL_H_ + +#include + +#include "cuda/rms_norm/kernel.h" +#include "nvidia/device_.h" + +namespace infini::ops { + +namespace rms_norm { + +struct NvidiaBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kNvidia; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace rms_norm + +template <> +class Operator + : public CudaRmsNorm { + public: + using CudaRmsNorm::CudaRmsNorm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/nvidia/swiglu/kernel.h b/src/nvidia/swiglu/kernel.h new file mode 100644 index 0000000..8ea30f8 --- /dev/null +++ b/src/nvidia/swiglu/kernel.h @@ -0,0 +1,44 @@ +#ifndef INFINI_OPS_NVIDIA_SWIGLU_KERNEL_H_ +#define INFINI_OPS_NVIDIA_SWIGLU_KERNEL_H_ + +#include + +#include "cuda/swiglu/kernel.h" +#include "nvidia/device_.h" + +namespace infini::ops { + +namespace swiglu { + +struct NvidiaBackend { + using stream_t = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kNvidia; + + static constexpr auto malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto memcpy = cudaMemcpy; + + static constexpr auto free = cudaFree; + + static constexpr auto memcpyH2D = cudaMemcpyHostToDevice; + + static int GetOptimalBlockSize() { + return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock()); + } +}; + +} // namespace swiglu + +template <> +class Operator + : public CudaSwiglu { + public: + using CudaSwiglu::CudaSwiglu; +}; + +} // namespace infini::ops + +#endif diff --git a/src/operator.h b/src/operator.h new file mode 100644 index 0000000..f482db2 --- /dev/null +++ b/src/operator.h @@ -0,0 +1,167 @@ +#ifndef INFINI_OPS_OPERATOR_H_ +#define INFINI_OPS_OPERATOR_H_ + +#include +#include +#include +#include +#include + +#include "dispatcher.h" +#include "handle.h" +#include "tensor.h" + +namespace infini::ops::detail { + +struct CacheKey { + std::size_t hash; + + std::vector tensors; + + std::size_t scalar_hash; + + template + static CacheKey Build(const Args&... args) { + CacheKey key; + key.hash = 0; + key.scalar_hash = 0; + (key.Absorb(args), ...); + return key; + } + + private: + void Absorb(const Tensor& t) { + hash_combine(hash, t); + tensors.push_back(t); + } + + template + void Absorb(const T& v) { + hash_combine(hash, v); + hash_combine(scalar_hash, v); + } +}; + +} // namespace infini::ops::detail + +template <> +struct std::hash { + std::size_t operator()(const infini::ops::detail::CacheKey& key) const { + return key.hash; + } +}; + +template <> +struct std::equal_to { + bool operator()(const infini::ops::detail::CacheKey& a, + const infini::ops::detail::CacheKey& b) const { + if (a.scalar_hash != b.scalar_hash) return false; + if (a.tensors.size() != b.tensors.size()) return false; + std::equal_to eq; + for (std::size_t i = 0; i < a.tensors.size(); ++i) { + if (!eq(a.tensors[i], b.tensors[i])) return false; + } + return true; + } +}; + +namespace infini::ops { + +class OperatorBase { + public: + virtual ~OperatorBase() = default; + + virtual std::size_t workspace_size_in_bytes() const { return 0; } + + void set_handle(const Handle& handle) { handle_ = handle; } + + void set_stream(void* stream) { stream_ = stream; } + + void set_workspace(void* workspace) { workspace_ = workspace; } + + void set_workspace_size_in_bytes(std::size_t workspace_size_in_bytes) { + workspace_size_in_bytes_ = workspace_size_in_bytes; + } + + protected: + Handle handle_; + + void* stream_{nullptr}; + + void* workspace_{nullptr}; + + std::size_t workspace_size_in_bytes_{0}; +}; + +template +class Operator : public OperatorBase { + public: + template + static auto make(const Tensor tensor, Args&&... args) { + std::unique_ptr op_ptr; + + DispatchFunc( + tensor.device().type(), + [&](auto tag) { + constexpr Device::Type kDev = decltype(tag)::value; + if constexpr (std::is_constructible_v, + const Tensor&, Args...>) { + op_ptr = std::make_unique>( + tensor, std::forward(args)...); + } else { + assert(false && "operator is not implemented for this device"); + } + }, + "Operator::make"); + + return op_ptr; + } + + template + static auto call(const Handle& handle, void* stream, void* workspace, + std::size_t workspace_size_in_bytes, Args&&... args) { + static std::unordered_map> + cache; + + auto key = detail::CacheKey::Build(args...); + + auto it{cache.find(key)}; + + if (it == cache.end()) { + it = cache.emplace(std::move(key), make(std::forward(args)...)) + .first; + } + + auto& op{it->second}; + + auto resolved_stream{stream ? stream : handle.stream()}; + auto resolved_workspace{workspace ? workspace : handle.workspace()}; + auto resolved_workspace_size{workspace_size_in_bytes + ? workspace_size_in_bytes + : handle.workspace_size_in_bytes()}; + + op->set_handle(handle); + op->set_stream(resolved_stream); + op->set_workspace(resolved_workspace); + op->set_workspace_size_in_bytes(resolved_workspace_size); + + return (*op)(std::forward(args)...); + } + + template + static auto call(const Tensor tensor, Args&&... args) { + return call({}, nullptr, nullptr, 0, tensor, std::forward(args)...); + } + + template + auto operator()(Args&&... args) const { + return (*static_cast(this))(std::forward(args)...); + } + + protected: + static constexpr Device::Type device_type_{device_type}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/pybind11_utils.h b/src/pybind11_utils.h new file mode 100644 index 0000000..de4fa62 --- /dev/null +++ b/src/pybind11_utils.h @@ -0,0 +1,120 @@ +#ifndef INFINI_OPS_PYBIND11_UTILS_H_ +#define INFINI_OPS_PYBIND11_UTILS_H_ + +#include +#include + +#include "tensor.h" + +namespace py = pybind11; + +namespace infini::ops { + +namespace detail { + +template +struct TorchDeviceName; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cpu"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cuda"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cuda"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cuda"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cuda"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cuda"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"cuda"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"mlu"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"npu"}; +}; + +template <> +struct TorchDeviceName { + static constexpr std::string_view kValue{"musa"}; +}; + +template +std::unordered_map BuildTorchNameMap( + List) { + std::unordered_map map; + (map.emplace(std::string{TorchDeviceName::kValue}, kDevs), ...); + return map; +} + +} // namespace detail + +inline DataType DataTypeFromString(const std::string& name) { + return kStringToDataType.at(name); +} + +inline Device::Type DeviceTypeFromString(const std::string& name) { + static const auto kTorchNameToTypes{ + detail::BuildTorchNameMap(ActiveDevices{})}; + + auto it{kTorchNameToTypes.find(name)}; + + if (it != kTorchNameToTypes.cend()) { + return it->second; + } + + return Device::TypeFromString(name); +} + +inline Tensor TensorFromPybind11Handle(py::handle obj) { + auto data{ + reinterpret_cast(obj.attr("data_ptr")().cast())}; + + auto shape{obj.attr("shape").cast()}; + + auto dtype_str{py::str(obj.attr("dtype")).cast()}; + auto pos{dtype_str.find_last_of('.')}; + auto dtype{DataTypeFromString( + pos == std::string::npos ? dtype_str : dtype_str.substr(pos + 1))}; + + auto device_obj{obj.attr("device")}; + auto device_type_str{device_obj.attr("type").cast()}; + auto device_index_obj{device_obj.attr("index")}; + auto device_index{device_index_obj.is_none() ? 0 + : device_index_obj.cast()}; + Device device{DeviceTypeFromString(device_type_str), device_index}; + + auto strides{obj.attr("stride")().cast()}; + + return Tensor{data, std::move(shape), dtype, device, std::move(strides)}; +} + +} // namespace infini::ops + +#endif diff --git a/src/tensor.cc b/src/tensor.cc new file mode 100644 index 0000000..cd11087 --- /dev/null +++ b/src/tensor.cc @@ -0,0 +1,154 @@ +#include "tensor.h" + +#include +#include +#include + +#include "dispatcher.h" + +namespace infini::ops { + +static Tensor::Index GetEffectiveIndex(Tensor::Index index, Tensor::Size size) { + return index < 0 ? index + size : index; +} + +Tensor::Tensor(void* data, std::initializer_list shape, + const DataType& dtype, const Device& device, + std::initializer_list strides) + : Tensor{data, decltype(shape_){shape}, dtype, device, + decltype(strides_){strides}} {} + +Tensor Tensor::operator[](const Index& index) const { + return { + reinterpret_cast( + reinterpret_cast(data_) + + GetEffectiveIndex(index, shape_[0]) * strides_[0] * element_size()), + Shape{shape_.cbegin() + 1, shape_.cend()}, dtype_, device_, + Strides{strides_.cbegin() + 1, strides_.cend()}}; +} + +void*& Tensor::data() { return data_; } + +const void* Tensor::data() const { return data_; } + +const Tensor::Shape& Tensor::shape() const { return shape_; } + +const DataType& Tensor::dtype() const { return dtype_; } + +const Device& Tensor::device() const { return device_; } + +const Tensor::Strides& Tensor::strides() const { return strides_; } + +Tensor::Size Tensor::size(const Index& index) const { + return shape_[GetEffectiveIndex(index, shape_.size())]; +} + +Tensor::Stride Tensor::stride(const Index& index) const { + return strides_[GetEffectiveIndex(index, strides_.size())]; +} + +Tensor::Size Tensor::ndim() const { return shape_.size(); } + +Tensor::Size Tensor::element_size() const { return kDataTypeToSize.at(dtype_); } + +Tensor::Size Tensor::numel() const { + return std::accumulate(shape_.begin(), shape_.end(), + static_cast(1), + [](Tensor::Size a, Tensor::Size b) { return a * b; }); +} + +Tensor Tensor::T() const { + return {data_, + {shape_[1], shape_[0]}, + dtype_, + device_, + {strides_[1], strides_[0]}}; +} + +std::string Tensor::ToString() const { + return "tensor(" + ToStringHelper() + + ", dtype=" + std::string(kDataTypeToDesc.at(dtype_)) + ", device='" + + device_.ToString() + "')"; +} + +bool Tensor::HasBroadcastDim() const { + return std::any_of(shape_.begin(), shape_.end(), + [&, i = 0](const auto&) mutable { + return shape_[i] != 1 && strides_[i++] == 0; + }); +} + +bool Tensor::IsContiguous() const { + if (ndim() == 0) { + return true; + } + + if (!IsMergeable(0, ndim() - 1)) { + return false; + } + + return stride(ndim() - 1) == 1; +} + +const DataType Tensor::DefaultDataType() { return DataType::kFloat32; } + +Device Tensor::DefaultDevice() { return Device{Device::Type::kCpu}; } + +Tensor::Strides Tensor::DefaultStrides(const Shape& shape) { + if (shape.empty()) { + return {}; + } + + Strides strides(shape.size()); + + strides.back() = 1; + + for (auto i{shape.size() - 2}; i != -1; --i) { + strides[i] = strides[i + 1] * shape[i + 1]; + } + + return strides; +} + +std::string Tensor::ToStringHelper() const { + if (ndim() == 0) { + return DispatchFunc>( + dtype_, + [&](auto tag) { + using T = typename decltype(tag)::type; + return std::to_string(*static_cast(data_)); + }, + "Tensor::ToStringHelper()"); + } + + std::string result{"["}; + + for (auto i{Index{0}}; i < shape_[0]; ++i) { + result += operator[](i).ToStringHelper() + ", "; + } + + result.pop_back(); + result.back() = ']'; + + return result; +} + +bool Tensor::IsMergeable(Tensor::Size dim_start, Tensor::Size dim_end) const { + if (dim_start == dim_end) { + return true; + } + + for (Tensor::Size i = dim_start; i < dim_end; ++i) { + if (size(i) == 1 && stride(i) == 0) { + return false; + } + if (stride(i) != size(i + 1) * stride(i + 1)) { + return false; + } + } + + return true; +} + +} // namespace infini::ops diff --git a/src/tensor.h b/src/tensor.h new file mode 100644 index 0000000..bbe72f8 --- /dev/null +++ b/src/tensor.h @@ -0,0 +1,157 @@ +#ifndef INFINI_OPS_TENSOR_H_ +#define INFINI_OPS_TENSOR_H_ + +#include +#include +#include + +#include "data_type.h" +#include "device.h" +#include "hash.h" + +namespace infini::ops { + +class Tensor { + public: + using Size = std::size_t; + + using Stride = std::ptrdiff_t; + + using Index = Stride; + + using Shape = std::vector; + + using Strides = std::vector; + + template + Tensor(void* data, const Shape& shape) + : data_{data}, + shape_{shape}, + dtype_{DefaultDataType()}, + device_{DefaultDevice()}, + strides_{DefaultStrides(shape)} {} + + template + Tensor(void* data, const Shape& shape, const DataType& dtype) + : data_{data}, + shape_{shape}, + dtype_{dtype}, + device_{DefaultDevice()}, + strides_{DefaultStrides(shape)} {} + + template + Tensor(void* data, const Shape& shape, const Device& device) + : data_{data}, + shape_{shape}, + dtype_{DefaultDataType()}, + device_{device}, + strides_{DefaultStrides(shape)} {} + + template + Tensor(void* data, const Shape& shape, const DataType& dtype, + const Device& device) + : data_{data}, + shape_{shape}, + dtype_{dtype}, + device_{device}, + strides_{DefaultStrides(shape)} {} + + template + Tensor(void* data, const Shape& shape, const DataType& dtype, + const Device& device, const Strides& strides) + : data_{data}, + shape_{shape}, + dtype_{dtype}, + device_{device}, + strides_{strides} {} + + Tensor(void* data, std::initializer_list shape, const DataType& dtype, + const Device& device, std::initializer_list strides); + + Tensor operator[](const Index& index) const; + + void*& data(); + + const void* data() const; + + const DataType& dtype() const; + + const Device& device() const; + + const Shape& shape() const; + + const Strides& strides() const; + + Size size(const Index& index) const; + + Stride stride(const Index& index) const; + + Size ndim() const; + + Size element_size() const; + + Size numel() const; + + Tensor T() const; + + std::string ToString() const; + + bool HasBroadcastDim() const; + + bool IsContiguous() const; + + private: + static const DataType DefaultDataType(); + + static Device DefaultDevice(); + + static Strides DefaultStrides(const Shape& shape); + + std::string ToStringHelper() const; + + bool IsMergeable(Size dim_start, Size dim_end) const; + + void* data_{nullptr}; + + Shape shape_; + + const DataType dtype_; + + Device device_; + + Strides strides_; +}; + +} // namespace infini::ops + +template <> +struct std::hash { + std::size_t operator()(const infini::ops::Tensor& tensor) const { + std::size_t seed{0}; + + for (const auto& size : tensor.shape()) { + hash_combine(seed, size); + } + + hash_combine(seed, tensor.dtype()); + + hash_combine(seed, tensor.device()); + + for (const auto& stride : tensor.strides()) { + hash_combine(seed, stride); + } + + return seed; + } +}; + +template <> +struct std::equal_to { + bool operator()(const infini::ops::Tensor& a, + const infini::ops::Tensor& b) const { + return a.dtype() == b.dtype() && a.device() == b.device() && + a.shape() == b.shape() && a.strides() == b.strides(); + } +}; + +#endif diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..44654c3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,152 @@ +import hashlib +import random + +import pytest +import torch +import torch.utils.benchmark as benchmark + +from tests.utils import clone_strided, get_available_devices + + +def pytest_addoption(parser): + parser.addoption( + "--benchmark", action="store_true", help="Run performance benchmarks." + ) + + +def pytest_configure(config): + torch.backends.fp32_precision = "tf32" + + config.addinivalue_line( + "markers", + "auto_act_and_assert: automatically perform Act and Assert phases using the return values", + ) + + +def pytest_collectstart(collector): + if isinstance(collector, pytest.Module): + _set_random_seed(_hash(collector.name)) + + +@pytest.fixture(scope="module", autouse=True) +def set_seed_per_module(request): + _set_random_seed(_hash(_module_path_from_request(request))) + + +@pytest.fixture(autouse=True) +def set_seed_per_test(request): + _set_random_seed(_hash(_test_case_path_from_request(request))) + + +def _set_random_seed(seed): + random.seed(seed) + torch.manual_seed(seed) + + +def pytest_generate_tests(metafunc): + already_parametrized = _get_parametrized_args(metafunc) + + if "dtype" in metafunc.fixturenames and "dtype" not in already_parametrized: + metafunc.parametrize( + "dtype, rtol, atol", + ( + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-3, 1e-3), + ), + ) + + if "device" in metafunc.fixturenames and "device" not in already_parametrized: + metafunc.parametrize("device", get_available_devices()) + + +@pytest.hookimpl(tryfirst=True) +def pytest_pyfunc_call(pyfuncitem): + if pyfuncitem.get_closest_marker("auto_act_and_assert"): + func_kwargs = { + arg: pyfuncitem.funcargs[arg] for arg in pyfuncitem._fixtureinfo.argnames + } + + payload = pyfuncitem.obj(**func_kwargs) + + func = payload.func + ref = payload.ref + args = payload.args + kwargs = payload.kwargs + + ref_args = _clone(args) + ref_kwargs = _clone(kwargs) + + output = func(*args, **kwargs) + expected = ref(*ref_args, **ref_kwargs) + + if pyfuncitem.config.getoption("--benchmark"): + stmt = "func(*args, **kwargs)" + + func_timer = benchmark.Timer( + stmt=stmt, + globals={"func": func, "args": args, "kwargs": kwargs}, + label=func.__name__, + description="InfiniOps", + ) + + ref_timer = benchmark.Timer( + stmt=stmt, + globals={"func": ref, "args": ref_args, "kwargs": ref_kwargs}, + label=func.__name__, + description="Reference", + ) + + func_measurement = func_timer.blocked_autorange() + ref_measurement = ref_timer.blocked_autorange() + + benchmark.Compare((func_measurement, ref_measurement)).print() + + rtol = payload.rtol + atol = payload.atol + + assert torch.allclose(output, expected, rtol=rtol, atol=atol) + + return True + + +def _get_parametrized_args(metafunc): + parametrized_args = set() + + for marker in metafunc.definition.iter_markers(name="parametrize"): + args = marker.args[0] + + if isinstance(args, str): + parametrized_args.update(x.strip() for x in args.split(",")) + elif isinstance(args, (list, tuple)): + parametrized_args.update(args) + + return parametrized_args + + +def _test_case_path_from_request(request): + return f"{_module_path_from_request(request)}::{request.node.name}" + + +def _module_path_from_request(request): + return f"{request.module.__name__.replace('.', '/')}.py" + + +def _hash(string): + return int(hashlib.sha256(string.encode("utf-8")).hexdigest(), 16) % 2**32 + + +def _clone(obj): + if isinstance(obj, torch.Tensor): + return clone_strided(obj) + + if isinstance(obj, tuple): + return tuple(_clone(a) for a in obj) + + if isinstance(obj, list): + return [_clone(a) for a in obj] + + if isinstance(obj, dict): + return {key: _clone(value) for key, value in obj.items()} + + return obj diff --git a/tests/test_add.py b/tests/test_add.py new file mode 100644 index 0000000..8b8166c --- /dev/null +++ b/tests/test_add.py @@ -0,0 +1,81 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, randint_strided, randn_strided + +_INT_DTYPES = (torch.int16, torch.int32, torch.int64) + +_UINT_DTYPES = tuple( + filter(None, (getattr(torch, f"uint{bits}", None) for bits in (16, 32, 64))) +) + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, other_strides, out_strides", + ( + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + ) + + tuple((dtype, 0, 0) for dtype in _INT_DTYPES + _UINT_DTYPES), +) +def test_add( + shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol +): + if device == "musa" and dtype in _UINT_DTYPES: + pytest.skip( + "The `torch.musa` test cloning path does not support `uint16`, `uint32`, or `uint64`." + ) + + if dtype in _INT_DTYPES or dtype in _UINT_DTYPES: + input = randint_strided( + 0, 100, shape, input_strides, dtype=dtype, device=device + ) + other = randint_strided( + 0, 100, shape, other_strides, dtype=dtype, device=device + ) + else: + input = randn_strided(shape, input_strides, dtype=dtype, device=device) + other = randn_strided(shape, other_strides, dtype=dtype, device=device) + + out = empty_strided(shape, out_strides, dtype=dtype, device=device) + + return Payload(_add, _torch_add, (input, other, out), {}, rtol=rtol, atol=atol) + + +def _add(input, other, out): + infini.ops.add(input, other, out) + + return out + + +def _torch_add(input, other, out): + if input.dtype in _UINT_DTYPES: + input = input.to(torch.int64) + + if other.dtype in _UINT_DTYPES: + other = other.to(torch.int64) + + res = torch.add(input, other) + out.copy_(res.to(out.dtype)) + + return out diff --git a/tests/test_causal_softmax.py b/tests/test_causal_softmax.py new file mode 100644 index 0000000..8b35457 --- /dev/null +++ b/tests/test_causal_softmax.py @@ -0,0 +1,54 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, out_strides", + ( + ((3, 3), None, None), + ((3, 5), None, None), + ((32, 512), None, None), + ((32, 512), (1024, 1), (1024, 1)), + ((4, 20, 512), None, None), + ((4, 20, 512), (20480, 512, 1), None), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-5, 1e-5), + (torch.float16, 1e-2, 1e-2), + (torch.bfloat16, 1e-2, 1e-2), + ), +) +def test_causal_softmax(shape, input_strides, out_strides, dtype, device, rtol, atol): + input_tensor = randn_strided(shape, input_strides, dtype=dtype, device=device) + out = empty_strided(shape, out_strides, dtype=dtype, device=device) + + return Payload( + _causal_softmax, + _torch_causal_softmax, + (input_tensor, out), + {}, + rtol=rtol, + atol=atol, + ) + + +def _causal_softmax(input, out): + infini.ops.causal_softmax(input, out) + + return out + + +def _torch_causal_softmax(input, out): + mask = torch.tril(torch.ones_like(input), diagonal=-1).flip(dims=[-2, -1]) + masked = torch.where(mask == 1, -torch.inf, input.to(torch.float32)) + result = torch.nn.functional.softmax(masked, dim=-1, dtype=input.dtype) + out.copy_(result) + + return out diff --git a/tests/test_gemm.py b/tests/test_gemm.py new file mode 100644 index 0000000..136e991 --- /dev/null +++ b/tests/test_gemm.py @@ -0,0 +1,110 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "a_shape, b_shape, c_shape, a_strides, b_strides, c_strides", + ( + ((1, 2048), (2048, 2048), (1, 2048), None, None, None), + ((2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None), + ((1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)), + ((6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)), + ((4, 48, 64), (4, 64, 6), (4, 48, 6), None, None, None), + ), +) +@pytest.mark.parametrize("alpha", (-1, -0.5, 0, 0.5, 1)) +@pytest.mark.parametrize("beta", (-1, -0.5, 0, 0.5, 1)) +@pytest.mark.parametrize("trans_a", (False, True)) +@pytest.mark.parametrize("trans_b", (False, True)) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-3, 1e-3), + (torch.float16, 1e-2, 1e-2), + (torch.bfloat16, 1e-2, 1e-2), + ), +) +def test_gemm( + a_shape, + b_shape, + c_shape, + a_strides, + b_strides, + c_strides, + alpha, + beta, + trans_a, + trans_b, + dtype, + device, + rtol, + atol, +): + # Skip transposing test cases for MLU platform as transposing is not currently supported. + if device == "mlu" and (trans_a or trans_b): + pytest.skip("transposing is not currently supported on MLU") + + # `cnnlBatchMatMulEx` does not accept `bfloat16` inputs on MLU. + if device == "mlu" and dtype == torch.bfloat16: + pytest.skip("`bfloat16` is not supported by `cnnlBatchMatMulEx`") + + a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) + b = randn_strided(b_shape, b_strides, dtype=dtype, device=device) + + if trans_a: + a = a.transpose(-2, -1) + + if trans_b: + b = b.transpose(-2, -1) + + c = randn_strided(c_shape, c_strides, dtype=dtype, device=device) + + return Payload( + _gemm, + _torch_gemm, + (a, b, alpha, beta, trans_a, trans_b, c), + {}, + rtol=rtol, + atol=atol, + ) + + +def _gemm(a, b, alpha, beta, trans_a, trans_b, c): + infini.ops.gemm(a, b, alpha, beta, trans_a, trans_b, c) + + return c + + +def _torch_gemm(a, b, alpha=1.0, beta=1.0, trans_a=False, trans_b=False, c=None): + if trans_a: + a = a.transpose(-2, -1) + + if trans_b: + b = b.transpose(-2, -1) + + # PyTorch `baddbmm`/`addmm` ignores `beta` when `alpha=0.0`. + if alpha == 0: + c.mul_(beta) + + return c + + # Some backends (e.g. `torch_musa`) may reject `addmm`/`baddbmm(out=...)` + # for certain strided outputs. Fall back to `matmul` plus fused `alpha`/`beta` + # update to keep reference coverage. + try: + if a.ndim == 2: + return torch.addmm(c, a, b, beta=beta, alpha=alpha, out=c) + + return torch.baddbmm(c, a, b, beta=beta, alpha=alpha, out=c) + except RuntimeError: + # Fallback for backends that don't support `addmm`/`baddbmm` (e.g. CPU `float16`/`bfloat16`): + # compute in float32 and cast back. + c_original = c.float() + result = torch.matmul(a.float(), b.float()) + c.copy_((alpha * result + beta * c_original).to(c.dtype)) + + return c diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py new file mode 100644 index 0000000..d6d4dff --- /dev/null +++ b/tests/test_rms_norm.py @@ -0,0 +1,77 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "input_shape, weight_shape, input_strides, weight_strides, out_strides", + ( + ((1, 64), (64,), None, None, None), + ((2, 128), (128,), None, None, None), + ((4, 48, 64), (64,), None, None, None), + ((2, 4, 2048), (2048,), None, None, None), + ((1, 64), (64,), (64, 1), (1,), (64, 1)), + ((4, 48, 64), (64,), (3072, 64, 1), (1,), (3072, 64, 1)), + ), +) +@pytest.mark.parametrize("eps", (1e-6, 1e-5)) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-4, 1e-4), + (torch.float16, 1e-2, 1e-2), + (torch.bfloat16, 2e-2, 1e-2), + ), +) +def test_rms_norm( + input_shape, + weight_shape, + input_strides, + weight_strides, + out_strides, + eps, + dtype, + device, + rtol, + atol, +): + input = randn_strided(input_shape, input_strides, dtype=dtype, device=device) + weight = randn_strided(weight_shape, weight_strides, dtype=dtype, device=device) + out = empty_strided(input_shape, out_strides, dtype=dtype, device=device) + + return Payload( + _rms_norm, + _torch_rms_norm, + (input, weight), + {"eps": eps, "out": out}, + rtol=rtol, + atol=atol, + ) + + +def _rms_norm(input, weight, *, eps=1e-6, out=None): + infini.ops.rms_norm(input, weight, eps, out) + + return out + + +def _torch_rms_norm(input, weight, *, eps=1e-6, out=None): + # Fallback for `torch<2.3`: `rms_norm = (x / sqrt(mean(x^2) + eps)) * weight`. + def _fallback(input, _normalized_shape, weight, *, eps=1e-6): + rms = torch.sqrt(torch.mean(input * input, dim=-1, keepdim=True) + eps) + + return (input / rms) * weight + + rms_norm_fn = getattr(torch.nn.functional, "rms_norm", _fallback) + + result = rms_norm_fn(input, input.shape[-1:], weight=weight, eps=eps) + + if out is not None: + out.copy_(result) + else: + out = result + + return out diff --git a/tests/test_swiglu.py b/tests/test_swiglu.py new file mode 100644 index 0000000..89c95f7 --- /dev/null +++ b/tests/test_swiglu.py @@ -0,0 +1,49 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, rand_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, gate_strides, out_strides", + ( + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + ), +) +def test_swiglu( + shape, input_strides, gate_strides, out_strides, dtype, device, rtol, atol +): + input = rand_strided(shape, input_strides, dtype=dtype, device=device) + gate = rand_strided(shape, gate_strides, dtype=dtype, device=device) + out = empty_strided(shape, out_strides, dtype=dtype, device=device) + + return Payload(_swiglu, _torch_swiglu, (input, gate, out), {}, rtol=rtol, atol=atol) + + +def _swiglu(input, gate, out): + infini.ops.swiglu(input, gate, out) + + return out + + +def _torch_swiglu(input, gate, out): + swish_x = gate * torch.sigmoid(gate) + + return torch.mul(input, swish_x, out=out) diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..aa4ee42 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,88 @@ +import contextlib +import dataclasses +from collections.abc import Callable + +import torch + + +@dataclasses.dataclass +class Payload: + func: Callable + + ref: Callable + + args: tuple + + kwargs: dict + + rtol: float = 1e-5 + + atol: float = 1e-8 + + +def get_available_devices(): + devices = ["cpu"] + + if torch.cuda.is_available(): + devices.append("cuda") + + if hasattr(torch, "mlu") and torch.mlu.is_available(): + devices.append("mlu") + + if hasattr(torch, "musa") and torch.musa.is_available(): + devices.append("musa") + + return tuple(devices) + + +with contextlib.suppress(ImportError, ModuleNotFoundError): + import torch_mlu # noqa: F401 + + +def empty_strided(shape, strides, *, dtype=None, device=None): + if strides is None: + return torch.empty(shape, dtype=dtype, device=device) + + return torch.empty_strided(shape, strides, dtype=dtype, device=device) + + +def randn_strided(shape, strides, *, dtype=None, device=None): + output = empty_strided(shape, strides, dtype=dtype, device=device) + + output.as_strided( + (output.untyped_storage().size() // output.element_size(),), (1,) + ).normal_() + + return output + + +def rand_strided(shape, strides, *, dtype=None, device=None): + output = empty_strided(shape, strides, dtype=dtype, device=device) + + output.as_strided( + (output.untyped_storage().size() // output.element_size(),), (1,) + ).uniform_(0, 1) + + return output + + +def randint_strided(low, high, shape, strides, *, dtype=None, device=None): + output = empty_strided(shape, strides, dtype=dtype, device=device) + + output.as_strided( + (output.untyped_storage().size() // output.element_size(),), (1,) + ).random_(low, high) + + return output + + +def clone_strided(input): + output = empty_strided( + input.size(), input.stride(), dtype=input.dtype, device=input.device + ) + + as_strided_args = (output.untyped_storage().size() // output.element_size(),), (1,) + + output.as_strided(*as_strided_args).copy_(input.as_strided(*as_strided_args)) + + return output