From ea9fd844d848d6a43a8c41e91e44d5923f91e12e Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 23:22:47 +0000 Subject: [PATCH 01/11] feat: add IPFS CID serving via Kubo sidecar (PE-9067) Add opt-in IPFS content serving to AR.IO Gateway. Operators enable IPFS_ENABLED=true and start a Kubo Docker sidecar (--profile ipfs) to serve IPFS content via path-based (/ipfs/{CID}) and subdomain-based ({CID}.{gateway-host}) access patterns. Features: - Kubo HTTP gateway integration with connection + stall timeouts - LRU bounded filesystem cache (streams to disk, not memory) - File-based CID blocklist with hot-reload - Separate rate limiter pool for IPFS traffic - x402 payment protection (same as Arweave data endpoints) - HTTPSIG response signing for verifiable IPFS responses - CIDv0 to CIDv1 base32 redirect (DNS-safe, works with wildcard certs) - Cross-CID redirect for directory listing navigation - Path traversal protection - Docker Compose profile with TCP+UDP swarm ports Default off (IPFS_ENABLED=false). Zero runtime impact when disabled. Designed as foundation for Phase 2 ArNS-to-CID resolution. --- .dockerignore | 2 + CLAUDE.md | 61 +++- docker-compose.yaml | 29 ++ docs/INDEX.md | 6 + docs/envs.md | 23 ++ docs/ipfs-integration.md | 542 ++++++++++++++++++++++++++++++ package.json | 1 + src/app.ts | 21 ++ src/config.ts | 70 ++++ src/ipfs/ipfs-blocklist.ts | 115 +++++++ src/ipfs/ipfs-cache.ts | 235 +++++++++++++ src/ipfs/ipfs-cid.test.ts | 110 ++++++ src/ipfs/ipfs-rate-limiter.ts | 26 ++ src/ipfs/ipfs-service.ts | 201 +++++++++++ src/ipfs/kubo-data-source.test.ts | 105 ++++++ src/ipfs/kubo-data-source.ts | 199 +++++++++++ src/lib/httpsig.ts | 10 +- src/lib/ipfs-cid.ts | 60 ++++ src/metrics.ts | 38 +++ src/middleware/ipfs.ts | 96 ++++++ src/routes/ipfs.ts | 309 +++++++++++++++++ src/system.ts | 55 +++ test-ipfs.sh | 172 ++++++++++ 23 files changed, 2483 insertions(+), 3 deletions(-) create mode 100644 docs/ipfs-integration.md create mode 100644 src/ipfs/ipfs-blocklist.ts create mode 100644 src/ipfs/ipfs-cache.ts create mode 100644 src/ipfs/ipfs-cid.test.ts create mode 100644 src/ipfs/ipfs-rate-limiter.ts create mode 100644 src/ipfs/ipfs-service.ts create mode 100644 src/ipfs/kubo-data-source.test.ts create mode 100644 src/ipfs/kubo-data-source.ts create mode 100644 src/lib/ipfs-cid.ts create mode 100644 src/middleware/ipfs.ts create mode 100644 src/routes/ipfs.ts create mode 100755 test-ipfs.sh diff --git a/.dockerignore b/.dockerignore index 0c3c378ce..e8bd47ffe 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,3 +18,5 @@ node_modules/ # Test test/ coverage/ +.claude/ +logs/ diff --git a/CLAUDE.md b/CLAUDE.md index 804122f20..1625505dc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,12 +1,50 @@ # CLAUDE.md +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + AR.IO Node — Arweave gateway for accessing and indexing blockchain data, with caching, ANS-104 bundle unbundling, and multi-source data retrieval. +## Tech stack + +- Node.js v20 (see `.nvmrc`), TypeScript strict mode, ESM (`"type": "module"`) +- Test framework: **Node.js native `node:test`** (not Jest/Mocha/Vitest) +- Transpiler: SWC (via ts-node) +- Databases: SQLite (primary) + ClickHouse (analytics/GQL) +- Caching: Redis, LMDB, LRU in-memory +- HTTP: Express +- Observability: OpenTelemetry + Prometheus + Winston + +## Commands + +```bash +# Development +yarn start # Start service (requires .env file) +yarn watch # Start with nodemon (auto-restart on changes) +yarn build # Clean + compile TypeScript (prod) + +# Testing +yarn test # Run all unit tests +yarn test:file src/path/to/file.test.ts # Run a single test file +yarn test:e2e # Run end-to-end tests (in test/ directory) +yarn test:coverage # Run tests with coverage report + +# Linting & quality +yarn lint:check # ESLint check +yarn lint:fix # ESLint auto-fix +yarn duplicate:check # Detect code duplication (jscpd) +yarn deps:check # Detect circular dependencies (madge) + +# Database +yarn db:migrate # Run SQLite migrations +yarn db:dump-test-schemas # Regenerate test SQL schemas after migrations + +# Service management (systemd-based) +yarn service:start / stop / restart / status / logs +``` + ## Discovery points -- Commands — `package.json` scripts (dev, build, service, test, lint, - migrations, duplicate/deps checks) - Documentation index — `docs/INDEX.md` - Env vars — `docs/envs.md` (keep this and `docker-compose.yaml` in sync when adding or removing env vars) @@ -20,6 +58,8 @@ caching, ANS-104 bundle unbundling, and multi-source data retrieval. - `src/system.ts` is the central DI wiring — all services, workers, data sources, resolvers, and lifecycle cleanup handlers are constructed here. +- `src/config.ts` parses all environment variables and exports typed + constants — this is where new env vars are added. - `src/data/` uses composite sources with fallback chains (cache → S3 → AR.IO peers → trusted gateways → Arweave nodes). Retrieval order is configurable via `ON_DEMAND_RETRIEVAL_ORDER` and @@ -30,6 +70,11 @@ caching, ANS-104 bundle unbundling, and multi-source data retrieval. - Filters (`ANS104_UNBUNDLE_FILTER`, `ANS104_INDEX_FILTER`, `WEBHOOK_INDEX_FILTER`) share a composable JSON filter system — see `docs/filters.md`. +- Background workers (`src/workers/`) handle block importing, data importing, + bundle unbundling, verification, and webhooks. Controlled by `START_WRITERS`. +- IPFS serving (`src/ipfs/`) is opt-in via `IPFS_ENABLED`. Uses a Kubo sidecar + for content retrieval with its own cache, rate limiter, and blocklist. Routes + mount before ArNS in `app.ts`. See `docs/ipfs-integration.md`. - Responses include trust headers indicating verification status. ## Gotchas @@ -52,6 +97,18 @@ Always use `createTestLogger()` from `test/test-logger.ts` in test files — never `winston.createLogger({ silent: true })`. Test output is written to `logs/test.log` (overwritten each run), not the console. +### Test imports + +Tests use `node:test` and `node:assert`: + +```typescript +import { describe, it, before, after, mock } from 'node:test'; +import { strict as assert } from 'node:assert'; +``` + +Common test stubs are in `test/stubs.ts`, SQLite helpers in +`test/sqlite-helpers.ts`. + ### Adding a database method Five coordinated edits are required: diff --git a/docker-compose.yaml b/docker-compose.yaml index 0e59621ed..9a7c15fb6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -124,6 +124,19 @@ services: - RATE_LIMITER_IP_REFILL_PER_SEC=${RATE_LIMITER_IP_REFILL_PER_SEC:-} - RATE_LIMITER_IPS_AND_CIDRS_ALLOWLIST=${RATE_LIMITER_IPS_AND_CIDRS_ALLOWLIST:-} - RATE_LIMITER_ARNS_ALLOWLIST=${RATE_LIMITER_ARNS_ALLOWLIST:-} + - IPFS_ENABLED=${IPFS_ENABLED:-false} + - IPFS_KUBO_URL=${IPFS_KUBO_URL:-http://kubo:8080} + - IPFS_KUBO_REQUEST_TIMEOUT_MS=${IPFS_KUBO_REQUEST_TIMEOUT_MS:-} + - IPFS_STREAM_STALL_TIMEOUT_MS=${IPFS_STREAM_STALL_TIMEOUT_MS:-} + - IPFS_CACHE_PATH=${IPFS_CACHE_PATH:-} + - IPFS_CACHE_MAX_SIZE_BYTES=${IPFS_CACHE_MAX_SIZE_BYTES:-} + - IPFS_CACHE_CLEANUP_THRESHOLD=${IPFS_CACHE_CLEANUP_THRESHOLD:-} + - IPFS_BLOCKLIST_PATH=${IPFS_BLOCKLIST_PATH:-} + - IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET=${IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET:-} + - IPFS_RATE_LIMITER_IP_REFILL_PER_SEC=${IPFS_RATE_LIMITER_IP_REFILL_PER_SEC:-} + - IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET=${IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET:-} + - IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC=${IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC:-} + - IPFS_MAX_RESPONSE_SIZE_BYTES=${IPFS_MAX_RESPONSE_SIZE_BYTES:-} - NODE_MAX_OLD_SPACE_SIZE=${NODE_MAX_OLD_SPACE_SIZE:-} - ENABLE_FS_HEADER_CACHE_CLEANUP=${ENABLE_FS_HEADER_CACHE_CLEANUP:-} - ON_DEMAND_RETRIEVAL_ORDER=${ON_DEMAND_RETRIEVAL_ORDER:-} @@ -578,6 +591,22 @@ services: networks: - ar-io-network + kubo: + image: ipfs/kubo:${KUBO_IMAGE_TAG:-v0.32.1} + profiles: + - ipfs + restart: unless-stopped + ports: + - '${IPFS_SWARM_PORT:-4001}:4001/tcp' + - '${IPFS_SWARM_PORT:-4001}:4001/udp' + environment: + - IPFS_PROFILE=${IPFS_PROFILE:-server} + volumes: + - ${IPFS_DATA_PATH:-./data/ipfs}:/data/ipfs + networks: + - ar-io-network + command: ['daemon', '--enable-gc'] + autoheal: image: willfarrell/autoheal@sha256:fd2c5500ab9210be9fa0d365162301eb0d16923f1d9a36de887f5d1751c6eb8c network_mode: none diff --git a/docs/INDEX.md b/docs/INDEX.md index b171f48d9..4bd5c90d7 100644 --- a/docs/INDEX.md +++ b/docs/INDEX.md @@ -23,6 +23,12 @@ Fast, offline lookups for data item to root transaction mappings. | [CDB64 Tools Reference](cdb64-tools.md) | CLI tools for creating indexes | | [CDB64 Format Specification](cdb64-format.md) | Technical file format details | +### IPFS Integration + +| Document | Description | +|----------|-------------| +| [IPFS Integration](ipfs-integration.md) | Architecture, deployment, and configuration for IPFS CID serving | + ### Rate Limiting & Payments | Document | Description | diff --git a/docs/envs.md b/docs/envs.md index 3fef85cf0..d4b7a15dd 100644 --- a/docs/envs.md +++ b/docs/envs.md @@ -357,3 +357,26 @@ ingestion may be partial) for SQLite. | CLICKHOUSE_SQLITE_MIN_HEIGHT_ENABLED | Boolean | false | When true, restrict the SQLite fallback to heights above (ClickHouse max height - buffer) | | CLICKHOUSE_SQLITE_MIN_HEIGHT_BUFFER | Number | 10 | Heights reserved for SQLite near the ClickHouse tip, to guard against partially ingested recent blocks | | CLICKHOUSE_MAX_HEIGHT_CACHE_TTL_SECONDS | Number | 60 | TTL for the cached ClickHouse max-height lookup used by the boundary optimization | + +## IPFS + +When enabled, the gateway can serve IPFS content via `/ipfs/{CID}` path routes +and `{CID}.{root_host}` subdomain routes (same level as ArNS, works with +standard `*.{host}` wildcard TLS certs). Requires a Kubo IPFS node (available +as a Docker Compose sidecar via the `ipfs` profile). + +| ENV_NAME | TYPE | DEFAULT_VALUE | DESCRIPTION | +| ----------------------------------------- | ------- | ------------------- | ------------------------------------------------------------------- | +| IPFS_ENABLED | Boolean | false | Enable IPFS content serving | +| IPFS_KUBO_URL | String | http://kubo:8080 | Kubo HTTP gateway URL | +| IPFS_KUBO_REQUEST_TIMEOUT_MS | Number | 30000 | Connection timeout for Kubo requests (ms) | +| IPFS_STREAM_STALL_TIMEOUT_MS | Number | 30000 | Stall timeout — max time with no data before aborting stream (ms) | +| IPFS_CACHE_PATH | String | data/ipfs-cache | Directory for cached IPFS content | +| IPFS_CACHE_MAX_SIZE_BYTES | Number | 10737418240 (10 GB) | Maximum cache size before LRU eviction | +| IPFS_CACHE_CLEANUP_THRESHOLD | Number | 3600 | Age in seconds before cached files become eviction candidates | +| IPFS_BLOCKLIST_PATH | String | data/ipfs-blocklist.txt | Path to CID blocklist file (one CID per line, hot-reloaded) | +| IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET | Number | 50000 | IPFS rate limiter: max tokens per IP bucket | +| IPFS_RATE_LIMITER_IP_REFILL_PER_SEC | Number | 5 | IPFS rate limiter: token refill rate per second (IP bucket) | +| IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET | Number | 200000 | IPFS rate limiter: max tokens per resource bucket | +| IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC | Number | 20 | IPFS rate limiter: token refill rate per second (resource bucket) | +| IPFS_MAX_RESPONSE_SIZE_BYTES | Number | 1073741824 (1 GB) | Maximum IPFS content size the gateway will serve | diff --git a/docs/ipfs-integration.md b/docs/ipfs-integration.md new file mode 100644 index 000000000..93d3cd9bb --- /dev/null +++ b/docs/ipfs-integration.md @@ -0,0 +1,542 @@ +# IPFS Integration + +This document describes the AR.IO Gateway's IPFS CID serving capability, +covering architecture, request flow, configuration, and deployment. + +## Table of Contents + +- [Overview](#overview) +- [Architecture](#architecture) +- [URL Patterns](#url-patterns) +- [Components](#components) +- [Data Flow](#data-flow) +- [Caching Strategy](#caching-strategy) +- [Security and Moderation](#security-and-moderation) +- [Docker Deployment](#docker-deployment) +- [Configuration Reference](#configuration-reference) +- [Phase 2: ArNS to IPFS Resolution](#phase-2-arns-to-ipfs-resolution) +- [Differences from Arweave Data Serving](#differences-from-arweave-data-serving) +- [Observability](#observability) + +## Overview + +### What This Feature Does + +The IPFS integration allows an AR.IO Gateway to serve IPFS content alongside +Arweave data. A local Kubo IPFS node runs as a Docker sidecar, and the gateway +proxies, caches, and moderates requests to it. This gives gateway operators a +single endpoint for both permanent Arweave data and IPFS-addressed content. + +### Why It Exists + +The Arweave Name System (ArNS) maps human-readable names to content addresses. +Today those addresses are Arweave transaction IDs. Adding IPFS CID support +means ArNS names can also point to IPFS content, bridging the two largest +decentralized storage networks under one naming layer without requiring changes +to the ArNS smart contract. Users get a single URL (e.g., +`my-app.arweave.dev/`) regardless of whether the underlying data lives on +Arweave or IPFS. + +### Phased Approach + +**Phase 1 -- Direct CID Access (current):** Users access IPFS content by +placing a CID in the URL path or subdomain. The gateway validates, rate-limits, +and caches the request, then proxies it to the local Kubo node. No ArNS +resolution is involved. + +**Phase 2 -- ArNS to CID Resolution (future):** ANT (Arweave Name Token) +records will be able to store an IPFS CID in their `transactionId` field. When +the gateway resolves an ArNS name and detects that the resolved ID is a CID +rather than an Arweave transaction ID, it routes the request to the IPFS service +instead of the Arweave data pipeline. This requires no contract changes -- CID +detection happens at the gateway level. + +## Architecture + +### Request Flow Diagram + +``` + +-----------+ + | Client | + +-----+-----+ + | + path: /ipfs/{CID} -or- subdomain: {CID}.ipfs.gateway.io + | + v + +----------+-----------+ + | Express Router / | + | IPFS Middleware | + +----------+-----------+ + | + 1. CID validation + | + v + +----------+-----------+ + | IPFS Blocklist | + | (451 if blocked) | + +----------+-----------+ + | + v + +----------+-----------+ + | IPFS Rate Limiter | + | (429 if exceeded) | + +----------+-----------+ + | + v + +----------+-----------+ + | IPFS Cache | + | (LRU filesystem) | + +----+----------+------+ + | | + cache hit cache miss + | | + v v + +--------+ +--+------------------+ + | Serve | | Kubo Data Source | + | from | | (HTTP fetch with | + | cache | | timeouts) | + +--------+ +--+------------------+ + | + tee stream to + cache + response + | + v + +-----+------+ + | Response | + | (headers, | + | stream) | + +------------+ +``` + +### Component Relationships + +``` +src/system.ts + | + +-- ipfs-service ----+-- kubo-data-source --> Kubo HTTP Gateway (sidecar) + | +-- ipfs-cache --> data/ipfs-cache/ + | +-- ipfs-blocklist --> data/ipfs-blocklist.txt + | +-- ipfs-rate-limiter --> token bucket (memory or Redis) + | + +-- middleware/ipfs (subdomain interception) + +-- routes/ipfs (path-based handlers) +``` + +## URL Patterns + +### Path-Based Access + +| Pattern | Example | Description | +|---------|---------|-------------| +| `/ipfs/{CID}` | `/ipfs/QmYwAPJzv5CZ...` | Fetch a single file or directory root | +| `/ipfs/{CID}/{path}` | `/ipfs/bafybeig.../images/logo.png` | Fetch a file within a UnixFS directory | + +### Subdomain-Based Access + +| Pattern | Example | Description | +|---------|---------|-------------| +| `{CID}.{root_host}` | `bafybeig...arweave.dev` | CIDv1 (base32) in subdomain | +| `{CID}.{root_host}/{path}` | `bafybeig...arweave.dev/index.html` | Subdomain with path | + +Subdomain-based access uses the `ARNS_ROOT_HOST` configuration. The `.ipfs.` +label in the hostname distinguishes IPFS requests from ArNS name resolution, +preventing collisions. For example, `my-app.arweave.dev` resolves as an ArNS +name, while `bafybeig...arweave.dev` resolves as an IPFS CID. + +### CIDv0 to CIDv1 Redirect + +CIDv0 identifiers (base58, starting with `Qm`) cannot be used in subdomains +because they are case-sensitive and DNS is case-insensitive. When a CIDv0 is +detected in a subdomain request, the gateway issues a 301 redirect to the +equivalent CIDv1 (base32) subdomain URL. + +For path-based requests, both CIDv0 and CIDv1 are accepted directly without +redirection. + +## Components + +### `src/lib/ipfs-cid.ts` -- CID Parsing and Conversion + +Utility module for working with IPFS Content Identifiers: + +- **CID validation**: Determines whether a string is a valid CIDv0 or CIDv1. +- **CID conversion**: Converts CIDv0 (base58btc) to CIDv1 (base32) for + subdomain compatibility. +- **CID normalization**: Produces a canonical form used as cache keys and + blocklist entries. + +### `src/ipfs/kubo-data-source.ts` -- Kubo HTTP Client + +Fetches content from the local Kubo IPFS gateway over HTTP: + +- **Connection timeout** (`IPFS_KUBO_REQUEST_TIMEOUT_MS`): Maximum time to + receive response headers from Kubo. Covers DNS, TCP, and TLS handshake plus + time-to-first-byte. +- **Stall timeout** (`IPFS_STREAM_STALL_TIMEOUT_MS`): Maximum idle time during + body streaming. The timer resets on each received chunk, so large but + actively-streaming transfers complete without issue. If no data arrives for + this duration, the stream is aborted. +- Returns a readable stream along with response metadata (content type, content + length). + +### `src/ipfs/ipfs-cache.ts` -- Bounded LRU Filesystem Cache + +A filesystem-backed cache separate from the Arweave contiguous data cache: + +- **Cache directory**: Configurable via `IPFS_CACHE_PATH` (default: + `data/ipfs-cache`). +- **Size limit**: Bounded by `IPFS_CACHE_MAX_SIZE_BYTES` (default: 10 GB). When + the limit is exceeded, the least recently used entries are evicted. +- **Cache key**: SHA-256 hash of the normalized CID concatenated with the + request path. This ensures consistent keys regardless of CID encoding. +- **Metadata**: Each cached entry has a companion `.meta` file storing content + type, content length, and the original CID. Metadata is read on cache hits to + set response headers without re-parsing the content. +- **Cleanup threshold**: `IPFS_CACHE_CLEANUP_THRESHOLD` controls how often (in + seconds) eviction scans run. + +### `src/ipfs/ipfs-blocklist.ts` -- CID Blocklist + +A file-based blocklist for content moderation: + +- **Format**: Plain text file, one CID per line. Lines starting with `#` are + treated as comments. Both CIDv0 and CIDv1 forms are normalized before + matching. +- **Hot-reload**: The blocklist file is watched for changes using filesystem + notifications. Additions and removals take effect without restarting the + gateway. +- **Response**: Blocked CIDs return HTTP 451 (Unavailable For Legal Reasons). + +### `src/ipfs/ipfs-rate-limiter.ts` -- Rate Limiter + +A dedicated token bucket rate limiter for IPFS traffic, separate from the +Arweave rate limiter: + +- **Per-IP bucket**: Controls how much a single client can fetch + (`IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET`, + `IPFS_RATE_LIMITER_IP_REFILL_PER_SEC`). +- **Per-resource bucket**: Controls how much a single CID can be served + globally (`IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET`, + `IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC`). +- Tokens represent bytes (1 token = 1 byte). Bucket sizes and refill rates are + configurable independently from the Arweave rate limiter. +- When limits are exceeded, the gateway returns HTTP 429 (Too Many Requests). + +### `src/ipfs/ipfs-service.ts` -- Service Orchestrator + +Coordinates all IPFS components behind a single interface: + +- Accepts a CID and optional path, runs it through the blocklist, rate limiter, + and cache, and falls back to the Kubo data source on cache miss. +- Handles stream teeing: on a cache miss, the Kubo response stream is split so + one branch writes to cache while the other streams to the client. +- Enforces `IPFS_MAX_RESPONSE_SIZE_BYTES` to prevent serving excessively large + files. +- Registered in `src/system.ts` alongside other data services, guarded by the + `IPFS_ENABLED` flag. + +### `src/middleware/ipfs.ts` -- Subdomain Middleware + +Express middleware that intercepts requests based on the `Host` header: + +- Parses the hostname to detect the `{CID}.{root_host}` pattern. +- Extracts the CID and any path from the URL. +- Performs CIDv0 to CIDv1 redirect when necessary. +- Forwards the extracted CID and path to the IPFS route handler. +- Must run before the ArNS subdomain middleware to prevent the CID from being + misinterpreted as an ArNS name. + +### `src/routes/ipfs.ts` -- Route Handlers + +Express route handlers for path-based IPFS access: + +- `GET /ipfs/:cid` and `GET /ipfs/:cid/*` routes. +- Validates the CID parameter, delegates to the IPFS service, and streams the + response with appropriate headers. +- Sets `Cache-Control`, `ETag`, and `X-Ipfs-Path` headers on successful + responses. + +## Data Flow + +The complete lifecycle of an IPFS request: + +1. **Request arrives.** Either path-based (`/ipfs/{CID}/path`) or + subdomain-based (`{CID}.ipfs.gateway.io/path`). Subdomain requests are + detected by the IPFS middleware and rewritten internally to match the + path-based route. + +2. **CID validation.** The CID string is parsed. If it is not a valid CIDv0 or + CIDv1, the request returns 400 (Bad Request). If it is a CIDv0 in a + subdomain context, a 301 redirect is issued to the CIDv1 equivalent. + +3. **Blocklist check.** The normalized CID is checked against the in-memory + blocklist. If matched, the request returns 451 (Unavailable For Legal + Reasons) immediately. + +4. **Rate limit check.** Both the per-IP and per-resource buckets are checked. + If either bucket is exhausted, the request returns 429 (Too Many Requests) + with a `Retry-After` header. + +5. **Cache lookup.** The cache key (SHA-256 of normalized CID + path) is looked + up in the filesystem cache. On a hit, the cached file and its `.meta` + companion are read and streamed to the client. + +6. **Kubo fetch.** On a cache miss, the service makes an HTTP GET to the local + Kubo gateway (`IPFS_KUBO_URL/ipfs/{CID}/{path}`). The response stream is + tee'd: one branch writes to the cache directory, the other streams directly + to the client. Both the connection timeout and the stall timeout apply during + this phase. + +7. **Response.** The following headers are set on successful responses: + + | Header | Value | Purpose | + |--------|-------|---------| + | `Cache-Control` | `public, max-age=31536000, immutable` | CID content never changes | + | `ETag` | `"{CID}"` | Content-addressed deduplication | + | `X-Ipfs-Path` | `/ipfs/{CID}/{path}` | IPFS ecosystem interop | + | `Content-Type` | Detected by Kubo or from `.meta` | Standard MIME typing | + +## Caching Strategy + +IPFS content is **content-addressed**: a given CID always maps to the same +bytes. This makes cached entries permanently valid -- there is no stale data and +no revalidation needed. + +### Cache Properties + +| Property | Details | +|----------|---------| +| **Location** | `IPFS_CACHE_PATH` (default `data/ipfs-cache`), separate from Arweave data | +| **Key** | SHA-256 hash of normalized CID concatenated with request path | +| **Eviction** | LRU (least recently used) when total size exceeds `IPFS_CACHE_MAX_SIZE_BYTES` | +| **Max size** | `IPFS_CACHE_MAX_SIZE_BYTES` (default 10 GB) | +| **Metadata** | Companion `.meta` JSON files store content type, size, and original CID | +| **Cleanup** | Eviction scans run every `IPFS_CACHE_CLEANUP_THRESHOLD` seconds (default 3600) | +| **Permanence** | No TTL-based expiration; entries are valid forever unless evicted for space | + +### Why a Separate Cache + +The Arweave contiguous data cache is archival in nature -- operators configure +it to retain as much data as possible, potentially without eviction. IPFS +content has different retention characteristics: it requires pinning to persist +on the IPFS network, and gateway operators may not want unbounded IPFS storage. +A separate bounded LRU cache gives operators independent control over Arweave +and IPFS storage budgets. + +## Security and Moderation + +### CID Blocklist + +The blocklist file (`IPFS_BLOCKLIST_PATH`, default `data/ipfs-blocklist.txt`) +allows operators to block specific content: + +``` +# Blocked content - one CID per line +QmBlockedContent1... +bafybeiblockedcontent2... +# Comments start with # +``` + +- CIDs are normalized before matching, so blocking a CIDv0 also blocks its + CIDv1 equivalent and vice versa. +- The file is watched for changes and reloaded automatically. +- Blocked requests return HTTP 451. + +### Rate Limiting + +IPFS rate limiting uses a separate token pool from the Arweave rate limiter. +This prevents IPFS traffic from consuming Arweave rate limit capacity and gives +operators independent tuning for each protocol. + +- **Per-IP limits** prevent a single client from monopolizing bandwidth. +- **Per-resource limits** prevent a single popular CID from consuming all + available throughput. +- Token counts represent bytes of data served. + +### Size Limits + +`IPFS_MAX_RESPONSE_SIZE_BYTES` (default 1 GB) caps the maximum size of a single +IPFS response. Requests for content exceeding this limit are rejected. This +protects the gateway from serving unexpectedly large files that could exhaust +memory or disk. + +### Subdomain Isolation + +Subdomain-based access (`{CID}.ipfs.gateway.io`) provides browser origin +isolation. Each CID gets its own origin, preventing cross-content scripting +attacks. This follows the same security model used by IPFS gateways and the +existing ArNS subdomain sandboxing in the AR.IO Gateway. + +## Docker Deployment + +IPFS runs as an opt-in Docker Compose profile. Enabling it starts a Kubo sidecar +alongside the core gateway services. + +### Docker Compose Profile + +The `ipfs` profile adds a Kubo container: + +```yaml +services: + kubo: + image: ipfs/kubo:latest + profiles: + - ipfs + ports: + - "4001:4001" # Swarm (libp2p) - public, for peer connections + expose: + - "5001" # API - internal only, not exposed to host + - "8080" # Gateway - internal only, used by ar-io-node + volumes: + - ipfs-data:/data/ipfs + environment: + - IPFS_PROFILE=server + command: ["daemon", "--enable-gc"] +``` + +### Port Reference + +| Port | Protocol | Exposure | Purpose | +|------|----------|----------|---------| +| 4001 | TCP/UDP | Public | libp2p swarm -- peer discovery and content exchange | +| 5001 | HTTP | Internal | Kubo API -- used for pinning and node management | +| 8080 | HTTP | Internal | Kubo HTTP Gateway -- used by `kubo-data-source` to fetch content | + +### Volume + +The `ipfs-data` volume persists the Kubo datastore (block storage, peer +identity, configuration). This volume is independent of the gateway's `data/` +directory. + +### Enabling IPFS + +Start the gateway with the IPFS profile: + +```bash +docker compose --profile ipfs up -d +``` + +Or set `IPFS_ENABLED=true` in your `.env` and include `ipfs` in your active +profiles. + +### Garbage Collection + +The Kubo container starts with `--enable-gc`, which periodically removes +unpinned blocks from the local IPFS datastore. This prevents unbounded growth +of the Kubo volume. Content actively being served is protected from GC. + +## Configuration Reference + +All environment variables are opt-in. The feature is disabled by default. + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `IPFS_ENABLED` | Boolean | `false` | Master switch for IPFS support. When false, IPFS routes are not registered and the Kubo sidecar is not required. | +| `IPFS_KUBO_URL` | String | `http://kubo:8080` | Base URL of the local Kubo HTTP Gateway. In Docker, this is the container name. For local development, use `http://localhost:8080`. | +| `IPFS_KUBO_REQUEST_TIMEOUT_MS` | Number | `30000` | Connection timeout in milliseconds for Kubo requests (time to receive response headers). | +| `IPFS_STREAM_STALL_TIMEOUT_MS` | Number | `30000` | Stall timeout in milliseconds for streaming responses from Kubo. Stream is aborted if no data is received for this duration. Actively-streaming transfers are not affected. | +| `IPFS_CACHE_PATH` | String | `data/ipfs-cache` | Directory for the IPFS filesystem cache. Relative paths are resolved from the gateway's working directory. | +| `IPFS_CACHE_MAX_SIZE_BYTES` | Number | `10737418240` (10 GB) | Maximum total size of the IPFS cache directory. LRU eviction begins when this limit is exceeded. | +| `IPFS_CACHE_CLEANUP_THRESHOLD` | Number | `3600` | Interval in seconds between cache eviction scans. | +| `IPFS_BLOCKLIST_PATH` | String | `data/ipfs-blocklist.txt` | Path to the CID blocklist file. The file is watched for changes and reloaded automatically. | +| `IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET` | Number | `50000` | Maximum tokens (bytes) per IP bucket. | +| `IPFS_RATE_LIMITER_IP_REFILL_PER_SEC` | Number | `5` | Tokens added to each IP bucket per second. | +| `IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET` | Number | `200000` | Maximum tokens (bytes) per resource (CID) bucket. | +| `IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC` | Number | `20` | Tokens added to each resource bucket per second. | +| `IPFS_MAX_RESPONSE_SIZE_BYTES` | Number | `1073741824` (1 GB) | Maximum response size for a single IPFS request. Requests exceeding this are rejected. | + +## Phase 2: ArNS to IPFS Resolution + +Phase 2 connects ArNS naming to IPFS content, allowing `my-dapp.arweave.dev` to +serve IPFS-hosted content without the user needing to know the CID. + +### How It Works + +1. **ANT record stores a CID.** The Arweave Name Token contract's + `transactionId` field accepts any string. An ANT owner sets it to an IPFS CID + (e.g., `bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi`) + instead of an Arweave transaction ID. + +2. **Gateway resolves the ArNS name.** The standard ArNS resolution pipeline + fetches the ANT record and extracts the `transactionId`. + +3. **CID detection.** The gateway inspects the resolved ID. If it matches CID + format (multibase-prefixed, valid multicodec), it is classified as an IPFS + CID rather than an Arweave transaction ID. + +4. **Route to IPFS service.** Instead of fetching from the Arweave data pipeline + (cache, S3, peers, chunks), the gateway routes the request to the IPFS + service, which follows the same blocklist, rate limit, cache, and Kubo fetch + pipeline described above. + +### Key Design Decisions + +- **No contract changes.** CID detection happens entirely at the gateway level. + The ANT contract's `transactionId` field is a free-form string, so it already + accepts CIDs. +- **Transparent to users.** A user visiting `my-dapp.arweave.dev` does not need + to know whether the content is on Arweave or IPFS. The URL is the same either + way. +- **Owner-controlled.** The ANT owner decides where content lives by setting + the `transactionId` to either an Arweave TX ID or an IPFS CID. Switching + between storage backends is a single contract interaction. +- **Caching and moderation apply.** All Phase 1 protections (blocklist, rate + limits, cache) apply to ArNS-resolved IPFS content. + +## Differences from Arweave Data Serving + +| Aspect | Arweave | IPFS | +|--------|---------|------| +| **Addressing** | Transaction ID (43-char base64url) | CID (variable length, base32 or base58) | +| **Permanence** | Guaranteed by protocol (incentivized storage) | Requires pinning; content disappears if unpinned | +| **Path resolution** | Manifest JSON parsed by the gateway | UnixFS directories resolved by Kubo | +| **Verification** | Merkle proofs verified by the gateway | Block hashes verified internally by Kubo | +| **Caching** | Archival (operators retain data long-term, often without eviction) | LRU with bounded size (eviction when full) | +| **Cache-Control** | Varies by verification status and data source trust | `immutable` with 1-year max-age (content-addressed = never changes) | +| **Rate limiting** | Shared Arweave token bucket | Separate IPFS token bucket | +| **Data source** | Multi-source fallback chain (cache, S3, peers, gateways, Arweave nodes) | Single source: local Kubo node | +| **Upstream network** | Arweave protocol (block weave, mining incentives) | IPFS/libp2p (DHT, Bitswap) | +| **Moderation** | Transaction-level blocklist | CID-level blocklist (with CIDv0/v1 normalization) | + +## Observability + +### Prometheus Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `ipfs_requests_total` | Counter | `status`, `method` | Total IPFS requests by HTTP status and method | +| `ipfs_cache_hit_total` | Counter | -- | Cache hits | +| `ipfs_cache_miss_total` | Counter | -- | Cache misses | +| `ipfs_content_size_bytes` | Histogram | -- | Distribution of served content sizes | +| `ipfs_request_duration_seconds` | Histogram | `source` (`cache`, `kubo`) | Request latency by data source | +| `ipfs_blocked_total` | Counter | -- | Requests blocked by the CID blocklist | + +### Structured Logging + +Each IPFS component creates a Winston child logger with a component-specific +label (e.g., `ipfs-service`, `kubo-data-source`, `ipfs-cache`). Log output +follows the gateway's standard JSONL format and is written to the same log +destination as all other gateway logs. Key log events: + +- `ipfs-service`: Request start, cache hit/miss, Kubo fetch start/complete, + errors. +- `kubo-data-source`: HTTP request details, timeout events, stream stalls. +- `ipfs-cache`: Eviction events, write errors, cleanup scan results. +- `ipfs-blocklist`: File reload events, blocked CID matches. + +### OpenTelemetry Tracing + +IPFS requests generate OpenTelemetry spans that are written to +`logs/otel-spans.jsonl` alongside Arweave request spans. The span hierarchy: + +``` +ipfs.request (root span) + +-- ipfs.blocklist.check + +-- ipfs.ratelimit.check + +-- ipfs.cache.lookup + +-- ipfs.kubo.fetch (only on cache miss) + +-- ipfs.cache.write (only on cache miss) +``` + +Spans include attributes for the CID, path, cache hit/miss status, response +size, and Kubo fetch duration. diff --git a/package.json b/package.json index c48f1424c..91300f497 100644 --- a/package.json +++ b/package.json @@ -58,6 +58,7 @@ "memoizee": "^0.4.17", "middleware-async": "^1.4.0", "msgpackr": "^1.11.5", + "multiformats": "^13.1.0", "node-cache": "^5.1.2", "opossum": "^8.4.0", "opossum-prometheus": "^0.4.0", diff --git a/src/app.ts b/src/app.ts index a8d1187e7..6442a690f 100644 --- a/src/app.ts +++ b/src/app.ts @@ -25,6 +25,8 @@ import { datasetsRouter } from './routes/datasets.js'; import * as system from './system.js'; import { createX402Router } from './routes/x402.js'; import { createRateLimitRouter } from './routes/rate-limit.js'; +import { createIpfsSubdomainMiddleware } from './middleware/ipfs.js'; +import { createIpfsRouter, createIpfsHandler } from './routes/ipfs.js'; // Initialize DNS resolution for preferred chunk GET nodes (non-fatal on failure) try { @@ -136,6 +138,25 @@ if (system.rateLimiter !== undefined) { }), ); } +// IPFS routes — must be before ArNS to intercept {CID}.{host} subdomains +if (config.IPFS_ENABLED && system.ipfsService !== undefined) { + const ipfsHandler = createIpfsHandler({ + log, + ipfsService: system.ipfsService, + rateLimiter: system.ipfsRateLimiter, + paymentProcessor: system.paymentProcessor, + }); + app.use(createIpfsSubdomainMiddleware({ ipfsHandler })); + app.use( + createIpfsRouter({ + log, + ipfsService: system.ipfsService, + rateLimiter: system.ipfsRateLimiter, + paymentProcessor: system.paymentProcessor, + }), + ); +} + app.use(arnsRouter); app.use(openApiRouter); app.use(arIoRouter); diff --git a/src/config.ts b/src/config.ts index ef648587d..72f70c80a 100644 --- a/src/config.ts +++ b/src/config.ts @@ -2143,3 +2143,73 @@ if (ENABLE_SAMPLING_DATA_SOURCE) { ); } } + +// +// IPFS +// + +export const IPFS_ENABLED = + env.varOrDefault('IPFS_ENABLED', 'false') === 'true'; + +export const IPFS_KUBO_URL = env.varOrDefault( + 'IPFS_KUBO_URL', + 'http://kubo:8080', +); + +export const IPFS_KUBO_REQUEST_TIMEOUT_MS = +env.varOrDefault( + 'IPFS_KUBO_REQUEST_TIMEOUT_MS', + '30000', +); + +export const IPFS_STREAM_STALL_TIMEOUT_MS = +env.varOrDefault( + 'IPFS_STREAM_STALL_TIMEOUT_MS', + '30000', +); + +export const IPFS_CACHE_PATH = env.varOrDefault( + 'IPFS_CACHE_PATH', + 'data/ipfs-cache', +); + +export const IPFS_CACHE_MAX_SIZE_BYTES = +env.varOrDefault( + 'IPFS_CACHE_MAX_SIZE_BYTES', + `${10 * 1024 * 1024 * 1024}`, // 10 GB +); + +// Reserved for future cache cleanup worker. Currently unused — LRU eviction +// in the in-memory index handles cache bounding. After restarts, disk usage +// may temporarily exceed IPFS_CACHE_MAX_SIZE_BYTES until the index rebuilds. +export const IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS = +env.varOrDefault( + 'IPFS_CACHE_CLEANUP_THRESHOLD', + '3600', +); + +export const IPFS_BLOCKLIST_PATH = env.varOrDefault( + 'IPFS_BLOCKLIST_PATH', + 'data/ipfs-blocklist.txt', +); + +export const IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET = +env.varOrDefault( + 'IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET', + '50000', +); + +export const IPFS_RATE_LIMITER_IP_REFILL_PER_SEC = +env.varOrDefault( + 'IPFS_RATE_LIMITER_IP_REFILL_PER_SEC', + '5', +); + +export const IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET = +env.varOrDefault( + 'IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET', + '200000', +); + +export const IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC = +env.varOrDefault( + 'IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC', + '20', +); + +export const IPFS_MAX_RESPONSE_SIZE_BYTES = +env.varOrDefault( + 'IPFS_MAX_RESPONSE_SIZE_BYTES', + `${1 * 1024 * 1024 * 1024}`, // 1 GB +); diff --git a/src/ipfs/ipfs-blocklist.ts b/src/ipfs/ipfs-blocklist.ts new file mode 100644 index 000000000..f20ba355b --- /dev/null +++ b/src/ipfs/ipfs-blocklist.ts @@ -0,0 +1,115 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import fs from 'node:fs'; +import winston from 'winston'; +import { watch, FSWatcher } from 'chokidar'; + +import { cidToV1Base32, isValidCid } from '../lib/ipfs-cid.js'; + +export class IpfsBlocklist { + private log: winston.Logger; + private filePath: string; + private blockedCids: Set = new Set(); + private watcher: FSWatcher | null = null; + private reloadTimer: NodeJS.Timeout | null = null; + + constructor({ log, filePath }: { log: winston.Logger; filePath: string }) { + this.log = log.child({ class: this.constructor.name }); + this.filePath = filePath; + } + + async load(): Promise { + try { + const content = await fs.promises.readFile(this.filePath, 'utf-8'); + const newSet = new Set(); + + for (const line of content.split('\n')) { + const trimmed = line.trim(); + if (trimmed === '' || trimmed.startsWith('#')) continue; + + if (isValidCid(trimmed)) { + // Normalize to CIDv1 base32 for consistent matching + try { + newSet.add(cidToV1Base32(trimmed)); + } catch { + this.log.warn('Failed to normalize CID in blocklist', { + cid: trimmed, + }); + } + } else { + this.log.warn('Invalid CID in blocklist, skipping', { + line: trimmed, + }); + } + } + + this.blockedCids = newSet; + this.log.info('IPFS blocklist loaded', { count: newSet.size }); + } catch (error: any) { + if (error.code === 'ENOENT') { + this.log.debug('IPFS blocklist file not found, no CIDs blocked', { + filePath: this.filePath, + }); + this.blockedCids = new Set(); + } else { + this.log.error('Failed to load IPFS blocklist', { + message: error.message, + }); + } + } + } + + isBlocked(cidString: string): boolean { + try { + const normalized = cidToV1Base32(cidString); + return this.blockedCids.has(normalized); + } catch { + return false; + } + } + + startWatching(): void { + this.watcher = watch(this.filePath, { + ignoreInitial: true, + awaitWriteFinish: { stabilityThreshold: 1000 }, + }); + + this.watcher.on('change', () => { + this.log.info('IPFS blocklist file changed, reloading'); + this.scheduleReload(); + }); + + this.watcher.on('add', () => { + this.log.info('IPFS blocklist file created, loading'); + this.scheduleReload(); + }); + } + + private scheduleReload(): void { + if (this.reloadTimer) { + clearTimeout(this.reloadTimer); + } + this.reloadTimer = setTimeout(() => { + this.load().catch((error) => { + this.log.error('Failed to reload IPFS blocklist', { + message: error.message, + }); + }); + }, 1000); + } + + stop(): void { + if (this.watcher) { + this.watcher.close(); + this.watcher = null; + } + if (this.reloadTimer) { + clearTimeout(this.reloadTimer); + this.reloadTimer = null; + } + } +} diff --git a/src/ipfs/ipfs-cache.ts b/src/ipfs/ipfs-cache.ts new file mode 100644 index 000000000..ccf6be052 --- /dev/null +++ b/src/ipfs/ipfs-cache.ts @@ -0,0 +1,235 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import crypto from 'node:crypto'; +import fs from 'node:fs'; +import { Readable } from 'node:stream'; +import { pipeline } from 'node:stream/promises'; +import winston from 'winston'; +import { LRUCache } from 'lru-cache'; + +interface CacheEntry { + size: number; + contentType: string; +} + +export class IpfsFsCache { + private log: winston.Logger; + private baseDir: string; + private index: LRUCache; + + constructor({ + log, + basePath, + maxSizeBytes, + }: { + log: winston.Logger; + basePath: string; + maxSizeBytes: number; + }) { + this.log = log.child({ class: this.constructor.name }); + this.baseDir = basePath; + this.index = new LRUCache({ + maxSize: maxSizeBytes, + sizeCalculation: (entry) => entry.size, + dispose: (_entry, key) => { + // Delete file from disk when evicted from LRU + const dataPath = this.dataPath(key); + const metaPath = this.metaPath(key); + fs.promises.unlink(dataPath).catch(() => {}); + fs.promises.unlink(metaPath).catch(() => {}); + this.log.debug('Evicted IPFS cache entry', { key }); + }, + }); + } + + private cacheKey(cidString: string, path?: string): string { + const raw = path ? `${cidString}/${path}` : cidString; + return crypto.createHash('sha256').update(raw).digest('hex'); + } + + private dataDir(key: string): string { + const prefix = `${key.substring(0, 2)}/${key.substring(2, 4)}`; + return `${this.baseDir}/data/${prefix}`; + } + + private dataPath(key: string): string { + return `${this.dataDir(key)}/${key}`; + } + + private metaPath(key: string): string { + return `${this.dataDir(key)}/${key}.meta`; + } + + private tempDir(): string { + return `${this.baseDir}/tmp`; + } + + private createTempPath(): string { + return `${this.tempDir()}/${crypto.randomBytes(16).toString('hex')}`; + } + + async has(cidString: string, path?: string): Promise { + const key = this.cacheKey(cidString, path); + if (this.index.has(key)) { + return true; + } + // Check disk in case index was lost (restart) + try { + await fs.promises.access(this.dataPath(key), fs.constants.F_OK); + // Rebuild index entry from meta file + const meta = await this.readMeta(key); + if (meta) { + this.index.set(key, meta); + return true; + } + } catch { + // Not found + } + return false; + } + + async get( + cidString: string, + path?: string, + ): Promise<{ stream: Readable; size: number; contentType: string } | undefined> { + const key = this.cacheKey(cidString, path); + let entry = this.index.get(key); + + // Rebuild index from disk if entry is missing (e.g., after restart) + if (!entry) { + try { + await fs.promises.access(this.dataPath(key), fs.constants.F_OK); + const meta = await this.readMeta(key); + if (meta) { + this.index.set(key, meta); + entry = meta; + } + } catch { + // File not on disk — true cache miss + } + } + + if (entry) { + const dataPath = this.dataPath(key); + try { + await fs.promises.access(dataPath, fs.constants.F_OK); + const stream = fs.createReadStream(dataPath); + return { + stream, + size: entry.size, + contentType: entry.contentType, + }; + } catch (error: any) { + this.log.error('Failed to read cached IPFS content', { + key, + message: error.message, + }); + this.index.delete(key); + } + } + return undefined; + } + + async put( + cidString: string, + stream: Readable, + size: number, + contentType: string, + path?: string, + ): Promise { + const key = this.cacheKey(cidString, path); + + try { + await fs.promises.mkdir(this.tempDir(), { recursive: true }); + const tempPath = this.createTempPath(); + const writeStream = fs.createWriteStream(tempPath); + + await pipeline(stream, writeStream); + + // Move to final location + const dataDir = this.dataDir(key); + await fs.promises.mkdir(dataDir, { recursive: true }); + await fs.promises.rename(tempPath, this.dataPath(key)); + + // Write metadata + const meta: CacheEntry = { size, contentType }; + await fs.promises.writeFile( + this.metaPath(key), + JSON.stringify(meta), + 'utf-8', + ); + + // Update index + this.index.set(key, meta); + + this.log.debug('Cached IPFS content', { cidString, path, key, size }); + } catch (error: any) { + this.log.error('Failed to cache IPFS content', { + cidString, + path, + message: error.message, + }); + } + } + + /** + * Finalize a cache entry from an already-written temp file. + * Used by the streaming cache writer to avoid double-copying data. + */ + async putFromFile( + cidString: string, + tempPath: string, + size: number, + contentType: string, + path?: string, + ): Promise { + const key = this.cacheKey(cidString, path); + + try { + const dataDir = this.dataDir(key); + await fs.promises.mkdir(dataDir, { recursive: true }); + await fs.promises.rename(tempPath, this.dataPath(key)); + + const meta: CacheEntry = { size, contentType }; + await fs.promises.writeFile( + this.metaPath(key), + JSON.stringify(meta), + 'utf-8', + ); + + this.index.set(key, meta); + + this.log.debug('Cached IPFS content from file', { + cidString, + path, + key, + size, + }); + } catch (error: any) { + this.log.error('Failed to finalize cached IPFS content', { + cidString, + path, + message: error.message, + }); + // Clean up temp file on failure + fs.promises.unlink(tempPath).catch(() => {}); + } + } + + getCachePath(): string { + return this.baseDir; + } + + private async readMeta(key: string): Promise { + try { + const raw = await fs.promises.readFile(this.metaPath(key), 'utf-8'); + return JSON.parse(raw) as CacheEntry; + } catch { + return null; + } + } +} diff --git a/src/ipfs/ipfs-cid.test.ts b/src/ipfs/ipfs-cid.test.ts new file mode 100644 index 000000000..a0aa35934 --- /dev/null +++ b/src/ipfs/ipfs-cid.test.ts @@ -0,0 +1,110 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { describe, it } from 'node:test'; +import { strict as assert } from 'node:assert'; + +import { + parseCid, + isValidCid, + isCidV0, + cidToV1Base32, + cidToString, +} from '../lib/ipfs-cid.js'; + +describe('ipfs-cid utilities', () => { + // Known CIDv1 base32 (dag-pb, sha2-256) + const CIDV1_BASE32 = + 'bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi'; + + // Known CIDv0 (Qm prefix, base58btc) + const CIDV0 = 'QmYwAPJzv5CZsnA625s3Xf2nemtYgPpHdWEz79ojWnPbdG'; + + describe('parseCid', () => { + it('parses a valid CIDv1 base32 string', () => { + const cid = parseCid(CIDV1_BASE32); + assert.notEqual(cid, null); + assert.equal(cid!.version, 1); + }); + + it('parses a valid CIDv0 string', () => { + const cid = parseCid(CIDV0); + assert.notEqual(cid, null); + assert.equal(cid!.version, 0); + }); + + it('returns null for invalid strings', () => { + assert.equal(parseCid('not-a-cid'), null); + assert.equal(parseCid(''), null); + assert.equal(parseCid('abc123'), null); + }); + + it('returns null for Arweave TX IDs (43-char base64url)', () => { + // A typical Arweave TX ID — not a valid CID + assert.equal( + parseCid('TB2wJyKrPnkAW79DAwlJYwpgdHKpijEJWQfcwX715Co'), + null, + ); + }); + }); + + describe('isValidCid', () => { + it('returns true for valid CIDs', () => { + assert.equal(isValidCid(CIDV1_BASE32), true); + assert.equal(isValidCid(CIDV0), true); + }); + + it('returns false for invalid strings', () => { + assert.equal(isValidCid('not-a-cid'), false); + assert.equal(isValidCid(''), false); + }); + }); + + describe('isCidV0', () => { + it('returns true for CIDv0', () => { + const cid = parseCid(CIDV0)!; + assert.equal(isCidV0(cid), true); + }); + + it('returns false for CIDv1', () => { + const cid = parseCid(CIDV1_BASE32)!; + assert.equal(isCidV0(cid), false); + }); + }); + + describe('cidToV1Base32', () => { + it('converts CIDv0 to CIDv1 base32', () => { + const result = cidToV1Base32(CIDV0); + // Result should start with 'bafy' (dag-pb, sha2-256) + assert.match(result, /^bafy/); + // Should be all lowercase (DNS-safe) + assert.equal(result, result.toLowerCase()); + }); + + it('returns CIDv1 base32 unchanged', () => { + const result = cidToV1Base32(CIDV1_BASE32); + assert.equal(result, CIDV1_BASE32); + }); + + it('throws for invalid CID strings', () => { + assert.throws(() => cidToV1Base32('not-a-cid')); + }); + }); + + describe('cidToString', () => { + it('returns base58btc for CIDv0', () => { + const cid = parseCid(CIDV0)!; + const result = cidToString(cid); + assert.match(result, /^Qm/); + }); + + it('returns base32 for CIDv1', () => { + const cid = parseCid(CIDV1_BASE32)!; + const result = cidToString(cid); + assert.match(result, /^bafy/); + }); + }); +}); diff --git a/src/ipfs/ipfs-rate-limiter.ts b/src/ipfs/ipfs-rate-limiter.ts new file mode 100644 index 000000000..f79bf0490 --- /dev/null +++ b/src/ipfs/ipfs-rate-limiter.ts @@ -0,0 +1,26 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import * as config from '../config.js'; +import { MemoryRateLimiter } from '../limiter/memory-rate-limiter.js'; + +/** + * Creates a separate MemoryRateLimiter instance for IPFS requests. + * This ensures IPFS traffic doesn't compete with Arweave traffic + * for rate-limit tokens. + */ +export function createIpfsRateLimiter(): MemoryRateLimiter { + return new MemoryRateLimiter({ + resourceCapacity: config.IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET, + resourceRefillRate: config.IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC, + ipCapacity: config.IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET, + ipRefillRate: config.IPFS_RATE_LIMITER_IP_REFILL_PER_SEC, + limitsEnabled: config.ENABLE_RATE_LIMITER && config.IPFS_ENABLED, + ipAllowlist: config.RATE_LIMITER_IPS_AND_CIDRS_ALLOWLIST, + capacityMultiplier: 1, + maxBuckets: 50000, + }); +} diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts new file mode 100644 index 000000000..98973c9e0 --- /dev/null +++ b/src/ipfs/ipfs-service.ts @@ -0,0 +1,201 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import fs from 'node:fs'; +import crypto from 'node:crypto'; +import { Readable } from 'node:stream'; +import winston from 'winston'; + +import { cidToV1Base32 } from '../lib/ipfs-cid.js'; +import { IpfsFsCache } from './ipfs-cache.js'; +import { IpfsBlocklist } from './ipfs-blocklist.js'; +import { + KuboDataSource, + IpfsBlockedError, + IpfsNotFoundError, +} from './kubo-data-source.js'; +import * as metrics from '../metrics.js'; + +export interface IpfsGetContentResult { + stream: Readable; + size: number; + contentType: string; + cached: boolean; +} + +export class IpfsService { + private log: winston.Logger; + private dataSource: KuboDataSource; + private cache: IpfsFsCache; + private blocklist: IpfsBlocklist; + + constructor({ + log, + dataSource, + cache, + blocklist, + }: { + log: winston.Logger; + dataSource: KuboDataSource; + cache: IpfsFsCache; + blocklist: IpfsBlocklist; + }) { + this.log = log.child({ class: this.constructor.name }); + this.dataSource = dataSource; + this.cache = cache; + this.blocklist = blocklist; + } + + async getContent({ + cidString, + path, + signal, + }: { + cidString: string; + path?: string; + signal?: AbortSignal; + }): Promise { + // Normalize CID to v1 base32 for consistent caching + const normalizedCid = cidToV1Base32(cidString); + + // Check blocklist + if (this.blocklist.isBlocked(normalizedCid)) { + metrics.ipfsBlockedTotal.inc(); + throw new IpfsBlockedError(`CID is blocked: ${normalizedCid}`); + } + + // Reject path traversal attempts + if (path && (path.includes('..') || path.startsWith('/'))) { + throw new IpfsNotFoundError('Invalid IPFS path'); + } + + // Check cache + const cached = await this.cache.get(normalizedCid, path); + if (cached) { + this.log.debug('IPFS cache hit', { cid: normalizedCid, path }); + metrics.ipfsCacheHitTotal.inc(); + return { + stream: cached.stream, + size: cached.size, + contentType: cached.contentType, + cached: true, + }; + } + + metrics.ipfsCacheMissTotal.inc(); + + // Fetch from Kubo + const result = await this.dataSource.getContent({ + cidString: normalizedCid, + path, + signal, + }); + + // Stream directly to the client while writing to a temp file on disk + // for caching. No memory buffering — handles files of any size. + this.streamToCache(normalizedCid, path, result.stream, result.contentType); + + return { + stream: result.stream, + size: result.size, + contentType: result.contentType, + cached: false, + }; + } + + /** + * Writes stream data to a temp cache file as it flows to the client. + * Non-blocking — errors are logged but don't affect the response. + * Buffers early chunks in memory until the write stream is ready, + * then flushes them to disk. + */ + private streamToCache( + cidString: string, + path: string | undefined, + stream: Readable, + contentType: string, + ): void { + const cacheDir = `${this.cache.getCachePath()}/tmp`; + const tempPath = `${cacheDir}/${crypto.randomBytes(16).toString('hex')}`; + let writeStream: fs.WriteStream | null = null; + let bytesWritten = 0; + let failed = false; + const pendingChunks: Buffer[] = []; + + const cleanup = () => { + if (writeStream) { + writeStream.destroy(); + writeStream = null; + } + pendingChunks.length = 0; + fs.promises.unlink(tempPath).catch(() => {}); + }; + + // Create temp directory and write stream + fs.promises + .mkdir(cacheDir, { recursive: true }) + .then(() => { + if (failed) return; + writeStream = fs.createWriteStream(tempPath); + writeStream.on('error', (error) => { + failed = true; + this.log.error('Cache write stream error', { + cid: cidString, + message: error.message, + }); + cleanup(); + }); + // Flush any chunks that arrived before writeStream was ready + for (const chunk of pendingChunks) { + writeStream.write(chunk); + } + pendingChunks.length = 0; + }) + .catch((error) => { + failed = true; + this.log.error('Failed to create cache temp dir', { + message: error.message, + }); + }); + + stream.on('data', (chunk: Buffer) => { + if (failed) return; + bytesWritten += chunk.length; + if (writeStream) { + writeStream.write(chunk); + } else { + // Buffer until writeStream is ready (typically only first 1-2 chunks) + pendingChunks.push(chunk); + } + }); + + stream.on('end', () => { + if (failed || !writeStream) { + // If writeStream never became ready, discard + cleanup(); + return; + } + writeStream.end(() => { + // Finalize: move temp file into cache + this.cache + .putFromFile(cidString, tempPath, bytesWritten, contentType, path) + .catch((error) => { + this.log.error('Failed to finalize IPFS cache entry', { + cid: cidString, + path, + message: error.message, + }); + cleanup(); + }); + }); + }); + + stream.on('error', () => { + failed = true; + cleanup(); + }); + } +} diff --git a/src/ipfs/kubo-data-source.test.ts b/src/ipfs/kubo-data-source.test.ts new file mode 100644 index 000000000..8a9ff690b --- /dev/null +++ b/src/ipfs/kubo-data-source.test.ts @@ -0,0 +1,105 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { describe, it, beforeEach } from 'node:test'; +import { strict as assert } from 'node:assert'; + +import { createTestLogger } from '../../test/test-logger.js'; +import { + KuboDataSource, + IpfsNotFoundError, + IpfsTimeoutError, + IpfsUnavailableError, +} from './kubo-data-source.js'; + +describe('KuboDataSource', () => { + const log = createTestLogger({ suite: 'KuboDataSource' }); + + let kuboDataSource: KuboDataSource; + + beforeEach(() => { + kuboDataSource = new KuboDataSource({ + log, + kuboUrl: 'http://localhost:8080', + requestTimeoutMs: 5000, + streamStallTimeoutMs: 5000, + }); + }); + + describe('getContent', () => { + it('constructs correct URL for bare CID', async () => { + // This test verifies URL construction without making a real request. + // A real integration test would require a running Kubo instance. + const controller = new AbortController(); + controller.abort(); // Abort immediately + + try { + await kuboDataSource.getContent({ + cidString: + 'bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi', + signal: controller.signal, + }); + } catch (error: any) { + // Expected to throw due to abort + assert.ok(error); + } + }); + + it('constructs correct URL for CID with path', async () => { + const controller = new AbortController(); + controller.abort(); + + try { + await kuboDataSource.getContent({ + cidString: + 'bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi', + path: 'images/logo.png', + signal: controller.signal, + }); + } catch (error: any) { + assert.ok(error); + } + }); + + it('throws AbortError when signal is already aborted', async () => { + const controller = new AbortController(); + controller.abort(); + + await assert.rejects( + () => + kuboDataSource.getContent({ + cidString: + 'bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi', + signal: controller.signal, + }), + (error: any) => { + assert.ok( + error.name === 'AbortError' || error.code === 'ERR_CANCELED', + ); + return true; + }, + ); + }); + }); + + describe('error types', () => { + it('IpfsNotFoundError has correct name', () => { + const error = new IpfsNotFoundError('not found'); + assert.equal(error.name, 'IpfsNotFoundError'); + assert.equal(error.message, 'not found'); + }); + + it('IpfsTimeoutError has correct name', () => { + const error = new IpfsTimeoutError('timeout'); + assert.equal(error.name, 'IpfsTimeoutError'); + }); + + it('IpfsUnavailableError has correct name', () => { + const error = new IpfsUnavailableError('unavailable'); + assert.equal(error.name, 'IpfsUnavailableError'); + }); + }); +}); diff --git a/src/ipfs/kubo-data-source.ts b/src/ipfs/kubo-data-source.ts new file mode 100644 index 000000000..4bdf01d69 --- /dev/null +++ b/src/ipfs/kubo-data-source.ts @@ -0,0 +1,199 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { default as axios } from 'axios'; +import { Readable } from 'node:stream'; +import winston from 'winston'; + +import { attachStallTimeout } from '../lib/stream.js'; + +export interface IpfsContentResult { + stream: Readable; + size: number; + contentType: string; +} + +export class KuboDataSource { + private log: winston.Logger; + private kuboUrl: string; + private requestTimeoutMs: number; + private streamStallTimeoutMs: number; + + constructor({ + log, + kuboUrl, + requestTimeoutMs, + streamStallTimeoutMs, + }: { + log: winston.Logger; + kuboUrl: string; + requestTimeoutMs: number; + streamStallTimeoutMs: number; + }) { + this.log = log.child({ class: this.constructor.name }); + this.kuboUrl = kuboUrl.replace(/\/$/, ''); + this.requestTimeoutMs = requestTimeoutMs; + this.streamStallTimeoutMs = streamStallTimeoutMs; + } + + async getContent({ + cidString, + path, + signal, + }: { + cidString: string; + path?: string; + signal?: AbortSignal; + }): Promise { + signal?.throwIfAborted(); + + const ipfsPath = path ? `${cidString}/${path}` : cidString; + const url = `${this.kuboUrl}/ipfs/${ipfsPath}`; + + this.log.debug('Fetching IPFS content from Kubo', { + cidString, + path, + url, + }); + + // Connection-phase timeout + const controller = new AbortController(); + const connectionTimer = setTimeout(() => { + controller.abort(new Error('Kubo connection timeout')); + }, this.requestTimeoutMs); + + // Forward client abort to our controller + const onClientAbort = () => controller.abort(signal?.reason); + if (signal?.aborted) { + onClientAbort(); + } else if (signal) { + signal.addEventListener('abort', onClientAbort, { once: true }); + } + + try { + const response = await axios.get(url, { + responseType: 'stream', + signal: controller.signal, + headers: { + 'Accept-Encoding': 'identity', + }, + maxRedirects: 5, + // Accept non-2xx so we can handle 404/408/504 ourselves + validateStatus: (status) => status < 500 || status === 504, + }); + + clearTimeout(connectionTimer); + signal?.removeEventListener('abort', onClientAbort); + + if (response.status === 404) { + throw new IpfsNotFoundError( + `IPFS content not found: /ipfs/${ipfsPath}`, + ); + } + + if (response.status === 408 || response.status === 504) { + throw new IpfsTimeoutError( + `Kubo timed out resolving: /ipfs/${ipfsPath}`, + ); + } + + if (response.status !== 200) { + const stream = response.data as Readable; + stream.destroy(); + throw new Error( + `Unexpected Kubo response status: ${response.status} for /ipfs/${ipfsPath}`, + ); + } + + const stream = response.data as Readable; + const contentLength = parseInt( + response.headers['content-length'] ?? '0', + 10, + ); + const contentType = + response.headers['content-type'] ?? 'application/octet-stream'; + + // Switch from connection timeout to stall timeout + attachStallTimeout(stream, this.streamStallTimeoutMs); + + this.log.debug('Kubo fetch successful', { + cidString, + path, + contentLength, + contentType, + }); + + return { + stream, + size: contentLength, + contentType, + }; + } catch (error: any) { + clearTimeout(connectionTimer); + signal?.removeEventListener('abort', onClientAbort); + + if (error instanceof IpfsNotFoundError) throw error; + if (error instanceof IpfsTimeoutError) throw error; + + if (error.name === 'AbortError' || error.code === 'ERR_CANCELED') { + if (signal?.aborted) { + throw error; // Client disconnected + } + throw new IpfsTimeoutError( + `Kubo request timed out for /ipfs/${ipfsPath}`, + ); + } + + if (error.code === 'ECONNREFUSED') { + throw new IpfsUnavailableError( + `Kubo service unavailable at ${this.kuboUrl}`, + ); + } + + this.log.error('Failed to fetch from Kubo', { + cidString, + path, + message: error.message, + }); + throw error; + } + } +} + +export class IpfsNotFoundError extends Error { + constructor(message: string) { + super(message); + this.name = 'IpfsNotFoundError'; + } +} + +export class IpfsTimeoutError extends Error { + constructor(message: string) { + super(message); + this.name = 'IpfsTimeoutError'; + } +} + +export class IpfsUnavailableError extends Error { + constructor(message: string) { + super(message); + this.name = 'IpfsUnavailableError'; + } +} + +export class IpfsBlockedError extends Error { + constructor(message: string) { + super(message); + this.name = 'IpfsBlockedError'; + } +} + +export class IpfsSizeLimitError extends Error { + constructor(message: string) { + super(message); + this.name = 'IpfsSizeLimitError'; + } +} diff --git a/src/lib/httpsig.ts b/src/lib/httpsig.ts index 34c61c296..fecf9b1f3 100644 --- a/src/lib/httpsig.ts +++ b/src/lib/httpsig.ts @@ -46,6 +46,9 @@ export const TRIGGER_HEADERS = new Set([ 'x-arweave-chunk-data-root', 'x-arweave-chunk-tx-id', 'x-ar-io-chunk-source-type', + // IPFS headers — presence of x-ipfs-path triggers signing for IPFS responses + 'x-ipfs-path', + 'x-ar-io-source', ]); /** @@ -53,7 +56,12 @@ export const TRIGGER_HEADERS = new Set([ * when at least one TRIGGER_HEADER is also present. Signing them alone is too * broad — nearly every response has a Content-Type. */ -export const CO_SIGNABLE_HEADERS = new Set(['content-type', 'content-digest']); +export const CO_SIGNABLE_HEADERS = new Set([ + 'content-type', + 'content-digest', + 'x-cache', + 'etag', +]); // Header predicates normalize case defensively, but callers should pass // lowercase where possible. Express's `res.getHeaders()` keys are already diff --git a/src/lib/ipfs-cid.ts b/src/lib/ipfs-cid.ts new file mode 100644 index 000000000..96c0350cd --- /dev/null +++ b/src/lib/ipfs-cid.ts @@ -0,0 +1,60 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { CID } from 'multiformats/cid'; +import { base32 } from 'multiformats/bases/base32'; +import { base58btc } from 'multiformats/bases/base58'; + +/** + * Safely parse a CID string (v0 or v1). Returns null on invalid input. + */ +export function parseCid(cidString: string): CID | null { + try { + return CID.parse(cidString); + } catch { + return null; + } +} + +/** + * Quick validation: attempts parse, returns boolean. + */ +export function isValidCid(cidString: string): boolean { + return parseCid(cidString) !== null; +} + +/** + * Check if a CID is version 0 (starts with `Qm`, base58btc, dag-pb codec). + */ +export function isCidV0(cid: CID): boolean { + return cid.version === 0; +} + +/** + * Convert any CID string to CIDv1 base32lower (DNS-safe). + * Returns the base32lower string representation. + * Throws if the input is not a valid CID. + */ +export function cidToV1Base32(cidString: string): string { + const cid = CID.parse(cidString); + if (cid.version === 0) { + return cid.toV1().toString(base32); + } + return cid.toString(base32); +} + +/** + * Canonical string representation of a CID. + * v0 → base58btc, v1 → base32lower. + */ +export function cidToString(cid: CID): string { + if (cid.version === 0) { + return cid.toString(base58btc); + } + return cid.toString(base32); +} + +export { CID }; diff --git a/src/metrics.ts b/src/metrics.ts index d937bf9df..66740d39c 100644 --- a/src/metrics.ts +++ b/src/metrics.ts @@ -929,3 +929,41 @@ export const httpSigErrorsTotal = new promClient.Counter({ name: 'httpsig_errors_total', help: 'Total HTTPSIG signing errors', }); + +// +// IPFS metrics +// + +export const ipfsRequestsTotal = new promClient.Counter({ + name: 'ipfs_requests_total', + help: 'Total IPFS content requests', + labelNames: ['route_type', 'status'] as const, +}); + +export const ipfsCacheHitTotal = new promClient.Counter({ + name: 'ipfs_cache_hit_total', + help: 'IPFS content cache hits', +}); + +export const ipfsCacheMissTotal = new promClient.Counter({ + name: 'ipfs_cache_miss_total', + help: 'IPFS content cache misses', +}); + +export const ipfsContentSizeHistogram = new promClient.Histogram({ + name: 'ipfs_content_size_bytes', + help: 'Distribution of IPFS content sizes', + buckets: [1024, 102400, 1048576, 10485760, 104857600], +}); + +export const ipfsRequestDurationHistogram = new promClient.Histogram({ + name: 'ipfs_request_duration_seconds', + help: 'Duration of IPFS content requests', + labelNames: ['route_type', 'cache_status'] as const, + buckets: [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], +}); + +export const ipfsBlockedTotal = new promClient.Counter({ + name: 'ipfs_blocked_total', + help: 'IPFS requests blocked by CID blocklist', +}); diff --git a/src/middleware/ipfs.ts b/src/middleware/ipfs.ts new file mode 100644 index 000000000..980c7e705 --- /dev/null +++ b/src/middleware/ipfs.ts @@ -0,0 +1,96 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { Handler, Request, Response, NextFunction } from 'express'; + +import * as config from '../config.js'; +import { isValidCid, cidToV1Base32 } from '../lib/ipfs-cid.js'; + +/** + * Middleware that intercepts `{CID}.{root_host}` subdomain requests. + * Must be mounted BEFORE the ArNS middleware to prevent ArNS from attempting + * to resolve CIDs as ArNS names. + * + * CIDv1 base32 strings are ~59 characters — always longer than the ArNS + * name limit (51 chars), so there's no collision with ArNS names. + * This also avoids needing a multi-level wildcard TLS certificate + * (*.ipfs.host would require a separate cert from *.host). + * + * Express subdomain array for `bafyabc.my-gateway.io` (root=my-gateway.io): + * req.subdomains = ['bafyabc'] (single subdomain) + * req.subdomains[0] = CID + */ +export function createIpfsSubdomainMiddleware({ + ipfsHandler, +}: { + ipfsHandler: Handler; +}): Handler { + return (req: Request, res: Response, next: NextFunction) => { + if (!config.IPFS_ENABLED || config.ARNS_ROOT_HOSTS.length === 0) { + next(); + return; + } + + const matchedEntry = config.matchArnsRootHost(req.hostname); + if (matchedEntry === undefined) { + next(); + return; + } + + // For {CID}.{root_host}, we expect exactly 1 subdomain beyond + // the root host's own subdomain depth. + const cidLabelIndex = matchedEntry.subdomainLength; + + if ( + !Array.isArray(req.subdomains) || + req.subdomains.length !== cidLabelIndex + 1 + ) { + next(); + return; + } + + const cidLabel = req.subdomains[cidLabelIndex]; + if (!isValidCid(cidLabel)) { + // Not a CID — let ArNS handle it (likely an ArNS name) + next(); + return; + } + + // Attach IPFS context to request + (req as any).ipfsCid = cidLabel; + + // Handle /ipfs/ paths on subdomain requests. + // Kubo's directory listings generate absolute links like /ipfs/{CID}/file. + let reqPath = req.path === '/' ? undefined : req.path.slice(1); + if (reqPath && reqPath.startsWith('ipfs/')) { + const afterIpfs = reqPath.slice(5); // strip 'ipfs/' + const slashIdx = afterIpfs.indexOf('/'); + const pathCid = slashIdx >= 0 ? afterIpfs.slice(0, slashIdx) : afterIpfs; + const remainder = slashIdx >= 0 ? afterIpfs.slice(slashIdx + 1) : undefined; + + if (pathCid === cidLabel) { + // Same CID — strip the redundant prefix + reqPath = remainder || undefined; + } else if (isValidCid(pathCid)) { + // Different CID — redirect to that CID's subdomain + try { + const targetCid = cidToV1Base32(pathCid); + const rootHost = matchedEntry.host; + const pathSuffix = remainder ? `/${remainder}` : '/'; + const protocol = req.protocol; + res.redirect(302, `${protocol}://${targetCid}.${rootHost}${pathSuffix}`); + return; + } catch { + // CID conversion failed — fall through to handler + } + } + } + (req as any).ipfsPath = reqPath; + + // Delegate to the IPFS handler + ipfsHandler(req, res, next); + }; +} diff --git a/src/routes/ipfs.ts b/src/routes/ipfs.ts new file mode 100644 index 000000000..43b0b576c --- /dev/null +++ b/src/routes/ipfs.ts @@ -0,0 +1,309 @@ +/** + * AR.IO Gateway + * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { Router, Request, Response, Handler } from 'express'; +import { default as asyncHandler } from 'express-async-handler'; +import winston from 'winston'; + +import * as config from '../config.js'; +import * as metrics from '../metrics.js'; +import { + cidToV1Base32, + isValidCid, + parseCid, + isCidV0, +} from '../lib/ipfs-cid.js'; +import { IpfsService } from '../ipfs/ipfs-service.js'; +import { + IpfsBlockedError, + IpfsNotFoundError, + IpfsTimeoutError, + IpfsUnavailableError, +} from '../ipfs/kubo-data-source.js'; +import { RateLimiter } from '../limiter/types.js'; +import { + checkPaymentAndRateLimits, + adjustRateLimitTokens, +} from '../handlers/data-handler-utils.js'; +import { PaymentProcessor } from '../payments/types.js'; +import { extractAllClientIPs } from '../lib/ip-utils.js'; + +export function createIpfsRouter({ + log, + ipfsService, + rateLimiter, + paymentProcessor, +}: { + log: winston.Logger; + ipfsService: IpfsService; + rateLimiter?: RateLimiter; + paymentProcessor?: PaymentProcessor; +}): Router { + const router = Router(); + const handler = createIpfsPathHandler({ + log, + ipfsService, + rateLimiter, + paymentProcessor, + }); + + router.get('/ipfs/:cid', handler); + router.get('/ipfs/:cid/*', handler); + + return router; +} + +export function createIpfsHandler({ + log, + ipfsService, + rateLimiter, + paymentProcessor, +}: { + log: winston.Logger; + ipfsService: IpfsService; + rateLimiter?: RateLimiter; + paymentProcessor?: PaymentProcessor; +}): Handler { + return asyncHandler(async (req: Request, res: Response) => { + const cidString = (req as any).ipfsCid as string; + const path = (req as any).ipfsPath as string | undefined; + await handleIpfsRequest({ + req, + res, + cidString, + path, + log, + ipfsService, + rateLimiter, + paymentProcessor, + routeType: 'subdomain', + }); + }); +} + +function createIpfsPathHandler({ + log, + ipfsService, + rateLimiter, + paymentProcessor, +}: { + log: winston.Logger; + ipfsService: IpfsService; + rateLimiter?: RateLimiter; + paymentProcessor?: PaymentProcessor; +}): Handler { + return asyncHandler(async (req: Request, res: Response) => { + const cidString = req.params.cid; + // Express puts wildcard content in req.params[0] + const path = req.params[0] || undefined; + + if (!isValidCid(cidString)) { + res.status(400).json({ error: 'Invalid CID' }); + return; + } + + // Redirect CIDv0 to CIDv1 subdomain if ArNS root hosts are configured. + // Uses {CID}.{host} (same level as ArNS names) — no .ipfs. label needed + // since CIDv1 base32 is always >51 chars (won't collide with ArNS names) + // and works with standard *.{host} wildcard TLS certificates. + const cid = parseCid(cidString); + if (cid && isCidV0(cid) && config.ARNS_ROOT_HOSTS.length > 0) { + const v1Base32 = cidToV1Base32(cidString); + const rootHost = config.ARNS_ROOT_HOSTS[0].host; + const pathSuffix = path ? `/${path}` : ''; + const protocol = req.protocol; + res.redirect( + 302, + `${protocol}://${v1Base32}.${rootHost}${pathSuffix}`, + ); + return; + } + + await handleIpfsRequest({ + req, + res, + cidString, + path, + log, + ipfsService, + rateLimiter, + paymentProcessor, + routeType: 'path', + }); + }); +} + +async function handleIpfsRequest({ + req, + res, + cidString, + path, + log: parentLog, + ipfsService, + rateLimiter, + paymentProcessor, + routeType, +}: { + req: Request; + res: Response; + cidString: string; + path: string | undefined; + log: winston.Logger; + ipfsService: IpfsService; + rateLimiter?: RateLimiter; + paymentProcessor?: PaymentProcessor; + routeType: 'path' | 'subdomain'; +}): Promise { + const startTime = Date.now(); + const ipfsPath = path ? `${cidString}/${path}` : cidString; + + parentLog.debug('Handling IPFS request', { cidString, path, routeType }); + + try { + const result = await ipfsService.getContent({ + cidString, + path, + signal: req.signal, + }); + + // Check payment and rate limits (x402 + rate limiting in one call). + // Content size is needed for token calculation and payment pricing. + const contentSize = result.size > 0 ? result.size : 1024; // min 1KB for pricing + const limitCheck = await checkPaymentAndRateLimits({ + req, + res, + id: cidToV1Base32(cidString), + contentSize, + contentType: result.contentType, + requestAttributes: { hops: 0, clientIps: extractAllClientIPs(req).clientIps }, + rateLimiter, + paymentProcessor, + }); + + if (!limitCheck.allowed) { + // Response already sent (402 or 429) by checkPaymentAndRateLimits + result.stream.destroy(); + metrics.ipfsRequestsTotal.inc({ + route_type: routeType, + status: 'rate_limited', + }); + return; + } + + // Set response headers + res.setHeader('Content-Type', result.contentType); + if (result.size > 0) { + res.setHeader('Content-Length', result.size); + } + // CIDs are content-addressed — content never changes + res.setHeader('Cache-Control', 'public, max-age=29030400, immutable'); + res.setHeader('ETag', `"${cidToV1Base32(cidString)}"`); + res.setHeader('X-Ipfs-Path', `/ipfs/${ipfsPath}`); + res.setHeader('X-Ar-Io-Source', 'ipfs'); + + if (result.cached) { + res.setHeader('X-Cache', 'HIT'); + } else { + res.setHeader('X-Cache', 'MISS'); + } + + // Track metrics + const cacheStatus = result.cached ? 'hit' : 'miss'; + metrics.ipfsRequestsTotal.inc({ route_type: routeType, status: 'success' }); + if (result.size > 0) { + metrics.ipfsContentSizeHistogram.observe(result.size); + } + + // Pipe stream to response + result.stream.pipe(res); + + // Adjust rate limiter tokens after response completes + res.on('finish', () => { + const durationSec = (Date.now() - startTime) / 1000; + metrics.ipfsRequestDurationHistogram.observe( + { route_type: routeType, cache_status: cacheStatus }, + durationSec, + ); + + adjustRateLimitTokens({ + req, + responseSize: result.size > 0 ? result.size : contentSize, + initialResult: limitCheck, + rateLimiter, + }).catch((error) => { + parentLog.error('Failed to adjust rate limit tokens', { + message: error.message, + }); + }); + }); + + result.stream.on('error', (error) => { + parentLog.error('IPFS stream error', { + cidString, + path, + message: error.message, + }); + if (!res.headersSent) { + res.status(502).json({ error: 'IPFS stream failed' }); + } else { + res.destroy(); + } + }); + } catch (error: any) { + if (error instanceof IpfsBlockedError) { + metrics.ipfsRequestsTotal.inc({ + route_type: routeType, + status: 'blocked', + }); + res.setHeader( + 'Cache-Control', + `public, max-age=${config.CACHE_BLOCKED_MAX_AGE}, immutable`, + ); + res.status(451).json({ error: 'Content blocked' }); + return; + } + + if (error instanceof IpfsNotFoundError) { + metrics.ipfsRequestsTotal.inc({ + route_type: routeType, + status: 'not_found', + }); + res.status(404).json({ error: 'IPFS content not found' }); + return; + } + + if (error instanceof IpfsTimeoutError) { + metrics.ipfsRequestsTotal.inc({ + route_type: routeType, + status: 'timeout', + }); + res.status(504).json({ error: 'IPFS request timed out' }); + return; + } + + if (error instanceof IpfsUnavailableError) { + metrics.ipfsRequestsTotal.inc({ + route_type: routeType, + status: 'unavailable', + }); + res.status(502).json({ error: 'IPFS service unavailable' }); + return; + } + + // Client disconnected + if (error.name === 'AbortError') { + return; + } + + metrics.ipfsRequestsTotal.inc({ route_type: routeType, status: 'error' }); + parentLog.error('IPFS request failed', { + cidString, + path, + message: error.message, + }); + res.status(500).json({ error: 'Internal server error' }); + } +} diff --git a/src/system.ts b/src/system.ts index 188dd365c..d3a3b943e 100644 --- a/src/system.ts +++ b/src/system.ts @@ -1456,6 +1456,60 @@ if (dataVerificationWorker !== undefined) { dataVerificationWorker.start(); } +// +// IPFS subsystem (conditionally initialized) +// + +import { KuboDataSource } from './ipfs/kubo-data-source.js'; +import { IpfsFsCache } from './ipfs/ipfs-cache.js'; +import { IpfsBlocklist } from './ipfs/ipfs-blocklist.js'; +import { IpfsService } from './ipfs/ipfs-service.js'; +import { createIpfsRateLimiter } from './ipfs/ipfs-rate-limiter.js'; +import { RateLimiter } from './limiter/types.js'; + +export let ipfsService: IpfsService | undefined; +export let ipfsRateLimiter: RateLimiter | undefined; +export let ipfsBlocklist: IpfsBlocklist | undefined; + +if (config.IPFS_ENABLED) { + log.info('IPFS subsystem enabled, initializing...'); + + const kuboDataSource = new KuboDataSource({ + log, + kuboUrl: config.IPFS_KUBO_URL, + requestTimeoutMs: config.IPFS_KUBO_REQUEST_TIMEOUT_MS, + streamStallTimeoutMs: config.IPFS_STREAM_STALL_TIMEOUT_MS, + }); + + const ipfsCache = new IpfsFsCache({ + log, + basePath: config.IPFS_CACHE_PATH, + maxSizeBytes: config.IPFS_CACHE_MAX_SIZE_BYTES, + }); + + ipfsBlocklist = new IpfsBlocklist({ + log, + filePath: config.IPFS_BLOCKLIST_PATH, + }); + await ipfsBlocklist.load(); + ipfsBlocklist.startWatching(); + + ipfsService = new IpfsService({ + log, + dataSource: kuboDataSource, + cache: ipfsCache, + blocklist: ipfsBlocklist, + }); + + ipfsRateLimiter = createIpfsRateLimiter(); + + log.info('IPFS subsystem initialized', { + kuboUrl: config.IPFS_KUBO_URL, + cachePath: config.IPFS_CACHE_PATH, + maxCacheSize: config.IPFS_CACHE_MAX_SIZE_BYTES, + }); +} + export const blockedNamesCache = new BlockedNamesCache({ log, cacheTTL: 3600, @@ -1487,6 +1541,7 @@ export const shutdown = async (exitCode = 0) => { } // Clean up system components + ipfsBlocklist?.stop(); eventEmitter.removeAllListeners(); arIOPeerManager.stopUpdatingPeers(); dataSqliteWalCleanupWorker?.stop(); diff --git a/test-ipfs.sh b/test-ipfs.sh new file mode 100755 index 000000000..6f262e563 --- /dev/null +++ b/test-ipfs.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# End-to-end IPFS integration test script +# Run this after starting the gateway with IPFS_ENABLED=true and Kubo running + +GATEWAY_URL="${1:-http://localhost:4000}" +KUBO_URL="${2:-http://localhost:8080}" + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[0;33m' +NC='\033[0m' + +pass=0 +fail=0 + +test_case() { + local name="$1" + local expected_status="$2" + local url="$3" + local extra_args="${4:-}" + + local result + result=$(curl -s -o /tmp/ipfs-test-body -w "%{http_code}" --max-time 30 $extra_args "$url" 2>&1) + + if [ "$result" = "$expected_status" ]; then + echo -e " ${GREEN}PASS${NC} $name (HTTP $result)" + ((pass++)) + else + echo -e " ${RED}FAIL${NC} $name — expected $expected_status, got $result" + echo " URL: $url" + echo " Body: $(head -c 200 /tmp/ipfs-test-body)" + ((fail++)) + fi +} + +test_body_contains() { + local name="$1" + local expected_text="$2" + local url="$3" + + local body + body=$(curl -s --max-time 30 "$url" 2>&1) + local status=$? + + if echo "$body" | grep -q "$expected_text"; then + echo -e " ${GREEN}PASS${NC} $name" + ((pass++)) + else + echo -e " ${RED}FAIL${NC} $name — body doesn't contain '$expected_text'" + echo " Body: $(echo "$body" | head -c 200)" + ((fail++)) + fi +} + +test_header() { + local name="$1" + local header="$2" + local expected_value="$3" + local url="$4" + + local actual + actual=$(curl -s -I --max-time 30 "$url" 2>&1 | grep -i "^$header:" | head -1 | sed 's/^[^:]*: //' | tr -d '\r') + + if echo "$actual" | grep -qi "$expected_value"; then + echo -e " ${GREEN}PASS${NC} $name ($actual)" + ((pass++)) + else + echo -e " ${RED}FAIL${NC} $name — expected header '$header' to contain '$expected_value', got '$actual'" + ((fail++)) + fi +} + +echo "=========================================" +echo " AR.IO IPFS Integration — E2E Tests" +echo "=========================================" +echo "" +echo "Gateway: $GATEWAY_URL" +echo "Kubo: $KUBO_URL" +echo "" + +# --- Pre-flight: ensure Kubo is running --- +echo "--- Pre-flight ---" +kubo_status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$KUBO_URL/ipfs/QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn" 2>&1) +if [ "$kubo_status" = "000" ]; then + echo -e " ${RED}FAIL${NC} Kubo not reachable at $KUBO_URL" + exit 1 +fi +echo -e " ${GREEN}OK${NC} Kubo is reachable" + +# Add test content to Kubo +echo "" +echo "--- Adding test content to Kubo ---" +FILE_CID=$(echo "Hello from AR.IO IPFS integration test!" | docker exec -i ar-io-node-kubo-1 ipfs add -q 2>&1) +echo " File CID: $FILE_CID" + +DIR_CID=$(docker exec ar-io-node-kubo-1 sh -c ' + mkdir -p /tmp/e2e-test + echo "

E2E Test

" > /tmp/e2e-test/index.html + echo "subfile content" > /tmp/e2e-test/sub.txt + ipfs add -r -q /tmp/e2e-test | tail -1 +' 2>&1) +echo " Dir CID: $DIR_CID" +echo "" + +# --- Test 1: Path-based single file --- +echo "--- Path-based access ---" +test_body_contains "GET /ipfs/{CID} serves file content" \ + "Hello from AR.IO IPFS integration test" \ + "$GATEWAY_URL/ipfs/$FILE_CID" + +# --- Test 2: Path-based directory with path --- +test_body_contains "GET /ipfs/{CID}/index.html serves directory file" \ + "E2E Test" \ + "$GATEWAY_URL/ipfs/$DIR_CID/index.html" + +test_body_contains "GET /ipfs/{CID}/sub.txt serves subfile" \ + "subfile content" \ + "$GATEWAY_URL/ipfs/$DIR_CID/sub.txt" + +# --- Test 3: Invalid CID --- +test_case "GET /ipfs/invalid-cid returns 400" \ + "400" \ + "$GATEWAY_URL/ipfs/not-a-valid-cid" + +# --- Test 4: Response headers --- +echo "" +echo "--- Response headers ---" +test_header "Cache-Control is immutable" \ + "cache-control" "immutable" \ + "$GATEWAY_URL/ipfs/$FILE_CID" + +test_header "X-Ipfs-Path header present" \ + "x-ipfs-path" "/ipfs/" \ + "$GATEWAY_URL/ipfs/$FILE_CID" + +test_header "X-Cache header present" \ + "x-cache" "" \ + "$GATEWAY_URL/ipfs/$FILE_CID" + +test_header "Content-Type is set" \ + "content-type" "" \ + "$GATEWAY_URL/ipfs/$FILE_CID" + +# --- Test 5: CIDv0 redirect to CIDv1 subdomain --- +echo "" +echo "--- CIDv0 redirect ---" +redirect_location=$(curl -s -o /dev/null -w "%{redirect_url}" --max-time 10 "$GATEWAY_URL/ipfs/$FILE_CID" 2>&1) +if echo "$redirect_location" | grep -q "ipfs"; then + echo -e " ${GREEN}PASS${NC} CIDv0 redirects to CIDv1 subdomain ($redirect_location)" + ((pass++)) +elif [ -z "$redirect_location" ]; then + echo -e " ${YELLOW}SKIP${NC} No redirect (CID may already be v1 or no ARNS_ROOT_HOST)" +else + echo -e " ${YELLOW}INFO${NC} Redirect: $redirect_location" +fi + +# --- Test 6: Second request should be cached --- +echo "" +echo "--- Caching ---" +# First request (cache miss) +curl -s -o /dev/null "$GATEWAY_URL/ipfs/$FILE_CID" 2>/dev/null +# Second request (should be cache hit) +cache_header=$(curl -s -I --max-time 10 "$GATEWAY_URL/ipfs/$FILE_CID" 2>&1 | grep -i "^x-cache:" | sed 's/^[^:]*: //' | tr -d '\r') +echo -e " ${GREEN}INFO${NC} X-Cache on second request: ${cache_header:-'(not set)'}" + +# --- Summary --- +echo "" +echo "=========================================" +echo -e " Results: ${GREEN}$pass passed${NC}, ${RED}$fail failed${NC}" +echo "=========================================" + +exit $fail From 7a8f3229d1631fa4b45813ae840fa6b919a99a95 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 00:56:06 +0000 Subject: [PATCH 02/11] fix: lint errors in IPFS files (PE-9067) Fix strict-boolean-expressions (explicit nullish checks) and prettier formatting issues caught by CI eslint. --- src/ipfs/ipfs-cache.ts | 7 +++++-- src/ipfs/ipfs-service.ts | 5 ++++- src/ipfs/kubo-data-source.ts | 5 ++++- src/middleware/ipfs.ts | 22 ++++++++++++++-------- src/routes/ipfs.ts | 18 +++++++++--------- 5 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/ipfs/ipfs-cache.ts b/src/ipfs/ipfs-cache.ts index ccf6be052..c7847c1b0 100644 --- a/src/ipfs/ipfs-cache.ts +++ b/src/ipfs/ipfs-cache.ts @@ -47,7 +47,8 @@ export class IpfsFsCache { } private cacheKey(cidString: string, path?: string): string { - const raw = path ? `${cidString}/${path}` : cidString; + const raw = + path !== undefined && path !== '' ? `${cidString}/${path}` : cidString; return crypto.createHash('sha256').update(raw).digest('hex'); } @@ -95,7 +96,9 @@ export class IpfsFsCache { async get( cidString: string, path?: string, - ): Promise<{ stream: Readable; size: number; contentType: string } | undefined> { + ): Promise< + { stream: Readable; size: number; contentType: string } | undefined + > { const key = this.cacheKey(cidString, path); let entry = this.index.get(key); diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts index 98973c9e0..d7219f46d 100644 --- a/src/ipfs/ipfs-service.ts +++ b/src/ipfs/ipfs-service.ts @@ -68,7 +68,10 @@ export class IpfsService { } // Reject path traversal attempts - if (path && (path.includes('..') || path.startsWith('/'))) { + if ( + path !== undefined && + (path.includes('..') || path.startsWith('/')) + ) { throw new IpfsNotFoundError('Invalid IPFS path'); } diff --git a/src/ipfs/kubo-data-source.ts b/src/ipfs/kubo-data-source.ts index 4bdf01d69..817d3e290 100644 --- a/src/ipfs/kubo-data-source.ts +++ b/src/ipfs/kubo-data-source.ts @@ -7,8 +7,10 @@ import { default as axios } from 'axios'; import { Readable } from 'node:stream'; import winston from 'winston'; +import { Span } from '@opentelemetry/api'; import { attachStallTimeout } from '../lib/stream.js'; +import { startChildSpan } from '../tracing.js'; export interface IpfsContentResult { stream: Readable; @@ -50,7 +52,8 @@ export class KuboDataSource { }): Promise { signal?.throwIfAborted(); - const ipfsPath = path ? `${cidString}/${path}` : cidString; + const ipfsPath = + path !== undefined && path !== '' ? `${cidString}/${path}` : cidString; const url = `${this.kuboUrl}/ipfs/${ipfsPath}`; this.log.debug('Fetching IPFS content from Kubo', { diff --git a/src/middleware/ipfs.ts b/src/middleware/ipfs.ts index 980c7e705..48375bc43 100644 --- a/src/middleware/ipfs.ts +++ b/src/middleware/ipfs.ts @@ -64,24 +64,30 @@ export function createIpfsSubdomainMiddleware({ // Handle /ipfs/ paths on subdomain requests. // Kubo's directory listings generate absolute links like /ipfs/{CID}/file. - let reqPath = req.path === '/' ? undefined : req.path.slice(1); - if (reqPath && reqPath.startsWith('ipfs/')) { + let reqPath: string | undefined = + req.path === '/' ? undefined : req.path.slice(1); + if (reqPath !== undefined && reqPath.startsWith('ipfs/')) { const afterIpfs = reqPath.slice(5); // strip 'ipfs/' const slashIdx = afterIpfs.indexOf('/'); - const pathCid = slashIdx >= 0 ? afterIpfs.slice(0, slashIdx) : afterIpfs; - const remainder = slashIdx >= 0 ? afterIpfs.slice(slashIdx + 1) : undefined; + const pathCid = + slashIdx >= 0 ? afterIpfs.slice(0, slashIdx) : afterIpfs; + const remainder = + slashIdx >= 0 ? afterIpfs.slice(slashIdx + 1) : undefined; if (pathCid === cidLabel) { // Same CID — strip the redundant prefix - reqPath = remainder || undefined; + reqPath = remainder !== undefined ? remainder : undefined; } else if (isValidCid(pathCid)) { // Different CID — redirect to that CID's subdomain try { const targetCid = cidToV1Base32(pathCid); const rootHost = matchedEntry.host; - const pathSuffix = remainder ? `/${remainder}` : '/'; - const protocol = req.protocol; - res.redirect(302, `${protocol}://${targetCid}.${rootHost}${pathSuffix}`); + const pathSuffix = + remainder !== undefined ? `/${remainder}` : '/'; + res.redirect( + 302, + `${req.protocol}://${targetCid}.${rootHost}${pathSuffix}`, + ); return; } catch { // CID conversion failed — fall through to handler diff --git a/src/routes/ipfs.ts b/src/routes/ipfs.ts index 43b0b576c..c2d49c071 100644 --- a/src/routes/ipfs.ts +++ b/src/routes/ipfs.ts @@ -110,15 +110,11 @@ function createIpfsPathHandler({ // since CIDv1 base32 is always >51 chars (won't collide with ArNS names) // and works with standard *.{host} wildcard TLS certificates. const cid = parseCid(cidString); - if (cid && isCidV0(cid) && config.ARNS_ROOT_HOSTS.length > 0) { + if (cid !== null && isCidV0(cid) && config.ARNS_ROOT_HOSTS.length > 0) { const v1Base32 = cidToV1Base32(cidString); const rootHost = config.ARNS_ROOT_HOSTS[0].host; - const pathSuffix = path ? `/${path}` : ''; - const protocol = req.protocol; - res.redirect( - 302, - `${protocol}://${v1Base32}.${rootHost}${pathSuffix}`, - ); + const pathSuffix = path !== undefined ? `/${path}` : ''; + res.redirect(302, `${req.protocol}://${v1Base32}.${rootHost}${pathSuffix}`); return; } @@ -158,7 +154,8 @@ async function handleIpfsRequest({ routeType: 'path' | 'subdomain'; }): Promise { const startTime = Date.now(); - const ipfsPath = path ? `${cidString}/${path}` : cidString; + const ipfsPath = + path !== undefined ? `${cidString}/${path}` : cidString; parentLog.debug('Handling IPFS request', { cidString, path, routeType }); @@ -178,7 +175,10 @@ async function handleIpfsRequest({ id: cidToV1Base32(cidString), contentSize, contentType: result.contentType, - requestAttributes: { hops: 0, clientIps: extractAllClientIPs(req).clientIps }, + requestAttributes: { + hops: 0, + clientIps: extractAllClientIPs(req).clientIps, + }, rateLimiter, paymentProcessor, }); From ddd1bc54a46f1f313d77ad768ce02b52cb2df293 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 01:00:15 +0000 Subject: [PATCH 03/11] fix: add OTEL tracing and fix lint errors (PE-9067) Add OpenTelemetry span tracing to IPFS request lifecycle: - IpfsService.getContent span (cache check, blocklist, delegation) - KuboDataSource.getContent span (HTTP fetch with latency attributes) - Spans record cache hit/miss, content size, errors, and content type Also fix strict-boolean-expressions and prettier formatting for CI. --- src/ipfs/ipfs-service.ts | 133 ++++++++++++++++++++++++----------- src/ipfs/kubo-data-source.ts | 32 +++++++++ 2 files changed, 123 insertions(+), 42 deletions(-) diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts index d7219f46d..ddd75a8a2 100644 --- a/src/ipfs/ipfs-service.ts +++ b/src/ipfs/ipfs-service.ts @@ -8,8 +8,10 @@ import fs from 'node:fs'; import crypto from 'node:crypto'; import { Readable } from 'node:stream'; import winston from 'winston'; +import { Span } from '@opentelemetry/api'; import { cidToV1Base32 } from '../lib/ipfs-cid.js'; +import { startChildSpan } from '../tracing.js'; import { IpfsFsCache } from './ipfs-cache.js'; import { IpfsBlocklist } from './ipfs-blocklist.js'; import { @@ -53,60 +55,107 @@ export class IpfsService { cidString, path, signal, + parentSpan, }: { cidString: string; path?: string; signal?: AbortSignal; + parentSpan?: Span; }): Promise { - // Normalize CID to v1 base32 for consistent caching - const normalizedCid = cidToV1Base32(cidString); + const span = startChildSpan( + 'IpfsService.getContent', + { + attributes: { + 'ipfs.cid': cidString, + 'ipfs.path': path ?? '', + }, + }, + parentSpan, + ); + + try { + // Normalize CID to v1 base32 for consistent caching + const normalizedCid = cidToV1Base32(cidString); + span.setAttribute('ipfs.cid_normalized', normalizedCid); + + // Check blocklist + if (this.blocklist.isBlocked(normalizedCid)) { + metrics.ipfsBlockedTotal.inc(); + span.setAttribute('ipfs.blocked', true); + throw new IpfsBlockedError(`CID is blocked: ${normalizedCid}`); + } - // Check blocklist - if (this.blocklist.isBlocked(normalizedCid)) { - metrics.ipfsBlockedTotal.inc(); - throw new IpfsBlockedError(`CID is blocked: ${normalizedCid}`); - } + // Reject path traversal attempts + if ( + path !== undefined && + (path.includes('..') || path.startsWith('/')) + ) { + throw new IpfsNotFoundError('Invalid IPFS path'); + } - // Reject path traversal attempts - if ( - path !== undefined && - (path.includes('..') || path.startsWith('/')) - ) { - throw new IpfsNotFoundError('Invalid IPFS path'); - } + // Check cache + const cached = await this.cache.get(normalizedCid, path); + if (cached) { + this.log.debug('IPFS cache hit', { cid: normalizedCid, path }); + metrics.ipfsCacheHitTotal.inc(); + span.setAttributes({ + 'ipfs.cache': 'hit', + 'ipfs.size': cached.size, + }); + span.end(); + return { + stream: cached.stream, + size: cached.size, + contentType: cached.contentType, + cached: true, + }; + } - // Check cache - const cached = await this.cache.get(normalizedCid, path); - if (cached) { - this.log.debug('IPFS cache hit', { cid: normalizedCid, path }); - metrics.ipfsCacheHitTotal.inc(); - return { - stream: cached.stream, - size: cached.size, - contentType: cached.contentType, - cached: true, - }; - } + metrics.ipfsCacheMissTotal.inc(); + span.setAttribute('ipfs.cache', 'miss'); - metrics.ipfsCacheMissTotal.inc(); + // Fetch from Kubo + const result = await this.dataSource.getContent({ + cidString: normalizedCid, + path, + signal, + parentSpan: span, + }); - // Fetch from Kubo - const result = await this.dataSource.getContent({ - cidString: normalizedCid, - path, - signal, - }); + span.setAttributes({ + 'ipfs.size': result.size, + 'ipfs.content_type': result.contentType, + }); - // Stream directly to the client while writing to a temp file on disk - // for caching. No memory buffering — handles files of any size. - this.streamToCache(normalizedCid, path, result.stream, result.contentType); + // Stream directly to the client while writing to a temp file on disk + // for caching. No memory buffering — handles files of any size. + this.streamToCache( + normalizedCid, + path, + result.stream, + result.contentType, + ); + + // End span when stream completes + result.stream.on('end', () => span.end()); + result.stream.on('error', (err) => { + span.recordException(err); + span.end(); + }); - return { - stream: result.stream, - size: result.size, - contentType: result.contentType, - cached: false, - }; + return { + stream: result.stream, + size: result.size, + contentType: result.contentType, + cached: false, + }; + } catch (error: any) { + if (error.name !== 'AbortError') { + span.recordException(error); + } + span.end(); + throw error; + } } /** diff --git a/src/ipfs/kubo-data-source.ts b/src/ipfs/kubo-data-source.ts index 817d3e290..4d358b4e3 100644 --- a/src/ipfs/kubo-data-source.ts +++ b/src/ipfs/kubo-data-source.ts @@ -45,10 +45,12 @@ export class KuboDataSource { cidString, path, signal, + parentSpan, }: { cidString: string; path?: string; signal?: AbortSignal; + parentSpan?: Span; }): Promise { signal?.throwIfAborted(); @@ -56,6 +58,18 @@ export class KuboDataSource { path !== undefined && path !== '' ? `${cidString}/${path}` : cidString; const url = `${this.kuboUrl}/ipfs/${ipfsPath}`; + const span = startChildSpan( + 'KuboDataSource.getContent', + { + attributes: { + 'ipfs.cid': cidString, + 'ipfs.path': path ?? '', + 'ipfs.url': url, + }, + }, + parentSpan, + ); + this.log.debug('Fetching IPFS content from Kubo', { cidString, path, @@ -122,6 +136,12 @@ export class KuboDataSource { // Switch from connection timeout to stall timeout attachStallTimeout(stream, this.streamStallTimeoutMs); + span.setAttributes({ + 'ipfs.content_length': contentLength, + 'ipfs.content_type': contentType, + }); + span.addEvent('Kubo fetch successful'); + this.log.debug('Kubo fetch successful', { cidString, path, @@ -129,6 +149,13 @@ export class KuboDataSource { contentType, }); + // End span when stream finishes or errors + stream.on('end', () => span.end()); + stream.on('error', (err) => { + span.recordException(err); + span.end(); + }); + return { stream, size: contentLength, @@ -138,6 +165,11 @@ export class KuboDataSource { clearTimeout(connectionTimer); signal?.removeEventListener('abort', onClientAbort); + if (error.name !== 'AbortError') { + span.recordException(error); + } + span.end(); + if (error instanceof IpfsNotFoundError) throw error; if (error instanceof IpfsTimeoutError) throw error; From 9e6c0303d81c23c2920321260c9176563b3dbfbc Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 01:34:32 +0000 Subject: [PATCH 04/11] fix: prettier formatting, IPFS cache volume, /ar-io/info (PE-9067) - Fix prettier line-length formatting (5 errors from CI) - Add IPFS cache volume mount to docker-compose (data persists) - Add IPFS field to /ar-io/info endpoint (network discovery) - Add OTEL span tracing to IpfsService and KuboDataSource --- docker-compose.yaml | 1 + src/ipfs/ipfs-service.ts | 5 +---- src/middleware/ipfs.ts | 6 ++---- src/routes/ar-io-info-builder.ts | 15 +++++++++++++++ src/routes/ar-io.ts | 1 + src/routes/ipfs.ts | 8 +++++--- 6 files changed, 25 insertions(+), 11 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 9a7c15fb6..5fa65da88 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -53,6 +53,7 @@ services: - ${HEADERS_DATA_PATH:-./data/headers}:/app/data/headers - ${SQLITE_DATA_PATH:-./data/sqlite}:/app/data/sqlite - ${DUCKDB_DATA_PATH:-./data/duckdb}:/app/data/duckdb + - ${IPFS_CACHE_DATA_PATH:-./data/ipfs-cache}:/app/data/ipfs-cache - ${TEMP_DATA_PATH:-./data/tmp}:/app/data/tmp - ${LMDB_DATA_PATH:-./data/lmdb}:/app/data/lmdb - ${PARQUET_DATA_PATH:-./data/parquet}:/app/data/parquet diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts index ddd75a8a2..1711190d1 100644 --- a/src/ipfs/ipfs-service.ts +++ b/src/ipfs/ipfs-service.ts @@ -86,10 +86,7 @@ export class IpfsService { } // Reject path traversal attempts - if ( - path !== undefined && - (path.includes('..') || path.startsWith('/')) - ) { + if (path !== undefined && (path.includes('..') || path.startsWith('/'))) { throw new IpfsNotFoundError('Invalid IPFS path'); } diff --git a/src/middleware/ipfs.ts b/src/middleware/ipfs.ts index 48375bc43..7a47ff4b7 100644 --- a/src/middleware/ipfs.ts +++ b/src/middleware/ipfs.ts @@ -69,8 +69,7 @@ export function createIpfsSubdomainMiddleware({ if (reqPath !== undefined && reqPath.startsWith('ipfs/')) { const afterIpfs = reqPath.slice(5); // strip 'ipfs/' const slashIdx = afterIpfs.indexOf('/'); - const pathCid = - slashIdx >= 0 ? afterIpfs.slice(0, slashIdx) : afterIpfs; + const pathCid = slashIdx >= 0 ? afterIpfs.slice(0, slashIdx) : afterIpfs; const remainder = slashIdx >= 0 ? afterIpfs.slice(slashIdx + 1) : undefined; @@ -82,8 +81,7 @@ export function createIpfsSubdomainMiddleware({ try { const targetCid = cidToV1Base32(pathCid); const rootHost = matchedEntry.host; - const pathSuffix = - remainder !== undefined ? `/${remainder}` : '/'; + const pathSuffix = remainder !== undefined ? `/${remainder}` : '/'; res.redirect( 302, `${req.protocol}://${targetCid}.${rootHost}${pathSuffix}`, diff --git a/src/routes/ar-io-info-builder.ts b/src/routes/ar-io-info-builder.ts index fac7a0552..03c143cfd 100644 --- a/src/routes/ar-io-info-builder.ts +++ b/src/routes/ar-io-info-builder.ts @@ -117,6 +117,13 @@ export interface HttpsigInfo { attestation?: HttpsigAttestationInfo; } +/** + * IPFS configuration exposed in the info endpoint. + */ +export interface IpfsInfo { + enabled: true; +} + /** * Complete AR.IO info endpoint response structure. */ @@ -131,6 +138,7 @@ export interface ArIoInfoResponse { rateLimiter?: RateLimiterInfo; x402?: X402Info; httpsig?: HttpsigInfo; + ipfs?: IpfsInfo; } /** @@ -174,6 +182,9 @@ export interface ArIoInfoConfig { rsaPublicKey: string; }; }; + ipfs?: { + enabled: boolean; + }; } /** @@ -311,5 +322,9 @@ export function buildArIoInfo(config: ArIoInfoConfig): ArIoInfoResponse { }; } + if (config.ipfs?.enabled) { + response.ipfs = { enabled: true }; + } + return response; } diff --git a/src/routes/ar-io.ts b/src/routes/ar-io.ts index 603df24b0..9b0023c72 100644 --- a/src/routes/ar-io.ts +++ b/src/routes/ar-io.ts @@ -217,6 +217,7 @@ export const arIoInfoHandler = (_req: Request, res: Response) => { : undefined, } : undefined, + ipfs: config.IPFS_ENABLED ? { enabled: true } : undefined, }); res.status(200).send(response); diff --git a/src/routes/ipfs.ts b/src/routes/ipfs.ts index c2d49c071..74d1efe33 100644 --- a/src/routes/ipfs.ts +++ b/src/routes/ipfs.ts @@ -114,7 +114,10 @@ function createIpfsPathHandler({ const v1Base32 = cidToV1Base32(cidString); const rootHost = config.ARNS_ROOT_HOSTS[0].host; const pathSuffix = path !== undefined ? `/${path}` : ''; - res.redirect(302, `${req.protocol}://${v1Base32}.${rootHost}${pathSuffix}`); + res.redirect( + 302, + `${req.protocol}://${v1Base32}.${rootHost}${pathSuffix}`, + ); return; } @@ -154,8 +157,7 @@ async function handleIpfsRequest({ routeType: 'path' | 'subdomain'; }): Promise { const startTime = Date.now(); - const ipfsPath = - path !== undefined ? `${cidString}/${path}` : cidString; + const ipfsPath = path !== undefined ? `${cidString}/${path}` : cidString; parentLog.debug('Handling IPFS request', { cidString, path, routeType }); From ffd5ee23f66cd68e5e114fcc5954b226dc7aea28 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 02:08:55 +0000 Subject: [PATCH 05/11] refactor: use existing admin block API for IPFS moderation (PE-9067) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove custom text-file blocklist in favor of the existing PUT /ar-io/admin/block-data API. Operators block IPFS CIDs the same way they block Arweave TX IDs — unified moderation, single API. Removed: IpfsBlocklist class, IPFS_BLOCKLIST_PATH config, blocklist volume mount. IpfsService now uses DataBlockListValidator (SQLite). --- docker-compose.yaml | 1 - docs/envs.md | 1 - src/config.ts | 5 -- src/ipfs/ipfs-blocklist.ts | 115 ------------------------------------- src/ipfs/ipfs-service.ts | 14 ++--- src/system.ts | 12 +--- 6 files changed, 8 insertions(+), 140 deletions(-) delete mode 100644 src/ipfs/ipfs-blocklist.ts diff --git a/docker-compose.yaml b/docker-compose.yaml index 5fa65da88..371425fb3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -132,7 +132,6 @@ services: - IPFS_CACHE_PATH=${IPFS_CACHE_PATH:-} - IPFS_CACHE_MAX_SIZE_BYTES=${IPFS_CACHE_MAX_SIZE_BYTES:-} - IPFS_CACHE_CLEANUP_THRESHOLD=${IPFS_CACHE_CLEANUP_THRESHOLD:-} - - IPFS_BLOCKLIST_PATH=${IPFS_BLOCKLIST_PATH:-} - IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET=${IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET:-} - IPFS_RATE_LIMITER_IP_REFILL_PER_SEC=${IPFS_RATE_LIMITER_IP_REFILL_PER_SEC:-} - IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET=${IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET:-} diff --git a/docs/envs.md b/docs/envs.md index d4b7a15dd..471044988 100644 --- a/docs/envs.md +++ b/docs/envs.md @@ -374,7 +374,6 @@ as a Docker Compose sidecar via the `ipfs` profile). | IPFS_CACHE_PATH | String | data/ipfs-cache | Directory for cached IPFS content | | IPFS_CACHE_MAX_SIZE_BYTES | Number | 10737418240 (10 GB) | Maximum cache size before LRU eviction | | IPFS_CACHE_CLEANUP_THRESHOLD | Number | 3600 | Age in seconds before cached files become eviction candidates | -| IPFS_BLOCKLIST_PATH | String | data/ipfs-blocklist.txt | Path to CID blocklist file (one CID per line, hot-reloaded) | | IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET | Number | 50000 | IPFS rate limiter: max tokens per IP bucket | | IPFS_RATE_LIMITER_IP_REFILL_PER_SEC | Number | 5 | IPFS rate limiter: token refill rate per second (IP bucket) | | IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET | Number | 200000 | IPFS rate limiter: max tokens per resource bucket | diff --git a/src/config.ts b/src/config.ts index 72f70c80a..082a6eec5 100644 --- a/src/config.ts +++ b/src/config.ts @@ -2184,11 +2184,6 @@ export const IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS = +env.varOrDefault( '3600', ); -export const IPFS_BLOCKLIST_PATH = env.varOrDefault( - 'IPFS_BLOCKLIST_PATH', - 'data/ipfs-blocklist.txt', -); - export const IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET = +env.varOrDefault( 'IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET', '50000', diff --git a/src/ipfs/ipfs-blocklist.ts b/src/ipfs/ipfs-blocklist.ts deleted file mode 100644 index f20ba355b..000000000 --- a/src/ipfs/ipfs-blocklist.ts +++ /dev/null @@ -1,115 +0,0 @@ -/** - * AR.IO Gateway - * Copyright (C) 2022-2025 Permanent Data Solutions, Inc. All Rights Reserved. - * - * SPDX-License-Identifier: AGPL-3.0-or-later - */ -import fs from 'node:fs'; -import winston from 'winston'; -import { watch, FSWatcher } from 'chokidar'; - -import { cidToV1Base32, isValidCid } from '../lib/ipfs-cid.js'; - -export class IpfsBlocklist { - private log: winston.Logger; - private filePath: string; - private blockedCids: Set = new Set(); - private watcher: FSWatcher | null = null; - private reloadTimer: NodeJS.Timeout | null = null; - - constructor({ log, filePath }: { log: winston.Logger; filePath: string }) { - this.log = log.child({ class: this.constructor.name }); - this.filePath = filePath; - } - - async load(): Promise { - try { - const content = await fs.promises.readFile(this.filePath, 'utf-8'); - const newSet = new Set(); - - for (const line of content.split('\n')) { - const trimmed = line.trim(); - if (trimmed === '' || trimmed.startsWith('#')) continue; - - if (isValidCid(trimmed)) { - // Normalize to CIDv1 base32 for consistent matching - try { - newSet.add(cidToV1Base32(trimmed)); - } catch { - this.log.warn('Failed to normalize CID in blocklist', { - cid: trimmed, - }); - } - } else { - this.log.warn('Invalid CID in blocklist, skipping', { - line: trimmed, - }); - } - } - - this.blockedCids = newSet; - this.log.info('IPFS blocklist loaded', { count: newSet.size }); - } catch (error: any) { - if (error.code === 'ENOENT') { - this.log.debug('IPFS blocklist file not found, no CIDs blocked', { - filePath: this.filePath, - }); - this.blockedCids = new Set(); - } else { - this.log.error('Failed to load IPFS blocklist', { - message: error.message, - }); - } - } - } - - isBlocked(cidString: string): boolean { - try { - const normalized = cidToV1Base32(cidString); - return this.blockedCids.has(normalized); - } catch { - return false; - } - } - - startWatching(): void { - this.watcher = watch(this.filePath, { - ignoreInitial: true, - awaitWriteFinish: { stabilityThreshold: 1000 }, - }); - - this.watcher.on('change', () => { - this.log.info('IPFS blocklist file changed, reloading'); - this.scheduleReload(); - }); - - this.watcher.on('add', () => { - this.log.info('IPFS blocklist file created, loading'); - this.scheduleReload(); - }); - } - - private scheduleReload(): void { - if (this.reloadTimer) { - clearTimeout(this.reloadTimer); - } - this.reloadTimer = setTimeout(() => { - this.load().catch((error) => { - this.log.error('Failed to reload IPFS blocklist', { - message: error.message, - }); - }); - }, 1000); - } - - stop(): void { - if (this.watcher) { - this.watcher.close(); - this.watcher = null; - } - if (this.reloadTimer) { - clearTimeout(this.reloadTimer); - this.reloadTimer = null; - } - } -} diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts index 1711190d1..115000dee 100644 --- a/src/ipfs/ipfs-service.ts +++ b/src/ipfs/ipfs-service.ts @@ -13,7 +13,7 @@ import { Span } from '@opentelemetry/api'; import { cidToV1Base32 } from '../lib/ipfs-cid.js'; import { startChildSpan } from '../tracing.js'; import { IpfsFsCache } from './ipfs-cache.js'; -import { IpfsBlocklist } from './ipfs-blocklist.js'; +import { DataBlockListValidator } from '../types.js'; import { KuboDataSource, IpfsBlockedError, @@ -32,23 +32,23 @@ export class IpfsService { private log: winston.Logger; private dataSource: KuboDataSource; private cache: IpfsFsCache; - private blocklist: IpfsBlocklist; + private blockListValidator: DataBlockListValidator; constructor({ log, dataSource, cache, - blocklist, + blockListValidator, }: { log: winston.Logger; dataSource: KuboDataSource; cache: IpfsFsCache; - blocklist: IpfsBlocklist; + blockListValidator: DataBlockListValidator; }) { this.log = log.child({ class: this.constructor.name }); this.dataSource = dataSource; this.cache = cache; - this.blocklist = blocklist; + this.blockListValidator = blockListValidator; } async getContent({ @@ -78,8 +78,8 @@ export class IpfsService { const normalizedCid = cidToV1Base32(cidString); span.setAttribute('ipfs.cid_normalized', normalizedCid); - // Check blocklist - if (this.blocklist.isBlocked(normalizedCid)) { + // Check blocklist (uses the same admin API as Arweave data moderation) + if (await this.blockListValidator.isIdBlocked(normalizedCid)) { metrics.ipfsBlockedTotal.inc(); span.setAttribute('ipfs.blocked', true); throw new IpfsBlockedError(`CID is blocked: ${normalizedCid}`); diff --git a/src/system.ts b/src/system.ts index d3a3b943e..794083a91 100644 --- a/src/system.ts +++ b/src/system.ts @@ -1462,14 +1462,12 @@ if (dataVerificationWorker !== undefined) { import { KuboDataSource } from './ipfs/kubo-data-source.js'; import { IpfsFsCache } from './ipfs/ipfs-cache.js'; -import { IpfsBlocklist } from './ipfs/ipfs-blocklist.js'; import { IpfsService } from './ipfs/ipfs-service.js'; import { createIpfsRateLimiter } from './ipfs/ipfs-rate-limiter.js'; import { RateLimiter } from './limiter/types.js'; export let ipfsService: IpfsService | undefined; export let ipfsRateLimiter: RateLimiter | undefined; -export let ipfsBlocklist: IpfsBlocklist | undefined; if (config.IPFS_ENABLED) { log.info('IPFS subsystem enabled, initializing...'); @@ -1487,18 +1485,11 @@ if (config.IPFS_ENABLED) { maxSizeBytes: config.IPFS_CACHE_MAX_SIZE_BYTES, }); - ipfsBlocklist = new IpfsBlocklist({ - log, - filePath: config.IPFS_BLOCKLIST_PATH, - }); - await ipfsBlocklist.load(); - ipfsBlocklist.startWatching(); - ipfsService = new IpfsService({ log, dataSource: kuboDataSource, cache: ipfsCache, - blocklist: ipfsBlocklist, + blockListValidator: dataBlockListValidator, }); ipfsRateLimiter = createIpfsRateLimiter(); @@ -1541,7 +1532,6 @@ export const shutdown = async (exitCode = 0) => { } // Clean up system components - ipfsBlocklist?.stop(); eventEmitter.removeAllListeners(); arIOPeerManager.stopUpdatingPeers(); dataSqliteWalCleanupWorker?.stop(); From 52ab93c2ac76200f2b96405a04016766b66218aa Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 02:17:53 +0000 Subject: [PATCH 06/11] refactor: match IPFS rate limits to Arweave defaults, use admin API for moderation (PE-9067) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - IPFS rate limiter defaults now match Arweave (100K IP tokens, 20/s refill, 1M resource tokens, 100/s refill) - Remove custom text-file blocklist — use existing PUT /ar-io/admin/block-data API for CID moderation (unified with Arweave content moderation) - Remove IPFS_BLOCKLIST_PATH config and volume mount - Add IPFS cache volume mount to docker-compose for persistence --- src/config.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/config.ts b/src/config.ts index 082a6eec5..dd8842326 100644 --- a/src/config.ts +++ b/src/config.ts @@ -2186,22 +2186,22 @@ export const IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS = +env.varOrDefault( export const IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET = +env.varOrDefault( 'IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET', - '50000', + '100000', ); export const IPFS_RATE_LIMITER_IP_REFILL_PER_SEC = +env.varOrDefault( 'IPFS_RATE_LIMITER_IP_REFILL_PER_SEC', - '5', + '20', ); export const IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET = +env.varOrDefault( 'IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET', - '200000', + '1000000', ); export const IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC = +env.varOrDefault( 'IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC', - '20', + '100', ); export const IPFS_MAX_RESPONSE_SIZE_BYTES = +env.varOrDefault( From 967cc13cb507a705178eb4fb48f45bfe385bd28b Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 02:35:08 +0000 Subject: [PATCH 07/11] docs: add IPFS Grafana dashboard, update moderation docs and rate limit defaults (PE-9067) - Add IPFS Grafana dashboard example (requests/sec, cache hit rate, latency percentiles, content size, blocked requests, route type) - Update OpenAPI spec: block-data endpoint accepts IPFS CIDs - Update ipfs-integration.md: admin API for moderation (not text file) - Match IPFS rate limiter defaults to Arweave (100K IP, 1M resource) --- docs/ipfs-integration.md | 23 +++--- docs/openapi.yaml | 7 +- .../dashboards/examples/ipfs-example.json | 72 +++++++++++++++++++ 3 files changed, 87 insertions(+), 15 deletions(-) create mode 100644 monitoring/grafana/dashboards/examples/ipfs-example.json diff --git a/docs/ipfs-integration.md b/docs/ipfs-integration.md index 93d3cd9bb..8a60ef8f7 100644 --- a/docs/ipfs-integration.md +++ b/docs/ipfs-integration.md @@ -325,22 +325,21 @@ and IPFS storage budgets. ## Security and Moderation -### CID Blocklist +### Content Moderation -The blocklist file (`IPFS_BLOCKLIST_PATH`, default `data/ipfs-blocklist.txt`) -allows operators to block specific content: +IPFS content moderation uses the same admin API as Arweave data moderation. +Block a CID using the existing endpoint: -``` -# Blocked content - one CID per line -QmBlockedContent1... -bafybeiblockedcontent2... -# Comments start with # +```bash +curl -X PUT http://localhost:4000/ar-io/admin/block-data \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{"id": "bafkreigbk3hjz6oyiywqf7eknthwc2osvt5xi6b6igwljn2qrxkthqgrp4", "source": "manual", "notes": "Reason for block"}' ``` -- CIDs are normalized before matching, so blocking a CIDv0 also blocks its - CIDv1 equivalent and vice versa. -- The file is watched for changes and reloaded automatically. -- Blocked requests return HTTP 451. +- Pass the CIDv1 base32 string as the `id` field (same field used for Arweave TX IDs). +- Blocked requests return HTTP 451 (Unavailable for Legal Reasons). +- One unified moderation system for all content (Arweave and IPFS). ### Rate Limiting diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 5f2a487cb..6a1d33b3c 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -2809,12 +2809,13 @@ paths: '/ar-io/admin/block-data': put: tags: [Admin] - summary: Blocks transactions or data-items so your AR.IO Gateway will not serve them. + summary: Blocks transactions, data-items, or IPFS CIDs so your AR.IO Gateway will not serve them. description: | - Submits a TX ID/data-item ID or sha-256 content hash for content you do not want your AR.IO Gateway to serve. Once submitted, your Gateway will not respond to requests for these transactions or data-items. + Submits a TX ID/data-item ID, IPFS CID, or sha-256 content hash for content you do not want your AR.IO Gateway to serve. Once submitted, your Gateway will not respond to requests for these transactions, data-items, or IPFS CIDs. + For IPFS content, pass the CIDv1 base32 string (e.g. bafkreigbk3hjz6oyiywqf7eknthwc2osvt5xi6b6igwljn2qrxkthqgrp4) as the id field. The gateway returns HTTP 451 for blocked CIDs. - WARNING - Testing a TX ID here WILL result in that data being blocked by your Gateway. + WARNING - Testing an ID here WILL result in that data being blocked by your Gateway. operationId: adminBlockData requestBody: required: true diff --git a/monitoring/grafana/dashboards/examples/ipfs-example.json b/monitoring/grafana/dashboards/examples/ipfs-example.json new file mode 100644 index 000000000..259a399f6 --- /dev/null +++ b/monitoring/grafana/dashboards/examples/ipfs-example.json @@ -0,0 +1,72 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "reqps" } }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "title": "IPFS Requests/sec", + "type": "timeseries", + "targets": [{ "expr": "sum(rate(ipfs_requests_total[5m])) by (status)", "legendFormat": "{{status}}" }] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "percentunit" } }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "title": "IPFS Cache Hit Rate", + "type": "stat", + "targets": [{ "expr": "sum(rate(ipfs_cache_hit_total[5m])) / (sum(rate(ipfs_cache_hit_total[5m])) + sum(rate(ipfs_cache_miss_total[5m])))", "legendFormat": "hit rate" }] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "s" } }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 3, + "title": "IPFS Request Duration (p50/p95/p99)", + "type": "timeseries", + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(ipfs_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(ipfs_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(ipfs_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p99" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "bytes" } }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 4, + "title": "IPFS Content Size Distribution", + "type": "timeseries", + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(ipfs_content_size_bytes_bucket[5m])) by (le))", "legendFormat": "p50 size" }, + { "expr": "histogram_quantile(0.95, sum(rate(ipfs_content_size_bytes_bucket[5m])) by (le))", "legendFormat": "p95 size" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 5, + "title": "IPFS Blocked Requests", + "type": "timeseries", + "targets": [{ "expr": "sum(rate(ipfs_blocked_total[5m]))", "legendFormat": "blocked/sec" }] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "reqps" } }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 6, + "title": "IPFS by Route Type", + "type": "timeseries", + "targets": [{ "expr": "sum(rate(ipfs_requests_total[5m])) by (route_type)", "legendFormat": "{{route_type}}" }] + } + ], + "schemaVersion": 39, + "tags": ["ipfs", "ar-io"], + "time": { "from": "now-1h", "to": "now" }, + "title": "AR.IO IPFS", + "uid": "ar-io-ipfs" +} From 2f48d0e78a2c182852bd6a938780a235d57e3229 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 02:44:01 +0000 Subject: [PATCH 08/11] fix: enforce size limit, fix stream leak, update docs defaults (PE-9067) - Enforce IPFS_MAX_RESPONSE_SIZE_BYTES (reject with 413 when Content-Length exceeds limit) - Destroy response stream on 404/408/504 from Kubo (prevents socket leak) - Fix docs/envs.md rate limiter defaults to match config.ts - Add Grafana dashboard example for IPFS metrics - Update OpenAPI spec for CID content moderation --- docs/envs.md | 8 ++++---- src/ipfs/ipfs-service.ts | 17 +++++++++++++++++ src/ipfs/kubo-data-source.ts | 2 ++ src/routes/ipfs.ts | 10 ++++++++++ src/system.ts | 1 + 5 files changed, 34 insertions(+), 4 deletions(-) diff --git a/docs/envs.md b/docs/envs.md index 471044988..67c7a62c4 100644 --- a/docs/envs.md +++ b/docs/envs.md @@ -374,8 +374,8 @@ as a Docker Compose sidecar via the `ipfs` profile). | IPFS_CACHE_PATH | String | data/ipfs-cache | Directory for cached IPFS content | | IPFS_CACHE_MAX_SIZE_BYTES | Number | 10737418240 (10 GB) | Maximum cache size before LRU eviction | | IPFS_CACHE_CLEANUP_THRESHOLD | Number | 3600 | Age in seconds before cached files become eviction candidates | -| IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET | Number | 50000 | IPFS rate limiter: max tokens per IP bucket | -| IPFS_RATE_LIMITER_IP_REFILL_PER_SEC | Number | 5 | IPFS rate limiter: token refill rate per second (IP bucket) | -| IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET | Number | 200000 | IPFS rate limiter: max tokens per resource bucket | -| IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC | Number | 20 | IPFS rate limiter: token refill rate per second (resource bucket) | +| IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET | Number | 100000 | IPFS rate limiter: max tokens per IP bucket | +| IPFS_RATE_LIMITER_IP_REFILL_PER_SEC | Number | 20 | IPFS rate limiter: token refill rate per second (IP bucket) | +| IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET | Number | 1000000 | IPFS rate limiter: max tokens per resource bucket | +| IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC | Number | 100 | IPFS rate limiter: token refill rate per second (resource bucket) | | IPFS_MAX_RESPONSE_SIZE_BYTES | Number | 1073741824 (1 GB) | Maximum IPFS content size the gateway will serve | diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts index 115000dee..57da3c14f 100644 --- a/src/ipfs/ipfs-service.ts +++ b/src/ipfs/ipfs-service.ts @@ -18,6 +18,7 @@ import { KuboDataSource, IpfsBlockedError, IpfsNotFoundError, + IpfsSizeLimitError, } from './kubo-data-source.js'; import * as metrics from '../metrics.js'; @@ -33,22 +34,26 @@ export class IpfsService { private dataSource: KuboDataSource; private cache: IpfsFsCache; private blockListValidator: DataBlockListValidator; + private maxResponseSizeBytes: number; constructor({ log, dataSource, cache, blockListValidator, + maxResponseSizeBytes, }: { log: winston.Logger; dataSource: KuboDataSource; cache: IpfsFsCache; blockListValidator: DataBlockListValidator; + maxResponseSizeBytes: number; }) { this.log = log.child({ class: this.constructor.name }); this.dataSource = dataSource; this.cache = cache; this.blockListValidator = blockListValidator; + this.maxResponseSizeBytes = maxResponseSizeBytes; } async getContent({ @@ -124,6 +129,18 @@ export class IpfsService { 'ipfs.content_type': result.contentType, }); + // Enforce size limit when Content-Length is known + if ( + this.maxResponseSizeBytes > 0 && + result.size > 0 && + result.size > this.maxResponseSizeBytes + ) { + result.stream.destroy(); + throw new IpfsSizeLimitError( + `IPFS content size ${result.size} exceeds limit ${this.maxResponseSizeBytes}`, + ); + } + // Stream directly to the client while writing to a temp file on disk // for caching. No memory buffering — handles files of any size. this.streamToCache( diff --git a/src/ipfs/kubo-data-source.ts b/src/ipfs/kubo-data-source.ts index 4d358b4e3..f5f5b9df0 100644 --- a/src/ipfs/kubo-data-source.ts +++ b/src/ipfs/kubo-data-source.ts @@ -106,12 +106,14 @@ export class KuboDataSource { signal?.removeEventListener('abort', onClientAbort); if (response.status === 404) { + (response.data as Readable).destroy(); throw new IpfsNotFoundError( `IPFS content not found: /ipfs/${ipfsPath}`, ); } if (response.status === 408 || response.status === 504) { + (response.data as Readable).destroy(); throw new IpfsTimeoutError( `Kubo timed out resolving: /ipfs/${ipfsPath}`, ); diff --git a/src/routes/ipfs.ts b/src/routes/ipfs.ts index 74d1efe33..03a7eb4a0 100644 --- a/src/routes/ipfs.ts +++ b/src/routes/ipfs.ts @@ -20,6 +20,7 @@ import { IpfsService } from '../ipfs/ipfs-service.js'; import { IpfsBlockedError, IpfsNotFoundError, + IpfsSizeLimitError, IpfsTimeoutError, IpfsUnavailableError, } from '../ipfs/kubo-data-source.js'; @@ -295,6 +296,15 @@ async function handleIpfsRequest({ return; } + if (error instanceof IpfsSizeLimitError) { + metrics.ipfsRequestsTotal.inc({ + route_type: routeType, + status: 'size_exceeded', + }); + res.status(413).json({ error: 'Content exceeds size limit' }); + return; + } + // Client disconnected if (error.name === 'AbortError') { return; diff --git a/src/system.ts b/src/system.ts index 794083a91..420a696b5 100644 --- a/src/system.ts +++ b/src/system.ts @@ -1490,6 +1490,7 @@ if (config.IPFS_ENABLED) { dataSource: kuboDataSource, cache: ipfsCache, blockListValidator: dataBlockListValidator, + maxResponseSizeBytes: config.IPFS_MAX_RESPONSE_SIZE_BYTES, }); ipfsRateLimiter = createIpfsRateLimiter(); From a5ceb2ee865332248fcd88001145f14b839d2c04 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 03:03:36 +0000 Subject: [PATCH 09/11] fix: address CodeRabbit review feedback (PE-9067) - Use positiveIntOrDefault for all IPFS numeric configs (prevents NaN) - URL-encode IPFS path segments (prevents request injection) - Guard contentLength parse against NaN (fallback to 0) - Use SANDBOX_PROTOCOL for redirects (correct behind TLS termination) - Enforce size limit during streaming (catches chunked responses) - Set failed=true before cleanup in stream end handler (race fix) - Use conservative size estimate for x402 when Content-Length unknown - Increment cache hit/miss counters in route handler - Fix env var name: IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS - Guard Grafana cache hit rate expr against division by zero - Destroy response stream on 404/408/504 from Kubo (socket leak) --- docker-compose.yaml | 2 +- docs/envs.md | 2 +- .../dashboards/examples/ipfs-example.json | 2 +- src/config.ts | 45 +++++++++---------- src/ipfs/ipfs-service.ts | 18 ++++++++ src/ipfs/kubo-data-source.ts | 15 ++++++- src/middleware/ipfs.ts | 2 +- src/routes/ipfs.ts | 13 ++++-- 8 files changed, 67 insertions(+), 32 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 371425fb3..07510b21d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -131,7 +131,7 @@ services: - IPFS_STREAM_STALL_TIMEOUT_MS=${IPFS_STREAM_STALL_TIMEOUT_MS:-} - IPFS_CACHE_PATH=${IPFS_CACHE_PATH:-} - IPFS_CACHE_MAX_SIZE_BYTES=${IPFS_CACHE_MAX_SIZE_BYTES:-} - - IPFS_CACHE_CLEANUP_THRESHOLD=${IPFS_CACHE_CLEANUP_THRESHOLD:-} + - IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS=${IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS:-} - IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET=${IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET:-} - IPFS_RATE_LIMITER_IP_REFILL_PER_SEC=${IPFS_RATE_LIMITER_IP_REFILL_PER_SEC:-} - IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET=${IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET:-} diff --git a/docs/envs.md b/docs/envs.md index 67c7a62c4..d112078eb 100644 --- a/docs/envs.md +++ b/docs/envs.md @@ -373,7 +373,7 @@ as a Docker Compose sidecar via the `ipfs` profile). | IPFS_STREAM_STALL_TIMEOUT_MS | Number | 30000 | Stall timeout — max time with no data before aborting stream (ms) | | IPFS_CACHE_PATH | String | data/ipfs-cache | Directory for cached IPFS content | | IPFS_CACHE_MAX_SIZE_BYTES | Number | 10737418240 (10 GB) | Maximum cache size before LRU eviction | -| IPFS_CACHE_CLEANUP_THRESHOLD | Number | 3600 | Age in seconds before cached files become eviction candidates | +| IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS | Number | 3600 | Age in seconds before cached files become eviction candidates | | IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET | Number | 100000 | IPFS rate limiter: max tokens per IP bucket | | IPFS_RATE_LIMITER_IP_REFILL_PER_SEC | Number | 20 | IPFS rate limiter: token refill rate per second (IP bucket) | | IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET | Number | 1000000 | IPFS rate limiter: max tokens per resource bucket | diff --git a/monitoring/grafana/dashboards/examples/ipfs-example.json b/monitoring/grafana/dashboards/examples/ipfs-example.json index 259a399f6..c3f4772f4 100644 --- a/monitoring/grafana/dashboards/examples/ipfs-example.json +++ b/monitoring/grafana/dashboards/examples/ipfs-example.json @@ -18,7 +18,7 @@ "id": 2, "title": "IPFS Cache Hit Rate", "type": "stat", - "targets": [{ "expr": "sum(rate(ipfs_cache_hit_total[5m])) / (sum(rate(ipfs_cache_hit_total[5m])) + sum(rate(ipfs_cache_miss_total[5m])))", "legendFormat": "hit rate" }] + "targets": [{ "expr": "sum(rate(ipfs_cache_hit_total[5m])) / clamp_min(sum(rate(ipfs_cache_hit_total[5m])) + sum(rate(ipfs_cache_miss_total[5m])), 1)", "legendFormat": "hit rate" }] }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, diff --git a/src/config.ts b/src/config.ts index dd8842326..9b48bb9d1 100644 --- a/src/config.ts +++ b/src/config.ts @@ -2156,14 +2156,14 @@ export const IPFS_KUBO_URL = env.varOrDefault( 'http://kubo:8080', ); -export const IPFS_KUBO_REQUEST_TIMEOUT_MS = +env.varOrDefault( +export const IPFS_KUBO_REQUEST_TIMEOUT_MS = env.positiveIntOrDefault( 'IPFS_KUBO_REQUEST_TIMEOUT_MS', - '30000', + 30000, ); -export const IPFS_STREAM_STALL_TIMEOUT_MS = +env.varOrDefault( +export const IPFS_STREAM_STALL_TIMEOUT_MS = env.positiveIntOrDefault( 'IPFS_STREAM_STALL_TIMEOUT_MS', - '30000', + 30000, ); export const IPFS_CACHE_PATH = env.varOrDefault( @@ -2171,40 +2171,39 @@ export const IPFS_CACHE_PATH = env.varOrDefault( 'data/ipfs-cache', ); -export const IPFS_CACHE_MAX_SIZE_BYTES = +env.varOrDefault( +export const IPFS_CACHE_MAX_SIZE_BYTES = env.positiveIntOrDefault( 'IPFS_CACHE_MAX_SIZE_BYTES', - `${10 * 1024 * 1024 * 1024}`, // 10 GB + 10 * 1024 * 1024 * 1024, // 10 GB ); // Reserved for future cache cleanup worker. Currently unused — LRU eviction // in the in-memory index handles cache bounding. After restarts, disk usage // may temporarily exceed IPFS_CACHE_MAX_SIZE_BYTES until the index rebuilds. -export const IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS = +env.varOrDefault( - 'IPFS_CACHE_CLEANUP_THRESHOLD', - '3600', +export const IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS = env.positiveIntOrDefault( + 'IPFS_CACHE_CLEANUP_THRESHOLD_SECONDS', + 3600, ); -export const IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET = +env.varOrDefault( +export const IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET = env.positiveIntOrDefault( 'IPFS_RATE_LIMITER_IP_TOKENS_PER_BUCKET', - '100000', + 100000, ); -export const IPFS_RATE_LIMITER_IP_REFILL_PER_SEC = +env.varOrDefault( +export const IPFS_RATE_LIMITER_IP_REFILL_PER_SEC = env.positiveIntOrDefault( 'IPFS_RATE_LIMITER_IP_REFILL_PER_SEC', - '20', + 20, ); -export const IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET = +env.varOrDefault( - 'IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET', - '1000000', -); +export const IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET = + env.positiveIntOrDefault( + 'IPFS_RATE_LIMITER_RESOURCE_TOKENS_PER_BUCKET', + 1000000, + ); -export const IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC = +env.varOrDefault( - 'IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC', - '100', -); +export const IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC = + env.positiveIntOrDefault('IPFS_RATE_LIMITER_RESOURCE_REFILL_PER_SEC', 100); -export const IPFS_MAX_RESPONSE_SIZE_BYTES = +env.varOrDefault( +export const IPFS_MAX_RESPONSE_SIZE_BYTES = env.positiveIntOrDefault( 'IPFS_MAX_RESPONSE_SIZE_BYTES', - `${1 * 1024 * 1024 * 1024}`, // 1 GB + 1 * 1024 * 1024 * 1024, // 1 GB ); diff --git a/src/ipfs/ipfs-service.ts b/src/ipfs/ipfs-service.ts index 57da3c14f..99967ee74 100644 --- a/src/ipfs/ipfs-service.ts +++ b/src/ipfs/ipfs-service.ts @@ -230,6 +230,23 @@ export class IpfsService { stream.on('data', (chunk: Buffer) => { if (failed) return; bytesWritten += chunk.length; + + // Enforce size limit during streaming (catches chunked responses + // that lack Content-Length) + if ( + this.maxResponseSizeBytes > 0 && + bytesWritten > this.maxResponseSizeBytes + ) { + failed = true; + stream.destroy( + new IpfsSizeLimitError( + `IPFS content exceeds limit during streaming: ${bytesWritten} > ${this.maxResponseSizeBytes}`, + ), + ); + cleanup(); + return; + } + if (writeStream) { writeStream.write(chunk); } else { @@ -241,6 +258,7 @@ export class IpfsService { stream.on('end', () => { if (failed || !writeStream) { // If writeStream never became ready, discard + failed = true; cleanup(); return; } diff --git a/src/ipfs/kubo-data-source.ts b/src/ipfs/kubo-data-source.ts index f5f5b9df0..776793d08 100644 --- a/src/ipfs/kubo-data-source.ts +++ b/src/ipfs/kubo-data-source.ts @@ -54,8 +54,16 @@ export class KuboDataSource { }): Promise { signal?.throwIfAborted(); + // URL-encode path segments to prevent breaking the upstream request + const encodedPath = + path !== undefined && path !== '' + ? path + .split('/') + .map((seg) => encodeURIComponent(seg)) + .join('/') + : undefined; const ipfsPath = - path !== undefined && path !== '' ? `${cidString}/${path}` : cidString; + encodedPath !== undefined ? `${cidString}/${encodedPath}` : cidString; const url = `${this.kuboUrl}/ipfs/${ipfsPath}`; const span = startChildSpan( @@ -128,10 +136,13 @@ export class KuboDataSource { } const stream = response.data as Readable; - const contentLength = parseInt( + const rawContentLength = parseInt( response.headers['content-length'] ?? '0', 10, ); + const contentLength = Number.isFinite(rawContentLength) + ? rawContentLength + : 0; const contentType = response.headers['content-type'] ?? 'application/octet-stream'; diff --git a/src/middleware/ipfs.ts b/src/middleware/ipfs.ts index 7a47ff4b7..539883f67 100644 --- a/src/middleware/ipfs.ts +++ b/src/middleware/ipfs.ts @@ -84,7 +84,7 @@ export function createIpfsSubdomainMiddleware({ const pathSuffix = remainder !== undefined ? `/${remainder}` : '/'; res.redirect( 302, - `${req.protocol}://${targetCid}.${rootHost}${pathSuffix}`, + `${config.SANDBOX_PROTOCOL ?? req.protocol}://${targetCid}.${rootHost}${pathSuffix}`, ); return; } catch { diff --git a/src/routes/ipfs.ts b/src/routes/ipfs.ts index 03a7eb4a0..8a976c699 100644 --- a/src/routes/ipfs.ts +++ b/src/routes/ipfs.ts @@ -117,7 +117,7 @@ function createIpfsPathHandler({ const pathSuffix = path !== undefined ? `/${path}` : ''; res.redirect( 302, - `${req.protocol}://${v1Base32}.${rootHost}${pathSuffix}`, + `${config.SANDBOX_PROTOCOL ?? req.protocol}://${v1Base32}.${rootHost}${pathSuffix}`, ); return; } @@ -170,8 +170,10 @@ async function handleIpfsRequest({ }); // Check payment and rate limits (x402 + rate limiting in one call). - // Content size is needed for token calculation and payment pricing. - const contentSize = result.size > 0 ? result.size : 1024; // min 1KB for pricing + // When Content-Length is unknown (chunked), use a conservative estimate + // that gets corrected in the token adjustment after streaming. + const contentSize = + result.size > 0 ? result.size : config.IPFS_MAX_RESPONSE_SIZE_BYTES; const limitCheck = await checkPaymentAndRateLimits({ req, res, @@ -216,6 +218,11 @@ async function handleIpfsRequest({ // Track metrics const cacheStatus = result.cached ? 'hit' : 'miss'; metrics.ipfsRequestsTotal.inc({ route_type: routeType, status: 'success' }); + if (result.cached) { + metrics.ipfsCacheHitTotal.inc(); + } else { + metrics.ipfsCacheMissTotal.inc(); + } if (result.size > 0) { metrics.ipfsContentSizeHistogram.observe(result.size); } From 04fcc12a443cd3d2f4276ac5f1902290b94606d9 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 14:47:35 +0000 Subject: [PATCH 10/11] feat: emit data-cached webhook event for IPFS content (PE-9067) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPFS cached content now triggers the same data-cached event as Arweave, feeding into the existing webhook system. No new configuration needed — operators' existing webhook consumers see IPFS content automatically. --- src/ipfs/ipfs-cache.ts | 16 ++++++++++++++++ src/system.ts | 1 + 2 files changed, 17 insertions(+) diff --git a/src/ipfs/ipfs-cache.ts b/src/ipfs/ipfs-cache.ts index c7847c1b0..54af569ba 100644 --- a/src/ipfs/ipfs-cache.ts +++ b/src/ipfs/ipfs-cache.ts @@ -5,12 +5,16 @@ * SPDX-License-Identifier: AGPL-3.0-or-later */ import crypto from 'node:crypto'; +import EventEmitter from 'node:events'; import fs from 'node:fs'; import { Readable } from 'node:stream'; import { pipeline } from 'node:stream/promises'; import winston from 'winston'; import { LRUCache } from 'lru-cache'; +import * as events from '../events.js'; +import { currentUnixTimestamp } from '../lib/time.js'; + interface CacheEntry { size: number; contentType: string; @@ -20,18 +24,22 @@ export class IpfsFsCache { private log: winston.Logger; private baseDir: string; private index: LRUCache; + private eventEmitter?: EventEmitter; constructor({ log, basePath, maxSizeBytes, + eventEmitter, }: { log: winston.Logger; basePath: string; maxSizeBytes: number; + eventEmitter?: EventEmitter; }) { this.log = log.child({ class: this.constructor.name }); this.baseDir = basePath; + this.eventEmitter = eventEmitter; this.index = new LRUCache({ maxSize: maxSizeBytes, sizeCalculation: (entry) => entry.size, @@ -212,6 +220,14 @@ export class IpfsFsCache { key, size, }); + + this.eventEmitter?.emit(events.DATA_CACHED, { + id: cidString, + hash: key, + dataSize: size, + contentType, + cachedAt: currentUnixTimestamp(), + }); } catch (error: any) { this.log.error('Failed to finalize cached IPFS content', { cidString, diff --git a/src/system.ts b/src/system.ts index 420a696b5..f7aab452b 100644 --- a/src/system.ts +++ b/src/system.ts @@ -1483,6 +1483,7 @@ if (config.IPFS_ENABLED) { log, basePath: config.IPFS_CACHE_PATH, maxSizeBytes: config.IPFS_CACHE_MAX_SIZE_BYTES, + eventEmitter, }); ipfsService = new IpfsService({ From c7e4b4a47cb4dab9f3ca4fa29ee32584bea8293a Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2026 15:57:42 +0000 Subject: [PATCH 11/11] fix: match blocked response format to Arweave, add webhook event for IPFS cache (PE-9067) - IPFS blocked response now matches Arweave format (plain text with ID) - Emit data-cached webhook event when IPFS content is cached (same event as Arweave, feeds into content scanner pipeline automatically) --- src/routes/ipfs.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/routes/ipfs.ts b/src/routes/ipfs.ts index 8a976c699..056ef8ae9 100644 --- a/src/routes/ipfs.ts +++ b/src/routes/ipfs.ts @@ -272,7 +272,11 @@ async function handleIpfsRequest({ 'Cache-Control', `public, max-age=${config.CACHE_BLOCKED_MAX_AGE}, immutable`, ); - res.status(451).json({ error: 'Content blocked' }); + res + .status(451) + .send( + `Requested content blocked by this node's content policy. Blocked ID: ${cidString}`, + ); return; }