From 1866c4492e4e0021a0af28568b66933118b19c4f Mon Sep 17 00:00:00 2001
From: Nils Schimmelmann <nschimme@gmail.com>
Date: Wed, 4 Mar 2026 06:35:17 -0600
Subject: [PATCH 1/5] CI: add FAAC Benchmark Suite to test quality

---
 .github/workflows/benchmark.yml | 180 +++++++++++
 .gitignore                      |   7 +
 tests/README.md                 |  90 ++++++
 tests/compare_results.py        | 519 ++++++++++++++++++++++++++++++++
 tests/requirements.txt          |   4 +
 tests/run_benchmark.py          | 364 ++++++++++++++++++++++
 tests/setup_datasets.py         | 261 ++++++++++++++++
 7 files changed, 1425 insertions(+)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100644 tests/README.md
 create mode 100644 tests/compare_results.py
 create mode 100644 tests/requirements.txt
 create mode 100644 tests/run_benchmark.py
 create mode 100644 tests/setup_datasets.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 00000000..4c2ad7e5
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,180 @@
+name: Benchmark
+
+on:
+  pull_request:
+    branches: [ "master" ]
+    paths:
+      - "libfaac/**"
+      - "tests/**"
+
+jobs:
+  benchmark:
+    # NOTE: ViSQOL via visqol-py is currently most reliable on ubuntu-22.04.
+    name: ${{ matrix.arch }} / ${{ matrix.precision }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [amd64]
+        precision: [single, double]
+        include:
+          - arch: amd64
+            os: ubuntu-22.04
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y meson ninja-build bc ffmpeg
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+          cache-dependency-path: 'tests/requirements.txt'
+
+      - name: Install Python dependencies
+        run: |
+          pip install --upgrade pip setuptools wheel
+          pip install -r tests/requirements.txt
+
+      - name: Restore Datasets
+        id: cache-datasets
+        uses: actions/cache/restore@v4
+        with:
+          path: tests/data/external
+          key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }}
+
+      - name: Setup Datasets
+        if: steps.cache-datasets.outputs.cache-hit != 'true'
+        run: |
+          python3 tests/setup_datasets.py
+
+      - name: Save Datasets
+        if: steps.cache-datasets.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v4
+        with:
+          path: tests/data/external
+          key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }}
+
+      - name: Determine Baseline SHA
+        id: baseline-sha
+        run: |
+          git checkout ${{ github.base_ref || 'master' }}
+          echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
+          git checkout ${{ github.sha }}
+
+      - name: Restore Baseline Results
+        id: cache-baseline
+        uses: actions/cache/restore@v4
+        with:
+          path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
+          key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }}
+
+      - name: Run Benchmark (Baseline)
+        if: steps.cache-baseline.outputs.cache-hit != 'true'
+        run: |
+          git checkout ${{ steps.baseline-sha.outputs.sha }}
+          meson setup build_base -Dfloating-point=${{ matrix.precision }} --buildtype=release
+          ninja -C build_base
+          LIB_PATH="build_base/libfaac/libfaac.so"
+          FAAC_PATH="build_base/frontend/faac"
+          # Restore benchmark scripts and config from PR branch to ensure consistent comparison logic
+          git checkout ${{ github.sha }} -- tests/
+          python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_base" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json" --coverage 100
+
+      - name: Save Baseline Results
+        if: always() && steps.cache-baseline.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v4
+        with:
+          path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
+          key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }}
+
+      - name: Run Benchmark (Candidate)
+        run: |
+          git checkout ${{ github.sha }}
+          mkdir -p tests/results
+          meson setup build_cand -Dfloating-point=${{ matrix.precision }} --buildtype=release
+          ninja -C build_cand
+          LIB_PATH="build_cand/libfaac/libfaac.so"
+          FAAC_PATH="build_cand/frontend/faac"
+          python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_cand" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_cand.json" --coverage 100
+
+      - name: Upload Results
+        uses: actions/upload-artifact@v4
+        with:
+          name: results-${{ matrix.arch }}-${{ matrix.precision }}
+          path: tests/results/*.json
+
+  report:
+    name: Consolidated Report
+    needs: benchmark
+    runs-on: ubuntu-latest
+    env:
+      BASE_SHA: ${{ github.event.pull_request.base.sha || github.event.before }}
+      CAND_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+    if: always()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Download all results
+        uses: actions/download-artifact@v4
+        with:
+          path: tests/results
+          pattern: results-*
+          merge-multiple: true
+
+      - name: Generate Report
+        id: generate
+        run: |
+          # Summary report for PR comment (high-signal only)
+          python3 tests/compare_results.py tests/results --summary-only --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-summary.md || echo "REGRESSION_DETECTED=1" >> $GITHUB_ENV
+          # Full report for artifact (all details)
+          python3 tests/compare_results.py tests/results --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-full.md || true
+          if [ ! -s report-summary.md ]; then
+            echo "Error: report-summary.md is empty"
+            exit 1
+          fi
+          cat report-summary.md
+
+      - name: Upload Full Report
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-report-full
+          path: report-full.md
+
+      - name: PR Feedback
+        if: always() && github.event_name == 'pull_request'
+        continue-on-error: true
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            if (fs.existsSync('report-summary.md')) {
+              let report = fs.readFileSync('report-summary.md', 'utf8').trim();
+              if (report.length > 0) {
+                const jobUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
+                const readmeUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/blob/${process.env.GITHUB_SHA}/tests/README.md`;
+                report += `\n\n---\n[View Detailed Job Log and Full Report](${jobUrl}) | [What Is This?](${readmeUrl})`;
+                github.rest.issues.createComment({
+                  issue_number: context.issue.number,
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  body: report
+                })
+              }
+            }
+
+      - name: Check for Regressions
+        run: |
+          if [ "${{ env.REGRESSION_DETECTED }}" == "1" ]; then
+            echo "Regressions or missing data detected. Failing job."
+            exit 1
+          fi
diff --git a/.gitignore b/.gitignore
index 1e20e33b..ec9b9a11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,10 @@ Makefile*
 *.user
 /libfaac/win32_ver.h
 /libfaac/faac.pc
+.DS_Store
+/build*/
+/venv/
+/tests/__pycache__/
+/tests/data/external/
+/tests/output/
+/tests/results/
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..d47c7104
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,90 @@
+# FAAC Benchmark Suite
+
+FAAC is the high-efficiency encoder for the resource-constrained world. From hobbyist projects to professional surveillance (VSS) and embedded VoIP, we prioritize performance where every cycle and byte matters.
+
+This suite provides the objective data necessary to ensure that every change moves us closer to our Northstar: the optimal balance of quality, speed, and size.
+
+---
+
+## The "Golden Triangle" Philosophy
+
+We evaluate every contribution against three competing pillars. While high-bitrate encoders like FDK-AAC or Opus target multi-channel, high-fidelity entertainment, FAAC focuses on remaining approachable and distributable for the global open-source community. We prioritize non-patent encumbered areas and the standard Low Complexity (LC-AAC) profile.
+
+1.  **Audio Fidelity**: We target transparent audio quality for our bitrates. We use objective metrics like ViSQOL (MOS) to ensure psychoacoustic improvements truly benefit the listener without introducing "metallic" ringing or "underwater" artifacts.
+2.  **Computational Efficiency**: FAAC must remain fast. We optimize for low-power cores where encoding speed is a critical requirement. Every CPU cycle saved is a win for our users.
+3.  **Minimal Footprint**: Binary size is a feature. We ensure the library remains small enough to fit within restrictive embedded firmware.
+
+---
+
+## Benchmarking Scenarios
+
+| Scenario | Mode | Source | Config | Project Goal |
+| :--- | :--- | :--- | :--- | :--- |
+| **VoIP** | Speech (16k) | TCD-VOIP | `-b 16` | Clear communication at low bitrates (16kbps). |
+| **VSS** | Speech (16k) | TCD-VOIP | `-b 40` | High-fidelity Video Surveillance Systems recording (40kbps). |
+| **Music** | Audio (48k) | PMLT / SoundExpert | `-b 64-256` | Full-range transparency for storage & streaming. |
+| **Throughput** | Efficiency | Synthetic Signals | Default | Stability test using 10-minute Sine/Sweep/Noise/Silence. |
+
+---
+
+## Metric Definitions
+
+| Metric | Definition | Reference |
+| :--- | :--- | :--- |
+| **MOS** | Mean Opinion Score (LQO). Predicted perceptual quality from 1.0 (Bad) to 5.0 (Excellent), computed via the **ViSQOL** model. | [ITU-T P.800](https://www.itu.int/rec/T-REC-P.800), [ViSQOL](https://github.com/google/visqol) |
+| **Regressions** | Critical failure or a drop in MOS ≥ 0.1 compared to the baseline commit. Significant throughput drops (>10%) or increased binary size also warrant review. | |
+| **Significant Win** | An improvement in MOS ≥ 0.1 compared to the baseline commit. | |
+| **Consistency** | Percentage of test cases where bitstreams are MD5-identical to the baseline. | |
+| **Throughput** | Normalized encoding speed improvement against baseline. Higher % indicates faster execution. | |
+| **Library Size** | Binary footprint of `libfaac.so`. Delta measured against baseline. Critical for embedded VSS/IoT targets. | |
+| **Bitrate Δ** | Percentage change in generated file size against baseline. Relative shift in bits used for the same target. | |
+| **Bitrate Accuracy** | The closeness of the achieved bitrate to the specified target (ABR mode). Measures the encoder's ability to respect the user-defined bitrate budget. | |
+
+---
+
+## Dataset Sources
+
+We are grateful to the following projects for providing high-quality research material:
+
+*   **TCD-VoIP (Sigmedia-VoIP)**: [Listener Test Database](https://www.sigmedia.tv/datasets/tcd_voip_ltd/) - Specifically designed for assessing quality in VoIP applications.
+*   **PMLT2014**: [Public Multiformat Listening Test](https://listening-test.coresv.net/) - A community-defined comprehensive multi-codec benchmark.
+*   **SoundExpert**: [Sound Samples](https://soundexpert.org/sound-samples) - High-precision EBU SQAM CD excerpts for transparency testing.
+
+---
+
+## Quick Start
+
+### 1. Install Dependencies
+```bash
+# System (Ubuntu/Debian)
+sudo apt-get update && sudo apt-get install -y meson ninja-build bc ffmpeg
+
+# Python
+python3 -m venv venv
+source venv/bin/activate
+pip install -r tests/requirements.txt
+```
+
+### 2. Prepare Datasets
+Downloads samples and generates 10-minute synthetic throughput signals (Sine, Sweep, Noise, Silence).
+```bash
+python3 tests/setup_datasets.py
+```
+
+### 3. Run a Benchmark
+Perceptual analysis and full test suite coverage are enabled by default. Use `--skip-mos` or `--coverage 10` for faster iteration during local development.
+```bash
+python3 tests/run_benchmark.py build/frontend/faac build/libfaac/libfaac.so my_run tests/results/my_run.json
+```
+
+### 4. Compare Results
+Generate a high-signal summary comparing your candidate against a baseline.
+```bash
+python3 tests/compare_results.py tests/results/
+```
+
+## Who This Suite Helps
+
+*   **Maintainers**: Provides the confidence to merge PRs by proving that a change improves the encoder—or at least doesn't cause a regression.
+*   **Developers**: Offers standardized, automated feedback during implementation.
+*   **Users**: Ensures that every new version of FAAC remains a reliable choice for their critical firmware and communication projects.
diff --git a/tests/compare_results.py b/tests/compare_results.py
new file mode 100644
index 00000000..edae0175
--- /dev/null
+++ b/tests/compare_results.py
@@ -0,0 +1,519 @@
+"""
+ * FAAC Benchmark Suite
+ * Copyright (C) 2026 Nils Schimmelmann
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+"""
+
+import json
+import sys
+import os
+from collections import defaultdict
+
+
+def analyze_pair(base_file, cand_file):
+    try:
+        with open(base_file, "r") as f:
+            base = json.load(f)
+    except Exception as e:
+        sys.stderr.write(
+            f"  Warning: Could not load baseline file {base_file}: {e}\n")
+        base = {}
+
+    try:
+        with open(cand_file, "r") as f:
+            cand = json.load(f)
+    except Exception as e:
+        sys.stderr.write(
+            f"  Error: Could not load candidate file {cand_file}: {e}\n")
+        return None
+
+    suite_results = {
+        "has_regression": False,
+        "missing_data": False,
+        "mos_delta_sum": 0,
+        "mos_count": 0,
+        "missing_mos_count": 0,
+        "tp_reduction": 0,
+        "lib_size_chg": 0,
+        "bitrate_chg_sum": 0,
+        "bitrate_count": 0,
+        "bitrate_acc_sum": 0,
+        "bitrate_acc_count": 0,
+        "regressions": [],
+        "new_wins": [],
+        "significant_wins": [],
+        "opportunities": [],
+        "bit_exact_count": 0,
+        "total_cases": 0,
+        "all_cases": [],
+        "scenario_stats": defaultdict(
+            lambda: {
+                "tp_sum_cand": 0,
+                "tp_sum_base": 0,
+                "count": 0}),
+        "base_tp": base.get("throughput", {}),
+        "cand_tp": cand.get("throughput", {})}
+
+    base_m = base.get("matrix", {})
+    cand_m = cand.get("matrix", {})
+
+    if cand_m:
+        suite_results["total_cases"] = len(cand_m)
+        for k in sorted(cand_m.keys()):
+            o = cand_m[k]
+            b = base_m.get(k, {})
+
+            filename = o.get("filename", k)
+            scenario = o.get("scenario", "")
+            display_name = f"{scenario}: {filename}"
+
+            o_mos = o.get("mos")
+            b_mos = b.get("mos")
+            thresh = o.get("thresh", 1.0)
+
+            o_size = o.get("size")
+            b_size = b.get("size")
+
+            o_bitrate = o.get("bitrate")
+            o_target = o.get("bitrate_target")
+
+            if o_bitrate is not None and o_target is not None and o_target > 0:
+                acc = (1.0 - abs(o_bitrate - o_target) / o_target) * 100
+                suite_results["bitrate_acc_sum"] += acc
+                suite_results["bitrate_acc_count"] += 1
+
+            o_time = o.get("time")
+            b_time = b.get("time")
+
+            if o_time is not None and b_time is not None and b_time > 0:
+                suite_results["scenario_stats"][scenario]["tp_sum_cand"] += o_time
+                suite_results["scenario_stats"][scenario]["tp_sum_base"] += b_time
+                suite_results["scenario_stats"][scenario]["count"] += 1
+
+            o_md5 = o.get("md5", "")
+            b_md5 = b.get("md5", "")
+
+            if o_md5 and b_md5 and o_md5 == b_md5:
+                suite_results["bit_exact_count"] += 1
+
+            size_chg = "N/A"
+            if o_size is not None and b_size is not None:
+                size_chg_val = (o_size - b_size) / b_size * 100
+                size_chg = f"{size_chg_val:+.2f}%"
+                suite_results["bitrate_chg_sum"] += size_chg_val
+                suite_results["bitrate_count"] += 1
+            elif o_size is None:
+                suite_results["missing_data"] = True
+
+            status = "✅"
+            delta = 0
+            if o_mos is not None:
+                if b_mos is not None:
+                    delta = o_mos - b_mos
+                    suite_results["mos_delta_sum"] += delta
+                    suite_results["mos_count"] += 1
+
+                if o_mos < (thresh - 0.5):
+                    status = "🤮"  # Awful
+                elif o_mos < thresh:
+                    status = "📉"  # Bad/Poor
+
+                if b_mos is not None:
+                    if (o_mos - b_mos) < -0.1:
+                        status = "❌"  # Regression
+                        suite_results["has_regression"] = True
+                    elif (o_mos - b_mos) > 0.1:
+                        status = "🌟"  # Significant Win
+
+                # Check for New Win (Baseline failed, Candidate passed)
+                if b_mos is not None and b_mos < thresh and o_mos >= thresh:
+                    suite_results["new_wins"].append({
+                        "display_name": display_name,
+                        "mos": o_mos,
+                        "b_mos": b_mos,
+                        "delta": delta
+                    })
+            else:
+                status = "❌"  # Missing MOS is a failure
+                suite_results["missing_mos_count"] += 1
+                suite_results["has_regression"] = True
+                suite_results["missing_data"] = True
+                delta = -10.0  # Force to top of regressions
+
+            mos_str = f"{o_mos:.2f}" if o_mos is not None else "N/A"
+            b_mos_str = f"{b_mos:.2f}" if b_mos is not None else "N/A"
+            delta_mos = f"{(o_mos - b_mos):+.2f}" if (
+                o_mos is not None and b_mos is not None) else "N/A"
+
+            case_data = {
+                "display_name": display_name,
+                "status": status,
+                "mos": o_mos,
+                "b_mos": b_mos,
+                "delta": delta,
+                "size_chg": size_chg,
+                "line": f"| {display_name} | {status} | {mos_str} ({b_mos_str}) | {delta_mos} | {size_chg} |"
+            }
+
+            suite_results["all_cases"].append(case_data)
+            if status == "❌":
+                suite_results["regressions"].append(case_data)
+            elif status == "🌟":
+                suite_results["significant_wins"].append(case_data)
+            elif status in ["🤮", "📉"]:
+                suite_results["opportunities"].append(case_data)
+    else:
+        suite_results["missing_data"] = True
+
+    # Sorts
+    suite_results["regressions"].sort(key=lambda x: x["delta"])
+    suite_results["new_wins"].sort(key=lambda x: x["delta"], reverse=True)
+    suite_results["significant_wins"].sort(
+        key=lambda x: x["delta"], reverse=True)
+    suite_results["opportunities"].sort(
+        key=lambda x: x["mos"] if x["mos"] is not None else 6.0)
+
+    # Throughput
+    base_tp = base.get("throughput", {})
+    cand_tp = cand.get("throughput", {})
+    # Exclude "overall" to avoid double-counting in manual summation
+    total_base_t = sum(v for k, v in base_tp.items() if k != "overall")
+    total_cand_t = sum(v for k, v in cand_tp.items() if k != "overall")
+    if total_cand_t > 0 and total_base_t > 0:
+        suite_results["tp_reduction"] = (1 - total_cand_t / total_base_t) * 100
+    else:
+        # If overall throughput is missing, try to aggregate from scenarios
+        cand_t_sum = sum(s["tp_sum_cand"]
+                         for s in suite_results["scenario_stats"].values())
+        base_t_sum = sum(s["tp_sum_base"]
+                         for s in suite_results["scenario_stats"].values())
+        if cand_t_sum > 0 and base_t_sum > 0:
+            suite_results["tp_reduction"] = (1 - cand_t_sum / base_t_sum) * 100
+        else:
+            suite_results["missing_data"] = True
+
+    # Binary Size
+    base_lib = base.get("lib_size", 0)
+    cand_lib = cand.get("lib_size", 0)
+    if cand_lib > 0 and base_lib > 0:
+        suite_results["lib_size_chg"] = ((cand_lib / base_lib) - 1) * 100
+    else:
+        suite_results["missing_data"] = True
+
+    return suite_results
+
+
+def main():
+    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+    summary_only = "--summary-only" in sys.argv
+    if summary_only:
+        sys.argv.remove("--summary-only")
+
+    base_sha = None
+    if "--base-sha" in sys.argv:
+        idx = sys.argv.index("--base-sha")
+        base_sha = sys.argv[idx + 1]
+        sys.argv.pop(idx + 1)
+        sys.argv.pop(idx)
+
+    cand_sha = None
+    if "--cand-sha" in sys.argv:
+        idx = sys.argv.index("--cand-sha")
+        cand_sha = sys.argv[idx + 1]
+        sys.argv.pop(idx + 1)
+        sys.argv.pop(idx)
+
+    results_dir = sys.argv[1] if len(
+        sys.argv) > 1 else os.path.join(
+        SCRIPT_DIR, "results")
+
+    if not os.path.exists(results_dir):
+        sys.exit(1)
+
+    files = os.listdir(results_dir)
+
+    suites = {}
+    for f in files:
+        if f.endswith("_cand.json"):
+            suite_name = f[:-10]
+            base_f = suite_name + "_base.json"
+            if base_f in files:
+                suites[suite_name] = (
+                    os.path.join(
+                        results_dir, base_f), os.path.join(
+                        results_dir, f))
+
+    if not suites:
+        sys.stderr.write("No result pairs found in directory.\n")
+        sys.exit(1)
+
+    all_suite_data = {}
+    overall_regression = False
+    overall_missing = False
+    total_mos_delta = 0
+    total_mos_count = 0
+    total_missing_mos = 0
+    total_tp_reduction = 0
+    total_lib_chg = 0
+    total_bitrate_chg = 0
+    total_bitrate_count = 0
+    total_bitrate_acc_sum = 0
+    total_bitrate_acc_count = 0
+
+    total_regressions = 0
+    total_new_wins = 0
+    total_significant_wins = 0
+    total_bit_exact = 0
+    total_cases_all = 0
+
+    # For worst-case scenario throughput
+    scenario_tp_deltas = []
+
+    for name, (base, cand) in sorted(suites.items()):
+        data = analyze_pair(base, cand)
+        if data:
+            all_suite_data[name] = data
+            if data["has_regression"]:
+                overall_regression = True
+            if data["missing_data"]:
+                overall_missing = True
+            total_mos_delta += data["mos_delta_sum"]
+            total_mos_count += data["mos_count"]
+            total_missing_mos += data["missing_mos_count"]
+            total_tp_reduction += data["tp_reduction"]
+            total_lib_chg += data["lib_size_chg"]
+            total_bitrate_chg += data["bitrate_chg_sum"]
+            total_bitrate_count += data["bitrate_count"]
+            total_bitrate_acc_sum += data["bitrate_acc_sum"]
+            total_bitrate_acc_count += data["bitrate_acc_count"]
+
+            total_regressions += len(data["regressions"])
+            total_new_wins += len(data["new_wins"])
+            total_significant_wins += len(data["significant_wins"])
+            total_bit_exact += data["bit_exact_count"]
+            total_cases_all += data["total_cases"]
+
+            for sc_name, sc_data in data["scenario_stats"].items():
+                if sc_data["tp_sum_base"] > 0:
+                    delta = (1 - sc_data["tp_sum_cand"] /
+                             sc_data["tp_sum_base"]) * 100
+                    scenario_tp_deltas.append((f"{name} / {sc_name}", delta))
+
+    avg_mos_delta_str = f"{(total_mos_delta /
+                            total_mos_count):+.3f}" if total_mos_count > 0 else "N/A"
+    avg_tp_reduction = total_tp_reduction / \
+        len(all_suite_data) if all_suite_data else 0
+    avg_lib_chg = total_lib_chg / len(all_suite_data) if all_suite_data else 0
+    avg_bitrate_chg = total_bitrate_chg / \
+        total_bitrate_count if total_bitrate_count > 0 else 0
+    avg_bitrate_acc = total_bitrate_acc_sum / \
+        total_bitrate_acc_count if total_bitrate_acc_count > 0 else 0
+
+    bit_exact_percent = (
+        total_bit_exact /
+        total_cases_all *
+        100) if total_cases_all > 0 else 0
+
+    # Worst-case throughput
+    worst_tp_scen, worst_tp_delta = (None, 0)
+    if scenario_tp_deltas:
+        worst_tp_scen, worst_tp_delta = min(
+            scenario_tp_deltas, key=lambda x: x[1])
+
+    report = []
+    if overall_regression:
+        report.append("## ❌ Quality Regression Detected")
+    elif worst_tp_delta < -5.0:
+        report.append("## ⚠️ Performance Regression Detected")
+    elif overall_missing:
+        report.append("## ❌ Incomplete/Missing Data Detected")
+    elif bit_exact_percent == 100.0:
+        report.append("## ✅ Refactor Verified (Bit-Identical)")
+    elif total_new_wins > 0 or total_significant_wins > 0 or (total_mos_count > 0 and (total_mos_delta / total_mos_count) > 0.01) or avg_tp_reduction > 5:
+        report.append("## 🚀 Perceptual & Efficiency Improvement")
+    else:
+        report.append("## 📊 Benchmark Summary")
+
+    if not summary_only and (base_sha or cand_sha):
+        report.append("\n### Environment")
+        if base_sha:
+            report.append(f"- **Baseline SHA**: `{base_sha}`")
+        if cand_sha:
+            report.append(f"- **Candidate SHA**: `{cand_sha}`")
+
+    report.append("\n### Summary")
+    report.append("| Metric | Value |")
+    report.append("| :--- | :--- |")
+
+    # Regressions (Always shown)
+    reg_status = "0 ✅" if total_regressions == 0 else f"{total_regressions} ❌"
+    report.append(f"| **Regressions** | {reg_status} |")
+
+    # New Wins (Only if baseline < threshold and candidate >= threshold)
+    if total_new_wins > 0:
+        report.append(f"| **New Wins** | {total_new_wins} 🆕 |")
+
+    # Significant Wins (MOS delta > 0.1)
+    if total_significant_wins > 0:
+        report.append(f"| **Significant Wins** | {total_significant_wins} 🌟 |")
+
+    # Bitstream Consistency (Against baseline)
+    consist_status = f"{bit_exact_percent:.1f}%"
+    if bit_exact_percent == 100.0:
+        consist_status += " (MD5 Match)"
+    report.append(f"| **Consistency** | {consist_status} |")
+
+    # Throughput
+    if abs(avg_tp_reduction) > 0.1:
+        tp_icon = "🚀" if avg_tp_reduction > 1.0 else "📉" if avg_tp_reduction < -1.0 else ""
+        report.append(
+            f"| **Throughput (Avg)** | {avg_tp_reduction:+.1f}% {tp_icon} |")
+
+    # Per-signal throughput deltas if available
+    tp_details = []
+    if all_suite_data:
+        first_data = list(all_suite_data.values())[0]
+        base_tp = first_data.get("base_tp", {})
+        cand_tp = first_data.get("cand_tp", {})
+        for signal in sorted(cand_tp.keys()):
+            if signal == "overall":
+                continue
+            if signal in base_tp and base_tp[signal] > 0:
+                delta = (1 - cand_tp[signal] / base_tp[signal]) * 100
+                icon = "🚀" if delta > 1.0 else "📉" if delta < -1.0 else ""
+                tp_details.append(
+                    f"{signal.split('.')[0]}: {delta:+.1f}% {icon}")
+
+    if tp_details:
+        report.append(f"| **TP Breakdown** | {', '.join(tp_details)} |")
+
+    if worst_tp_delta < -1.0:
+        report.append(
+            f"| **Worst-case TP Δ** | {worst_tp_delta:.1f}% ({worst_tp_scen}) ⚠️ |")
+
+    # Binary Size
+    if abs(avg_lib_chg) > 0.01:
+        size_icon = "📉" if avg_lib_chg < -0.1 else "📈" if avg_lib_chg > 0.1 else ""
+        report.append(
+            f"| **Library Size** | {avg_lib_chg:+.2f}% {size_icon} |")
+
+
+    # Bitrate Δ
+    if abs(avg_bitrate_chg) > 0.1:
+        bitrate_icon = "📉" if avg_bitrate_chg < - \
+            1.0 else "📈" if avg_bitrate_chg > 1.0 else ""
+        report.append(
+            f"| **Bitrate Δ** | {avg_bitrate_chg:+.2f}% {bitrate_icon} |")
+
+    # Bitrate Accuracy
+    if total_bitrate_acc_count > 0:
+        acc_icon = "🎯" if avg_bitrate_acc > 95 else "⚠️" if avg_bitrate_acc < 80 else ""
+        report.append(
+            f"| **Bitrate Accuracy** | {avg_bitrate_acc:.1f}% {acc_icon} |")
+
+    # Avg MOS Delta
+    if total_mos_count > 0 and abs(total_mos_delta / total_mos_count) > 0.001:
+        report.append(f"| **Avg MOS Delta** | {avg_mos_delta_str} |")
+
+    if total_missing_mos > 0:
+        report.append(
+            f"\n⚠️ **Warning**: {total_missing_mos} MOS scores were missing/failed (treated as ❌).")
+
+    if not summary_only:
+        # 1. Collapsible Details: Regressions
+        if total_regressions > 0:
+            report.append(
+                "\n<details><summary><b>❌ View Regression Details ({})</b></summary>\n".format(total_regressions))
+            for name, data in sorted(all_suite_data.items()):
+                if data["regressions"]:
+                    report.append(f"\n#### {name}")
+                    report.append(
+                        "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
+                    report.append("| :--- | :---: | :---: | :---: | :---: |")
+                    for r in data["regressions"]:
+                        report.append(r["line"])
+            report.append("\n</details>")
+
+        # 2. Collapsible Additional Details
+        report.append(
+            "\n<details><summary><b>View Additional Suite Details & Wins</b></summary>\n")
+
+        for name, data in sorted(all_suite_data.items()):
+            status_icon = "✅"
+            if data["has_regression"]:
+                status_icon = "❌"
+            elif data["missing_data"]:
+                status_icon = "❌"
+
+            avg_mos_suite = f"{(data['mos_delta_sum'] /
+                                data['mos_count']):+.3f}" if data["mos_count"] > 0 else "N/A"
+            suite_bit_exact_percent = (
+                data["bit_exact_count"] /
+                data["total_cases"] *
+                100) if data["total_cases"] > 0 else 0
+
+            report.append(f"\n#### {status_icon} {name}")
+            report.append(
+                f"- MOS Δ: {avg_mos_suite}, TP Δ: {data['tp_reduction']:+.1f}%, Size Δ: {data['lib_size_chg']:+.2f}%")
+            report.append(
+                f"- Bitstream Consistency: {suite_bit_exact_percent:.1f}%")
+
+            if data["new_wins"]:
+                report.append("\n**🆕 New Wins**")
+                report.append("| Test Case | MOS (Base) | Delta |")
+                report.append("| :--- | :---: | :---: |")
+                for w in data["new_wins"]:
+                    report.append("| {} | {:.2f} ({:.2f}) | {:+.2f} |".format(
+                        w["display_name"], w["mos"], w["b_mos"], w["delta"]))
+
+            if data["significant_wins"]:
+                report.append("\n**🌟 Significant Wins**")
+                report.append(
+                    "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
+                report.append("| :--- | :---: | :---: | :---: | :---: |")
+                for w in data["significant_wins"]:
+                    report.append(w["line"])
+
+            if data["opportunities"]:
+                report.append("\n**💡 Opportunities**")
+                report.append(
+                    "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
+                report.append("| :--- | :---: | :---: | :---: | :---: |")
+                for o in data["opportunities"]:
+                    report.append(o["line"])
+
+            if data["all_cases"]:
+                report.append(
+                    f"\n<details><summary>View all {len(data['all_cases'])} cases for {name}</summary>\n")
+                report.append(
+                    "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
+                report.append("| :--- | :---: | :---: | :---: | :---: |")
+                for c in data["all_cases"]:
+                    report.append(c["line"])
+                report.append("\n</details>")
+
+        report.append("\n</details>")
+
+    output = "\n".join(report)
+    sys.stdout.write(output + "\n")
+
+    if overall_regression or overall_missing:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 00000000..2ee1d2a2
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+protobuf==3.20.3
+ffmpeg-python
+git+https://github.com/diggerdu/visqol-py.git@452eb5c4f17fd2404f968ec2eeadfcad74925485
diff --git a/tests/run_benchmark.py b/tests/run_benchmark.py
new file mode 100644
index 00000000..232dbb54
--- /dev/null
+++ b/tests/run_benchmark.py
@@ -0,0 +1,364 @@
+"""
+ * FAAC Benchmark Suite
+ * Copyright (C) 2026 Nils Schimmelmann
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+"""
+
+import os
+import subprocess
+import time
+import sys
+import json
+import tempfile
+import hashlib
+import concurrent.futures
+import multiprocessing
+
+try:
+    import visqol_py
+    from visqol_py import ViSQOLMode
+    HAS_VISQOL = True
+except ImportError:
+    HAS_VISQOL = False
+
+try:
+    import ffmpeg
+    HAS_FFMPEG = True
+except ImportError:
+    HAS_FFMPEG = False
+
+# Paths relative to script directory
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+EXTERNAL_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external")
+OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
+
+SCENARIOS = {
+    "voip": {
+        "mode": "speech",
+        "rate": 16000,
+        "visqol_rate": 16000,
+        "bitrate": 16,
+        "thresh": 2.5},
+    "vss": {
+        "mode": "speech",
+        "rate": 16000,
+        "visqol_rate": 16000,
+        "bitrate": 40,
+        "thresh": 3.0},
+    "music_low": {
+        "mode": "audio",
+        "rate": 48000,
+        "visqol_rate": 48000,
+        "bitrate": 64,
+        "thresh": 3.5},
+    "music_std": {
+        "mode": "audio",
+        "rate": 48000,
+        "visqol_rate": 48000,
+        "bitrate": 128,
+        "thresh": 4.0},
+    "music_high": {
+        "mode": "audio",
+        "rate": 48000,
+        "visqol_rate": 48000,
+        "bitrate": 256,
+        "thresh": 4.3}}
+
+
+def get_visqol_mode(mode_str):
+    if not HAS_VISQOL:
+        return None
+    return ViSQOLMode.SPEECH if mode_str == "speech" else ViSQOLMode.AUDIO
+
+
+def get_binary_size(path):
+    if os.path.exists(path):
+        return os.path.getsize(path)
+    return 0
+
+
+def get_md5(path):
+    if not os.path.exists(path):
+        return ""
+    hash_md5 = hashlib.md5()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def run_visqol(visqol, ref_wav, deg_wav):
+    """Run ViSQOL via provided API instance and return MOS score."""
+    if visqol is None:
+        return None
+    try:
+        result = visqol.measure(ref_wav, deg_wav)
+        return float(result.moslqo)
+    except Exception as e:
+        print(f" ViSQOL API error: {e}")
+    return None
+
+
+# Process-local storage for ViSQOL instances
+_process_visqol_instances = {}
+
+
+def get_process_visqol(mode_str):
+    if not HAS_VISQOL:
+        return None
+    if mode_str not in _process_visqol_instances:
+        try:
+            mode = get_visqol_mode(mode_str)
+            _process_visqol_instances[mode_str] = visqol_py.ViSQOL(mode=mode)
+        except Exception as e:
+            print(
+                f" Failed to initialize ViSQOL in process {
+                    os.getpid()}: {e}")
+            _process_visqol_instances[mode_str] = None
+    return _process_visqol_instances[mode_str]
+
+
+def worker_init(cpu_id_queue):
+    """Pin the worker process to a specific CPU core for consistent benchmarks."""
+    cpu_id = cpu_id_queue.get()
+    if hasattr(os, "sched_setaffinity"):
+        try:
+            os.sched_setaffinity(0, [cpu_id])
+        except Exception as e:
+            print(f" Failed to pin process {os.getpid()} to CPU {cpu_id}: {e}")
+
+
+def process_sample(faac_bin_path, name, cfg, sample, data_dir, precision, env):
+    input_path = os.path.join(data_dir, sample)
+    key = f"{name}_{sample}"
+    output_path = os.path.join(OUTPUT_DIR, f"{key}_{precision}.aac")
+
+    # Determine encoding parameters
+    cmd = [faac_bin_path, "-o", output_path, input_path]
+    cmd.extend(["-b", str(cfg["bitrate"])])
+
+    try:
+        t_start = time.time()
+        subprocess.run(cmd, env=env, check=True, capture_output=True)
+        t_duration = time.time() - t_start
+
+        mos = None
+        aac_size = os.path.getsize(output_path)
+        actual_bitrate = None
+
+        if HAS_FFMPEG:
+            try:
+                probe = ffmpeg.probe(input_path)
+                duration = float(probe['format']['duration'])
+                if duration > 0:
+                    # kbps = (bytes * 8) / (seconds * 1000)
+                    actual_bitrate = (aac_size * 8) / (duration * 1000)
+            except Exception as e:
+                print(f" Failed to probe duration for {sample}: {e}")
+
+        if HAS_FFMPEG:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                v_ref = os.path.join(tmpdir, "vref.wav")
+                v_deg = os.path.join(tmpdir, "vdeg.wav")
+                v_rate = cfg["visqol_rate"]
+                v_channels = 1 if cfg["mode"] == "speech" else 2
+
+                try:
+                    # Use ffmpeg-python to decode AAC and prepare files for
+                    # ViSQOL
+                    ffmpeg.input(input_path).output(
+                        v_ref, ar=v_rate, ac=v_channels, sample_fmt='s16').run(
+                        quiet=True, overwrite_output=True)
+                    ffmpeg.input(output_path).output(
+                        v_deg, ar=v_rate, ac=v_channels, sample_fmt='s16').run(
+                        quiet=True, overwrite_output=True)
+
+                    if os.path.exists(v_ref) and os.path.exists(v_deg):
+                        visqol = get_process_visqol(cfg["mode"])
+                        mos = run_visqol(visqol, v_ref, v_deg)
+                except ffmpeg.Error as e:
+                    print(
+                        f" FFmpeg error for {sample}: {
+                            e.stderr.decode() if e.stderr else e}")
+
+        return key, {
+            "mos": mos,
+            "size": aac_size,
+            "bitrate": actual_bitrate,
+            "bitrate_target": cfg.get("bitrate"),
+            "time": t_duration,
+            "md5": get_md5(output_path),
+            "thresh": cfg["thresh"],
+            "scenario": name,
+            "filename": sample
+        }
+    except Exception as e:
+        print(f" failed: {e}")
+        return None
+
+
+def run_benchmark(
+        faac_bin_path,
+        lib_path,
+        precision,
+        coverage=100,
+        run_perceptual=True):
+    env = os.environ.copy()
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    results = {
+        "matrix": {},
+        "throughput": {},
+        "lib_size": get_binary_size(lib_path)
+    }
+
+    if run_perceptual:
+        print(f"Starting perceptual benchmark for {precision}...")
+        # Detect number of CPUs for parallelization
+        num_cpus = os.cpu_count() or 1
+        print(f"Parallelizing across {num_cpus} threads.")
+
+        for name, cfg in SCENARIOS.items():
+            data_subdir = "speech" if cfg["mode"] == "speech" else "audio"
+            data_dir = os.path.join(EXTERNAL_DATA_DIR, data_subdir)
+            if not os.path.exists(data_dir):
+                print(
+                    f"  [Scenario: {name}] Data directory {data_dir} not found, skipping.")
+                continue
+
+            all_samples = sorted(
+                [f for f in os.listdir(data_dir) if f.endswith(".wav")])
+            num_to_run = max(1, int(len(all_samples) * coverage / 100.0))
+            step = len(all_samples) / num_to_run if num_to_run > 0 else 1
+            samples = [all_samples[int(i * step)] for i in range(num_to_run)]
+
+            print(
+                f"  [Scenario: {name}] Processing {
+                    len(samples)} samples (coverage {coverage}%)...")
+
+            # Pin each process to a unique CPU core
+            manager = multiprocessing.Manager()
+            cpu_id_queue = manager.Queue()
+            for cpu_id in range(num_cpus):
+                cpu_id_queue.put(cpu_id)
+
+            with concurrent.futures.ProcessPoolExecutor(
+                max_workers=num_cpus,
+                initializer=worker_init,
+                initargs=(cpu_id_queue,)
+            ) as executor:
+                futures = {
+                    executor.submit(
+                        process_sample,
+                        faac_bin_path,
+                        name,
+                        cfg,
+                        sample,
+                        data_dir,
+                        precision,
+                        env): sample for sample in samples}
+                for i, future in enumerate(
+                        concurrent.futures.as_completed(futures)):
+                    result = future.result()
+                    if result:
+                        key, data = result
+                        results["matrix"][key] = data
+                        mos_str = f"{
+                            data['mos']:.2f}" if data['mos'] is not None else "N/A"
+                        print(
+                            f"    ({i + 1}/{len(samples)}) {data['filename']} done. (MOS: {mos_str})")
+
+    print(f"Measuring throughput for {precision}...")
+    # Pin current process to a single core for accurate throughput measurement
+    if hasattr(os, "sched_setaffinity"):
+        try:
+            os.sched_setaffinity(0, [0])
+        except BaseException:
+            pass
+
+    tp_dir = os.path.join(EXTERNAL_DATA_DIR, "throughput")
+    if os.path.exists(tp_dir):
+        tp_samples = sorted(
+            [f for f in os.listdir(tp_dir) if f.endswith(".wav")])
+        if tp_samples:
+            overall_durations = []
+            for sample in tp_samples:
+                input_path = os.path.join(tp_dir, sample)
+                output_path = os.path.join(
+                    OUTPUT_DIR, f"tp_{sample}_{precision}.aac")
+
+                print(f"  Benchmarking throughput with {sample}...")
+                try:
+                    # Warmup
+                    subprocess.run([faac_bin_path,
+                                    "-o",
+                                    output_path,
+                                    input_path],
+                                   env=env,
+                                   check=True,
+                                   capture_output=True)
+
+                    # Multiple runs to average noise
+                    durations = []
+                    for _ in range(3):
+                        start_time = time.perf_counter()
+                        subprocess.run([faac_bin_path,
+                                        "-o",
+                                        output_path,
+                                        input_path],
+                                       env=env,
+                                       check=True,
+                                       capture_output=True)
+                        durations.append(time.perf_counter() - start_time)
+
+                    avg_dur = sum(durations) / len(durations)
+                    results["throughput"][sample] = avg_dur
+                    overall_durations.append(avg_dur)
+                except BaseException as e:
+                    print(f"    Throughput benchmark failed for {sample}: {e}")
+                    pass
+
+            if overall_durations:
+                results["throughput"]["overall"] = sum(
+                    overall_durations) / len(overall_durations)
+
+    return results
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 5:
+        print(
+            "Usage: python3 tests/run_benchmark.py <faac_bin_path> <lib_path> <precision_name> <output_json> [--skip-mos] [--coverage 100]")
+        sys.exit(1)
+
+    do_perc = "--skip-mos" not in sys.argv
+    coverage = 100
+    if "--coverage" in sys.argv:
+        idx = sys.argv.index("--coverage")
+        coverage = int(sys.argv[idx + 1])
+
+    data = run_benchmark(
+        sys.argv[1],
+        sys.argv[2],
+        sys.argv[3],
+        coverage=coverage,
+        run_perceptual=do_perc)
+
+    # Ensure results directory exists
+    output_json = os.path.abspath(sys.argv[4])
+    os.makedirs(os.path.dirname(output_json), exist_ok=True)
+    with open(output_json, "w") as f:
+        json.dump(data, f, indent=2)
diff --git a/tests/setup_datasets.py b/tests/setup_datasets.py
new file mode 100644
index 00000000..735ce3b3
--- /dev/null
+++ b/tests/setup_datasets.py
@@ -0,0 +1,261 @@
+"""
+ * FAAC Benchmark Suite
+ * Copyright (C) 2026 Nils Schimmelmann
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+"""
+
+import os
+import urllib.request
+import zipfile
+import shutil
+import wave
+import re
+import ffmpeg
+
+DATASETS = {
+    "PMLT2014": {
+        "url": "https://github.com/nschimme/PMLT2014/archive/refs/tags/PMLT2014.zip",
+        "name": "Public Multiformat Listening Test @ 96 kbps (July 2014)"
+    },
+    "TCD-VOIP": {
+        "url": "https://github.com/nschimme/TCD-VOIP/archive/refs/tags/harte2015tcd.zip",
+        "name": "TCD-VoIP (Sigmedia-VoIP) Listener Test Database"
+    },
+    "SoundExpert": {
+        "url": "https://github.com/nschimme/SoundExpert/archive/refs/tags/SoundExpert.zip",
+        "name": "SoundExpert Sound samples"
+    }
+}
+
+# Paths relative to script directory
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+BASE_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external")
+TEMP_DIR = os.path.join(SCRIPT_DIR, "data", "temp")
+
+
+def download_and_extract(name, url):
+    os.makedirs(TEMP_DIR, exist_ok=True)
+    zip_path = os.path.join(TEMP_DIR, f"{name}.zip")
+    if not os.path.exists(zip_path):
+        print(f"Downloading {name}...")
+        urllib.request.urlretrieve(url, zip_path)
+
+    print(f"Extracting {name}...")
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(TEMP_DIR)
+
+
+def get_info(wav_path):
+    try:
+        with wave.open(wav_path, 'rb') as f:
+            frames = f.getnframes()
+            rate = f.getframerate()
+            channels = f.getnchannels()
+            return frames / float(rate), channels
+    except BaseException:
+        return 0, 2
+
+
+def resample(
+        input_path,
+        output_path,
+        rate,
+        channels,
+        start=None,
+        duration=None,
+        loop=False):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    try:
+        input_args = {}
+        output_args = {}
+
+        if loop:
+            # Loop input indefinitely, then trim to requested duration
+            input_args['stream_loop'] = -1
+
+        if start is not None:
+            output_args['ss'] = start
+        if duration is not None:
+            output_args['t'] = duration
+
+        (ffmpeg .input(input_path,
+                       **input_args) .output(output_path,
+                                             ar=rate,
+                                             ac=channels,
+                                             sample_fmt='s16',
+                                             **output_args) .run(quiet=True,
+                                                                 overwrite_output=True))
+    except ffmpeg.Error as e:
+        print(
+            f" FFmpeg error during setup: {
+                e.stderr.decode() if e.stderr else e}")
+
+
+def get_tier_params(dur):
+    """
+    Determine resampling parameters based on ViSQOL recommendations (5-10s).
+    1. < 5s: loop to 5s
+    2. 5-10s: use full sample
+    3. > 10s: trim to 10s center segment
+    """
+    if dur < 5.0:
+        return 0, 5, True
+    if dur <= 10.0:
+        return None, None, False
+    return (dur - 10) / 2, 10, False
+
+
+def setup_pmlt():
+    dataset_info = DATASETS["PMLT2014"]
+    src_dir = os.path.join(TEMP_DIR, "PMLT2014-PMLT2014")
+    dest_dir = os.path.join(BASE_DATA_DIR, "audio")
+
+    wav_files = []
+    for root, dirs, files in os.walk(src_dir):
+        for f in files:
+            if f.endswith("48k.wav") and not re.search(r"48k\.\d+\.wav$", f):
+                wav_files.append(os.path.join(root, f))
+
+    print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.")
+    for i, wav in enumerate(wav_files):
+        print(f"  [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...")
+        dur, chans = get_info(wav)
+        start, duration, loop = get_tier_params(dur)
+
+        filename = os.path.basename(wav)
+        output = os.path.join(dest_dir, filename)
+        resample(
+            wav,
+            output,
+            48000,
+            chans,
+            start=start,
+            duration=duration,
+            loop=loop)
+
+
+def setup_tcd_voip():
+    dataset_info = DATASETS["TCD-VOIP"]
+    src_dir = os.path.join(TEMP_DIR, "TCD-VOIP-harte2015tcd")
+    dest_dir = os.path.join(BASE_DATA_DIR, "speech")
+
+    wav_files = []
+    for root, dirs, files in os.walk(src_dir):
+        # Do not use any wave files if they're in a "ref" folder
+        if "ref" in root.split(os.sep):
+            continue
+
+        for f in files:
+            if f.endswith(".wav") and ("Test Set" in root or "chop" in root):
+                wav_files.append(os.path.join(root, f))
+
+    print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.")
+    for i, wav in enumerate(wav_files):
+        print(f"  [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...")
+        dur, chans = get_info(wav)
+        start, duration, loop = get_tier_params(dur)
+
+        filename = os.path.basename(wav)
+        output = os.path.join(dest_dir, filename)
+        # ViSQOL speech mode requires 16k mono
+        resample(
+            wav,
+            output,
+            16000,
+            1,
+            start=start,
+            duration=duration,
+            loop=loop)
+
+
+def setup_soundexpert():
+    dataset_info = DATASETS["SoundExpert"]
+    src_dir = os.path.join(TEMP_DIR, "SoundExpert-SoundExpert")
+    dest_dir = os.path.join(BASE_DATA_DIR, "audio")
+
+    wav_files = []
+    for root, dirs, files in os.walk(src_dir):
+        for f in files:
+            if f.endswith(".wav"):
+                wav_files.append(os.path.join(root, f))
+
+    print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.")
+    for i, wav in enumerate(wav_files):
+        print(f"  [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...")
+        dur, chans = get_info(wav)
+        start, duration, loop = get_tier_params(dur)
+
+        filename = os.path.basename(wav)
+        output = os.path.join(dest_dir, filename)
+        resample(
+            wav,
+            output,
+            48000,
+            chans,
+            start=start,
+            duration=duration,
+            loop=loop)
+
+
+def setup_throughput_signals():
+    """Generate 10-minute test signals for throughput measurement."""
+    dest_dir = os.path.join(BASE_DATA_DIR, "throughput")
+    os.makedirs(dest_dir, exist_ok=True)
+
+    signals = {
+        "sine": "sine=f=440:d=600",
+        "sweep": "aevalsrc='sin(2*PI*(100+(20000-100)/(2*600)*t)*t)':d=600",
+        "noise": "anoisesrc=d=600",
+        "silence": "anullsrc=d=600"
+    }
+
+    print(f"Generating 10-minute throughput signals...")
+    for name, filter_str in signals.items():
+        output_path = os.path.join(dest_dir, f"{name}.wav")
+        if not os.path.exists(output_path):
+            print(f"  Generating {name}.wav...")
+            try:
+                # Note: aevalsrc is also a lavfi filter
+                (
+                    ffmpeg
+                    .input(filter_str, format='lavfi')
+                    .output(output_path, ar=48000, ac=2, sample_fmt='s16')
+                    .run(quiet=True, overwrite_output=True)
+                )
+            except ffmpeg.Error as e:
+                print(
+                    f" FFmpeg error during signal generation: {
+                        e.stderr.decode() if e.stderr else e}")
+
+
+if __name__ == "__main__":
+    if not os.path.exists(BASE_DATA_DIR):
+        for name, info in DATASETS.items():
+            download_and_extract(name, info["url"])
+
+        setup_pmlt()
+        setup_tcd_voip()
+        setup_soundexpert()
+        setup_throughput_signals()
+
+        if os.path.exists(TEMP_DIR):
+            shutil.rmtree(TEMP_DIR)
+    else:
+        # Always check for throughput signals as they are vital for stable
+        # metrics
+        setup_throughput_signals()
+        print("Datasets already setup.")
+    print("Done.")

From ce9c4a4f784c5864d3886a4feb828cdc8fb4d4b9 Mon Sep 17 00:00:00 2001
From: Nils Schimmelmann <nschimme@gmail.com>
Date: Wed, 4 Mar 2026 21:13:40 -0600
Subject: [PATCH 2/5] Revert "CI: add FAAC Benchmark Suite to test quality"

This reverts commit 1866c4492e4e0021a0af28568b66933118b19c4f.
---
 .github/workflows/benchmark.yml | 180 -----------
 .gitignore                      |   7 -
 tests/README.md                 |  90 ------
 tests/compare_results.py        | 519 --------------------------------
 tests/requirements.txt          |   4 -
 tests/run_benchmark.py          | 364 ----------------------
 tests/setup_datasets.py         | 261 ----------------
 7 files changed, 1425 deletions(-)
 delete mode 100644 .github/workflows/benchmark.yml
 delete mode 100644 tests/README.md
 delete mode 100644 tests/compare_results.py
 delete mode 100644 tests/requirements.txt
 delete mode 100644 tests/run_benchmark.py
 delete mode 100644 tests/setup_datasets.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index 4c2ad7e5..00000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,180 +0,0 @@
-name: Benchmark
-
-on:
-  pull_request:
-    branches: [ "master" ]
-    paths:
-      - "libfaac/**"
-      - "tests/**"
-
-jobs:
-  benchmark:
-    # NOTE: ViSQOL via visqol-py is currently most reliable on ubuntu-22.04.
-    name: ${{ matrix.arch }} / ${{ matrix.precision }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        arch: [amd64]
-        precision: [single, double]
-        include:
-          - arch: amd64
-            os: ubuntu-22.04
-
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y meson ninja-build bc ffmpeg
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-          cache-dependency-path: 'tests/requirements.txt'
-
-      - name: Install Python dependencies
-        run: |
-          pip install --upgrade pip setuptools wheel
-          pip install -r tests/requirements.txt
-
-      - name: Restore Datasets
-        id: cache-datasets
-        uses: actions/cache/restore@v4
-        with:
-          path: tests/data/external
-          key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }}
-
-      - name: Setup Datasets
-        if: steps.cache-datasets.outputs.cache-hit != 'true'
-        run: |
-          python3 tests/setup_datasets.py
-
-      - name: Save Datasets
-        if: steps.cache-datasets.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v4
-        with:
-          path: tests/data/external
-          key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }}
-
-      - name: Determine Baseline SHA
-        id: baseline-sha
-        run: |
-          git checkout ${{ github.base_ref || 'master' }}
-          echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
-          git checkout ${{ github.sha }}
-
-      - name: Restore Baseline Results
-        id: cache-baseline
-        uses: actions/cache/restore@v4
-        with:
-          path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
-          key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }}
-
-      - name: Run Benchmark (Baseline)
-        if: steps.cache-baseline.outputs.cache-hit != 'true'
-        run: |
-          git checkout ${{ steps.baseline-sha.outputs.sha }}
-          meson setup build_base -Dfloating-point=${{ matrix.precision }} --buildtype=release
-          ninja -C build_base
-          LIB_PATH="build_base/libfaac/libfaac.so"
-          FAAC_PATH="build_base/frontend/faac"
-          # Restore benchmark scripts and config from PR branch to ensure consistent comparison logic
-          git checkout ${{ github.sha }} -- tests/
-          python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_base" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json" --coverage 100
-
-      - name: Save Baseline Results
-        if: always() && steps.cache-baseline.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v4
-        with:
-          path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
-          key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }}
-
-      - name: Run Benchmark (Candidate)
-        run: |
-          git checkout ${{ github.sha }}
-          mkdir -p tests/results
-          meson setup build_cand -Dfloating-point=${{ matrix.precision }} --buildtype=release
-          ninja -C build_cand
-          LIB_PATH="build_cand/libfaac/libfaac.so"
-          FAAC_PATH="build_cand/frontend/faac"
-          python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_cand" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_cand.json" --coverage 100
-
-      - name: Upload Results
-        uses: actions/upload-artifact@v4
-        with:
-          name: results-${{ matrix.arch }}-${{ matrix.precision }}
-          path: tests/results/*.json
-
-  report:
-    name: Consolidated Report
-    needs: benchmark
-    runs-on: ubuntu-latest
-    env:
-      BASE_SHA: ${{ github.event.pull_request.base.sha || github.event.before }}
-      CAND_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-    if: always()
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Download all results
-        uses: actions/download-artifact@v4
-        with:
-          path: tests/results
-          pattern: results-*
-          merge-multiple: true
-
-      - name: Generate Report
-        id: generate
-        run: |
-          # Summary report for PR comment (high-signal only)
-          python3 tests/compare_results.py tests/results --summary-only --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-summary.md || echo "REGRESSION_DETECTED=1" >> $GITHUB_ENV
-          # Full report for artifact (all details)
-          python3 tests/compare_results.py tests/results --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-full.md || true
-          if [ ! -s report-summary.md ]; then
-            echo "Error: report-summary.md is empty"
-            exit 1
-          fi
-          cat report-summary.md
-
-      - name: Upload Full Report
-        uses: actions/upload-artifact@v4
-        with:
-          name: benchmark-report-full
-          path: report-full.md
-
-      - name: PR Feedback
-        if: always() && github.event_name == 'pull_request'
-        continue-on-error: true
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const fs = require('fs');
-            if (fs.existsSync('report-summary.md')) {
-              let report = fs.readFileSync('report-summary.md', 'utf8').trim();
-              if (report.length > 0) {
-                const jobUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
-                const readmeUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/blob/${process.env.GITHUB_SHA}/tests/README.md`;
-                report += `\n\n---\n[View Detailed Job Log and Full Report](${jobUrl}) | [What Is This?](${readmeUrl})`;
-                github.rest.issues.createComment({
-                  issue_number: context.issue.number,
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  body: report
-                })
-              }
-            }
-
-      - name: Check for Regressions
-        run: |
-          if [ "${{ env.REGRESSION_DETECTED }}" == "1" ]; then
-            echo "Regressions or missing data detected. Failing job."
-            exit 1
-          fi
diff --git a/.gitignore b/.gitignore
index ec9b9a11..1e20e33b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,10 +38,3 @@ Makefile*
 *.user
 /libfaac/win32_ver.h
 /libfaac/faac.pc
-.DS_Store
-/build*/
-/venv/
-/tests/__pycache__/
-/tests/data/external/
-/tests/output/
-/tests/results/
diff --git a/tests/README.md b/tests/README.md
deleted file mode 100644
index d47c7104..00000000
--- a/tests/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# FAAC Benchmark Suite
-
-FAAC is the high-efficiency encoder for the resource-constrained world. From hobbyist projects to professional surveillance (VSS) and embedded VoIP, we prioritize performance where every cycle and byte matters.
-
-This suite provides the objective data necessary to ensure that every change moves us closer to our Northstar: the optimal balance of quality, speed, and size.
-
----
-
-## The "Golden Triangle" Philosophy
-
-We evaluate every contribution against three competing pillars. While high-bitrate encoders like FDK-AAC or Opus target multi-channel, high-fidelity entertainment, FAAC focuses on remaining approachable and distributable for the global open-source community. We prioritize non-patent encumbered areas and the standard Low Complexity (LC-AAC) profile.
-
-1.  **Audio Fidelity**: We target transparent audio quality for our bitrates. We use objective metrics like ViSQOL (MOS) to ensure psychoacoustic improvements truly benefit the listener without introducing "metallic" ringing or "underwater" artifacts.
-2.  **Computational Efficiency**: FAAC must remain fast. We optimize for low-power cores where encoding speed is a critical requirement. Every CPU cycle saved is a win for our users.
-3.  **Minimal Footprint**: Binary size is a feature. We ensure the library remains small enough to fit within restrictive embedded firmware.
-
----
-
-## Benchmarking Scenarios
-
-| Scenario | Mode | Source | Config | Project Goal |
-| :--- | :--- | :--- | :--- | :--- |
-| **VoIP** | Speech (16k) | TCD-VOIP | `-b 16` | Clear communication at low bitrates (16kbps). |
-| **VSS** | Speech (16k) | TCD-VOIP | `-b 40` | High-fidelity Video Surveillance Systems recording (40kbps). |
-| **Music** | Audio (48k) | PMLT / SoundExpert | `-b 64-256` | Full-range transparency for storage & streaming. |
-| **Throughput** | Efficiency | Synthetic Signals | Default | Stability test using 10-minute Sine/Sweep/Noise/Silence. |
-
----
-
-## Metric Definitions
-
-| Metric | Definition | Reference |
-| :--- | :--- | :--- |
-| **MOS** | Mean Opinion Score (LQO). Predicted perceptual quality from 1.0 (Bad) to 5.0 (Excellent), computed via the **ViSQOL** model. | [ITU-T P.800](https://www.itu.int/rec/T-REC-P.800), [ViSQOL](https://github.com/google/visqol) |
-| **Regressions** | Critical failure or a drop in MOS ≥ 0.1 compared to the baseline commit. Significant throughput drops (>10%) or increased binary size also warrant review. | |
-| **Significant Win** | An improvement in MOS ≥ 0.1 compared to the baseline commit. | |
-| **Consistency** | Percentage of test cases where bitstreams are MD5-identical to the baseline. | |
-| **Throughput** | Normalized encoding speed improvement against baseline. Higher % indicates faster execution. | |
-| **Library Size** | Binary footprint of `libfaac.so`. Delta measured against baseline. Critical for embedded VSS/IoT targets. | |
-| **Bitrate Δ** | Percentage change in generated file size against baseline. Relative shift in bits used for the same target. | |
-| **Bitrate Accuracy** | The closeness of the achieved bitrate to the specified target (ABR mode). Measures the encoder's ability to respect the user-defined bitrate budget. | |
-
----
-
-## Dataset Sources
-
-We are grateful to the following projects for providing high-quality research material:
-
-*   **TCD-VoIP (Sigmedia-VoIP)**: [Listener Test Database](https://www.sigmedia.tv/datasets/tcd_voip_ltd/) - Specifically designed for assessing quality in VoIP applications.
-*   **PMLT2014**: [Public Multiformat Listening Test](https://listening-test.coresv.net/) - A community-defined comprehensive multi-codec benchmark.
-*   **SoundExpert**: [Sound Samples](https://soundexpert.org/sound-samples) - High-precision EBU SQAM CD excerpts for transparency testing.
-
----
-
-## Quick Start
-
-### 1. Install Dependencies
-```bash
-# System (Ubuntu/Debian)
-sudo apt-get update && sudo apt-get install -y meson ninja-build bc ffmpeg
-
-# Python
-python3 -m venv venv
-source venv/bin/activate
-pip install -r tests/requirements.txt
-```
-
-### 2. Prepare Datasets
-Downloads samples and generates 10-minute synthetic throughput signals (Sine, Sweep, Noise, Silence).
-```bash
-python3 tests/setup_datasets.py
-```
-
-### 3. Run a Benchmark
-Perceptual analysis and full test suite coverage are enabled by default. Use `--skip-mos` or `--coverage 10` for faster iteration during local development.
-```bash
-python3 tests/run_benchmark.py build/frontend/faac build/libfaac/libfaac.so my_run tests/results/my_run.json
-```
-
-### 4. Compare Results
-Generate a high-signal summary comparing your candidate against a baseline.
-```bash
-python3 tests/compare_results.py tests/results/
-```
-
-## Who This Suite Helps
-
-*   **Maintainers**: Provides the confidence to merge PRs by proving that a change improves the encoder—or at least doesn't cause a regression.
-*   **Developers**: Offers standardized, automated feedback during implementation.
-*   **Users**: Ensures that every new version of FAAC remains a reliable choice for their critical firmware and communication projects.
diff --git a/tests/compare_results.py b/tests/compare_results.py
deleted file mode 100644
index edae0175..00000000
--- a/tests/compare_results.py
+++ /dev/null
@@ -1,519 +0,0 @@
-"""
- * FAAC Benchmark Suite
- * Copyright (C) 2026 Nils Schimmelmann
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
-
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-"""
-
-import json
-import sys
-import os
-from collections import defaultdict
-
-
-def analyze_pair(base_file, cand_file):
-    try:
-        with open(base_file, "r") as f:
-            base = json.load(f)
-    except Exception as e:
-        sys.stderr.write(
-            f"  Warning: Could not load baseline file {base_file}: {e}\n")
-        base = {}
-
-    try:
-        with open(cand_file, "r") as f:
-            cand = json.load(f)
-    except Exception as e:
-        sys.stderr.write(
-            f"  Error: Could not load candidate file {cand_file}: {e}\n")
-        return None
-
-    suite_results = {
-        "has_regression": False,
-        "missing_data": False,
-        "mos_delta_sum": 0,
-        "mos_count": 0,
-        "missing_mos_count": 0,
-        "tp_reduction": 0,
-        "lib_size_chg": 0,
-        "bitrate_chg_sum": 0,
-        "bitrate_count": 0,
-        "bitrate_acc_sum": 0,
-        "bitrate_acc_count": 0,
-        "regressions": [],
-        "new_wins": [],
-        "significant_wins": [],
-        "opportunities": [],
-        "bit_exact_count": 0,
-        "total_cases": 0,
-        "all_cases": [],
-        "scenario_stats": defaultdict(
-            lambda: {
-                "tp_sum_cand": 0,
-                "tp_sum_base": 0,
-                "count": 0}),
-        "base_tp": base.get("throughput", {}),
-        "cand_tp": cand.get("throughput", {})}
-
-    base_m = base.get("matrix", {})
-    cand_m = cand.get("matrix", {})
-
-    if cand_m:
-        suite_results["total_cases"] = len(cand_m)
-        for k in sorted(cand_m.keys()):
-            o = cand_m[k]
-            b = base_m.get(k, {})
-
-            filename = o.get("filename", k)
-            scenario = o.get("scenario", "")
-            display_name = f"{scenario}: {filename}"
-
-            o_mos = o.get("mos")
-            b_mos = b.get("mos")
-            thresh = o.get("thresh", 1.0)
-
-            o_size = o.get("size")
-            b_size = b.get("size")
-
-            o_bitrate = o.get("bitrate")
-            o_target = o.get("bitrate_target")
-
-            if o_bitrate is not None and o_target is not None and o_target > 0:
-                acc = (1.0 - abs(o_bitrate - o_target) / o_target) * 100
-                suite_results["bitrate_acc_sum"] += acc
-                suite_results["bitrate_acc_count"] += 1
-
-            o_time = o.get("time")
-            b_time = b.get("time")
-
-            if o_time is not None and b_time is not None and b_time > 0:
-                suite_results["scenario_stats"][scenario]["tp_sum_cand"] += o_time
-                suite_results["scenario_stats"][scenario]["tp_sum_base"] += b_time
-                suite_results["scenario_stats"][scenario]["count"] += 1
-
-            o_md5 = o.get("md5", "")
-            b_md5 = b.get("md5", "")
-
-            if o_md5 and b_md5 and o_md5 == b_md5:
-                suite_results["bit_exact_count"] += 1
-
-            size_chg = "N/A"
-            if o_size is not None and b_size is not None:
-                size_chg_val = (o_size - b_size) / b_size * 100
-                size_chg = f"{size_chg_val:+.2f}%"
-                suite_results["bitrate_chg_sum"] += size_chg_val
-                suite_results["bitrate_count"] += 1
-            elif o_size is None:
-                suite_results["missing_data"] = True
-
-            status = "✅"
-            delta = 0
-            if o_mos is not None:
-                if b_mos is not None:
-                    delta = o_mos - b_mos
-                    suite_results["mos_delta_sum"] += delta
-                    suite_results["mos_count"] += 1
-
-                if o_mos < (thresh - 0.5):
-                    status = "🤮"  # Awful
-                elif o_mos < thresh:
-                    status = "📉"  # Bad/Poor
-
-                if b_mos is not None:
-                    if (o_mos - b_mos) < -0.1:
-                        status = "❌"  # Regression
-                        suite_results["has_regression"] = True
-                    elif (o_mos - b_mos) > 0.1:
-                        status = "🌟"  # Significant Win
-
-                # Check for New Win (Baseline failed, Candidate passed)
-                if b_mos is not None and b_mos < thresh and o_mos >= thresh:
-                    suite_results["new_wins"].append({
-                        "display_name": display_name,
-                        "mos": o_mos,
-                        "b_mos": b_mos,
-                        "delta": delta
-                    })
-            else:
-                status = "❌"  # Missing MOS is a failure
-                suite_results["missing_mos_count"] += 1
-                suite_results["has_regression"] = True
-                suite_results["missing_data"] = True
-                delta = -10.0  # Force to top of regressions
-
-            mos_str = f"{o_mos:.2f}" if o_mos is not None else "N/A"
-            b_mos_str = f"{b_mos:.2f}" if b_mos is not None else "N/A"
-            delta_mos = f"{(o_mos - b_mos):+.2f}" if (
-                o_mos is not None and b_mos is not None) else "N/A"
-
-            case_data = {
-                "display_name": display_name,
-                "status": status,
-                "mos": o_mos,
-                "b_mos": b_mos,
-                "delta": delta,
-                "size_chg": size_chg,
-                "line": f"| {display_name} | {status} | {mos_str} ({b_mos_str}) | {delta_mos} | {size_chg} |"
-            }
-
-            suite_results["all_cases"].append(case_data)
-            if status == "❌":
-                suite_results["regressions"].append(case_data)
-            elif status == "🌟":
-                suite_results["significant_wins"].append(case_data)
-            elif status in ["🤮", "📉"]:
-                suite_results["opportunities"].append(case_data)
-    else:
-        suite_results["missing_data"] = True
-
-    # Sorts
-    suite_results["regressions"].sort(key=lambda x: x["delta"])
-    suite_results["new_wins"].sort(key=lambda x: x["delta"], reverse=True)
-    suite_results["significant_wins"].sort(
-        key=lambda x: x["delta"], reverse=True)
-    suite_results["opportunities"].sort(
-        key=lambda x: x["mos"] if x["mos"] is not None else 6.0)
-
-    # Throughput
-    base_tp = base.get("throughput", {})
-    cand_tp = cand.get("throughput", {})
-    # Exclude "overall" to avoid double-counting in manual summation
-    total_base_t = sum(v for k, v in base_tp.items() if k != "overall")
-    total_cand_t = sum(v for k, v in cand_tp.items() if k != "overall")
-    if total_cand_t > 0 and total_base_t > 0:
-        suite_results["tp_reduction"] = (1 - total_cand_t / total_base_t) * 100
-    else:
-        # If overall throughput is missing, try to aggregate from scenarios
-        cand_t_sum = sum(s["tp_sum_cand"]
-                         for s in suite_results["scenario_stats"].values())
-        base_t_sum = sum(s["tp_sum_base"]
-                         for s in suite_results["scenario_stats"].values())
-        if cand_t_sum > 0 and base_t_sum > 0:
-            suite_results["tp_reduction"] = (1 - cand_t_sum / base_t_sum) * 100
-        else:
-            suite_results["missing_data"] = True
-
-    # Binary Size
-    base_lib = base.get("lib_size", 0)
-    cand_lib = cand.get("lib_size", 0)
-    if cand_lib > 0 and base_lib > 0:
-        suite_results["lib_size_chg"] = ((cand_lib / base_lib) - 1) * 100
-    else:
-        suite_results["missing_data"] = True
-
-    return suite_results
-
-
-def main():
-    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-
-    summary_only = "--summary-only" in sys.argv
-    if summary_only:
-        sys.argv.remove("--summary-only")
-
-    base_sha = None
-    if "--base-sha" in sys.argv:
-        idx = sys.argv.index("--base-sha")
-        base_sha = sys.argv[idx + 1]
-        sys.argv.pop(idx + 1)
-        sys.argv.pop(idx)
-
-    cand_sha = None
-    if "--cand-sha" in sys.argv:
-        idx = sys.argv.index("--cand-sha")
-        cand_sha = sys.argv[idx + 1]
-        sys.argv.pop(idx + 1)
-        sys.argv.pop(idx)
-
-    results_dir = sys.argv[1] if len(
-        sys.argv) > 1 else os.path.join(
-        SCRIPT_DIR, "results")
-
-    if not os.path.exists(results_dir):
-        sys.exit(1)
-
-    files = os.listdir(results_dir)
-
-    suites = {}
-    for f in files:
-        if f.endswith("_cand.json"):
-            suite_name = f[:-10]
-            base_f = suite_name + "_base.json"
-            if base_f in files:
-                suites[suite_name] = (
-                    os.path.join(
-                        results_dir, base_f), os.path.join(
-                        results_dir, f))
-
-    if not suites:
-        sys.stderr.write("No result pairs found in directory.\n")
-        sys.exit(1)
-
-    all_suite_data = {}
-    overall_regression = False
-    overall_missing = False
-    total_mos_delta = 0
-    total_mos_count = 0
-    total_missing_mos = 0
-    total_tp_reduction = 0
-    total_lib_chg = 0
-    total_bitrate_chg = 0
-    total_bitrate_count = 0
-    total_bitrate_acc_sum = 0
-    total_bitrate_acc_count = 0
-
-    total_regressions = 0
-    total_new_wins = 0
-    total_significant_wins = 0
-    total_bit_exact = 0
-    total_cases_all = 0
-
-    # For worst-case scenario throughput
-    scenario_tp_deltas = []
-
-    for name, (base, cand) in sorted(suites.items()):
-        data = analyze_pair(base, cand)
-        if data:
-            all_suite_data[name] = data
-            if data["has_regression"]:
-                overall_regression = True
-            if data["missing_data"]:
-                overall_missing = True
-            total_mos_delta += data["mos_delta_sum"]
-            total_mos_count += data["mos_count"]
-            total_missing_mos += data["missing_mos_count"]
-            total_tp_reduction += data["tp_reduction"]
-            total_lib_chg += data["lib_size_chg"]
-            total_bitrate_chg += data["bitrate_chg_sum"]
-            total_bitrate_count += data["bitrate_count"]
-            total_bitrate_acc_sum += data["bitrate_acc_sum"]
-            total_bitrate_acc_count += data["bitrate_acc_count"]
-
-            total_regressions += len(data["regressions"])
-            total_new_wins += len(data["new_wins"])
-            total_significant_wins += len(data["significant_wins"])
-            total_bit_exact += data["bit_exact_count"]
-            total_cases_all += data["total_cases"]
-
-            for sc_name, sc_data in data["scenario_stats"].items():
-                if sc_data["tp_sum_base"] > 0:
-                    delta = (1 - sc_data["tp_sum_cand"] /
-                             sc_data["tp_sum_base"]) * 100
-                    scenario_tp_deltas.append((f"{name} / {sc_name}", delta))
-
-    avg_mos_delta_str = f"{(total_mos_delta /
-                            total_mos_count):+.3f}" if total_mos_count > 0 else "N/A"
-    avg_tp_reduction = total_tp_reduction / \
-        len(all_suite_data) if all_suite_data else 0
-    avg_lib_chg = total_lib_chg / len(all_suite_data) if all_suite_data else 0
-    avg_bitrate_chg = total_bitrate_chg / \
-        total_bitrate_count if total_bitrate_count > 0 else 0
-    avg_bitrate_acc = total_bitrate_acc_sum / \
-        total_bitrate_acc_count if total_bitrate_acc_count > 0 else 0
-
-    bit_exact_percent = (
-        total_bit_exact /
-        total_cases_all *
-        100) if total_cases_all > 0 else 0
-
-    # Worst-case throughput
-    worst_tp_scen, worst_tp_delta = (None, 0)
-    if scenario_tp_deltas:
-        worst_tp_scen, worst_tp_delta = min(
-            scenario_tp_deltas, key=lambda x: x[1])
-
-    report = []
-    if overall_regression:
-        report.append("## ❌ Quality Regression Detected")
-    elif worst_tp_delta < -5.0:
-        report.append("## ⚠️ Performance Regression Detected")
-    elif overall_missing:
-        report.append("## ❌ Incomplete/Missing Data Detected")
-    elif bit_exact_percent == 100.0:
-        report.append("## ✅ Refactor Verified (Bit-Identical)")
-    elif total_new_wins > 0 or total_significant_wins > 0 or (total_mos_count > 0 and (total_mos_delta / total_mos_count) > 0.01) or avg_tp_reduction > 5:
-        report.append("## 🚀 Perceptual & Efficiency Improvement")
-    else:
-        report.append("## 📊 Benchmark Summary")
-
-    if not summary_only and (base_sha or cand_sha):
-        report.append("\n### Environment")
-        if base_sha:
-            report.append(f"- **Baseline SHA**: `{base_sha}`")
-        if cand_sha:
-            report.append(f"- **Candidate SHA**: `{cand_sha}`")
-
-    report.append("\n### Summary")
-    report.append("| Metric | Value |")
-    report.append("| :--- | :--- |")
-
-    # Regressions (Always shown)
-    reg_status = "0 ✅" if total_regressions == 0 else f"{total_regressions} ❌"
-    report.append(f"| **Regressions** | {reg_status} |")
-
-    # New Wins (Only if baseline < threshold and candidate >= threshold)
-    if total_new_wins > 0:
-        report.append(f"| **New Wins** | {total_new_wins} 🆕 |")
-
-    # Significant Wins (MOS delta > 0.1)
-    if total_significant_wins > 0:
-        report.append(f"| **Significant Wins** | {total_significant_wins} 🌟 |")
-
-    # Bitstream Consistency (Against baseline)
-    consist_status = f"{bit_exact_percent:.1f}%"
-    if bit_exact_percent == 100.0:
-        consist_status += " (MD5 Match)"
-    report.append(f"| **Consistency** | {consist_status} |")
-
-    # Throughput
-    if abs(avg_tp_reduction) > 0.1:
-        tp_icon = "🚀" if avg_tp_reduction > 1.0 else "📉" if avg_tp_reduction < -1.0 else ""
-        report.append(
-            f"| **Throughput (Avg)** | {avg_tp_reduction:+.1f}% {tp_icon} |")
-
-    # Per-signal throughput deltas if available
-    tp_details = []
-    if all_suite_data:
-        first_data = list(all_suite_data.values())[0]
-        base_tp = first_data.get("base_tp", {})
-        cand_tp = first_data.get("cand_tp", {})
-        for signal in sorted(cand_tp.keys()):
-            if signal == "overall":
-                continue
-            if signal in base_tp and base_tp[signal] > 0:
-                delta = (1 - cand_tp[signal] / base_tp[signal]) * 100
-                icon = "🚀" if delta > 1.0 else "📉" if delta < -1.0 else ""
-                tp_details.append(
-                    f"{signal.split('.')[0]}: {delta:+.1f}% {icon}")
-
-    if tp_details:
-        report.append(f"| **TP Breakdown** | {', '.join(tp_details)} |")
-
-    if worst_tp_delta < -1.0:
-        report.append(
-            f"| **Worst-case TP Δ** | {worst_tp_delta:.1f}% ({worst_tp_scen}) ⚠️ |")
-
-    # Binary Size
-    if abs(avg_lib_chg) > 0.01:
-        size_icon = "📉" if avg_lib_chg < -0.1 else "📈" if avg_lib_chg > 0.1 else ""
-        report.append(
-            f"| **Library Size** | {avg_lib_chg:+.2f}% {size_icon} |")
-
-
-    # Bitrate Δ
-    if abs(avg_bitrate_chg) > 0.1:
-        bitrate_icon = "📉" if avg_bitrate_chg < - \
-            1.0 else "📈" if avg_bitrate_chg > 1.0 else ""
-        report.append(
-            f"| **Bitrate Δ** | {avg_bitrate_chg:+.2f}% {bitrate_icon} |")
-
-    # Bitrate Accuracy
-    if total_bitrate_acc_count > 0:
-        acc_icon = "🎯" if avg_bitrate_acc > 95 else "⚠️" if avg_bitrate_acc < 80 else ""
-        report.append(
-            f"| **Bitrate Accuracy** | {avg_bitrate_acc:.1f}% {acc_icon} |")
-
-    # Avg MOS Delta
-    if total_mos_count > 0 and abs(total_mos_delta / total_mos_count) > 0.001:
-        report.append(f"| **Avg MOS Delta** | {avg_mos_delta_str} |")
-
-    if total_missing_mos > 0:
-        report.append(
-            f"\n⚠️ **Warning**: {total_missing_mos} MOS scores were missing/failed (treated as ❌).")
-
-    if not summary_only:
-        # 1. Collapsible Details: Regressions
-        if total_regressions > 0:
-            report.append(
-                "\n<details><summary><b>❌ View Regression Details ({})</b></summary>\n".format(total_regressions))
-            for name, data in sorted(all_suite_data.items()):
-                if data["regressions"]:
-                    report.append(f"\n#### {name}")
-                    report.append(
-                        "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
-                    report.append("| :--- | :---: | :---: | :---: | :---: |")
-                    for r in data["regressions"]:
-                        report.append(r["line"])
-            report.append("\n</details>")
-
-        # 2. Collapsible Additional Details
-        report.append(
-            "\n<details><summary><b>View Additional Suite Details & Wins</b></summary>\n")
-
-        for name, data in sorted(all_suite_data.items()):
-            status_icon = "✅"
-            if data["has_regression"]:
-                status_icon = "❌"
-            elif data["missing_data"]:
-                status_icon = "❌"
-
-            avg_mos_suite = f"{(data['mos_delta_sum'] /
-                                data['mos_count']):+.3f}" if data["mos_count"] > 0 else "N/A"
-            suite_bit_exact_percent = (
-                data["bit_exact_count"] /
-                data["total_cases"] *
-                100) if data["total_cases"] > 0 else 0
-
-            report.append(f"\n#### {status_icon} {name}")
-            report.append(
-                f"- MOS Δ: {avg_mos_suite}, TP Δ: {data['tp_reduction']:+.1f}%, Size Δ: {data['lib_size_chg']:+.2f}%")
-            report.append(
-                f"- Bitstream Consistency: {suite_bit_exact_percent:.1f}%")
-
-            if data["new_wins"]:
-                report.append("\n**🆕 New Wins**")
-                report.append("| Test Case | MOS (Base) | Delta |")
-                report.append("| :--- | :---: | :---: |")
-                for w in data["new_wins"]:
-                    report.append("| {} | {:.2f} ({:.2f}) | {:+.2f} |".format(
-                        w["display_name"], w["mos"], w["b_mos"], w["delta"]))
-
-            if data["significant_wins"]:
-                report.append("\n**🌟 Significant Wins**")
-                report.append(
-                    "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
-                report.append("| :--- | :---: | :---: | :---: | :---: |")
-                for w in data["significant_wins"]:
-                    report.append(w["line"])
-
-            if data["opportunities"]:
-                report.append("\n**💡 Opportunities**")
-                report.append(
-                    "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
-                report.append("| :--- | :---: | :---: | :---: | :---: |")
-                for o in data["opportunities"]:
-                    report.append(o["line"])
-
-            if data["all_cases"]:
-                report.append(
-                    f"\n<details><summary>View all {len(data['all_cases'])} cases for {name}</summary>\n")
-                report.append(
-                    "| Test Case | Status | MOS (Base) | Delta | Size Δ |")
-                report.append("| :--- | :---: | :---: | :---: | :---: |")
-                for c in data["all_cases"]:
-                    report.append(c["line"])
-                report.append("\n</details>")
-
-        report.append("\n</details>")
-
-    output = "\n".join(report)
-    sys.stdout.write(output + "\n")
-
-    if overall_regression or overall_missing:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/requirements.txt b/tests/requirements.txt
deleted file mode 100644
index 2ee1d2a2..00000000
--- a/tests/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-numpy
-protobuf==3.20.3
-ffmpeg-python
-git+https://github.com/diggerdu/visqol-py.git@452eb5c4f17fd2404f968ec2eeadfcad74925485
diff --git a/tests/run_benchmark.py b/tests/run_benchmark.py
deleted file mode 100644
index 232dbb54..00000000
--- a/tests/run_benchmark.py
+++ /dev/null
@@ -1,364 +0,0 @@
-"""
- * FAAC Benchmark Suite
- * Copyright (C) 2026 Nils Schimmelmann
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
-
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-"""
-
-import os
-import subprocess
-import time
-import sys
-import json
-import tempfile
-import hashlib
-import concurrent.futures
-import multiprocessing
-
-try:
-    import visqol_py
-    from visqol_py import ViSQOLMode
-    HAS_VISQOL = True
-except ImportError:
-    HAS_VISQOL = False
-
-try:
-    import ffmpeg
-    HAS_FFMPEG = True
-except ImportError:
-    HAS_FFMPEG = False
-
-# Paths relative to script directory
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-EXTERNAL_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external")
-OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
-
-SCENARIOS = {
-    "voip": {
-        "mode": "speech",
-        "rate": 16000,
-        "visqol_rate": 16000,
-        "bitrate": 16,
-        "thresh": 2.5},
-    "vss": {
-        "mode": "speech",
-        "rate": 16000,
-        "visqol_rate": 16000,
-        "bitrate": 40,
-        "thresh": 3.0},
-    "music_low": {
-        "mode": "audio",
-        "rate": 48000,
-        "visqol_rate": 48000,
-        "bitrate": 64,
-        "thresh": 3.5},
-    "music_std": {
-        "mode": "audio",
-        "rate": 48000,
-        "visqol_rate": 48000,
-        "bitrate": 128,
-        "thresh": 4.0},
-    "music_high": {
-        "mode": "audio",
-        "rate": 48000,
-        "visqol_rate": 48000,
-        "bitrate": 256,
-        "thresh": 4.3}}
-
-
-def get_visqol_mode(mode_str):
-    if not HAS_VISQOL:
-        return None
-    return ViSQOLMode.SPEECH if mode_str == "speech" else ViSQOLMode.AUDIO
-
-
-def get_binary_size(path):
-    if os.path.exists(path):
-        return os.path.getsize(path)
-    return 0
-
-
-def get_md5(path):
-    if not os.path.exists(path):
-        return ""
-    hash_md5 = hashlib.md5()
-    with open(path, "rb") as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            hash_md5.update(chunk)
-    return hash_md5.hexdigest()
-
-
-def run_visqol(visqol, ref_wav, deg_wav):
-    """Run ViSQOL via provided API instance and return MOS score."""
-    if visqol is None:
-        return None
-    try:
-        result = visqol.measure(ref_wav, deg_wav)
-        return float(result.moslqo)
-    except Exception as e:
-        print(f" ViSQOL API error: {e}")
-    return None
-
-
-# Process-local storage for ViSQOL instances
-_process_visqol_instances = {}
-
-
-def get_process_visqol(mode_str):
-    if not HAS_VISQOL:
-        return None
-    if mode_str not in _process_visqol_instances:
-        try:
-            mode = get_visqol_mode(mode_str)
-            _process_visqol_instances[mode_str] = visqol_py.ViSQOL(mode=mode)
-        except Exception as e:
-            print(
-                f" Failed to initialize ViSQOL in process {
-                    os.getpid()}: {e}")
-            _process_visqol_instances[mode_str] = None
-    return _process_visqol_instances[mode_str]
-
-
-def worker_init(cpu_id_queue):
-    """Pin the worker process to a specific CPU core for consistent benchmarks."""
-    cpu_id = cpu_id_queue.get()
-    if hasattr(os, "sched_setaffinity"):
-        try:
-            os.sched_setaffinity(0, [cpu_id])
-        except Exception as e:
-            print(f" Failed to pin process {os.getpid()} to CPU {cpu_id}: {e}")
-
-
-def process_sample(faac_bin_path, name, cfg, sample, data_dir, precision, env):
-    input_path = os.path.join(data_dir, sample)
-    key = f"{name}_{sample}"
-    output_path = os.path.join(OUTPUT_DIR, f"{key}_{precision}.aac")
-
-    # Determine encoding parameters
-    cmd = [faac_bin_path, "-o", output_path, input_path]
-    cmd.extend(["-b", str(cfg["bitrate"])])
-
-    try:
-        t_start = time.time()
-        subprocess.run(cmd, env=env, check=True, capture_output=True)
-        t_duration = time.time() - t_start
-
-        mos = None
-        aac_size = os.path.getsize(output_path)
-        actual_bitrate = None
-
-        if HAS_FFMPEG:
-            try:
-                probe = ffmpeg.probe(input_path)
-                duration = float(probe['format']['duration'])
-                if duration > 0:
-                    # kbps = (bytes * 8) / (seconds * 1000)
-                    actual_bitrate = (aac_size * 8) / (duration * 1000)
-            except Exception as e:
-                print(f" Failed to probe duration for {sample}: {e}")
-
-        if HAS_FFMPEG:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                v_ref = os.path.join(tmpdir, "vref.wav")
-                v_deg = os.path.join(tmpdir, "vdeg.wav")
-                v_rate = cfg["visqol_rate"]
-                v_channels = 1 if cfg["mode"] == "speech" else 2
-
-                try:
-                    # Use ffmpeg-python to decode AAC and prepare files for
-                    # ViSQOL
-                    ffmpeg.input(input_path).output(
-                        v_ref, ar=v_rate, ac=v_channels, sample_fmt='s16').run(
-                        quiet=True, overwrite_output=True)
-                    ffmpeg.input(output_path).output(
-                        v_deg, ar=v_rate, ac=v_channels, sample_fmt='s16').run(
-                        quiet=True, overwrite_output=True)
-
-                    if os.path.exists(v_ref) and os.path.exists(v_deg):
-                        visqol = get_process_visqol(cfg["mode"])
-                        mos = run_visqol(visqol, v_ref, v_deg)
-                except ffmpeg.Error as e:
-                    print(
-                        f" FFmpeg error for {sample}: {
-                            e.stderr.decode() if e.stderr else e}")
-
-        return key, {
-            "mos": mos,
-            "size": aac_size,
-            "bitrate": actual_bitrate,
-            "bitrate_target": cfg.get("bitrate"),
-            "time": t_duration,
-            "md5": get_md5(output_path),
-            "thresh": cfg["thresh"],
-            "scenario": name,
-            "filename": sample
-        }
-    except Exception as e:
-        print(f" failed: {e}")
-        return None
-
-
-def run_benchmark(
-        faac_bin_path,
-        lib_path,
-        precision,
-        coverage=100,
-        run_perceptual=True):
-    env = os.environ.copy()
-
-    os.makedirs(OUTPUT_DIR, exist_ok=True)
-    results = {
-        "matrix": {},
-        "throughput": {},
-        "lib_size": get_binary_size(lib_path)
-    }
-
-    if run_perceptual:
-        print(f"Starting perceptual benchmark for {precision}...")
-        # Detect number of CPUs for parallelization
-        num_cpus = os.cpu_count() or 1
-        print(f"Parallelizing across {num_cpus} threads.")
-
-        for name, cfg in SCENARIOS.items():
-            data_subdir = "speech" if cfg["mode"] == "speech" else "audio"
-            data_dir = os.path.join(EXTERNAL_DATA_DIR, data_subdir)
-            if not os.path.exists(data_dir):
-                print(
-                    f"  [Scenario: {name}] Data directory {data_dir} not found, skipping.")
-                continue
-
-            all_samples = sorted(
-                [f for f in os.listdir(data_dir) if f.endswith(".wav")])
-            num_to_run = max(1, int(len(all_samples) * coverage / 100.0))
-            step = len(all_samples) / num_to_run if num_to_run > 0 else 1
-            samples = [all_samples[int(i * step)] for i in range(num_to_run)]
-
-            print(
-                f"  [Scenario: {name}] Processing {
-                    len(samples)} samples (coverage {coverage}%)...")
-
-            # Pin each process to a unique CPU core
-            manager = multiprocessing.Manager()
-            cpu_id_queue = manager.Queue()
-            for cpu_id in range(num_cpus):
-                cpu_id_queue.put(cpu_id)
-
-            with concurrent.futures.ProcessPoolExecutor(
-                max_workers=num_cpus,
-                initializer=worker_init,
-                initargs=(cpu_id_queue,)
-            ) as executor:
-                futures = {
-                    executor.submit(
-                        process_sample,
-                        faac_bin_path,
-                        name,
-                        cfg,
-                        sample,
-                        data_dir,
-                        precision,
-                        env): sample for sample in samples}
-                for i, future in enumerate(
-                        concurrent.futures.as_completed(futures)):
-                    result = future.result()
-                    if result:
-                        key, data = result
-                        results["matrix"][key] = data
-                        mos_str = f"{
-                            data['mos']:.2f}" if data['mos'] is not None else "N/A"
-                        print(
-                            f"    ({i + 1}/{len(samples)}) {data['filename']} done. (MOS: {mos_str})")
-
-    print(f"Measuring throughput for {precision}...")
-    # Pin current process to a single core for accurate throughput measurement
-    if hasattr(os, "sched_setaffinity"):
-        try:
-            os.sched_setaffinity(0, [0])
-        except BaseException:
-            pass
-
-    tp_dir = os.path.join(EXTERNAL_DATA_DIR, "throughput")
-    if os.path.exists(tp_dir):
-        tp_samples = sorted(
-            [f for f in os.listdir(tp_dir) if f.endswith(".wav")])
-        if tp_samples:
-            overall_durations = []
-            for sample in tp_samples:
-                input_path = os.path.join(tp_dir, sample)
-                output_path = os.path.join(
-                    OUTPUT_DIR, f"tp_{sample}_{precision}.aac")
-
-                print(f"  Benchmarking throughput with {sample}...")
-                try:
-                    # Warmup
-                    subprocess.run([faac_bin_path,
-                                    "-o",
-                                    output_path,
-                                    input_path],
-                                   env=env,
-                                   check=True,
-                                   capture_output=True)
-
-                    # Multiple runs to average noise
-                    durations = []
-                    for _ in range(3):
-                        start_time = time.perf_counter()
-                        subprocess.run([faac_bin_path,
-                                        "-o",
-                                        output_path,
-                                        input_path],
-                                       env=env,
-                                       check=True,
-                                       capture_output=True)
-                        durations.append(time.perf_counter() - start_time)
-
-                    avg_dur = sum(durations) / len(durations)
-                    results["throughput"][sample] = avg_dur
-                    overall_durations.append(avg_dur)
-                except BaseException as e:
-                    print(f"    Throughput benchmark failed for {sample}: {e}")
-                    pass
-
-            if overall_durations:
-                results["throughput"]["overall"] = sum(
-                    overall_durations) / len(overall_durations)
-
-    return results
-
-
-if __name__ == "__main__":
-    if len(sys.argv) < 5:
-        print(
-            "Usage: python3 tests/run_benchmark.py <faac_bin_path> <lib_path> <precision_name> <output_json> [--skip-mos] [--coverage 100]")
-        sys.exit(1)
-
-    do_perc = "--skip-mos" not in sys.argv
-    coverage = 100
-    if "--coverage" in sys.argv:
-        idx = sys.argv.index("--coverage")
-        coverage = int(sys.argv[idx + 1])
-
-    data = run_benchmark(
-        sys.argv[1],
-        sys.argv[2],
-        sys.argv[3],
-        coverage=coverage,
-        run_perceptual=do_perc)
-
-    # Ensure results directory exists
-    output_json = os.path.abspath(sys.argv[4])
-    os.makedirs(os.path.dirname(output_json), exist_ok=True)
-    with open(output_json, "w") as f:
-        json.dump(data, f, indent=2)
diff --git a/tests/setup_datasets.py b/tests/setup_datasets.py
deleted file mode 100644
index 735ce3b3..00000000
--- a/tests/setup_datasets.py
+++ /dev/null
@@ -1,261 +0,0 @@
-"""
- * FAAC Benchmark Suite
- * Copyright (C) 2026 Nils Schimmelmann
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
-
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-"""
-
-import os
-import urllib.request
-import zipfile
-import shutil
-import wave
-import re
-import ffmpeg
-
-DATASETS = {
-    "PMLT2014": {
-        "url": "https://github.com/nschimme/PMLT2014/archive/refs/tags/PMLT2014.zip",
-        "name": "Public Multiformat Listening Test @ 96 kbps (July 2014)"
-    },
-    "TCD-VOIP": {
-        "url": "https://github.com/nschimme/TCD-VOIP/archive/refs/tags/harte2015tcd.zip",
-        "name": "TCD-VoIP (Sigmedia-VoIP) Listener Test Database"
-    },
-    "SoundExpert": {
-        "url": "https://github.com/nschimme/SoundExpert/archive/refs/tags/SoundExpert.zip",
-        "name": "SoundExpert Sound samples"
-    }
-}
-
-# Paths relative to script directory
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-BASE_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external")
-TEMP_DIR = os.path.join(SCRIPT_DIR, "data", "temp")
-
-
-def download_and_extract(name, url):
-    os.makedirs(TEMP_DIR, exist_ok=True)
-    zip_path = os.path.join(TEMP_DIR, f"{name}.zip")
-    if not os.path.exists(zip_path):
-        print(f"Downloading {name}...")
-        urllib.request.urlretrieve(url, zip_path)
-
-    print(f"Extracting {name}...")
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(TEMP_DIR)
-
-
-def get_info(wav_path):
-    try:
-        with wave.open(wav_path, 'rb') as f:
-            frames = f.getnframes()
-            rate = f.getframerate()
-            channels = f.getnchannels()
-            return frames / float(rate), channels
-    except BaseException:
-        return 0, 2
-
-
-def resample(
-        input_path,
-        output_path,
-        rate,
-        channels,
-        start=None,
-        duration=None,
-        loop=False):
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    try:
-        input_args = {}
-        output_args = {}
-
-        if loop:
-            # Loop input indefinitely, then trim to requested duration
-            input_args['stream_loop'] = -1
-
-        if start is not None:
-            output_args['ss'] = start
-        if duration is not None:
-            output_args['t'] = duration
-
-        (ffmpeg .input(input_path,
-                       **input_args) .output(output_path,
-                                             ar=rate,
-                                             ac=channels,
-                                             sample_fmt='s16',
-                                             **output_args) .run(quiet=True,
-                                                                 overwrite_output=True))
-    except ffmpeg.Error as e:
-        print(
-            f" FFmpeg error during setup: {
-                e.stderr.decode() if e.stderr else e}")
-
-
-def get_tier_params(dur):
-    """
-    Determine resampling parameters based on ViSQOL recommendations (5-10s).
-    1. < 5s: loop to 5s
-    2. 5-10s: use full sample
-    3. > 10s: trim to 10s center segment
-    """
-    if dur < 5.0:
-        return 0, 5, True
-    if dur <= 10.0:
-        return None, None, False
-    return (dur - 10) / 2, 10, False
-
-
-def setup_pmlt():
-    dataset_info = DATASETS["PMLT2014"]
-    src_dir = os.path.join(TEMP_DIR, "PMLT2014-PMLT2014")
-    dest_dir = os.path.join(BASE_DATA_DIR, "audio")
-
-    wav_files = []
-    for root, dirs, files in os.walk(src_dir):
-        for f in files:
-            if f.endswith("48k.wav") and not re.search(r"48k\.\d+\.wav$", f):
-                wav_files.append(os.path.join(root, f))
-
-    print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.")
-    for i, wav in enumerate(wav_files):
-        print(f"  [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...")
-        dur, chans = get_info(wav)
-        start, duration, loop = get_tier_params(dur)
-
-        filename = os.path.basename(wav)
-        output = os.path.join(dest_dir, filename)
-        resample(
-            wav,
-            output,
-            48000,
-            chans,
-            start=start,
-            duration=duration,
-            loop=loop)
-
-
-def setup_tcd_voip():
-    dataset_info = DATASETS["TCD-VOIP"]
-    src_dir = os.path.join(TEMP_DIR, "TCD-VOIP-harte2015tcd")
-    dest_dir = os.path.join(BASE_DATA_DIR, "speech")
-
-    wav_files = []
-    for root, dirs, files in os.walk(src_dir):
-        # Do not use any wave files if they're in a "ref" folder
-        if "ref" in root.split(os.sep):
-            continue
-
-        for f in files:
-            if f.endswith(".wav") and ("Test Set" in root or "chop" in root):
-                wav_files.append(os.path.join(root, f))
-
-    print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.")
-    for i, wav in enumerate(wav_files):
-        print(f"  [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...")
-        dur, chans = get_info(wav)
-        start, duration, loop = get_tier_params(dur)
-
-        filename = os.path.basename(wav)
-        output = os.path.join(dest_dir, filename)
-        # ViSQOL speech mode requires 16k mono
-        resample(
-            wav,
-            output,
-            16000,
-            1,
-            start=start,
-            duration=duration,
-            loop=loop)
-
-
-def setup_soundexpert():
-    dataset_info = DATASETS["SoundExpert"]
-    src_dir = os.path.join(TEMP_DIR, "SoundExpert-SoundExpert")
-    dest_dir = os.path.join(BASE_DATA_DIR, "audio")
-
-    wav_files = []
-    for root, dirs, files in os.walk(src_dir):
-        for f in files:
-            if f.endswith(".wav"):
-                wav_files.append(os.path.join(root, f))
-
-    print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.")
-    for i, wav in enumerate(wav_files):
-        print(f"  [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...")
-        dur, chans = get_info(wav)
-        start, duration, loop = get_tier_params(dur)
-
-        filename = os.path.basename(wav)
-        output = os.path.join(dest_dir, filename)
-        resample(
-            wav,
-            output,
-            48000,
-            chans,
-            start=start,
-            duration=duration,
-            loop=loop)
-
-
-def setup_throughput_signals():
-    """Generate 10-minute test signals for throughput measurement."""
-    dest_dir = os.path.join(BASE_DATA_DIR, "throughput")
-    os.makedirs(dest_dir, exist_ok=True)
-
-    signals = {
-        "sine": "sine=f=440:d=600",
-        "sweep": "aevalsrc='sin(2*PI*(100+(20000-100)/(2*600)*t)*t)':d=600",
-        "noise": "anoisesrc=d=600",
-        "silence": "anullsrc=d=600"
-    }
-
-    print(f"Generating 10-minute throughput signals...")
-    for name, filter_str in signals.items():
-        output_path = os.path.join(dest_dir, f"{name}.wav")
-        if not os.path.exists(output_path):
-            print(f"  Generating {name}.wav...")
-            try:
-                # Note: aevalsrc is also a lavfi filter
-                (
-                    ffmpeg
-                    .input(filter_str, format='lavfi')
-                    .output(output_path, ar=48000, ac=2, sample_fmt='s16')
-                    .run(quiet=True, overwrite_output=True)
-                )
-            except ffmpeg.Error as e:
-                print(
-                    f" FFmpeg error during signal generation: {
-                        e.stderr.decode() if e.stderr else e}")
-
-
-if __name__ == "__main__":
-    if not os.path.exists(BASE_DATA_DIR):
-        for name, info in DATASETS.items():
-            download_and_extract(name, info["url"])
-
-        setup_pmlt()
-        setup_tcd_voip()
-        setup_soundexpert()
-        setup_throughput_signals()
-
-        if os.path.exists(TEMP_DIR):
-            shutil.rmtree(TEMP_DIR)
-    else:
-        # Always check for throughput signals as they are vital for stable
-        # metrics
-        setup_throughput_signals()
-        print("Datasets already setup.")
-    print("Done.")

From aeebe747b8dac4508ee44e46a856e1178cc4142b Mon Sep 17 00:00:00 2001
From: Nils Schimmelmann <nschimme@gmail.com>
Date: Wed, 4 Mar 2026 21:15:16 -0600
Subject: [PATCH 3/5] CI: add FAAC Benchmark Suite GitHub Action for automated
 regression testing

---
 .github/workflows/benchmark.yml | 136 ++++++++++++++++++++++++++++++++
 README                          |   9 +++
 2 files changed, 145 insertions(+)
 create mode 100644 .github/workflows/benchmark.yml

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 00000000..2950bbd6
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,136 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches: [ "master" ]
+    paths:
+      - "libfaac/**"
+      - ".github/workflows/benchmark.yml"
+  pull_request:
+    branches: [ "*" ]
+    paths:
+      - "libfaac/**"
+      - ".github/workflows/benchmark.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  benchmark:
+    name: Benchmark ${{ matrix.arch }} / ${{ matrix.precision }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [amd64]
+        precision: [single, double]
+
+    steps:
+      - name: Install build dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y meson ninja-build bc ffmpeg
+
+      - name: Checkout Candidate
+        uses: actions/checkout@v4
+        with:
+          path: candidate
+
+      - name: Build Candidate
+        run: |
+          cd candidate
+          meson setup build_cand -Dfloating-point=${{ matrix.precision }} --buildtype=release
+          ninja -C build_cand
+
+      - name: Determine Baseline SHA
+        id: baseline-sha
+        run: |
+          if [ "${{ github.event_name }}" == "push" ]; then
+            echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT
+          else
+            echo "sha=${{ github.event.pull_request.base.sha }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Restore Baseline Results
+        id: cache-baseline
+        uses: actions/cache/restore@v4
+        with:
+          path: results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
+          key: ${{ runner.os }}-baseline-${{ matrix.arch }}-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}
+
+      - name: Checkout Baseline
+        if: steps.cache-baseline.outputs.cache-hit != 'true'
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.baseline-sha.outputs.sha }}
+          path: baseline
+
+      - name: Build Baseline
+        if: steps.cache-baseline.outputs.cache-hit != 'true'
+        run: |
+          cd baseline
+          meson setup build_base -Dfloating-point=${{ matrix.precision }} --buildtype=release
+          ninja -C build_base
+
+      - name: Run Benchmark (Baseline)
+        if: steps.cache-baseline.outputs.cache-hit != 'true'
+        uses: nschimme/faac-benchmark@master
+        with:
+          faac-bin: ./baseline/build_base/frontend/faac
+          libfaac-so: ./baseline/build_base/libfaac/libfaac.so
+          run-name: ${{ matrix.arch }}_${{ matrix.precision }}_base
+          output-json: ./results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
+          visqol-image: ghcr.io/nschimme/faac-benchmark-visqol:latest
+
+      - name: Save Baseline Results
+        if: success() && steps.cache-baseline.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v4
+        with:
+          path: results/${{ matrix.arch }}_${{ matrix.precision }}_base.json
+          key: ${{ runner.os }}-baseline-${{ matrix.arch }}-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}
+
+      - name: Run Benchmark (Candidate)
+        if: github.event_name == 'pull_request'
+        uses: nschimme/faac-benchmark@master
+        with:
+          faac-bin: ./candidate/build_cand/frontend/faac
+          libfaac-so: ./candidate/build_cand/libfaac/libfaac.so
+          run-name: ${{ matrix.arch }}_${{ matrix.precision }}_cand
+          output-json: ./results/${{ matrix.arch }}_${{ matrix.precision }}_cand.json
+          visqol-image: ghcr.io/nschimme/faac-benchmark-visqol:latest
+
+      - name: Upload Results
+        if: github.event_name == 'pull_request'
+        uses: actions/upload-artifact@v4
+        with:
+          name: results-${{ matrix.arch }}-${{ matrix.precision }}
+          path: results/*.json
+
+  report:
+    name: Consolidated Report
+    needs: benchmark
+    runs-on: ubuntu-latest
+    if: always() && github.event_name == 'pull_request'
+    steps:
+      - name: Download all results
+        uses: actions/download-artifact@v4
+        with:
+          path: results
+          pattern: results-*
+          merge-multiple: true
+
+      - name: Generate Report
+        id: generate
+        uses: nschimme/faac-benchmark/report@master
+        with:
+          results-path: ./results
+          base-sha: ${{ github.event.pull_request.base.sha }}
+          cand-sha: ${{ github.event.pull_request.head.sha }}
+          summary-only: false
+
+      - name: Upload Full Report
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-report-full
+          path: report.md
diff --git a/README b/README
index 2c30f925..ccce8022 100644
--- a/README
+++ b/README
@@ -79,3 +79,12 @@ General FAAC compiling instructions
 	cd build
 	meson setup ..
 	meson install
+
+___________________________________
+Benchmarking
+
+FAAC uses a dedicated benchmark suite to ensure quality and performance.
+The suite is hosted in a separate repository: https://github.com/nschimme/faac-benchmark
+
+Automated benchmarks run on every pull request. For instructions on how to
+run benchmarks locally, please refer to the README in the benchmark repository.

From 4f0e73cf47ff117c21171290d7c6fcda71da9567 Mon Sep 17 00:00:00 2001
From: Nils Schimmelmann <nschimme@gmail.com>
Date: Thu, 5 Mar 2026 10:11:27 -0600
Subject: [PATCH 4/5] pin faac-benchmark to v1

---
 .github/workflows/benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2950bbd6..1c798371 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -75,7 +75,7 @@ jobs:
 
       - name: Run Benchmark (Baseline)
         if: steps.cache-baseline.outputs.cache-hit != 'true'
-        uses: nschimme/faac-benchmark@master
+        uses: nschimme/faac-benchmark@v1
         with:
           faac-bin: ./baseline/build_base/frontend/faac
           libfaac-so: ./baseline/build_base/libfaac/libfaac.so
@@ -92,7 +92,7 @@ jobs:
 
       - name: Run Benchmark (Candidate)
         if: github.event_name == 'pull_request'
-        uses: nschimme/faac-benchmark@master
+        uses: nschimme/faac-benchmark@v1
         with:
           faac-bin: ./candidate/build_cand/frontend/faac
           libfaac-so: ./candidate/build_cand/libfaac/libfaac.so
@@ -122,7 +122,7 @@ jobs:
 
       - name: Generate Report
         id: generate
-        uses: nschimme/faac-benchmark/report@master
+        uses: nschimme/faac-benchmark/report@v1
         with:
           results-path: ./results
           base-sha: ${{ github.event.pull_request.base.sha }}

From 8ab42d5219eb50bfac9ad3666ca760e52fe0e03f Mon Sep 17 00:00:00 2001
From: Nils Schimmelmann <nschimme@gmail.com>
Date: Fri, 13 Mar 2026 18:04:35 -0500
Subject: [PATCH 5/5] post comment again

---
 .github/workflows/benchmark.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 1c798371..084871cb 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -112,7 +112,13 @@ jobs:
     needs: benchmark
     runs-on: ubuntu-latest
     if: always() && github.event_name == 'pull_request'
+    permissions:
+      pull-requests: write
     steps:
+      - name: Checkout Code
+        if: github.event_name == 'pull_request'
+        uses: actions/checkout@v4
+
       - name: Download all results
         uses: actions/download-artifact@v4
         with:
@@ -134,3 +140,13 @@ jobs:
         with:
           name: benchmark-report-full
           path: report.md
+
+      - name: Post Summary to PR
+        if: always() && github.event_name == 'pull_request'
+        continue-on-error: true
+        shell: bash
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh pr comment ${{ github.event.pull_request.number }} --body-file summary.md
+