From 1866c4492e4e0021a0af28568b66933118b19c4f Mon Sep 17 00:00:00 2001 From: Nils Schimmelmann Date: Wed, 4 Mar 2026 06:35:17 -0600 Subject: [PATCH 1/5] CI: add FAAC Benchmark Suite to test quality --- .github/workflows/benchmark.yml | 180 +++++++++++ .gitignore | 7 + tests/README.md | 90 ++++++ tests/compare_results.py | 519 ++++++++++++++++++++++++++++++++ tests/requirements.txt | 4 + tests/run_benchmark.py | 364 ++++++++++++++++++++++ tests/setup_datasets.py | 261 ++++++++++++++++ 7 files changed, 1425 insertions(+) create mode 100644 .github/workflows/benchmark.yml create mode 100644 tests/README.md create mode 100644 tests/compare_results.py create mode 100644 tests/requirements.txt create mode 100644 tests/run_benchmark.py create mode 100644 tests/setup_datasets.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..4c2ad7e5 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,180 @@ +name: Benchmark + +on: + pull_request: + branches: [ "master" ] + paths: + - "libfaac/**" + - "tests/**" + +jobs: + benchmark: + # NOTE: ViSQOL via visqol-py is currently most reliable on ubuntu-22.04. + name: ${{ matrix.arch }} / ${{ matrix.precision }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + arch: [amd64] + precision: [single, double] + include: + - arch: amd64 + os: ubuntu-22.04 + + steps: + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y meson ninja-build bc ffmpeg + + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + cache-dependency-path: 'tests/requirements.txt' + + - name: Install Python dependencies + run: | + pip install --upgrade pip setuptools wheel + pip install -r tests/requirements.txt + + - name: Restore Datasets + id: cache-datasets + uses: actions/cache/restore@v4 + with: + path: tests/data/external + key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }} + + - name: Setup Datasets + if: steps.cache-datasets.outputs.cache-hit != 'true' + run: | + python3 tests/setup_datasets.py + + - name: Save Datasets + if: steps.cache-datasets.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: tests/data/external + key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }} + + - name: Determine Baseline SHA + id: baseline-sha + run: | + git checkout ${{ github.base_ref || 'master' }} + echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + git checkout ${{ github.sha }} + + - name: Restore Baseline Results + id: cache-baseline + uses: actions/cache/restore@v4 + with: + path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json + key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }} + + - name: Run Benchmark (Baseline) + if: steps.cache-baseline.outputs.cache-hit != 'true' + run: | + git checkout ${{ steps.baseline-sha.outputs.sha }} + meson setup build_base -Dfloating-point=${{ matrix.precision }} --buildtype=release + ninja -C build_base + LIB_PATH="build_base/libfaac/libfaac.so" + FAAC_PATH="build_base/frontend/faac" + # Restore benchmark scripts and config from PR branch to ensure consistent comparison logic + git checkout ${{ github.sha }} -- tests/ + python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_base" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json" --coverage 100 + + - name: Save Baseline Results + if: always() && steps.cache-baseline.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json + key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }} + + - name: Run Benchmark (Candidate) + run: | + git checkout ${{ github.sha }} + mkdir -p tests/results + meson setup build_cand -Dfloating-point=${{ matrix.precision }} --buildtype=release + ninja -C build_cand + LIB_PATH="build_cand/libfaac/libfaac.so" + FAAC_PATH="build_cand/frontend/faac" + python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_cand" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_cand.json" --coverage 100 + + - name: Upload Results + uses: actions/upload-artifact@v4 + with: + name: results-${{ matrix.arch }}-${{ matrix.precision }} + path: tests/results/*.json + + report: + name: Consolidated Report + needs: benchmark + runs-on: ubuntu-latest + env: + BASE_SHA: ${{ github.event.pull_request.base.sha || github.event.before }} + CAND_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + if: always() + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Download all results + uses: actions/download-artifact@v4 + with: + path: tests/results + pattern: results-* + merge-multiple: true + + - name: Generate Report + id: generate + run: | + # Summary report for PR comment (high-signal only) + python3 tests/compare_results.py tests/results --summary-only --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-summary.md || echo "REGRESSION_DETECTED=1" >> $GITHUB_ENV + # Full report for artifact (all details) + python3 tests/compare_results.py tests/results --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-full.md || true + if [ ! -s report-summary.md ]; then + echo "Error: report-summary.md is empty" + exit 1 + fi + cat report-summary.md + + - name: Upload Full Report + uses: actions/upload-artifact@v4 + with: + name: benchmark-report-full + path: report-full.md + + - name: PR Feedback + if: always() && github.event_name == 'pull_request' + continue-on-error: true + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + if (fs.existsSync('report-summary.md')) { + let report = fs.readFileSync('report-summary.md', 'utf8').trim(); + if (report.length > 0) { + const jobUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`; + const readmeUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/blob/${process.env.GITHUB_SHA}/tests/README.md`; + report += `\n\n---\n[View Detailed Job Log and Full Report](${jobUrl}) | [What Is This?](${readmeUrl})`; + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: report + }) + } + } + + - name: Check for Regressions + run: | + if [ "${{ env.REGRESSION_DETECTED }}" == "1" ]; then + echo "Regressions or missing data detected. Failing job." + exit 1 + fi diff --git a/.gitignore b/.gitignore index 1e20e33b..ec9b9a11 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,10 @@ Makefile* *.user /libfaac/win32_ver.h /libfaac/faac.pc +.DS_Store +/build*/ +/venv/ +/tests/__pycache__/ +/tests/data/external/ +/tests/output/ +/tests/results/ diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..d47c7104 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,90 @@ +# FAAC Benchmark Suite + +FAAC is the high-efficiency encoder for the resource-constrained world. From hobbyist projects to professional surveillance (VSS) and embedded VoIP, we prioritize performance where every cycle and byte matters. + +This suite provides the objective data necessary to ensure that every change moves us closer to our Northstar: the optimal balance of quality, speed, and size. + +--- + +## The "Golden Triangle" Philosophy + +We evaluate every contribution against three competing pillars. While high-bitrate encoders like FDK-AAC or Opus target multi-channel, high-fidelity entertainment, FAAC focuses on remaining approachable and distributable for the global open-source community. We prioritize non-patent encumbered areas and the standard Low Complexity (LC-AAC) profile. + +1. **Audio Fidelity**: We target transparent audio quality for our bitrates. We use objective metrics like ViSQOL (MOS) to ensure psychoacoustic improvements truly benefit the listener without introducing "metallic" ringing or "underwater" artifacts. +2. **Computational Efficiency**: FAAC must remain fast. We optimize for low-power cores where encoding speed is a critical requirement. Every CPU cycle saved is a win for our users. +3. **Minimal Footprint**: Binary size is a feature. We ensure the library remains small enough to fit within restrictive embedded firmware. + +--- + +## Benchmarking Scenarios + +| Scenario | Mode | Source | Config | Project Goal | +| :--- | :--- | :--- | :--- | :--- | +| **VoIP** | Speech (16k) | TCD-VOIP | `-b 16` | Clear communication at low bitrates (16kbps). | +| **VSS** | Speech (16k) | TCD-VOIP | `-b 40` | High-fidelity Video Surveillance Systems recording (40kbps). | +| **Music** | Audio (48k) | PMLT / SoundExpert | `-b 64-256` | Full-range transparency for storage & streaming. | +| **Throughput** | Efficiency | Synthetic Signals | Default | Stability test using 10-minute Sine/Sweep/Noise/Silence. | + +--- + +## Metric Definitions + +| Metric | Definition | Reference | +| :--- | :--- | :--- | +| **MOS** | Mean Opinion Score (LQO). Predicted perceptual quality from 1.0 (Bad) to 5.0 (Excellent), computed via the **ViSQOL** model. | [ITU-T P.800](https://www.itu.int/rec/T-REC-P.800), [ViSQOL](https://github.com/google/visqol) | +| **Regressions** | Critical failure or a drop in MOS ≥ 0.1 compared to the baseline commit. Significant throughput drops (>10%) or increased binary size also warrant review. | | +| **Significant Win** | An improvement in MOS ≥ 0.1 compared to the baseline commit. | | +| **Consistency** | Percentage of test cases where bitstreams are MD5-identical to the baseline. | | +| **Throughput** | Normalized encoding speed improvement against baseline. Higher % indicates faster execution. | | +| **Library Size** | Binary footprint of `libfaac.so`. Delta measured against baseline. Critical for embedded VSS/IoT targets. | | +| **Bitrate Δ** | Percentage change in generated file size against baseline. Relative shift in bits used for the same target. | | +| **Bitrate Accuracy** | The closeness of the achieved bitrate to the specified target (ABR mode). Measures the encoder's ability to respect the user-defined bitrate budget. | | + +--- + +## Dataset Sources + +We are grateful to the following projects for providing high-quality research material: + +* **TCD-VoIP (Sigmedia-VoIP)**: [Listener Test Database](https://www.sigmedia.tv/datasets/tcd_voip_ltd/) - Specifically designed for assessing quality in VoIP applications. +* **PMLT2014**: [Public Multiformat Listening Test](https://listening-test.coresv.net/) - A community-defined comprehensive multi-codec benchmark. +* **SoundExpert**: [Sound Samples](https://soundexpert.org/sound-samples) - High-precision EBU SQAM CD excerpts for transparency testing. + +--- + +## Quick Start + +### 1. Install Dependencies +```bash +# System (Ubuntu/Debian) +sudo apt-get update && sudo apt-get install -y meson ninja-build bc ffmpeg + +# Python +python3 -m venv venv +source venv/bin/activate +pip install -r tests/requirements.txt +``` + +### 2. Prepare Datasets +Downloads samples and generates 10-minute synthetic throughput signals (Sine, Sweep, Noise, Silence). +```bash +python3 tests/setup_datasets.py +``` + +### 3. Run a Benchmark +Perceptual analysis and full test suite coverage are enabled by default. Use `--skip-mos` or `--coverage 10` for faster iteration during local development. +```bash +python3 tests/run_benchmark.py build/frontend/faac build/libfaac/libfaac.so my_run tests/results/my_run.json +``` + +### 4. Compare Results +Generate a high-signal summary comparing your candidate against a baseline. +```bash +python3 tests/compare_results.py tests/results/ +``` + +## Who This Suite Helps + +* **Maintainers**: Provides the confidence to merge PRs by proving that a change improves the encoder—or at least doesn't cause a regression. +* **Developers**: Offers standardized, automated feedback during implementation. +* **Users**: Ensures that every new version of FAAC remains a reliable choice for their critical firmware and communication projects. diff --git a/tests/compare_results.py b/tests/compare_results.py new file mode 100644 index 00000000..edae0175 --- /dev/null +++ b/tests/compare_results.py @@ -0,0 +1,519 @@ +""" + * FAAC Benchmark Suite + * Copyright (C) 2026 Nils Schimmelmann + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +""" + +import json +import sys +import os +from collections import defaultdict + + +def analyze_pair(base_file, cand_file): + try: + with open(base_file, "r") as f: + base = json.load(f) + except Exception as e: + sys.stderr.write( + f" Warning: Could not load baseline file {base_file}: {e}\n") + base = {} + + try: + with open(cand_file, "r") as f: + cand = json.load(f) + except Exception as e: + sys.stderr.write( + f" Error: Could not load candidate file {cand_file}: {e}\n") + return None + + suite_results = { + "has_regression": False, + "missing_data": False, + "mos_delta_sum": 0, + "mos_count": 0, + "missing_mos_count": 0, + "tp_reduction": 0, + "lib_size_chg": 0, + "bitrate_chg_sum": 0, + "bitrate_count": 0, + "bitrate_acc_sum": 0, + "bitrate_acc_count": 0, + "regressions": [], + "new_wins": [], + "significant_wins": [], + "opportunities": [], + "bit_exact_count": 0, + "total_cases": 0, + "all_cases": [], + "scenario_stats": defaultdict( + lambda: { + "tp_sum_cand": 0, + "tp_sum_base": 0, + "count": 0}), + "base_tp": base.get("throughput", {}), + "cand_tp": cand.get("throughput", {})} + + base_m = base.get("matrix", {}) + cand_m = cand.get("matrix", {}) + + if cand_m: + suite_results["total_cases"] = len(cand_m) + for k in sorted(cand_m.keys()): + o = cand_m[k] + b = base_m.get(k, {}) + + filename = o.get("filename", k) + scenario = o.get("scenario", "") + display_name = f"{scenario}: {filename}" + + o_mos = o.get("mos") + b_mos = b.get("mos") + thresh = o.get("thresh", 1.0) + + o_size = o.get("size") + b_size = b.get("size") + + o_bitrate = o.get("bitrate") + o_target = o.get("bitrate_target") + + if o_bitrate is not None and o_target is not None and o_target > 0: + acc = (1.0 - abs(o_bitrate - o_target) / o_target) * 100 + suite_results["bitrate_acc_sum"] += acc + suite_results["bitrate_acc_count"] += 1 + + o_time = o.get("time") + b_time = b.get("time") + + if o_time is not None and b_time is not None and b_time > 0: + suite_results["scenario_stats"][scenario]["tp_sum_cand"] += o_time + suite_results["scenario_stats"][scenario]["tp_sum_base"] += b_time + suite_results["scenario_stats"][scenario]["count"] += 1 + + o_md5 = o.get("md5", "") + b_md5 = b.get("md5", "") + + if o_md5 and b_md5 and o_md5 == b_md5: + suite_results["bit_exact_count"] += 1 + + size_chg = "N/A" + if o_size is not None and b_size is not None: + size_chg_val = (o_size - b_size) / b_size * 100 + size_chg = f"{size_chg_val:+.2f}%" + suite_results["bitrate_chg_sum"] += size_chg_val + suite_results["bitrate_count"] += 1 + elif o_size is None: + suite_results["missing_data"] = True + + status = "✅" + delta = 0 + if o_mos is not None: + if b_mos is not None: + delta = o_mos - b_mos + suite_results["mos_delta_sum"] += delta + suite_results["mos_count"] += 1 + + if o_mos < (thresh - 0.5): + status = "🤮" # Awful + elif o_mos < thresh: + status = "📉" # Bad/Poor + + if b_mos is not None: + if (o_mos - b_mos) < -0.1: + status = "❌" # Regression + suite_results["has_regression"] = True + elif (o_mos - b_mos) > 0.1: + status = "🌟" # Significant Win + + # Check for New Win (Baseline failed, Candidate passed) + if b_mos is not None and b_mos < thresh and o_mos >= thresh: + suite_results["new_wins"].append({ + "display_name": display_name, + "mos": o_mos, + "b_mos": b_mos, + "delta": delta + }) + else: + status = "❌" # Missing MOS is a failure + suite_results["missing_mos_count"] += 1 + suite_results["has_regression"] = True + suite_results["missing_data"] = True + delta = -10.0 # Force to top of regressions + + mos_str = f"{o_mos:.2f}" if o_mos is not None else "N/A" + b_mos_str = f"{b_mos:.2f}" if b_mos is not None else "N/A" + delta_mos = f"{(o_mos - b_mos):+.2f}" if ( + o_mos is not None and b_mos is not None) else "N/A" + + case_data = { + "display_name": display_name, + "status": status, + "mos": o_mos, + "b_mos": b_mos, + "delta": delta, + "size_chg": size_chg, + "line": f"| {display_name} | {status} | {mos_str} ({b_mos_str}) | {delta_mos} | {size_chg} |" + } + + suite_results["all_cases"].append(case_data) + if status == "❌": + suite_results["regressions"].append(case_data) + elif status == "🌟": + suite_results["significant_wins"].append(case_data) + elif status in ["🤮", "📉"]: + suite_results["opportunities"].append(case_data) + else: + suite_results["missing_data"] = True + + # Sorts + suite_results["regressions"].sort(key=lambda x: x["delta"]) + suite_results["new_wins"].sort(key=lambda x: x["delta"], reverse=True) + suite_results["significant_wins"].sort( + key=lambda x: x["delta"], reverse=True) + suite_results["opportunities"].sort( + key=lambda x: x["mos"] if x["mos"] is not None else 6.0) + + # Throughput + base_tp = base.get("throughput", {}) + cand_tp = cand.get("throughput", {}) + # Exclude "overall" to avoid double-counting in manual summation + total_base_t = sum(v for k, v in base_tp.items() if k != "overall") + total_cand_t = sum(v for k, v in cand_tp.items() if k != "overall") + if total_cand_t > 0 and total_base_t > 0: + suite_results["tp_reduction"] = (1 - total_cand_t / total_base_t) * 100 + else: + # If overall throughput is missing, try to aggregate from scenarios + cand_t_sum = sum(s["tp_sum_cand"] + for s in suite_results["scenario_stats"].values()) + base_t_sum = sum(s["tp_sum_base"] + for s in suite_results["scenario_stats"].values()) + if cand_t_sum > 0 and base_t_sum > 0: + suite_results["tp_reduction"] = (1 - cand_t_sum / base_t_sum) * 100 + else: + suite_results["missing_data"] = True + + # Binary Size + base_lib = base.get("lib_size", 0) + cand_lib = cand.get("lib_size", 0) + if cand_lib > 0 and base_lib > 0: + suite_results["lib_size_chg"] = ((cand_lib / base_lib) - 1) * 100 + else: + suite_results["missing_data"] = True + + return suite_results + + +def main(): + SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + + summary_only = "--summary-only" in sys.argv + if summary_only: + sys.argv.remove("--summary-only") + + base_sha = None + if "--base-sha" in sys.argv: + idx = sys.argv.index("--base-sha") + base_sha = sys.argv[idx + 1] + sys.argv.pop(idx + 1) + sys.argv.pop(idx) + + cand_sha = None + if "--cand-sha" in sys.argv: + idx = sys.argv.index("--cand-sha") + cand_sha = sys.argv[idx + 1] + sys.argv.pop(idx + 1) + sys.argv.pop(idx) + + results_dir = sys.argv[1] if len( + sys.argv) > 1 else os.path.join( + SCRIPT_DIR, "results") + + if not os.path.exists(results_dir): + sys.exit(1) + + files = os.listdir(results_dir) + + suites = {} + for f in files: + if f.endswith("_cand.json"): + suite_name = f[:-10] + base_f = suite_name + "_base.json" + if base_f in files: + suites[suite_name] = ( + os.path.join( + results_dir, base_f), os.path.join( + results_dir, f)) + + if not suites: + sys.stderr.write("No result pairs found in directory.\n") + sys.exit(1) + + all_suite_data = {} + overall_regression = False + overall_missing = False + total_mos_delta = 0 + total_mos_count = 0 + total_missing_mos = 0 + total_tp_reduction = 0 + total_lib_chg = 0 + total_bitrate_chg = 0 + total_bitrate_count = 0 + total_bitrate_acc_sum = 0 + total_bitrate_acc_count = 0 + + total_regressions = 0 + total_new_wins = 0 + total_significant_wins = 0 + total_bit_exact = 0 + total_cases_all = 0 + + # For worst-case scenario throughput + scenario_tp_deltas = [] + + for name, (base, cand) in sorted(suites.items()): + data = analyze_pair(base, cand) + if data: + all_suite_data[name] = data + if data["has_regression"]: + overall_regression = True + if data["missing_data"]: + overall_missing = True + total_mos_delta += data["mos_delta_sum"] + total_mos_count += data["mos_count"] + total_missing_mos += data["missing_mos_count"] + total_tp_reduction += data["tp_reduction"] + total_lib_chg += data["lib_size_chg"] + total_bitrate_chg += data["bitrate_chg_sum"] + total_bitrate_count += data["bitrate_count"] + total_bitrate_acc_sum += data["bitrate_acc_sum"] + total_bitrate_acc_count += data["bitrate_acc_count"] + + total_regressions += len(data["regressions"]) + total_new_wins += len(data["new_wins"]) + total_significant_wins += len(data["significant_wins"]) + total_bit_exact += data["bit_exact_count"] + total_cases_all += data["total_cases"] + + for sc_name, sc_data in data["scenario_stats"].items(): + if sc_data["tp_sum_base"] > 0: + delta = (1 - sc_data["tp_sum_cand"] / + sc_data["tp_sum_base"]) * 100 + scenario_tp_deltas.append((f"{name} / {sc_name}", delta)) + + avg_mos_delta_str = f"{(total_mos_delta / + total_mos_count):+.3f}" if total_mos_count > 0 else "N/A" + avg_tp_reduction = total_tp_reduction / \ + len(all_suite_data) if all_suite_data else 0 + avg_lib_chg = total_lib_chg / len(all_suite_data) if all_suite_data else 0 + avg_bitrate_chg = total_bitrate_chg / \ + total_bitrate_count if total_bitrate_count > 0 else 0 + avg_bitrate_acc = total_bitrate_acc_sum / \ + total_bitrate_acc_count if total_bitrate_acc_count > 0 else 0 + + bit_exact_percent = ( + total_bit_exact / + total_cases_all * + 100) if total_cases_all > 0 else 0 + + # Worst-case throughput + worst_tp_scen, worst_tp_delta = (None, 0) + if scenario_tp_deltas: + worst_tp_scen, worst_tp_delta = min( + scenario_tp_deltas, key=lambda x: x[1]) + + report = [] + if overall_regression: + report.append("## ❌ Quality Regression Detected") + elif worst_tp_delta < -5.0: + report.append("## ⚠️ Performance Regression Detected") + elif overall_missing: + report.append("## ❌ Incomplete/Missing Data Detected") + elif bit_exact_percent == 100.0: + report.append("## ✅ Refactor Verified (Bit-Identical)") + elif total_new_wins > 0 or total_significant_wins > 0 or (total_mos_count > 0 and (total_mos_delta / total_mos_count) > 0.01) or avg_tp_reduction > 5: + report.append("## 🚀 Perceptual & Efficiency Improvement") + else: + report.append("## 📊 Benchmark Summary") + + if not summary_only and (base_sha or cand_sha): + report.append("\n### Environment") + if base_sha: + report.append(f"- **Baseline SHA**: `{base_sha}`") + if cand_sha: + report.append(f"- **Candidate SHA**: `{cand_sha}`") + + report.append("\n### Summary") + report.append("| Metric | Value |") + report.append("| :--- | :--- |") + + # Regressions (Always shown) + reg_status = "0 ✅" if total_regressions == 0 else f"{total_regressions} ❌" + report.append(f"| **Regressions** | {reg_status} |") + + # New Wins (Only if baseline < threshold and candidate >= threshold) + if total_new_wins > 0: + report.append(f"| **New Wins** | {total_new_wins} 🆕 |") + + # Significant Wins (MOS delta > 0.1) + if total_significant_wins > 0: + report.append(f"| **Significant Wins** | {total_significant_wins} 🌟 |") + + # Bitstream Consistency (Against baseline) + consist_status = f"{bit_exact_percent:.1f}%" + if bit_exact_percent == 100.0: + consist_status += " (MD5 Match)" + report.append(f"| **Consistency** | {consist_status} |") + + # Throughput + if abs(avg_tp_reduction) > 0.1: + tp_icon = "🚀" if avg_tp_reduction > 1.0 else "📉" if avg_tp_reduction < -1.0 else "" + report.append( + f"| **Throughput (Avg)** | {avg_tp_reduction:+.1f}% {tp_icon} |") + + # Per-signal throughput deltas if available + tp_details = [] + if all_suite_data: + first_data = list(all_suite_data.values())[0] + base_tp = first_data.get("base_tp", {}) + cand_tp = first_data.get("cand_tp", {}) + for signal in sorted(cand_tp.keys()): + if signal == "overall": + continue + if signal in base_tp and base_tp[signal] > 0: + delta = (1 - cand_tp[signal] / base_tp[signal]) * 100 + icon = "🚀" if delta > 1.0 else "📉" if delta < -1.0 else "" + tp_details.append( + f"{signal.split('.')[0]}: {delta:+.1f}% {icon}") + + if tp_details: + report.append(f"| **TP Breakdown** | {', '.join(tp_details)} |") + + if worst_tp_delta < -1.0: + report.append( + f"| **Worst-case TP Δ** | {worst_tp_delta:.1f}% ({worst_tp_scen}) ⚠️ |") + + # Binary Size + if abs(avg_lib_chg) > 0.01: + size_icon = "📉" if avg_lib_chg < -0.1 else "📈" if avg_lib_chg > 0.1 else "" + report.append( + f"| **Library Size** | {avg_lib_chg:+.2f}% {size_icon} |") + + + # Bitrate Δ + if abs(avg_bitrate_chg) > 0.1: + bitrate_icon = "📉" if avg_bitrate_chg < - \ + 1.0 else "📈" if avg_bitrate_chg > 1.0 else "" + report.append( + f"| **Bitrate Δ** | {avg_bitrate_chg:+.2f}% {bitrate_icon} |") + + # Bitrate Accuracy + if total_bitrate_acc_count > 0: + acc_icon = "🎯" if avg_bitrate_acc > 95 else "⚠️" if avg_bitrate_acc < 80 else "" + report.append( + f"| **Bitrate Accuracy** | {avg_bitrate_acc:.1f}% {acc_icon} |") + + # Avg MOS Delta + if total_mos_count > 0 and abs(total_mos_delta / total_mos_count) > 0.001: + report.append(f"| **Avg MOS Delta** | {avg_mos_delta_str} |") + + if total_missing_mos > 0: + report.append( + f"\n⚠️ **Warning**: {total_missing_mos} MOS scores were missing/failed (treated as ❌).") + + if not summary_only: + # 1. Collapsible Details: Regressions + if total_regressions > 0: + report.append( + "\n
❌ View Regression Details ({})\n".format(total_regressions)) + for name, data in sorted(all_suite_data.items()): + if data["regressions"]: + report.append(f"\n#### {name}") + report.append( + "| Test Case | Status | MOS (Base) | Delta | Size Δ |") + report.append("| :--- | :---: | :---: | :---: | :---: |") + for r in data["regressions"]: + report.append(r["line"]) + report.append("\n
") + + # 2. Collapsible Additional Details + report.append( + "\n
View Additional Suite Details & Wins\n") + + for name, data in sorted(all_suite_data.items()): + status_icon = "✅" + if data["has_regression"]: + status_icon = "❌" + elif data["missing_data"]: + status_icon = "❌" + + avg_mos_suite = f"{(data['mos_delta_sum'] / + data['mos_count']):+.3f}" if data["mos_count"] > 0 else "N/A" + suite_bit_exact_percent = ( + data["bit_exact_count"] / + data["total_cases"] * + 100) if data["total_cases"] > 0 else 0 + + report.append(f"\n#### {status_icon} {name}") + report.append( + f"- MOS Δ: {avg_mos_suite}, TP Δ: {data['tp_reduction']:+.1f}%, Size Δ: {data['lib_size_chg']:+.2f}%") + report.append( + f"- Bitstream Consistency: {suite_bit_exact_percent:.1f}%") + + if data["new_wins"]: + report.append("\n**🆕 New Wins**") + report.append("| Test Case | MOS (Base) | Delta |") + report.append("| :--- | :---: | :---: |") + for w in data["new_wins"]: + report.append("| {} | {:.2f} ({:.2f}) | {:+.2f} |".format( + w["display_name"], w["mos"], w["b_mos"], w["delta"])) + + if data["significant_wins"]: + report.append("\n**🌟 Significant Wins**") + report.append( + "| Test Case | Status | MOS (Base) | Delta | Size Δ |") + report.append("| :--- | :---: | :---: | :---: | :---: |") + for w in data["significant_wins"]: + report.append(w["line"]) + + if data["opportunities"]: + report.append("\n**💡 Opportunities**") + report.append( + "| Test Case | Status | MOS (Base) | Delta | Size Δ |") + report.append("| :--- | :---: | :---: | :---: | :---: |") + for o in data["opportunities"]: + report.append(o["line"]) + + if data["all_cases"]: + report.append( + f"\n
View all {len(data['all_cases'])} cases for {name}\n") + report.append( + "| Test Case | Status | MOS (Base) | Delta | Size Δ |") + report.append("| :--- | :---: | :---: | :---: | :---: |") + for c in data["all_cases"]: + report.append(c["line"]) + report.append("\n
") + + report.append("\n
") + + output = "\n".join(report) + sys.stdout.write(output + "\n") + + if overall_regression or overall_missing: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 00000000..2ee1d2a2 --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,4 @@ +numpy +protobuf==3.20.3 +ffmpeg-python +git+https://github.com/diggerdu/visqol-py.git@452eb5c4f17fd2404f968ec2eeadfcad74925485 diff --git a/tests/run_benchmark.py b/tests/run_benchmark.py new file mode 100644 index 00000000..232dbb54 --- /dev/null +++ b/tests/run_benchmark.py @@ -0,0 +1,364 @@ +""" + * FAAC Benchmark Suite + * Copyright (C) 2026 Nils Schimmelmann + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +""" + +import os +import subprocess +import time +import sys +import json +import tempfile +import hashlib +import concurrent.futures +import multiprocessing + +try: + import visqol_py + from visqol_py import ViSQOLMode + HAS_VISQOL = True +except ImportError: + HAS_VISQOL = False + +try: + import ffmpeg + HAS_FFMPEG = True +except ImportError: + HAS_FFMPEG = False + +# Paths relative to script directory +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXTERNAL_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external") +OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output") + +SCENARIOS = { + "voip": { + "mode": "speech", + "rate": 16000, + "visqol_rate": 16000, + "bitrate": 16, + "thresh": 2.5}, + "vss": { + "mode": "speech", + "rate": 16000, + "visqol_rate": 16000, + "bitrate": 40, + "thresh": 3.0}, + "music_low": { + "mode": "audio", + "rate": 48000, + "visqol_rate": 48000, + "bitrate": 64, + "thresh": 3.5}, + "music_std": { + "mode": "audio", + "rate": 48000, + "visqol_rate": 48000, + "bitrate": 128, + "thresh": 4.0}, + "music_high": { + "mode": "audio", + "rate": 48000, + "visqol_rate": 48000, + "bitrate": 256, + "thresh": 4.3}} + + +def get_visqol_mode(mode_str): + if not HAS_VISQOL: + return None + return ViSQOLMode.SPEECH if mode_str == "speech" else ViSQOLMode.AUDIO + + +def get_binary_size(path): + if os.path.exists(path): + return os.path.getsize(path) + return 0 + + +def get_md5(path): + if not os.path.exists(path): + return "" + hash_md5 = hashlib.md5() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def run_visqol(visqol, ref_wav, deg_wav): + """Run ViSQOL via provided API instance and return MOS score.""" + if visqol is None: + return None + try: + result = visqol.measure(ref_wav, deg_wav) + return float(result.moslqo) + except Exception as e: + print(f" ViSQOL API error: {e}") + return None + + +# Process-local storage for ViSQOL instances +_process_visqol_instances = {} + + +def get_process_visqol(mode_str): + if not HAS_VISQOL: + return None + if mode_str not in _process_visqol_instances: + try: + mode = get_visqol_mode(mode_str) + _process_visqol_instances[mode_str] = visqol_py.ViSQOL(mode=mode) + except Exception as e: + print( + f" Failed to initialize ViSQOL in process { + os.getpid()}: {e}") + _process_visqol_instances[mode_str] = None + return _process_visqol_instances[mode_str] + + +def worker_init(cpu_id_queue): + """Pin the worker process to a specific CPU core for consistent benchmarks.""" + cpu_id = cpu_id_queue.get() + if hasattr(os, "sched_setaffinity"): + try: + os.sched_setaffinity(0, [cpu_id]) + except Exception as e: + print(f" Failed to pin process {os.getpid()} to CPU {cpu_id}: {e}") + + +def process_sample(faac_bin_path, name, cfg, sample, data_dir, precision, env): + input_path = os.path.join(data_dir, sample) + key = f"{name}_{sample}" + output_path = os.path.join(OUTPUT_DIR, f"{key}_{precision}.aac") + + # Determine encoding parameters + cmd = [faac_bin_path, "-o", output_path, input_path] + cmd.extend(["-b", str(cfg["bitrate"])]) + + try: + t_start = time.time() + subprocess.run(cmd, env=env, check=True, capture_output=True) + t_duration = time.time() - t_start + + mos = None + aac_size = os.path.getsize(output_path) + actual_bitrate = None + + if HAS_FFMPEG: + try: + probe = ffmpeg.probe(input_path) + duration = float(probe['format']['duration']) + if duration > 0: + # kbps = (bytes * 8) / (seconds * 1000) + actual_bitrate = (aac_size * 8) / (duration * 1000) + except Exception as e: + print(f" Failed to probe duration for {sample}: {e}") + + if HAS_FFMPEG: + with tempfile.TemporaryDirectory() as tmpdir: + v_ref = os.path.join(tmpdir, "vref.wav") + v_deg = os.path.join(tmpdir, "vdeg.wav") + v_rate = cfg["visqol_rate"] + v_channels = 1 if cfg["mode"] == "speech" else 2 + + try: + # Use ffmpeg-python to decode AAC and prepare files for + # ViSQOL + ffmpeg.input(input_path).output( + v_ref, ar=v_rate, ac=v_channels, sample_fmt='s16').run( + quiet=True, overwrite_output=True) + ffmpeg.input(output_path).output( + v_deg, ar=v_rate, ac=v_channels, sample_fmt='s16').run( + quiet=True, overwrite_output=True) + + if os.path.exists(v_ref) and os.path.exists(v_deg): + visqol = get_process_visqol(cfg["mode"]) + mos = run_visqol(visqol, v_ref, v_deg) + except ffmpeg.Error as e: + print( + f" FFmpeg error for {sample}: { + e.stderr.decode() if e.stderr else e}") + + return key, { + "mos": mos, + "size": aac_size, + "bitrate": actual_bitrate, + "bitrate_target": cfg.get("bitrate"), + "time": t_duration, + "md5": get_md5(output_path), + "thresh": cfg["thresh"], + "scenario": name, + "filename": sample + } + except Exception as e: + print(f" failed: {e}") + return None + + +def run_benchmark( + faac_bin_path, + lib_path, + precision, + coverage=100, + run_perceptual=True): + env = os.environ.copy() + + os.makedirs(OUTPUT_DIR, exist_ok=True) + results = { + "matrix": {}, + "throughput": {}, + "lib_size": get_binary_size(lib_path) + } + + if run_perceptual: + print(f"Starting perceptual benchmark for {precision}...") + # Detect number of CPUs for parallelization + num_cpus = os.cpu_count() or 1 + print(f"Parallelizing across {num_cpus} threads.") + + for name, cfg in SCENARIOS.items(): + data_subdir = "speech" if cfg["mode"] == "speech" else "audio" + data_dir = os.path.join(EXTERNAL_DATA_DIR, data_subdir) + if not os.path.exists(data_dir): + print( + f" [Scenario: {name}] Data directory {data_dir} not found, skipping.") + continue + + all_samples = sorted( + [f for f in os.listdir(data_dir) if f.endswith(".wav")]) + num_to_run = max(1, int(len(all_samples) * coverage / 100.0)) + step = len(all_samples) / num_to_run if num_to_run > 0 else 1 + samples = [all_samples[int(i * step)] for i in range(num_to_run)] + + print( + f" [Scenario: {name}] Processing { + len(samples)} samples (coverage {coverage}%)...") + + # Pin each process to a unique CPU core + manager = multiprocessing.Manager() + cpu_id_queue = manager.Queue() + for cpu_id in range(num_cpus): + cpu_id_queue.put(cpu_id) + + with concurrent.futures.ProcessPoolExecutor( + max_workers=num_cpus, + initializer=worker_init, + initargs=(cpu_id_queue,) + ) as executor: + futures = { + executor.submit( + process_sample, + faac_bin_path, + name, + cfg, + sample, + data_dir, + precision, + env): sample for sample in samples} + for i, future in enumerate( + concurrent.futures.as_completed(futures)): + result = future.result() + if result: + key, data = result + results["matrix"][key] = data + mos_str = f"{ + data['mos']:.2f}" if data['mos'] is not None else "N/A" + print( + f" ({i + 1}/{len(samples)}) {data['filename']} done. (MOS: {mos_str})") + + print(f"Measuring throughput for {precision}...") + # Pin current process to a single core for accurate throughput measurement + if hasattr(os, "sched_setaffinity"): + try: + os.sched_setaffinity(0, [0]) + except BaseException: + pass + + tp_dir = os.path.join(EXTERNAL_DATA_DIR, "throughput") + if os.path.exists(tp_dir): + tp_samples = sorted( + [f for f in os.listdir(tp_dir) if f.endswith(".wav")]) + if tp_samples: + overall_durations = [] + for sample in tp_samples: + input_path = os.path.join(tp_dir, sample) + output_path = os.path.join( + OUTPUT_DIR, f"tp_{sample}_{precision}.aac") + + print(f" Benchmarking throughput with {sample}...") + try: + # Warmup + subprocess.run([faac_bin_path, + "-o", + output_path, + input_path], + env=env, + check=True, + capture_output=True) + + # Multiple runs to average noise + durations = [] + for _ in range(3): + start_time = time.perf_counter() + subprocess.run([faac_bin_path, + "-o", + output_path, + input_path], + env=env, + check=True, + capture_output=True) + durations.append(time.perf_counter() - start_time) + + avg_dur = sum(durations) / len(durations) + results["throughput"][sample] = avg_dur + overall_durations.append(avg_dur) + except BaseException as e: + print(f" Throughput benchmark failed for {sample}: {e}") + pass + + if overall_durations: + results["throughput"]["overall"] = sum( + overall_durations) / len(overall_durations) + + return results + + +if __name__ == "__main__": + if len(sys.argv) < 5: + print( + "Usage: python3 tests/run_benchmark.py [--skip-mos] [--coverage 100]") + sys.exit(1) + + do_perc = "--skip-mos" not in sys.argv + coverage = 100 + if "--coverage" in sys.argv: + idx = sys.argv.index("--coverage") + coverage = int(sys.argv[idx + 1]) + + data = run_benchmark( + sys.argv[1], + sys.argv[2], + sys.argv[3], + coverage=coverage, + run_perceptual=do_perc) + + # Ensure results directory exists + output_json = os.path.abspath(sys.argv[4]) + os.makedirs(os.path.dirname(output_json), exist_ok=True) + with open(output_json, "w") as f: + json.dump(data, f, indent=2) diff --git a/tests/setup_datasets.py b/tests/setup_datasets.py new file mode 100644 index 00000000..735ce3b3 --- /dev/null +++ b/tests/setup_datasets.py @@ -0,0 +1,261 @@ +""" + * FAAC Benchmark Suite + * Copyright (C) 2026 Nils Schimmelmann + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +""" + +import os +import urllib.request +import zipfile +import shutil +import wave +import re +import ffmpeg + +DATASETS = { + "PMLT2014": { + "url": "https://github.com/nschimme/PMLT2014/archive/refs/tags/PMLT2014.zip", + "name": "Public Multiformat Listening Test @ 96 kbps (July 2014)" + }, + "TCD-VOIP": { + "url": "https://github.com/nschimme/TCD-VOIP/archive/refs/tags/harte2015tcd.zip", + "name": "TCD-VoIP (Sigmedia-VoIP) Listener Test Database" + }, + "SoundExpert": { + "url": "https://github.com/nschimme/SoundExpert/archive/refs/tags/SoundExpert.zip", + "name": "SoundExpert Sound samples" + } +} + +# Paths relative to script directory +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +BASE_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external") +TEMP_DIR = os.path.join(SCRIPT_DIR, "data", "temp") + + +def download_and_extract(name, url): + os.makedirs(TEMP_DIR, exist_ok=True) + zip_path = os.path.join(TEMP_DIR, f"{name}.zip") + if not os.path.exists(zip_path): + print(f"Downloading {name}...") + urllib.request.urlretrieve(url, zip_path) + + print(f"Extracting {name}...") + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(TEMP_DIR) + + +def get_info(wav_path): + try: + with wave.open(wav_path, 'rb') as f: + frames = f.getnframes() + rate = f.getframerate() + channels = f.getnchannels() + return frames / float(rate), channels + except BaseException: + return 0, 2 + + +def resample( + input_path, + output_path, + rate, + channels, + start=None, + duration=None, + loop=False): + os.makedirs(os.path.dirname(output_path), exist_ok=True) + try: + input_args = {} + output_args = {} + + if loop: + # Loop input indefinitely, then trim to requested duration + input_args['stream_loop'] = -1 + + if start is not None: + output_args['ss'] = start + if duration is not None: + output_args['t'] = duration + + (ffmpeg .input(input_path, + **input_args) .output(output_path, + ar=rate, + ac=channels, + sample_fmt='s16', + **output_args) .run(quiet=True, + overwrite_output=True)) + except ffmpeg.Error as e: + print( + f" FFmpeg error during setup: { + e.stderr.decode() if e.stderr else e}") + + +def get_tier_params(dur): + """ + Determine resampling parameters based on ViSQOL recommendations (5-10s). + 1. < 5s: loop to 5s + 2. 5-10s: use full sample + 3. > 10s: trim to 10s center segment + """ + if dur < 5.0: + return 0, 5, True + if dur <= 10.0: + return None, None, False + return (dur - 10) / 2, 10, False + + +def setup_pmlt(): + dataset_info = DATASETS["PMLT2014"] + src_dir = os.path.join(TEMP_DIR, "PMLT2014-PMLT2014") + dest_dir = os.path.join(BASE_DATA_DIR, "audio") + + wav_files = [] + for root, dirs, files in os.walk(src_dir): + for f in files: + if f.endswith("48k.wav") and not re.search(r"48k\.\d+\.wav$", f): + wav_files.append(os.path.join(root, f)) + + print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.") + for i, wav in enumerate(wav_files): + print(f" [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...") + dur, chans = get_info(wav) + start, duration, loop = get_tier_params(dur) + + filename = os.path.basename(wav) + output = os.path.join(dest_dir, filename) + resample( + wav, + output, + 48000, + chans, + start=start, + duration=duration, + loop=loop) + + +def setup_tcd_voip(): + dataset_info = DATASETS["TCD-VOIP"] + src_dir = os.path.join(TEMP_DIR, "TCD-VOIP-harte2015tcd") + dest_dir = os.path.join(BASE_DATA_DIR, "speech") + + wav_files = [] + for root, dirs, files in os.walk(src_dir): + # Do not use any wave files if they're in a "ref" folder + if "ref" in root.split(os.sep): + continue + + for f in files: + if f.endswith(".wav") and ("Test Set" in root or "chop" in root): + wav_files.append(os.path.join(root, f)) + + print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.") + for i, wav in enumerate(wav_files): + print(f" [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...") + dur, chans = get_info(wav) + start, duration, loop = get_tier_params(dur) + + filename = os.path.basename(wav) + output = os.path.join(dest_dir, filename) + # ViSQOL speech mode requires 16k mono + resample( + wav, + output, + 16000, + 1, + start=start, + duration=duration, + loop=loop) + + +def setup_soundexpert(): + dataset_info = DATASETS["SoundExpert"] + src_dir = os.path.join(TEMP_DIR, "SoundExpert-SoundExpert") + dest_dir = os.path.join(BASE_DATA_DIR, "audio") + + wav_files = [] + for root, dirs, files in os.walk(src_dir): + for f in files: + if f.endswith(".wav"): + wav_files.append(os.path.join(root, f)) + + print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.") + for i, wav in enumerate(wav_files): + print(f" [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...") + dur, chans = get_info(wav) + start, duration, loop = get_tier_params(dur) + + filename = os.path.basename(wav) + output = os.path.join(dest_dir, filename) + resample( + wav, + output, + 48000, + chans, + start=start, + duration=duration, + loop=loop) + + +def setup_throughput_signals(): + """Generate 10-minute test signals for throughput measurement.""" + dest_dir = os.path.join(BASE_DATA_DIR, "throughput") + os.makedirs(dest_dir, exist_ok=True) + + signals = { + "sine": "sine=f=440:d=600", + "sweep": "aevalsrc='sin(2*PI*(100+(20000-100)/(2*600)*t)*t)':d=600", + "noise": "anoisesrc=d=600", + "silence": "anullsrc=d=600" + } + + print(f"Generating 10-minute throughput signals...") + for name, filter_str in signals.items(): + output_path = os.path.join(dest_dir, f"{name}.wav") + if not os.path.exists(output_path): + print(f" Generating {name}.wav...") + try: + # Note: aevalsrc is also a lavfi filter + ( + ffmpeg + .input(filter_str, format='lavfi') + .output(output_path, ar=48000, ac=2, sample_fmt='s16') + .run(quiet=True, overwrite_output=True) + ) + except ffmpeg.Error as e: + print( + f" FFmpeg error during signal generation: { + e.stderr.decode() if e.stderr else e}") + + +if __name__ == "__main__": + if not os.path.exists(BASE_DATA_DIR): + for name, info in DATASETS.items(): + download_and_extract(name, info["url"]) + + setup_pmlt() + setup_tcd_voip() + setup_soundexpert() + setup_throughput_signals() + + if os.path.exists(TEMP_DIR): + shutil.rmtree(TEMP_DIR) + else: + # Always check for throughput signals as they are vital for stable + # metrics + setup_throughput_signals() + print("Datasets already setup.") + print("Done.") From ce9c4a4f784c5864d3886a4feb828cdc8fb4d4b9 Mon Sep 17 00:00:00 2001 From: Nils Schimmelmann Date: Wed, 4 Mar 2026 21:13:40 -0600 Subject: [PATCH 2/5] Revert "CI: add FAAC Benchmark Suite to test quality" This reverts commit 1866c4492e4e0021a0af28568b66933118b19c4f. --- .github/workflows/benchmark.yml | 180 ----------- .gitignore | 7 - tests/README.md | 90 ------ tests/compare_results.py | 519 -------------------------------- tests/requirements.txt | 4 - tests/run_benchmark.py | 364 ---------------------- tests/setup_datasets.py | 261 ---------------- 7 files changed, 1425 deletions(-) delete mode 100644 .github/workflows/benchmark.yml delete mode 100644 tests/README.md delete mode 100644 tests/compare_results.py delete mode 100644 tests/requirements.txt delete mode 100644 tests/run_benchmark.py delete mode 100644 tests/setup_datasets.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index 4c2ad7e5..00000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,180 +0,0 @@ -name: Benchmark - -on: - pull_request: - branches: [ "master" ] - paths: - - "libfaac/**" - - "tests/**" - -jobs: - benchmark: - # NOTE: ViSQOL via visqol-py is currently most reliable on ubuntu-22.04. - name: ${{ matrix.arch }} / ${{ matrix.precision }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - arch: [amd64] - precision: [single, double] - include: - - arch: amd64 - os: ubuntu-22.04 - - steps: - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y meson ninja-build bc ffmpeg - - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: 'pip' - cache-dependency-path: 'tests/requirements.txt' - - - name: Install Python dependencies - run: | - pip install --upgrade pip setuptools wheel - pip install -r tests/requirements.txt - - - name: Restore Datasets - id: cache-datasets - uses: actions/cache/restore@v4 - with: - path: tests/data/external - key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }} - - - name: Setup Datasets - if: steps.cache-datasets.outputs.cache-hit != 'true' - run: | - python3 tests/setup_datasets.py - - - name: Save Datasets - if: steps.cache-datasets.outputs.cache-hit != 'true' - uses: actions/cache/save@v4 - with: - path: tests/data/external - key: ${{ runner.os }}-datasets-${{ hashFiles('tests/setup_datasets.py') }} - - - name: Determine Baseline SHA - id: baseline-sha - run: | - git checkout ${{ github.base_ref || 'master' }} - echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT - git checkout ${{ github.sha }} - - - name: Restore Baseline Results - id: cache-baseline - uses: actions/cache/restore@v4 - with: - path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json - key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }} - - - name: Run Benchmark (Baseline) - if: steps.cache-baseline.outputs.cache-hit != 'true' - run: | - git checkout ${{ steps.baseline-sha.outputs.sha }} - meson setup build_base -Dfloating-point=${{ matrix.precision }} --buildtype=release - ninja -C build_base - LIB_PATH="build_base/libfaac/libfaac.so" - FAAC_PATH="build_base/frontend/faac" - # Restore benchmark scripts and config from PR branch to ensure consistent comparison logic - git checkout ${{ github.sha }} -- tests/ - python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_base" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json" --coverage 100 - - - name: Save Baseline Results - if: always() && steps.cache-baseline.outputs.cache-hit != 'true' - uses: actions/cache/save@v4 - with: - path: tests/results/${{ matrix.arch }}_${{ matrix.precision }}_base.json - key: ${{ runner.os }}-baseline-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }}-${{ hashFiles('tests/*.py', 'tests/requirements.txt') }} - - - name: Run Benchmark (Candidate) - run: | - git checkout ${{ github.sha }} - mkdir -p tests/results - meson setup build_cand -Dfloating-point=${{ matrix.precision }} --buildtype=release - ninja -C build_cand - LIB_PATH="build_cand/libfaac/libfaac.so" - FAAC_PATH="build_cand/frontend/faac" - python3 tests/run_benchmark.py $FAAC_PATH $LIB_PATH "${{ matrix.arch }}_${{ matrix.precision }}_cand" "tests/results/${{ matrix.arch }}_${{ matrix.precision }}_cand.json" --coverage 100 - - - name: Upload Results - uses: actions/upload-artifact@v4 - with: - name: results-${{ matrix.arch }}-${{ matrix.precision }} - path: tests/results/*.json - - report: - name: Consolidated Report - needs: benchmark - runs-on: ubuntu-latest - env: - BASE_SHA: ${{ github.event.pull_request.base.sha || github.event.before }} - CAND_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - if: always() - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Download all results - uses: actions/download-artifact@v4 - with: - path: tests/results - pattern: results-* - merge-multiple: true - - - name: Generate Report - id: generate - run: | - # Summary report for PR comment (high-signal only) - python3 tests/compare_results.py tests/results --summary-only --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-summary.md || echo "REGRESSION_DETECTED=1" >> $GITHUB_ENV - # Full report for artifact (all details) - python3 tests/compare_results.py tests/results --base-sha "${{ env.BASE_SHA }}" --cand-sha "${{ env.CAND_SHA }}" > report-full.md || true - if [ ! -s report-summary.md ]; then - echo "Error: report-summary.md is empty" - exit 1 - fi - cat report-summary.md - - - name: Upload Full Report - uses: actions/upload-artifact@v4 - with: - name: benchmark-report-full - path: report-full.md - - - name: PR Feedback - if: always() && github.event_name == 'pull_request' - continue-on-error: true - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - if (fs.existsSync('report-summary.md')) { - let report = fs.readFileSync('report-summary.md', 'utf8').trim(); - if (report.length > 0) { - const jobUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`; - const readmeUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/blob/${process.env.GITHUB_SHA}/tests/README.md`; - report += `\n\n---\n[View Detailed Job Log and Full Report](${jobUrl}) | [What Is This?](${readmeUrl})`; - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: report - }) - } - } - - - name: Check for Regressions - run: | - if [ "${{ env.REGRESSION_DETECTED }}" == "1" ]; then - echo "Regressions or missing data detected. Failing job." - exit 1 - fi diff --git a/.gitignore b/.gitignore index ec9b9a11..1e20e33b 100644 --- a/.gitignore +++ b/.gitignore @@ -38,10 +38,3 @@ Makefile* *.user /libfaac/win32_ver.h /libfaac/faac.pc -.DS_Store -/build*/ -/venv/ -/tests/__pycache__/ -/tests/data/external/ -/tests/output/ -/tests/results/ diff --git a/tests/README.md b/tests/README.md deleted file mode 100644 index d47c7104..00000000 --- a/tests/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# FAAC Benchmark Suite - -FAAC is the high-efficiency encoder for the resource-constrained world. From hobbyist projects to professional surveillance (VSS) and embedded VoIP, we prioritize performance where every cycle and byte matters. - -This suite provides the objective data necessary to ensure that every change moves us closer to our Northstar: the optimal balance of quality, speed, and size. - ---- - -## The "Golden Triangle" Philosophy - -We evaluate every contribution against three competing pillars. While high-bitrate encoders like FDK-AAC or Opus target multi-channel, high-fidelity entertainment, FAAC focuses on remaining approachable and distributable for the global open-source community. We prioritize non-patent encumbered areas and the standard Low Complexity (LC-AAC) profile. - -1. **Audio Fidelity**: We target transparent audio quality for our bitrates. We use objective metrics like ViSQOL (MOS) to ensure psychoacoustic improvements truly benefit the listener without introducing "metallic" ringing or "underwater" artifacts. -2. **Computational Efficiency**: FAAC must remain fast. We optimize for low-power cores where encoding speed is a critical requirement. Every CPU cycle saved is a win for our users. -3. **Minimal Footprint**: Binary size is a feature. We ensure the library remains small enough to fit within restrictive embedded firmware. - ---- - -## Benchmarking Scenarios - -| Scenario | Mode | Source | Config | Project Goal | -| :--- | :--- | :--- | :--- | :--- | -| **VoIP** | Speech (16k) | TCD-VOIP | `-b 16` | Clear communication at low bitrates (16kbps). | -| **VSS** | Speech (16k) | TCD-VOIP | `-b 40` | High-fidelity Video Surveillance Systems recording (40kbps). | -| **Music** | Audio (48k) | PMLT / SoundExpert | `-b 64-256` | Full-range transparency for storage & streaming. | -| **Throughput** | Efficiency | Synthetic Signals | Default | Stability test using 10-minute Sine/Sweep/Noise/Silence. | - ---- - -## Metric Definitions - -| Metric | Definition | Reference | -| :--- | :--- | :--- | -| **MOS** | Mean Opinion Score (LQO). Predicted perceptual quality from 1.0 (Bad) to 5.0 (Excellent), computed via the **ViSQOL** model. | [ITU-T P.800](https://www.itu.int/rec/T-REC-P.800), [ViSQOL](https://github.com/google/visqol) | -| **Regressions** | Critical failure or a drop in MOS ≥ 0.1 compared to the baseline commit. Significant throughput drops (>10%) or increased binary size also warrant review. | | -| **Significant Win** | An improvement in MOS ≥ 0.1 compared to the baseline commit. | | -| **Consistency** | Percentage of test cases where bitstreams are MD5-identical to the baseline. | | -| **Throughput** | Normalized encoding speed improvement against baseline. Higher % indicates faster execution. | | -| **Library Size** | Binary footprint of `libfaac.so`. Delta measured against baseline. Critical for embedded VSS/IoT targets. | | -| **Bitrate Δ** | Percentage change in generated file size against baseline. Relative shift in bits used for the same target. | | -| **Bitrate Accuracy** | The closeness of the achieved bitrate to the specified target (ABR mode). Measures the encoder's ability to respect the user-defined bitrate budget. | | - ---- - -## Dataset Sources - -We are grateful to the following projects for providing high-quality research material: - -* **TCD-VoIP (Sigmedia-VoIP)**: [Listener Test Database](https://www.sigmedia.tv/datasets/tcd_voip_ltd/) - Specifically designed for assessing quality in VoIP applications. -* **PMLT2014**: [Public Multiformat Listening Test](https://listening-test.coresv.net/) - A community-defined comprehensive multi-codec benchmark. -* **SoundExpert**: [Sound Samples](https://soundexpert.org/sound-samples) - High-precision EBU SQAM CD excerpts for transparency testing. - ---- - -## Quick Start - -### 1. Install Dependencies -```bash -# System (Ubuntu/Debian) -sudo apt-get update && sudo apt-get install -y meson ninja-build bc ffmpeg - -# Python -python3 -m venv venv -source venv/bin/activate -pip install -r tests/requirements.txt -``` - -### 2. Prepare Datasets -Downloads samples and generates 10-minute synthetic throughput signals (Sine, Sweep, Noise, Silence). -```bash -python3 tests/setup_datasets.py -``` - -### 3. Run a Benchmark -Perceptual analysis and full test suite coverage are enabled by default. Use `--skip-mos` or `--coverage 10` for faster iteration during local development. -```bash -python3 tests/run_benchmark.py build/frontend/faac build/libfaac/libfaac.so my_run tests/results/my_run.json -``` - -### 4. Compare Results -Generate a high-signal summary comparing your candidate against a baseline. -```bash -python3 tests/compare_results.py tests/results/ -``` - -## Who This Suite Helps - -* **Maintainers**: Provides the confidence to merge PRs by proving that a change improves the encoder—or at least doesn't cause a regression. -* **Developers**: Offers standardized, automated feedback during implementation. -* **Users**: Ensures that every new version of FAAC remains a reliable choice for their critical firmware and communication projects. diff --git a/tests/compare_results.py b/tests/compare_results.py deleted file mode 100644 index edae0175..00000000 --- a/tests/compare_results.py +++ /dev/null @@ -1,519 +0,0 @@ -""" - * FAAC Benchmark Suite - * Copyright (C) 2026 Nils Schimmelmann - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program. If not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -""" - -import json -import sys -import os -from collections import defaultdict - - -def analyze_pair(base_file, cand_file): - try: - with open(base_file, "r") as f: - base = json.load(f) - except Exception as e: - sys.stderr.write( - f" Warning: Could not load baseline file {base_file}: {e}\n") - base = {} - - try: - with open(cand_file, "r") as f: - cand = json.load(f) - except Exception as e: - sys.stderr.write( - f" Error: Could not load candidate file {cand_file}: {e}\n") - return None - - suite_results = { - "has_regression": False, - "missing_data": False, - "mos_delta_sum": 0, - "mos_count": 0, - "missing_mos_count": 0, - "tp_reduction": 0, - "lib_size_chg": 0, - "bitrate_chg_sum": 0, - "bitrate_count": 0, - "bitrate_acc_sum": 0, - "bitrate_acc_count": 0, - "regressions": [], - "new_wins": [], - "significant_wins": [], - "opportunities": [], - "bit_exact_count": 0, - "total_cases": 0, - "all_cases": [], - "scenario_stats": defaultdict( - lambda: { - "tp_sum_cand": 0, - "tp_sum_base": 0, - "count": 0}), - "base_tp": base.get("throughput", {}), - "cand_tp": cand.get("throughput", {})} - - base_m = base.get("matrix", {}) - cand_m = cand.get("matrix", {}) - - if cand_m: - suite_results["total_cases"] = len(cand_m) - for k in sorted(cand_m.keys()): - o = cand_m[k] - b = base_m.get(k, {}) - - filename = o.get("filename", k) - scenario = o.get("scenario", "") - display_name = f"{scenario}: {filename}" - - o_mos = o.get("mos") - b_mos = b.get("mos") - thresh = o.get("thresh", 1.0) - - o_size = o.get("size") - b_size = b.get("size") - - o_bitrate = o.get("bitrate") - o_target = o.get("bitrate_target") - - if o_bitrate is not None and o_target is not None and o_target > 0: - acc = (1.0 - abs(o_bitrate - o_target) / o_target) * 100 - suite_results["bitrate_acc_sum"] += acc - suite_results["bitrate_acc_count"] += 1 - - o_time = o.get("time") - b_time = b.get("time") - - if o_time is not None and b_time is not None and b_time > 0: - suite_results["scenario_stats"][scenario]["tp_sum_cand"] += o_time - suite_results["scenario_stats"][scenario]["tp_sum_base"] += b_time - suite_results["scenario_stats"][scenario]["count"] += 1 - - o_md5 = o.get("md5", "") - b_md5 = b.get("md5", "") - - if o_md5 and b_md5 and o_md5 == b_md5: - suite_results["bit_exact_count"] += 1 - - size_chg = "N/A" - if o_size is not None and b_size is not None: - size_chg_val = (o_size - b_size) / b_size * 100 - size_chg = f"{size_chg_val:+.2f}%" - suite_results["bitrate_chg_sum"] += size_chg_val - suite_results["bitrate_count"] += 1 - elif o_size is None: - suite_results["missing_data"] = True - - status = "✅" - delta = 0 - if o_mos is not None: - if b_mos is not None: - delta = o_mos - b_mos - suite_results["mos_delta_sum"] += delta - suite_results["mos_count"] += 1 - - if o_mos < (thresh - 0.5): - status = "🤮" # Awful - elif o_mos < thresh: - status = "📉" # Bad/Poor - - if b_mos is not None: - if (o_mos - b_mos) < -0.1: - status = "❌" # Regression - suite_results["has_regression"] = True - elif (o_mos - b_mos) > 0.1: - status = "🌟" # Significant Win - - # Check for New Win (Baseline failed, Candidate passed) - if b_mos is not None and b_mos < thresh and o_mos >= thresh: - suite_results["new_wins"].append({ - "display_name": display_name, - "mos": o_mos, - "b_mos": b_mos, - "delta": delta - }) - else: - status = "❌" # Missing MOS is a failure - suite_results["missing_mos_count"] += 1 - suite_results["has_regression"] = True - suite_results["missing_data"] = True - delta = -10.0 # Force to top of regressions - - mos_str = f"{o_mos:.2f}" if o_mos is not None else "N/A" - b_mos_str = f"{b_mos:.2f}" if b_mos is not None else "N/A" - delta_mos = f"{(o_mos - b_mos):+.2f}" if ( - o_mos is not None and b_mos is not None) else "N/A" - - case_data = { - "display_name": display_name, - "status": status, - "mos": o_mos, - "b_mos": b_mos, - "delta": delta, - "size_chg": size_chg, - "line": f"| {display_name} | {status} | {mos_str} ({b_mos_str}) | {delta_mos} | {size_chg} |" - } - - suite_results["all_cases"].append(case_data) - if status == "❌": - suite_results["regressions"].append(case_data) - elif status == "🌟": - suite_results["significant_wins"].append(case_data) - elif status in ["🤮", "📉"]: - suite_results["opportunities"].append(case_data) - else: - suite_results["missing_data"] = True - - # Sorts - suite_results["regressions"].sort(key=lambda x: x["delta"]) - suite_results["new_wins"].sort(key=lambda x: x["delta"], reverse=True) - suite_results["significant_wins"].sort( - key=lambda x: x["delta"], reverse=True) - suite_results["opportunities"].sort( - key=lambda x: x["mos"] if x["mos"] is not None else 6.0) - - # Throughput - base_tp = base.get("throughput", {}) - cand_tp = cand.get("throughput", {}) - # Exclude "overall" to avoid double-counting in manual summation - total_base_t = sum(v for k, v in base_tp.items() if k != "overall") - total_cand_t = sum(v for k, v in cand_tp.items() if k != "overall") - if total_cand_t > 0 and total_base_t > 0: - suite_results["tp_reduction"] = (1 - total_cand_t / total_base_t) * 100 - else: - # If overall throughput is missing, try to aggregate from scenarios - cand_t_sum = sum(s["tp_sum_cand"] - for s in suite_results["scenario_stats"].values()) - base_t_sum = sum(s["tp_sum_base"] - for s in suite_results["scenario_stats"].values()) - if cand_t_sum > 0 and base_t_sum > 0: - suite_results["tp_reduction"] = (1 - cand_t_sum / base_t_sum) * 100 - else: - suite_results["missing_data"] = True - - # Binary Size - base_lib = base.get("lib_size", 0) - cand_lib = cand.get("lib_size", 0) - if cand_lib > 0 and base_lib > 0: - suite_results["lib_size_chg"] = ((cand_lib / base_lib) - 1) * 100 - else: - suite_results["missing_data"] = True - - return suite_results - - -def main(): - SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) - - summary_only = "--summary-only" in sys.argv - if summary_only: - sys.argv.remove("--summary-only") - - base_sha = None - if "--base-sha" in sys.argv: - idx = sys.argv.index("--base-sha") - base_sha = sys.argv[idx + 1] - sys.argv.pop(idx + 1) - sys.argv.pop(idx) - - cand_sha = None - if "--cand-sha" in sys.argv: - idx = sys.argv.index("--cand-sha") - cand_sha = sys.argv[idx + 1] - sys.argv.pop(idx + 1) - sys.argv.pop(idx) - - results_dir = sys.argv[1] if len( - sys.argv) > 1 else os.path.join( - SCRIPT_DIR, "results") - - if not os.path.exists(results_dir): - sys.exit(1) - - files = os.listdir(results_dir) - - suites = {} - for f in files: - if f.endswith("_cand.json"): - suite_name = f[:-10] - base_f = suite_name + "_base.json" - if base_f in files: - suites[suite_name] = ( - os.path.join( - results_dir, base_f), os.path.join( - results_dir, f)) - - if not suites: - sys.stderr.write("No result pairs found in directory.\n") - sys.exit(1) - - all_suite_data = {} - overall_regression = False - overall_missing = False - total_mos_delta = 0 - total_mos_count = 0 - total_missing_mos = 0 - total_tp_reduction = 0 - total_lib_chg = 0 - total_bitrate_chg = 0 - total_bitrate_count = 0 - total_bitrate_acc_sum = 0 - total_bitrate_acc_count = 0 - - total_regressions = 0 - total_new_wins = 0 - total_significant_wins = 0 - total_bit_exact = 0 - total_cases_all = 0 - - # For worst-case scenario throughput - scenario_tp_deltas = [] - - for name, (base, cand) in sorted(suites.items()): - data = analyze_pair(base, cand) - if data: - all_suite_data[name] = data - if data["has_regression"]: - overall_regression = True - if data["missing_data"]: - overall_missing = True - total_mos_delta += data["mos_delta_sum"] - total_mos_count += data["mos_count"] - total_missing_mos += data["missing_mos_count"] - total_tp_reduction += data["tp_reduction"] - total_lib_chg += data["lib_size_chg"] - total_bitrate_chg += data["bitrate_chg_sum"] - total_bitrate_count += data["bitrate_count"] - total_bitrate_acc_sum += data["bitrate_acc_sum"] - total_bitrate_acc_count += data["bitrate_acc_count"] - - total_regressions += len(data["regressions"]) - total_new_wins += len(data["new_wins"]) - total_significant_wins += len(data["significant_wins"]) - total_bit_exact += data["bit_exact_count"] - total_cases_all += data["total_cases"] - - for sc_name, sc_data in data["scenario_stats"].items(): - if sc_data["tp_sum_base"] > 0: - delta = (1 - sc_data["tp_sum_cand"] / - sc_data["tp_sum_base"]) * 100 - scenario_tp_deltas.append((f"{name} / {sc_name}", delta)) - - avg_mos_delta_str = f"{(total_mos_delta / - total_mos_count):+.3f}" if total_mos_count > 0 else "N/A" - avg_tp_reduction = total_tp_reduction / \ - len(all_suite_data) if all_suite_data else 0 - avg_lib_chg = total_lib_chg / len(all_suite_data) if all_suite_data else 0 - avg_bitrate_chg = total_bitrate_chg / \ - total_bitrate_count if total_bitrate_count > 0 else 0 - avg_bitrate_acc = total_bitrate_acc_sum / \ - total_bitrate_acc_count if total_bitrate_acc_count > 0 else 0 - - bit_exact_percent = ( - total_bit_exact / - total_cases_all * - 100) if total_cases_all > 0 else 0 - - # Worst-case throughput - worst_tp_scen, worst_tp_delta = (None, 0) - if scenario_tp_deltas: - worst_tp_scen, worst_tp_delta = min( - scenario_tp_deltas, key=lambda x: x[1]) - - report = [] - if overall_regression: - report.append("## ❌ Quality Regression Detected") - elif worst_tp_delta < -5.0: - report.append("## ⚠️ Performance Regression Detected") - elif overall_missing: - report.append("## ❌ Incomplete/Missing Data Detected") - elif bit_exact_percent == 100.0: - report.append("## ✅ Refactor Verified (Bit-Identical)") - elif total_new_wins > 0 or total_significant_wins > 0 or (total_mos_count > 0 and (total_mos_delta / total_mos_count) > 0.01) or avg_tp_reduction > 5: - report.append("## 🚀 Perceptual & Efficiency Improvement") - else: - report.append("## 📊 Benchmark Summary") - - if not summary_only and (base_sha or cand_sha): - report.append("\n### Environment") - if base_sha: - report.append(f"- **Baseline SHA**: `{base_sha}`") - if cand_sha: - report.append(f"- **Candidate SHA**: `{cand_sha}`") - - report.append("\n### Summary") - report.append("| Metric | Value |") - report.append("| :--- | :--- |") - - # Regressions (Always shown) - reg_status = "0 ✅" if total_regressions == 0 else f"{total_regressions} ❌" - report.append(f"| **Regressions** | {reg_status} |") - - # New Wins (Only if baseline < threshold and candidate >= threshold) - if total_new_wins > 0: - report.append(f"| **New Wins** | {total_new_wins} 🆕 |") - - # Significant Wins (MOS delta > 0.1) - if total_significant_wins > 0: - report.append(f"| **Significant Wins** | {total_significant_wins} 🌟 |") - - # Bitstream Consistency (Against baseline) - consist_status = f"{bit_exact_percent:.1f}%" - if bit_exact_percent == 100.0: - consist_status += " (MD5 Match)" - report.append(f"| **Consistency** | {consist_status} |") - - # Throughput - if abs(avg_tp_reduction) > 0.1: - tp_icon = "🚀" if avg_tp_reduction > 1.0 else "📉" if avg_tp_reduction < -1.0 else "" - report.append( - f"| **Throughput (Avg)** | {avg_tp_reduction:+.1f}% {tp_icon} |") - - # Per-signal throughput deltas if available - tp_details = [] - if all_suite_data: - first_data = list(all_suite_data.values())[0] - base_tp = first_data.get("base_tp", {}) - cand_tp = first_data.get("cand_tp", {}) - for signal in sorted(cand_tp.keys()): - if signal == "overall": - continue - if signal in base_tp and base_tp[signal] > 0: - delta = (1 - cand_tp[signal] / base_tp[signal]) * 100 - icon = "🚀" if delta > 1.0 else "📉" if delta < -1.0 else "" - tp_details.append( - f"{signal.split('.')[0]}: {delta:+.1f}% {icon}") - - if tp_details: - report.append(f"| **TP Breakdown** | {', '.join(tp_details)} |") - - if worst_tp_delta < -1.0: - report.append( - f"| **Worst-case TP Δ** | {worst_tp_delta:.1f}% ({worst_tp_scen}) ⚠️ |") - - # Binary Size - if abs(avg_lib_chg) > 0.01: - size_icon = "📉" if avg_lib_chg < -0.1 else "📈" if avg_lib_chg > 0.1 else "" - report.append( - f"| **Library Size** | {avg_lib_chg:+.2f}% {size_icon} |") - - - # Bitrate Δ - if abs(avg_bitrate_chg) > 0.1: - bitrate_icon = "📉" if avg_bitrate_chg < - \ - 1.0 else "📈" if avg_bitrate_chg > 1.0 else "" - report.append( - f"| **Bitrate Δ** | {avg_bitrate_chg:+.2f}% {bitrate_icon} |") - - # Bitrate Accuracy - if total_bitrate_acc_count > 0: - acc_icon = "🎯" if avg_bitrate_acc > 95 else "⚠️" if avg_bitrate_acc < 80 else "" - report.append( - f"| **Bitrate Accuracy** | {avg_bitrate_acc:.1f}% {acc_icon} |") - - # Avg MOS Delta - if total_mos_count > 0 and abs(total_mos_delta / total_mos_count) > 0.001: - report.append(f"| **Avg MOS Delta** | {avg_mos_delta_str} |") - - if total_missing_mos > 0: - report.append( - f"\n⚠️ **Warning**: {total_missing_mos} MOS scores were missing/failed (treated as ❌).") - - if not summary_only: - # 1. Collapsible Details: Regressions - if total_regressions > 0: - report.append( - "\n
❌ View Regression Details ({})\n".format(total_regressions)) - for name, data in sorted(all_suite_data.items()): - if data["regressions"]: - report.append(f"\n#### {name}") - report.append( - "| Test Case | Status | MOS (Base) | Delta | Size Δ |") - report.append("| :--- | :---: | :---: | :---: | :---: |") - for r in data["regressions"]: - report.append(r["line"]) - report.append("\n
") - - # 2. Collapsible Additional Details - report.append( - "\n
View Additional Suite Details & Wins\n") - - for name, data in sorted(all_suite_data.items()): - status_icon = "✅" - if data["has_regression"]: - status_icon = "❌" - elif data["missing_data"]: - status_icon = "❌" - - avg_mos_suite = f"{(data['mos_delta_sum'] / - data['mos_count']):+.3f}" if data["mos_count"] > 0 else "N/A" - suite_bit_exact_percent = ( - data["bit_exact_count"] / - data["total_cases"] * - 100) if data["total_cases"] > 0 else 0 - - report.append(f"\n#### {status_icon} {name}") - report.append( - f"- MOS Δ: {avg_mos_suite}, TP Δ: {data['tp_reduction']:+.1f}%, Size Δ: {data['lib_size_chg']:+.2f}%") - report.append( - f"- Bitstream Consistency: {suite_bit_exact_percent:.1f}%") - - if data["new_wins"]: - report.append("\n**🆕 New Wins**") - report.append("| Test Case | MOS (Base) | Delta |") - report.append("| :--- | :---: | :---: |") - for w in data["new_wins"]: - report.append("| {} | {:.2f} ({:.2f}) | {:+.2f} |".format( - w["display_name"], w["mos"], w["b_mos"], w["delta"])) - - if data["significant_wins"]: - report.append("\n**🌟 Significant Wins**") - report.append( - "| Test Case | Status | MOS (Base) | Delta | Size Δ |") - report.append("| :--- | :---: | :---: | :---: | :---: |") - for w in data["significant_wins"]: - report.append(w["line"]) - - if data["opportunities"]: - report.append("\n**💡 Opportunities**") - report.append( - "| Test Case | Status | MOS (Base) | Delta | Size Δ |") - report.append("| :--- | :---: | :---: | :---: | :---: |") - for o in data["opportunities"]: - report.append(o["line"]) - - if data["all_cases"]: - report.append( - f"\n
View all {len(data['all_cases'])} cases for {name}\n") - report.append( - "| Test Case | Status | MOS (Base) | Delta | Size Δ |") - report.append("| :--- | :---: | :---: | :---: | :---: |") - for c in data["all_cases"]: - report.append(c["line"]) - report.append("\n
") - - report.append("\n
") - - output = "\n".join(report) - sys.stdout.write(output + "\n") - - if overall_regression or overall_missing: - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/tests/requirements.txt b/tests/requirements.txt deleted file mode 100644 index 2ee1d2a2..00000000 --- a/tests/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy -protobuf==3.20.3 -ffmpeg-python -git+https://github.com/diggerdu/visqol-py.git@452eb5c4f17fd2404f968ec2eeadfcad74925485 diff --git a/tests/run_benchmark.py b/tests/run_benchmark.py deleted file mode 100644 index 232dbb54..00000000 --- a/tests/run_benchmark.py +++ /dev/null @@ -1,364 +0,0 @@ -""" - * FAAC Benchmark Suite - * Copyright (C) 2026 Nils Schimmelmann - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program. If not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -""" - -import os -import subprocess -import time -import sys -import json -import tempfile -import hashlib -import concurrent.futures -import multiprocessing - -try: - import visqol_py - from visqol_py import ViSQOLMode - HAS_VISQOL = True -except ImportError: - HAS_VISQOL = False - -try: - import ffmpeg - HAS_FFMPEG = True -except ImportError: - HAS_FFMPEG = False - -# Paths relative to script directory -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -EXTERNAL_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external") -OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output") - -SCENARIOS = { - "voip": { - "mode": "speech", - "rate": 16000, - "visqol_rate": 16000, - "bitrate": 16, - "thresh": 2.5}, - "vss": { - "mode": "speech", - "rate": 16000, - "visqol_rate": 16000, - "bitrate": 40, - "thresh": 3.0}, - "music_low": { - "mode": "audio", - "rate": 48000, - "visqol_rate": 48000, - "bitrate": 64, - "thresh": 3.5}, - "music_std": { - "mode": "audio", - "rate": 48000, - "visqol_rate": 48000, - "bitrate": 128, - "thresh": 4.0}, - "music_high": { - "mode": "audio", - "rate": 48000, - "visqol_rate": 48000, - "bitrate": 256, - "thresh": 4.3}} - - -def get_visqol_mode(mode_str): - if not HAS_VISQOL: - return None - return ViSQOLMode.SPEECH if mode_str == "speech" else ViSQOLMode.AUDIO - - -def get_binary_size(path): - if os.path.exists(path): - return os.path.getsize(path) - return 0 - - -def get_md5(path): - if not os.path.exists(path): - return "" - hash_md5 = hashlib.md5() - with open(path, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - - -def run_visqol(visqol, ref_wav, deg_wav): - """Run ViSQOL via provided API instance and return MOS score.""" - if visqol is None: - return None - try: - result = visqol.measure(ref_wav, deg_wav) - return float(result.moslqo) - except Exception as e: - print(f" ViSQOL API error: {e}") - return None - - -# Process-local storage for ViSQOL instances -_process_visqol_instances = {} - - -def get_process_visqol(mode_str): - if not HAS_VISQOL: - return None - if mode_str not in _process_visqol_instances: - try: - mode = get_visqol_mode(mode_str) - _process_visqol_instances[mode_str] = visqol_py.ViSQOL(mode=mode) - except Exception as e: - print( - f" Failed to initialize ViSQOL in process { - os.getpid()}: {e}") - _process_visqol_instances[mode_str] = None - return _process_visqol_instances[mode_str] - - -def worker_init(cpu_id_queue): - """Pin the worker process to a specific CPU core for consistent benchmarks.""" - cpu_id = cpu_id_queue.get() - if hasattr(os, "sched_setaffinity"): - try: - os.sched_setaffinity(0, [cpu_id]) - except Exception as e: - print(f" Failed to pin process {os.getpid()} to CPU {cpu_id}: {e}") - - -def process_sample(faac_bin_path, name, cfg, sample, data_dir, precision, env): - input_path = os.path.join(data_dir, sample) - key = f"{name}_{sample}" - output_path = os.path.join(OUTPUT_DIR, f"{key}_{precision}.aac") - - # Determine encoding parameters - cmd = [faac_bin_path, "-o", output_path, input_path] - cmd.extend(["-b", str(cfg["bitrate"])]) - - try: - t_start = time.time() - subprocess.run(cmd, env=env, check=True, capture_output=True) - t_duration = time.time() - t_start - - mos = None - aac_size = os.path.getsize(output_path) - actual_bitrate = None - - if HAS_FFMPEG: - try: - probe = ffmpeg.probe(input_path) - duration = float(probe['format']['duration']) - if duration > 0: - # kbps = (bytes * 8) / (seconds * 1000) - actual_bitrate = (aac_size * 8) / (duration * 1000) - except Exception as e: - print(f" Failed to probe duration for {sample}: {e}") - - if HAS_FFMPEG: - with tempfile.TemporaryDirectory() as tmpdir: - v_ref = os.path.join(tmpdir, "vref.wav") - v_deg = os.path.join(tmpdir, "vdeg.wav") - v_rate = cfg["visqol_rate"] - v_channels = 1 if cfg["mode"] == "speech" else 2 - - try: - # Use ffmpeg-python to decode AAC and prepare files for - # ViSQOL - ffmpeg.input(input_path).output( - v_ref, ar=v_rate, ac=v_channels, sample_fmt='s16').run( - quiet=True, overwrite_output=True) - ffmpeg.input(output_path).output( - v_deg, ar=v_rate, ac=v_channels, sample_fmt='s16').run( - quiet=True, overwrite_output=True) - - if os.path.exists(v_ref) and os.path.exists(v_deg): - visqol = get_process_visqol(cfg["mode"]) - mos = run_visqol(visqol, v_ref, v_deg) - except ffmpeg.Error as e: - print( - f" FFmpeg error for {sample}: { - e.stderr.decode() if e.stderr else e}") - - return key, { - "mos": mos, - "size": aac_size, - "bitrate": actual_bitrate, - "bitrate_target": cfg.get("bitrate"), - "time": t_duration, - "md5": get_md5(output_path), - "thresh": cfg["thresh"], - "scenario": name, - "filename": sample - } - except Exception as e: - print(f" failed: {e}") - return None - - -def run_benchmark( - faac_bin_path, - lib_path, - precision, - coverage=100, - run_perceptual=True): - env = os.environ.copy() - - os.makedirs(OUTPUT_DIR, exist_ok=True) - results = { - "matrix": {}, - "throughput": {}, - "lib_size": get_binary_size(lib_path) - } - - if run_perceptual: - print(f"Starting perceptual benchmark for {precision}...") - # Detect number of CPUs for parallelization - num_cpus = os.cpu_count() or 1 - print(f"Parallelizing across {num_cpus} threads.") - - for name, cfg in SCENARIOS.items(): - data_subdir = "speech" if cfg["mode"] == "speech" else "audio" - data_dir = os.path.join(EXTERNAL_DATA_DIR, data_subdir) - if not os.path.exists(data_dir): - print( - f" [Scenario: {name}] Data directory {data_dir} not found, skipping.") - continue - - all_samples = sorted( - [f for f in os.listdir(data_dir) if f.endswith(".wav")]) - num_to_run = max(1, int(len(all_samples) * coverage / 100.0)) - step = len(all_samples) / num_to_run if num_to_run > 0 else 1 - samples = [all_samples[int(i * step)] for i in range(num_to_run)] - - print( - f" [Scenario: {name}] Processing { - len(samples)} samples (coverage {coverage}%)...") - - # Pin each process to a unique CPU core - manager = multiprocessing.Manager() - cpu_id_queue = manager.Queue() - for cpu_id in range(num_cpus): - cpu_id_queue.put(cpu_id) - - with concurrent.futures.ProcessPoolExecutor( - max_workers=num_cpus, - initializer=worker_init, - initargs=(cpu_id_queue,) - ) as executor: - futures = { - executor.submit( - process_sample, - faac_bin_path, - name, - cfg, - sample, - data_dir, - precision, - env): sample for sample in samples} - for i, future in enumerate( - concurrent.futures.as_completed(futures)): - result = future.result() - if result: - key, data = result - results["matrix"][key] = data - mos_str = f"{ - data['mos']:.2f}" if data['mos'] is not None else "N/A" - print( - f" ({i + 1}/{len(samples)}) {data['filename']} done. (MOS: {mos_str})") - - print(f"Measuring throughput for {precision}...") - # Pin current process to a single core for accurate throughput measurement - if hasattr(os, "sched_setaffinity"): - try: - os.sched_setaffinity(0, [0]) - except BaseException: - pass - - tp_dir = os.path.join(EXTERNAL_DATA_DIR, "throughput") - if os.path.exists(tp_dir): - tp_samples = sorted( - [f for f in os.listdir(tp_dir) if f.endswith(".wav")]) - if tp_samples: - overall_durations = [] - for sample in tp_samples: - input_path = os.path.join(tp_dir, sample) - output_path = os.path.join( - OUTPUT_DIR, f"tp_{sample}_{precision}.aac") - - print(f" Benchmarking throughput with {sample}...") - try: - # Warmup - subprocess.run([faac_bin_path, - "-o", - output_path, - input_path], - env=env, - check=True, - capture_output=True) - - # Multiple runs to average noise - durations = [] - for _ in range(3): - start_time = time.perf_counter() - subprocess.run([faac_bin_path, - "-o", - output_path, - input_path], - env=env, - check=True, - capture_output=True) - durations.append(time.perf_counter() - start_time) - - avg_dur = sum(durations) / len(durations) - results["throughput"][sample] = avg_dur - overall_durations.append(avg_dur) - except BaseException as e: - print(f" Throughput benchmark failed for {sample}: {e}") - pass - - if overall_durations: - results["throughput"]["overall"] = sum( - overall_durations) / len(overall_durations) - - return results - - -if __name__ == "__main__": - if len(sys.argv) < 5: - print( - "Usage: python3 tests/run_benchmark.py [--skip-mos] [--coverage 100]") - sys.exit(1) - - do_perc = "--skip-mos" not in sys.argv - coverage = 100 - if "--coverage" in sys.argv: - idx = sys.argv.index("--coverage") - coverage = int(sys.argv[idx + 1]) - - data = run_benchmark( - sys.argv[1], - sys.argv[2], - sys.argv[3], - coverage=coverage, - run_perceptual=do_perc) - - # Ensure results directory exists - output_json = os.path.abspath(sys.argv[4]) - os.makedirs(os.path.dirname(output_json), exist_ok=True) - with open(output_json, "w") as f: - json.dump(data, f, indent=2) diff --git a/tests/setup_datasets.py b/tests/setup_datasets.py deleted file mode 100644 index 735ce3b3..00000000 --- a/tests/setup_datasets.py +++ /dev/null @@ -1,261 +0,0 @@ -""" - * FAAC Benchmark Suite - * Copyright (C) 2026 Nils Schimmelmann - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program. If not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -""" - -import os -import urllib.request -import zipfile -import shutil -import wave -import re -import ffmpeg - -DATASETS = { - "PMLT2014": { - "url": "https://github.com/nschimme/PMLT2014/archive/refs/tags/PMLT2014.zip", - "name": "Public Multiformat Listening Test @ 96 kbps (July 2014)" - }, - "TCD-VOIP": { - "url": "https://github.com/nschimme/TCD-VOIP/archive/refs/tags/harte2015tcd.zip", - "name": "TCD-VoIP (Sigmedia-VoIP) Listener Test Database" - }, - "SoundExpert": { - "url": "https://github.com/nschimme/SoundExpert/archive/refs/tags/SoundExpert.zip", - "name": "SoundExpert Sound samples" - } -} - -# Paths relative to script directory -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -BASE_DATA_DIR = os.path.join(SCRIPT_DIR, "data", "external") -TEMP_DIR = os.path.join(SCRIPT_DIR, "data", "temp") - - -def download_and_extract(name, url): - os.makedirs(TEMP_DIR, exist_ok=True) - zip_path = os.path.join(TEMP_DIR, f"{name}.zip") - if not os.path.exists(zip_path): - print(f"Downloading {name}...") - urllib.request.urlretrieve(url, zip_path) - - print(f"Extracting {name}...") - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(TEMP_DIR) - - -def get_info(wav_path): - try: - with wave.open(wav_path, 'rb') as f: - frames = f.getnframes() - rate = f.getframerate() - channels = f.getnchannels() - return frames / float(rate), channels - except BaseException: - return 0, 2 - - -def resample( - input_path, - output_path, - rate, - channels, - start=None, - duration=None, - loop=False): - os.makedirs(os.path.dirname(output_path), exist_ok=True) - try: - input_args = {} - output_args = {} - - if loop: - # Loop input indefinitely, then trim to requested duration - input_args['stream_loop'] = -1 - - if start is not None: - output_args['ss'] = start - if duration is not None: - output_args['t'] = duration - - (ffmpeg .input(input_path, - **input_args) .output(output_path, - ar=rate, - ac=channels, - sample_fmt='s16', - **output_args) .run(quiet=True, - overwrite_output=True)) - except ffmpeg.Error as e: - print( - f" FFmpeg error during setup: { - e.stderr.decode() if e.stderr else e}") - - -def get_tier_params(dur): - """ - Determine resampling parameters based on ViSQOL recommendations (5-10s). - 1. < 5s: loop to 5s - 2. 5-10s: use full sample - 3. > 10s: trim to 10s center segment - """ - if dur < 5.0: - return 0, 5, True - if dur <= 10.0: - return None, None, False - return (dur - 10) / 2, 10, False - - -def setup_pmlt(): - dataset_info = DATASETS["PMLT2014"] - src_dir = os.path.join(TEMP_DIR, "PMLT2014-PMLT2014") - dest_dir = os.path.join(BASE_DATA_DIR, "audio") - - wav_files = [] - for root, dirs, files in os.walk(src_dir): - for f in files: - if f.endswith("48k.wav") and not re.search(r"48k\.\d+\.wav$", f): - wav_files.append(os.path.join(root, f)) - - print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.") - for i, wav in enumerate(wav_files): - print(f" [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...") - dur, chans = get_info(wav) - start, duration, loop = get_tier_params(dur) - - filename = os.path.basename(wav) - output = os.path.join(dest_dir, filename) - resample( - wav, - output, - 48000, - chans, - start=start, - duration=duration, - loop=loop) - - -def setup_tcd_voip(): - dataset_info = DATASETS["TCD-VOIP"] - src_dir = os.path.join(TEMP_DIR, "TCD-VOIP-harte2015tcd") - dest_dir = os.path.join(BASE_DATA_DIR, "speech") - - wav_files = [] - for root, dirs, files in os.walk(src_dir): - # Do not use any wave files if they're in a "ref" folder - if "ref" in root.split(os.sep): - continue - - for f in files: - if f.endswith(".wav") and ("Test Set" in root or "chop" in root): - wav_files.append(os.path.join(root, f)) - - print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.") - for i, wav in enumerate(wav_files): - print(f" [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...") - dur, chans = get_info(wav) - start, duration, loop = get_tier_params(dur) - - filename = os.path.basename(wav) - output = os.path.join(dest_dir, filename) - # ViSQOL speech mode requires 16k mono - resample( - wav, - output, - 16000, - 1, - start=start, - duration=duration, - loop=loop) - - -def setup_soundexpert(): - dataset_info = DATASETS["SoundExpert"] - src_dir = os.path.join(TEMP_DIR, "SoundExpert-SoundExpert") - dest_dir = os.path.join(BASE_DATA_DIR, "audio") - - wav_files = [] - for root, dirs, files in os.walk(src_dir): - for f in files: - if f.endswith(".wav"): - wav_files.append(os.path.join(root, f)) - - print(f"Found {len(wav_files)} valid samples for {dataset_info['name']}.") - for i, wav in enumerate(wav_files): - print(f" [{i + 1}/{len(wav_files)}] Processing {os.path.basename(wav)}...") - dur, chans = get_info(wav) - start, duration, loop = get_tier_params(dur) - - filename = os.path.basename(wav) - output = os.path.join(dest_dir, filename) - resample( - wav, - output, - 48000, - chans, - start=start, - duration=duration, - loop=loop) - - -def setup_throughput_signals(): - """Generate 10-minute test signals for throughput measurement.""" - dest_dir = os.path.join(BASE_DATA_DIR, "throughput") - os.makedirs(dest_dir, exist_ok=True) - - signals = { - "sine": "sine=f=440:d=600", - "sweep": "aevalsrc='sin(2*PI*(100+(20000-100)/(2*600)*t)*t)':d=600", - "noise": "anoisesrc=d=600", - "silence": "anullsrc=d=600" - } - - print(f"Generating 10-minute throughput signals...") - for name, filter_str in signals.items(): - output_path = os.path.join(dest_dir, f"{name}.wav") - if not os.path.exists(output_path): - print(f" Generating {name}.wav...") - try: - # Note: aevalsrc is also a lavfi filter - ( - ffmpeg - .input(filter_str, format='lavfi') - .output(output_path, ar=48000, ac=2, sample_fmt='s16') - .run(quiet=True, overwrite_output=True) - ) - except ffmpeg.Error as e: - print( - f" FFmpeg error during signal generation: { - e.stderr.decode() if e.stderr else e}") - - -if __name__ == "__main__": - if not os.path.exists(BASE_DATA_DIR): - for name, info in DATASETS.items(): - download_and_extract(name, info["url"]) - - setup_pmlt() - setup_tcd_voip() - setup_soundexpert() - setup_throughput_signals() - - if os.path.exists(TEMP_DIR): - shutil.rmtree(TEMP_DIR) - else: - # Always check for throughput signals as they are vital for stable - # metrics - setup_throughput_signals() - print("Datasets already setup.") - print("Done.") From aeebe747b8dac4508ee44e46a856e1178cc4142b Mon Sep 17 00:00:00 2001 From: Nils Schimmelmann Date: Wed, 4 Mar 2026 21:15:16 -0600 Subject: [PATCH 3/5] CI: add FAAC Benchmark Suite GitHub Action for automated regression testing --- .github/workflows/benchmark.yml | 136 ++++++++++++++++++++++++++++++++ README | 9 +++ 2 files changed, 145 insertions(+) create mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..2950bbd6 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,136 @@ +name: Continuous Integration + +on: + push: + branches: [ "master" ] + paths: + - "libfaac/**" + - ".github/workflows/benchmark.yml" + pull_request: + branches: [ "*" ] + paths: + - "libfaac/**" + - ".github/workflows/benchmark.yml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + benchmark: + name: Benchmark ${{ matrix.arch }} / ${{ matrix.precision }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + arch: [amd64] + precision: [single, double] + + steps: + - name: Install build dependencies + run: | + sudo apt-get update + sudo apt-get install -y meson ninja-build bc ffmpeg + + - name: Checkout Candidate + uses: actions/checkout@v4 + with: + path: candidate + + - name: Build Candidate + run: | + cd candidate + meson setup build_cand -Dfloating-point=${{ matrix.precision }} --buildtype=release + ninja -C build_cand + + - name: Determine Baseline SHA + id: baseline-sha + run: | + if [ "${{ github.event_name }}" == "push" ]; then + echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT + else + echo "sha=${{ github.event.pull_request.base.sha }}" >> $GITHUB_OUTPUT + fi + + - name: Restore Baseline Results + id: cache-baseline + uses: actions/cache/restore@v4 + with: + path: results/${{ matrix.arch }}_${{ matrix.precision }}_base.json + key: ${{ runner.os }}-baseline-${{ matrix.arch }}-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }} + + - name: Checkout Baseline + if: steps.cache-baseline.outputs.cache-hit != 'true' + uses: actions/checkout@v4 + with: + ref: ${{ steps.baseline-sha.outputs.sha }} + path: baseline + + - name: Build Baseline + if: steps.cache-baseline.outputs.cache-hit != 'true' + run: | + cd baseline + meson setup build_base -Dfloating-point=${{ matrix.precision }} --buildtype=release + ninja -C build_base + + - name: Run Benchmark (Baseline) + if: steps.cache-baseline.outputs.cache-hit != 'true' + uses: nschimme/faac-benchmark@master + with: + faac-bin: ./baseline/build_base/frontend/faac + libfaac-so: ./baseline/build_base/libfaac/libfaac.so + run-name: ${{ matrix.arch }}_${{ matrix.precision }}_base + output-json: ./results/${{ matrix.arch }}_${{ matrix.precision }}_base.json + visqol-image: ghcr.io/nschimme/faac-benchmark-visqol:latest + + - name: Save Baseline Results + if: success() && steps.cache-baseline.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: results/${{ matrix.arch }}_${{ matrix.precision }}_base.json + key: ${{ runner.os }}-baseline-${{ matrix.arch }}-${{ matrix.precision }}-${{ steps.baseline-sha.outputs.sha }} + + - name: Run Benchmark (Candidate) + if: github.event_name == 'pull_request' + uses: nschimme/faac-benchmark@master + with: + faac-bin: ./candidate/build_cand/frontend/faac + libfaac-so: ./candidate/build_cand/libfaac/libfaac.so + run-name: ${{ matrix.arch }}_${{ matrix.precision }}_cand + output-json: ./results/${{ matrix.arch }}_${{ matrix.precision }}_cand.json + visqol-image: ghcr.io/nschimme/faac-benchmark-visqol:latest + + - name: Upload Results + if: github.event_name == 'pull_request' + uses: actions/upload-artifact@v4 + with: + name: results-${{ matrix.arch }}-${{ matrix.precision }} + path: results/*.json + + report: + name: Consolidated Report + needs: benchmark + runs-on: ubuntu-latest + if: always() && github.event_name == 'pull_request' + steps: + - name: Download all results + uses: actions/download-artifact@v4 + with: + path: results + pattern: results-* + merge-multiple: true + + - name: Generate Report + id: generate + uses: nschimme/faac-benchmark/report@master + with: + results-path: ./results + base-sha: ${{ github.event.pull_request.base.sha }} + cand-sha: ${{ github.event.pull_request.head.sha }} + summary-only: false + + - name: Upload Full Report + uses: actions/upload-artifact@v4 + with: + name: benchmark-report-full + path: report.md diff --git a/README b/README index 2c30f925..ccce8022 100644 --- a/README +++ b/README @@ -79,3 +79,12 @@ General FAAC compiling instructions cd build meson setup .. meson install + +___________________________________ +Benchmarking + +FAAC uses a dedicated benchmark suite to ensure quality and performance. +The suite is hosted in a separate repository: https://github.com/nschimme/faac-benchmark + +Automated benchmarks run on every pull request. For instructions on how to +run benchmarks locally, please refer to the README in the benchmark repository. From 4f0e73cf47ff117c21171290d7c6fcda71da9567 Mon Sep 17 00:00:00 2001 From: Nils Schimmelmann Date: Thu, 5 Mar 2026 10:11:27 -0600 Subject: [PATCH 4/5] pin faac-benchmark to v1 --- .github/workflows/benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2950bbd6..1c798371 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -75,7 +75,7 @@ jobs: - name: Run Benchmark (Baseline) if: steps.cache-baseline.outputs.cache-hit != 'true' - uses: nschimme/faac-benchmark@master + uses: nschimme/faac-benchmark@v1 with: faac-bin: ./baseline/build_base/frontend/faac libfaac-so: ./baseline/build_base/libfaac/libfaac.so @@ -92,7 +92,7 @@ jobs: - name: Run Benchmark (Candidate) if: github.event_name == 'pull_request' - uses: nschimme/faac-benchmark@master + uses: nschimme/faac-benchmark@v1 with: faac-bin: ./candidate/build_cand/frontend/faac libfaac-so: ./candidate/build_cand/libfaac/libfaac.so @@ -122,7 +122,7 @@ jobs: - name: Generate Report id: generate - uses: nschimme/faac-benchmark/report@master + uses: nschimme/faac-benchmark/report@v1 with: results-path: ./results base-sha: ${{ github.event.pull_request.base.sha }} From 8ab42d5219eb50bfac9ad3666ca760e52fe0e03f Mon Sep 17 00:00:00 2001 From: Nils Schimmelmann Date: Fri, 13 Mar 2026 18:04:35 -0500 Subject: [PATCH 5/5] post comment again --- .github/workflows/benchmark.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 1c798371..084871cb 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -112,7 +112,13 @@ jobs: needs: benchmark runs-on: ubuntu-latest if: always() && github.event_name == 'pull_request' + permissions: + pull-requests: write steps: + - name: Checkout Code + if: github.event_name == 'pull_request' + uses: actions/checkout@v4 + - name: Download all results uses: actions/download-artifact@v4 with: @@ -134,3 +140,13 @@ jobs: with: name: benchmark-report-full path: report.md + + - name: Post Summary to PR + if: always() && github.event_name == 'pull_request' + continue-on-error: true + shell: bash + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr comment ${{ github.event.pull_request.number }} --body-file summary.md +