From cdb2585edbc96aa7f33742a06724eedc492f59f9 Mon Sep 17 00:00:00 2001 From: sunjqa1 Date: Thu, 26 Mar 2026 08:46:17 +0000 Subject: [PATCH] Split the visualization module and optimize the operator page --- dashboard/pages/operator.py | 38 +- dashboard/pages/training.py | 2 +- dashboard/utils/visualizations/__init__.py | 76 +++- dashboard/utils/visualizations/base.py | 385 +----------------- .../utils/visualizations/communication.py | 186 +++++++++ dashboard/utils/visualizations/inference.py | 140 +++++++ dashboard/utils/visualizations/operator.py | 159 ++++++++ .../utils/visualizations/summary_tables.py | 183 +++++++++ .../training.py} | 2 +- 9 files changed, 768 insertions(+), 403 deletions(-) create mode 100644 dashboard/utils/visualizations/communication.py create mode 100644 dashboard/utils/visualizations/inference.py create mode 100644 dashboard/utils/visualizations/operator.py create mode 100644 dashboard/utils/visualizations/summary_tables.py rename dashboard/utils/{training_plots.py => visualizations/training.py} (99%) diff --git a/dashboard/pages/operator.py b/dashboard/pages/operator.py index 5fdfd78..0078213 100644 --- a/dashboard/pages/operator.py +++ b/dashboard/pages/operator.py @@ -9,6 +9,7 @@ from utils.visualizations import ( create_summary_table_ops, plot_timeseries_auto, + render_operator_performance_charts, ) init_page("算子测试分析 | InfiniMetrics", "⚡") @@ -37,6 +38,10 @@ def main(): only_success = st.checkbox("仅显示成功测试", value=True) y_log = st.checkbox("Y轴对数刻度(可选)", value=False) + st.markdown("---") + st.markdown("### 📊 图表选项") + show_performance_charts = st.checkbox("显示性能仪表盘", value=True) + filtered = [r for r in ops_runs if (not only_success or r.get("success"))] st.caption(f"找到 {len(filtered)} 个算子测试") @@ -63,9 +68,15 @@ def main(): ri["data"] = data selected_runs.append(ri) - tab1, tab2 = st.tabs(["📌 概览", "📈 曲线/原始数据"]) + tab1, tab2, tab3 = st.tabs(["📈 性能图表", "📌 概览", "📊 原始数据"]) with tab1: + # Use the new performance chart function + render_operator_performance_charts( + selected_runs, y_log, show_performance_charts + ) + + with tab2: for run in selected_runs: with st.expander(f"{run.get('run_id')} - 概览"): st.dataframe( @@ -73,16 +84,28 @@ def main(): use_container_width=True, hide_index=True, ) - st.markdown("**config**") + st.markdown("**完整配置**") st.json(run["data"].get("config", {})) - with tab2: - # If operators have timeseries CSVs, automatically plot them + env = run["data"].get("environment", {}) + if env: + st.markdown("**环境信息**") + try: + acc = env["cluster"][0]["machine"]["accelerators"][0] + st.write(f"- 加速卡: {acc.get('model', 'Unknown')}") + st.write(f"- 显存: {acc.get('memory_gb_per_card', '?')} GB") + st.write(f"- CUDA版本: {acc.get('cuda', 'Unknown')}") + except: + st.json(env) + + with tab3: + # Original data for run in selected_runs: - with st.expander(f"{run.get('run_id')} - metrics"): + with st.expander(f"{run.get('run_id')} - 原始数据"): for m in run["data"].get("metrics", []): df = m.get("data") if df is not None and len(df.columns) >= 2: + st.markdown(f"**{m.get('name', 'metric')}**") fig = plot_timeseries_auto( df, title=m.get("name", "metric"), y_log_scale=y_log ) @@ -90,8 +113,9 @@ def main(): else: # scalar if m.get("type") == "scalar": - st.write( - f"- {m.get('name')}: {m.get('value')} {m.get('unit','')}" + st.metric( + label=m.get("name", ""), + value=f"{m.get('value', '')} {m.get('unit', '')}", ) diff --git a/dashboard/pages/training.py b/dashboard/pages/training.py index 1db0a07..9da5c01 100644 --- a/dashboard/pages/training.py +++ b/dashboard/pages/training.py @@ -12,7 +12,7 @@ load_selected_runs, create_training_summary, ) -from utils.training_plots import ( +from utils.visualizations import ( render_performance_curves, render_throughput_comparison, render_data_tables, diff --git a/dashboard/utils/visualizations/__init__.py b/dashboard/utils/visualizations/__init__.py index f897fcb..4dcdbff 100644 --- a/dashboard/utils/visualizations/__init__.py +++ b/dashboard/utils/visualizations/__init__.py @@ -4,37 +4,91 @@ This package provides visualization utilities organized by test type: - base: Common/legacy visualization functions - hardware: Hardware test visualizations (memory sweep, cache bandwidth) -- (future) communication: Communication test visualizations -- (future) inference: Inference test visualizations -- (future) operator: Operator test visualizations +- communication: Communication test visualizations +- inference: Inference test visualizations +- operator: Operator test visualizations +- training: Training test visualizations +- summary_tables: Summary tables for different test types """ +# Base functions (common) from .base import ( - plot_metric_vs_size, - plot_comparison_matrix, - create_summary_table, create_gauge_chart, plot_timeseries_auto, - create_summary_table_infer, - create_summary_table_ops, ) + +# Communication functions +from .communication import ( + plot_metric_vs_size, + plot_comparison_matrix, +) + +# Inference functions +from .inference import ( + render_inference_metrics, + render_memory_gauge, +) + +# Summary tables +from .summary_tables import ( + create_comm_summary_table, + create_infer_summary_table, + create_ops_summary_table, +) + +# Hardware functions from .hardware import ( create_summary_table_hw, plot_hw_mem_sweep, plot_hw_cache, ) +# Operator functions +from .operator import ( + extract_operator_metrics, + render_operator_performance_charts, +) + +# Training functions +from .training import ( + render_performance_curves, + render_throughput_comparison, + render_data_tables, + render_config_details, +) + +# Backward-compatible aliases +create_summary_table = create_comm_summary_table +create_summary_table_infer = create_infer_summary_table +create_summary_table_ops = create_ops_summary_table + __all__ = [ - # Base (common/legacy) + # Base + "create_gauge_chart", + "plot_timeseries_auto", + # Communication "plot_metric_vs_size", "plot_comparison_matrix", + # Inference + "render_inference_metrics", + "render_memory_gauge", + # Summary tables + "create_comm_summary_table", + "create_infer_summary_table", + "create_ops_summary_table", "create_summary_table", - "create_gauge_chart", - "plot_timeseries_auto", "create_summary_table_infer", "create_summary_table_ops", # Hardware "create_summary_table_hw", "plot_hw_mem_sweep", "plot_hw_cache", + # Operator + "extract_operator_metrics", + "render_operator_performance_charts", + # Training + "render_performance_curves", + "render_throughput_comparison", + "render_data_tables", + "render_config_details", ] diff --git a/dashboard/utils/visualizations/base.py b/dashboard/utils/visualizations/base.py index 8ba1bc2..f2de7f0 100644 --- a/dashboard/utils/visualizations/base.py +++ b/dashboard/utils/visualizations/base.py @@ -2,283 +2,10 @@ """Visualization functions for InfiniMetrics dashboard.""" import plotly.graph_objects as go -import plotly.express as px import pandas as pd -import numpy as np -from typing import Dict, List, Any, Optional, Literal +from typing import Optional import streamlit as st -from utils.data_loader import get_friendly_size - - -def plot_metric_vs_size( - df: pd.DataFrame, - metric_type: Literal["bandwidth", "latency"], - title: Optional[str] = None, - y_log_scale: bool = False, -) -> go.Figure: - """Generic plot for metric vs message size.""" - - fig = go.Figure() - - # Define metric-specific configurations - metric_configs = { - "bandwidth": { - "y_column": "bandwidth_gbs", - "y_title": "Bandwidth (GB/s)", - "line_color": "royalblue", - "name": "Bandwidth", - "default_title": "带宽 vs 数据大小", - }, - "latency": { - "y_column": "latency_us", - "y_title": "Latency (microseconds)", - "line_color": "firebrick", - "name": "Latency", - "default_title": "延迟 vs 数据大小", - }, - } - - config = metric_configs.get(metric_type) - if not config: - raise ValueError(f"Unsupported metric_type: {metric_type}") - - # Check if required columns exist - if config["y_column"] not in df.columns: - st.warning(f"DataFrame missing required column: {config['y_column']}") - fig.update_layout(title=f"{title or config['default_title']} (no data)") - return fig - - # Add friendly size column for hover - df = df.copy() - df["size_friendly"] = df["size_bytes"].apply(get_friendly_size) - - # Add metric line - fig.add_trace( - go.Scatter( - x=df["size_bytes"], - y=df[config["y_column"]], - mode="lines+markers", - name=config["name"], - line=dict(color=config["line_color"], width=3), - marker=dict(size=8), - hovertext=df["size_friendly"], - hoverinfo="text+y+x", - ) - ) - - # Update layout - layout = { - "title": title or config["default_title"], - "xaxis_title": "Data Size", - "yaxis_title": config["y_title"], - "xaxis_type": "log", - "template": "plotly_white", - "hovermode": "x unified", - "height": 500, - } - - if y_log_scale: - layout["yaxis_type"] = "log" - - fig.update_layout(**layout) - - # Add grid - fig.update_xaxes( - showgrid=True, - gridwidth=1, - gridcolor="LightGray", - tickvals=df["size_bytes"].tolist(), - ticktext=df["size_friendly"].tolist(), - ) - fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor="LightGray") - - return fig - - -def plot_comparison_matrix( - test_runs: List[Dict[str, Any]], - metric: str = "bandwidth", - y_log_scale: bool = False, -) -> go.Figure: - """Create comparison matrix for multiple test runs.""" - fig = go.Figure() - - colors = px.colors.qualitative.Set2 - - for i, run in enumerate(test_runs): - if i >= len(colors): - break - - data = run.get("data", {}) - metrics = data.get("metrics", []) - - for metric_data in metrics: - metric_name = metric_data.get("name", "") - - if ( - metric == "bandwidth" - and "bandwidth" in metric_name - and metric_data.get("data") is not None - ): - df = metric_data["data"].copy() - df["size_friendly"] = df["size_bytes"].apply(get_friendly_size) - - # Get run info for legend - device_used = run.get("device_used", "?") - operation = run.get("operation", "Test") - - fig.add_trace( - go.Scatter( - x=df["size_bytes"], - y=df["bandwidth_gbs"], - mode="lines+markers", - name=f"{operation} ({device_used} GPUs)", - line=dict(color=colors[i % len(colors)], width=2), - marker=dict(size=6), - hovertext=df["size_friendly"], - hoverinfo="text+y+name", - ) - ) - break # Found bandwidth metric - - elif ( - metric == "latency" - and "latency" in metric_name - and metric_data.get("data") is not None - ): - df = metric_data["data"].copy() - df["size_friendly"] = df["size_bytes"].apply(get_friendly_size) - - device_used = run.get("device_used", "?") - operation = run.get("operation", "Test") - - fig.add_trace( - go.Scatter( - x=df["size_bytes"], - y=df["latency_us"], - mode="lines+markers", - name=f"{operation} ({device_used} GPUs)", - line=dict(color=colors[i % len(colors)], width=2), - marker=dict(size=6), - hovertext=df["size_friendly"], - hoverinfo="text+y+name", - ) - ) - break # Found latency metric - - metric_title = "带宽 (GB/s)" if metric == "bandwidth" else "延迟 (µs)" - layout = { - "title": f"多测试对比 - {metric_title}", - "xaxis_title": "Data Size", - "yaxis_title": metric_title, - "xaxis_type": "log", - "template": "plotly_white", - "hovermode": "x unified", - "height": 600, - "legend": dict(yanchor="top", y=0.99, xanchor="left", x=0.01), - } - - if y_log_scale: - layout["yaxis_type"] = "log" - - fig.update_layout(**layout) - - # Set x-axis tick labels - if test_runs and len(test_runs[0].get("data", {}).get("metrics", [])) > 0: - first_metric = test_runs[0]["data"]["metrics"][0] - if first_metric.get("data") is not None: - df = first_metric["data"] - fig.update_xaxes( - tickvals=df["size_bytes"].tolist(), - ticktext=df["size_bytes"].apply(get_friendly_size).tolist(), - ) - - return fig - - -def create_summary_table(test_result: Dict[str, Any]) -> pd.DataFrame: - """Create summary table from test result.""" - summary_data = [] - - # Hardware summary - if "environment" in test_result: - env = test_result["environment"] - if "cluster" in env and len(env["cluster"]) > 0: - machine = env["cluster"][0]["machine"] - accelerators = machine.get("accelerators", []) - if accelerators: - acc = accelerators[0] - summary_data.append( - {"指标": "GPU型号", "数值": str(acc.get("model", "Unknown"))} - ) - summary_data.append( - {"指标": "GPU数量", "数值": str(acc.get("count", "Unknown"))} - ) - summary_data.append( - { - "指标": "显存/卡", - "数值": f"{acc.get('memory_gb_per_card', 'Unknown')} GB", - } - ) - summary_data.append( - {"指标": "CUDA版本", "数值": str(acc.get("cuda", "Unknown"))} - ) - - # Test config summary - config = test_result.get("config", {}) - resolved = test_result.get("resolved", {}) - - # Device info - device_used = ( - resolved.get("device_used") - or config.get("device_used") - or config.get("device_involved", "Unknown") - ) - nodes = resolved.get("nodes") or config.get("nodes", 1) - - summary_data.append({"指标": "算子", "数值": str(config.get("operator", "Unknown"))}) - summary_data.append({"指标": "设备数", "数值": str(device_used)}) - summary_data.append({"指标": "节点数", "数值": str(nodes)}) - summary_data.append( - {"指标": "预热迭代", "数值": str(config.get("warmup_iterations", "Unknown"))} - ) - summary_data.append( - {"指标": "测量迭代", "数值": str(config.get("measured_iterations", "Unknown"))} - ) - - # Performance summary (extract from metrics if available) - for metric in test_result.get("metrics", []): - if metric.get("name") == "comm.bandwidth" and metric.get("data") is not None: - df = metric["data"] - if "bandwidth_gbs" in df.columns: - avg_bw = df["bandwidth_gbs"].mean() - max_bw = df["bandwidth_gbs"].max() - summary_data.append({"指标": "平均带宽", "数值": f"{avg_bw:.2f} GB/s"}) - summary_data.append({"指标": "峰值带宽", "数值": f"{max_bw:.2f} GB/s"}) - - if metric.get("name") == "comm.latency" and metric.get("data") is not None: - df = metric["data"] - if "latency_us" in df.columns: - avg_lat = df["latency_us"].mean() - min_lat = df["latency_us"].min() - summary_data.append({"指标": "平均延迟", "数值": f"{avg_lat:.2f} µs"}) - summary_data.append({"指标": "最小延迟", "数值": f"{min_lat:.2f} µs"}) - - # Duration - duration = next( - ( - m["value"] - for m in test_result.get("metrics", []) - if m.get("name") == "comm.duration" - ), - None, - ) - if duration: - summary_data.append({"指标": "测试耗时", "数值": f"{duration:.2f} ms"}) - - return pd.DataFrame(summary_data) - def create_gauge_chart( value: float, @@ -333,11 +60,7 @@ def create_gauge_chart( def plot_timeseries_auto( df: pd.DataFrame, title: str = "Timeseries", y_log_scale: bool = False ) -> go.Figure: - """ - Generic plot for 2-column timeseries CSV: - - infer: (timestamp, latency_ms/ttft_ms/throughput) - - future ops: (step, value) etc. - """ + """Generic plot for 2-column timeseries CSV.""" fig = go.Figure() if df is None or df.empty or len(df.columns) < 2: fig.update_layout(title=f"{title} (no data)") @@ -360,107 +83,3 @@ def plot_timeseries_auto( if y_log_scale: fig.update_yaxes(type="log") return fig - - -def create_summary_table_infer(test_result: dict) -> pd.DataFrame: - rows = [] - - # env brief - env = test_result.get("environment", {}) - try: - acc = env["cluster"][0]["machine"]["accelerators"][0] - rows += [ - {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, - {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, - {"指标": "显存/卡", "数值": f"{acc.get('memory_gb_per_card','?')} GB"}, - {"指标": "CUDA", "数值": str(acc.get("cuda", "Unknown"))}, - {"指标": "平台", "数值": str(acc.get("type", "nvidia"))}, - ] - except Exception: - pass - - cfg = test_result.get("config", {}) - rows += [ - {"指标": "框架", "数值": str(cfg.get("framework", "unknown"))}, - {"指标": "模型", "数值": str(cfg.get("model", ""))}, - { - "指标": "batch", - "数值": str( - (cfg.get("infer_args", {}) or {}).get("static_batch_size", "unknown") - ), - }, - { - "指标": "prompt_tok", - "数值": str( - (cfg.get("infer_args", {}) or {}).get("prompt_token_num", "unknown") - ), - }, - { - "指标": "output_tok", - "数值": str( - (cfg.get("infer_args", {}) or {}).get("output_token_num", "unknown") - ), - }, - {"指标": "warmup", "数值": str(cfg.get("warmup_iterations", "unknown"))}, - {"指标": "measured", "数值": str(cfg.get("measured_iterations", "unknown"))}, - ] - - # scalar metrics quick view - for m in test_result.get("metrics", []): - if m.get("type") == "scalar": - rows.append( - { - "指标": str(m.get("name", "")), - "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), - } - ) - - return pd.DataFrame(rows) - - -def create_summary_table_ops(test_result: dict) -> pd.DataFrame: - rows = [] - cfg = test_result.get("config", {}) - - rows.append({"指标": "testcase", "数值": str(test_result.get("testcase", ""))}) - # Try to get operator name from config - rows.append( - { - "指标": "算子", - "数值": str(cfg.get("operator", cfg.get("op_name", "Unknown"))), - } - ) - - # Environment info - env = test_result.get("environment", {}) - try: - acc = env["cluster"][0]["machine"]["accelerators"][0] - rows += [ - {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, - {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, - ] - except Exception: - pass - - # Scalar metrics summary - scalars = [m for m in test_result.get("metrics", []) if m.get("type") == "scalar"] - for m in scalars: - rows.append( - { - "指标": str(m.get("name", "")), - "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), - } - ) - - # Common config fields fallback - for k in [ - "dtype", - "shape", - "batch_size", - "warmup_iterations", - "measured_iterations", - ]: - if k in cfg: - rows.append({"指标": k, "数值": str(cfg.get(k))}) - - return pd.DataFrame(rows) diff --git a/dashboard/utils/visualizations/communication.py b/dashboard/utils/visualizations/communication.py new file mode 100644 index 0000000..681fca9 --- /dev/null +++ b/dashboard/utils/visualizations/communication.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +"""Communication-specific visualization functions.""" + +import plotly.graph_objects as go +import plotly.express as px +import pandas as pd +from typing import Dict, List, Any, Optional, Literal +import streamlit as st + +from utils.data_loader import get_friendly_size + + +def plot_metric_vs_size( + df: pd.DataFrame, + metric_type: Literal["bandwidth", "latency"], + title: Optional[str] = None, + y_log_scale: bool = False, +) -> go.Figure: + """Generic plot for metric vs message size.""" + fig = go.Figure() + + metric_configs = { + "bandwidth": { + "y_column": "bandwidth_gbs", + "y_title": "Bandwidth (GB/s)", + "line_color": "royalblue", + "name": "Bandwidth", + "default_title": "带宽 vs 数据大小", + }, + "latency": { + "y_column": "latency_us", + "y_title": "Latency (microseconds)", + "line_color": "firebrick", + "name": "Latency", + "default_title": "延迟 vs 数据大小", + }, + } + + config = metric_configs.get(metric_type) + if not config: + raise ValueError(f"Unsupported metric_type: {metric_type}") + + if config["y_column"] not in df.columns: + st.warning(f"DataFrame missing required column: {config['y_column']}") + fig.update_layout(title=f"{title or config['default_title']} (no data)") + return fig + + df = df.copy() + df["size_friendly"] = df["size_bytes"].apply(get_friendly_size) + + fig.add_trace( + go.Scatter( + x=df["size_bytes"], + y=df[config["y_column"]], + mode="lines+markers", + name=config["name"], + line=dict(color=config["line_color"], width=3), + marker=dict(size=8), + hovertext=df["size_friendly"], + hoverinfo="text+y+x", + ) + ) + + layout = { + "title": title or config["default_title"], + "xaxis_title": "Data Size", + "yaxis_title": config["y_title"], + "xaxis_type": "log", + "template": "plotly_white", + "hovermode": "x unified", + "height": 500, + } + + if y_log_scale: + layout["yaxis_type"] = "log" + + fig.update_layout(**layout) + + fig.update_xaxes( + showgrid=True, + gridwidth=1, + gridcolor="LightGray", + tickvals=df["size_bytes"].tolist(), + ticktext=df["size_friendly"].tolist(), + ) + fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor="LightGray") + + return fig + + +def plot_comparison_matrix( + test_runs: List[Dict[str, Any]], + metric: str = "bandwidth", + y_log_scale: bool = False, +) -> go.Figure: + """Create comparison matrix for multiple test runs.""" + fig = go.Figure() + colors = px.colors.qualitative.Set2 + + for i, run in enumerate(test_runs): + if i >= len(colors): + break + + data = run.get("data", {}) + metrics = data.get("metrics", []) + + for metric_data in metrics: + metric_name = metric_data.get("name", "") + + if ( + metric == "bandwidth" + and "bandwidth" in metric_name + and metric_data.get("data") is not None + ): + df = metric_data["data"].copy() + df["size_friendly"] = df["size_bytes"].apply(get_friendly_size) + + device_used = run.get("device_used", "?") + operation = run.get("operation", "Test") + + fig.add_trace( + go.Scatter( + x=df["size_bytes"], + y=df["bandwidth_gbs"], + mode="lines+markers", + name=f"{operation} ({device_used} GPUs)", + line=dict(color=colors[i % len(colors)], width=2), + marker=dict(size=6), + hovertext=df["size_friendly"], + hoverinfo="text+y+name", + ) + ) + break + + elif ( + metric == "latency" + and "latency" in metric_name + and metric_data.get("data") is not None + ): + df = metric_data["data"].copy() + df["size_friendly"] = df["size_bytes"].apply(get_friendly_size) + + device_used = run.get("device_used", "?") + operation = run.get("operation", "Test") + + fig.add_trace( + go.Scatter( + x=df["size_bytes"], + y=df["latency_us"], + mode="lines+markers", + name=f"{operation} ({device_used} GPUs)", + line=dict(color=colors[i % len(colors)], width=2), + marker=dict(size=6), + hovertext=df["size_friendly"], + hoverinfo="text+y+name", + ) + ) + break + + metric_title = "带宽 (GB/s)" if metric == "bandwidth" else "延迟 (µs)" + layout = { + "title": f"多测试对比 - {metric_title}", + "xaxis_title": "Data Size", + "yaxis_title": metric_title, + "xaxis_type": "log", + "template": "plotly_white", + "hovermode": "x unified", + "height": 600, + "legend": dict(yanchor="top", y=0.99, xanchor="left", x=0.01), + } + + if y_log_scale: + layout["yaxis_type"] = "log" + + fig.update_layout(**layout) + + if test_runs and len(test_runs[0].get("data", {}).get("metrics", [])) > 0: + first_metric = test_runs[0]["data"]["metrics"][0] + if first_metric.get("data") is not None: + df = first_metric["data"] + fig.update_xaxes( + tickvals=df["size_bytes"].tolist(), + ticktext=df["size_bytes"].apply(get_friendly_size).tolist(), + ) + + return fig diff --git a/dashboard/utils/visualizations/inference.py b/dashboard/utils/visualizations/inference.py new file mode 100644 index 0000000..396c25c --- /dev/null +++ b/dashboard/utils/visualizations/inference.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +"""Inference-specific visualization functions.""" + +import streamlit as st +import pandas as pd +import plotly.graph_objects as go + +from .base import plot_timeseries_auto + + +def render_inference_metrics(selected_runs, y_log): + """Render inference metrics charts (Latency / TTFT / Throughput).""" + st.markdown("### 指标曲线(Latency / TTFT / Throughput)") + + # layout: 3 columns + c1, c2, c3 = st.columns(3) + + def _plot_metric(metric_name_contains: str, container, title: str = None): + with container: + if len(selected_runs) == 1: + # Single run + run = selected_runs[0] + metrics = run["data"].get("metrics", []) + hit = next( + ( + m + for m in metrics + if metric_name_contains in (m.get("name", "")) + and m.get("data") is not None + ), + None, + ) + if hit: + df = hit["data"] + fig = plot_timeseries_auto( + df, + title=f"{hit['name']} - {run.get('config',{}).get('framework','')}", + y_log_scale=y_log, + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info(f"未找到 {title or metric_name_contains} 对应的 CSV") + else: + # Multi-run comparison: overlay lines + st.markdown(f"**对比:{title or metric_name_contains}**") + lines = [] + for run in selected_runs: + hit = next( + ( + m + for m in run["data"].get("metrics", []) + if metric_name_contains in (m.get("name", "")) + and m.get("data") is not None + ), + None, + ) + if not hit: + continue + lines.append((run, hit)) + + if not lines: + st.info("选中的运行中没有可用数据") + return + + fig = go.Figure() + colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"] + + for i, (run, hit) in enumerate(lines): + df = hit["data"] + xcol = df.columns[0] + ycol = df.columns[1] if len(df.columns) > 1 else None + if ycol is None: + continue + + # Get mode from testcase + tc = run.get("testcase", "") + mode = ( + "service" + if ("Service" in tc or "service" in tc.lower()) + else "direct" + ) + + label = f"{run.get('config',{}).get('framework','unknown')}|{mode}|{run.get('device_used','?')}GPU" + fig.add_trace( + go.Scatter( + x=df[xcol], + y=df[ycol], + mode="lines+markers", + name=label, + line=dict(color=colors[i % len(colors)], width=2), + marker=dict(size=6), + ) + ) + + fig.update_layout( + title=f"{title or metric_name_contains} 对比", + xaxis_title="step", + yaxis_title=title or metric_name_contains, + template="plotly_white", + height=420, + hovermode="x unified", + ) + if y_log: + fig.update_yaxes(type="log") + st.plotly_chart(fig, use_container_width=True) + + _plot_metric("infer.compute_latency", c1, "Latency") + _plot_metric("infer.ttft", c2, "TTFT") + _plot_metric("infer.direct_throughput_tps", c3, "Throughput") + + +def render_memory_gauge(run): + """Render memory usage gauge for inference.""" + memory_metric = next( + ( + m + for m in run["data"].get("metrics", []) + if m.get("name") == "infer.peak_memory_usage" + ), + None, + ) + + if memory_metric and memory_metric.get("value"): + from .base import create_gauge_chart + + st.markdown("#### 💾 显存使用") + value = memory_metric["value"] + unit = memory_metric.get("unit", "GB") + + # Try to get max memory from environment + max_value = value * 1.5 + try: + env = run["data"].get("environment", {}) + acc = env["cluster"][0]["machine"]["accelerators"][0] + max_value = float(acc.get("memory_gb_per_card", 80)) + except: + pass + + fig = create_gauge_chart(value, max_value, "峰值显存使用", "green", unit) + st.plotly_chart(fig, use_container_width=True) diff --git a/dashboard/utils/visualizations/operator.py b/dashboard/utils/visualizations/operator.py new file mode 100644 index 0000000..f0b6a80 --- /dev/null +++ b/dashboard/utils/visualizations/operator.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +"""Operator-specific visualization functions.""" + +import streamlit as st +import pandas as pd +import plotly.graph_objects as go +import plotly.express as px + +from .base import create_gauge_chart + + +def extract_operator_metrics(data: dict) -> dict: + """Extract operator performance metrics.""" + metrics = data.get("metrics", []) + + result = { + "latency": None, + "flops": None, + "bandwidth": None, + "accuracy": None, + } + + for m in metrics: + name = m.get("name", "").lower() + value = m.get("value") + + if "latency" in name and value is not None: + result["latency"] = float(value) + elif "flops" in name and value is not None: + result["flops"] = float(value) + elif "bandwidth" in name and value is not None: + result["bandwidth"] = float(value) + elif "accuracy" in name or "tensor_accuracy" in name: + result["accuracy"] = value + + return result + + +def render_operator_performance_charts(selected_runs, y_log, show_performance_charts): + """Render operator performance charts (simplified like communication page).""" + + if len(selected_runs) == 1: + run = selected_runs[0] + metrics_data = extract_operator_metrics(run["data"]) + + if metrics_data: + st.markdown("#### 📊 核心指标") + cols = st.columns(3) + + if metrics_data.get("latency"): + cols[0].metric("延迟", f"{metrics_data['latency']:.2f} ms", help="算子执行延迟") + if metrics_data.get("flops"): + cols[1].metric( + "计算性能", f"{metrics_data['flops']:.2f} TFLOPS", help="每秒浮点运算次数" + ) + if metrics_data.get("bandwidth"): + cols[2].metric( + "带宽", f"{metrics_data['bandwidth']:.2f} GB/s", help="内存带宽" + ) + + # dashboard + if show_performance_charts: + gauge_cols = st.columns(3) + + with gauge_cols[0]: + if metrics_data.get("latency"): + max_latency = max(metrics_data["latency"] * 2, 100) + fig = create_gauge_chart( + metrics_data["latency"], max_latency, "延迟", "red", "ms" + ) + st.plotly_chart(fig, use_container_width=True) + + with gauge_cols[1]: + if metrics_data.get("flops"): + max_flops = max(metrics_data["flops"] * 1.5, 10) + fig = create_gauge_chart( + metrics_data["flops"], max_flops, "计算性能", "blue", "TFLOPS" + ) + st.plotly_chart(fig, use_container_width=True) + + with gauge_cols[2]: + if metrics_data.get("bandwidth"): + max_bandwidth = max(metrics_data["bandwidth"] * 1.5, 100) + fig = create_gauge_chart( + metrics_data["bandwidth"], + max_bandwidth, + "带宽", + "green", + "GB/s", + ) + st.plotly_chart(fig, use_container_width=True) + + # Display operator configuration information + st.markdown("#### 🔧 算子配置") + config = run["data"].get("config", {}) + inputs = config.get("inputs", []) + if inputs: + st.markdown("**输入张量**") + for inp in inputs: + shape = inp.get("shape", []) + dtype = inp.get("dtype", "unknown") + st.write( + f"- {inp.get('name', 'input')}: shape={shape}, dtype={dtype}" + ) + + outputs = config.get("outputs", []) + if outputs: + st.markdown("**输出张量**") + for out in outputs: + shape = out.get("shape", []) + dtype = out.get("dtype", "unknown") + st.write( + f"- {out.get('name', 'output')}: shape={shape}, dtype={dtype}" + ) + + else: + st.markdown("#### 📊 性能对比") + + all_metrics = [] + for run in selected_runs: + metrics = extract_operator_metrics(run["data"]) + if metrics: + config = run["data"].get("config", {}) + op_name = config.get("operator", run.get("operation", "unknown")) + all_metrics.append( + { + "运行": f"{op_name} ({run.get('device_used', '?')}设备)", + "延迟 (ms)": metrics.get("latency", 0), + "计算性能 (TFLOPS)": metrics.get("flops", 0), + "带宽 (GB/s)": metrics.get("bandwidth", 0), + } + ) + + if all_metrics: + df = pd.DataFrame(all_metrics) + st.dataframe(df, use_container_width=True, hide_index=True) + + fig = go.Figure() + for metric in ["延迟 (ms)", "计算性能 (TFLOPS)", "带宽 (GB/s)"]: + fig.add_trace( + go.Bar( + name=metric, + x=[m["运行"] for m in all_metrics], + y=[m[metric] for m in all_metrics], + text=[f"{m[metric]:.2f}" for m in all_metrics], + textposition="auto", + ) + ) + + fig.update_layout( + title="算子性能对比", + barmode="group", + template="plotly_white", + height=500, + yaxis_title="性能指标", + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("无法提取性能指标进行对比") diff --git a/dashboard/utils/visualizations/summary_tables.py b/dashboard/utils/visualizations/summary_tables.py new file mode 100644 index 0000000..27cb372 --- /dev/null +++ b/dashboard/utils/visualizations/summary_tables.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Summary table functions for different test types.""" + +import pandas as pd +from typing import Dict, Any + + +def create_comm_summary_table(test_result: Dict[str, Any]) -> pd.DataFrame: + """Create summary table for communication tests.""" + summary_data = [] + + # Hardware summary + if "environment" in test_result: + env = test_result["environment"] + if "cluster" in env and len(env["cluster"]) > 0: + machine = env["cluster"][0]["machine"] + accelerators = machine.get("accelerators", []) + if accelerators: + acc = accelerators[0] + summary_data.append( + {"指标": "GPU型号", "数值": str(acc.get("model", "Unknown"))} + ) + summary_data.append( + {"指标": "GPU数量", "数值": str(acc.get("count", "Unknown"))} + ) + summary_data.append( + { + "指标": "显存/卡", + "数值": f"{acc.get('memory_gb_per_card', 'Unknown')} GB", + } + ) + summary_data.append( + {"指标": "CUDA版本", "数值": str(acc.get("cuda", "Unknown"))} + ) + + # Test config summary + config = test_result.get("config", {}) + resolved = test_result.get("resolved", {}) + + device_used = ( + resolved.get("device_used") + or config.get("device_used") + or config.get("device_involved", "Unknown") + ) + nodes = resolved.get("nodes") or config.get("nodes", 1) + + summary_data.append({"指标": "算子", "数值": str(config.get("operator", "Unknown"))}) + summary_data.append({"指标": "设备数", "数值": str(device_used)}) + summary_data.append({"指标": "节点数", "数值": str(nodes)}) + summary_data.append( + {"指标": "预热迭代", "数值": str(config.get("warmup_iterations", "Unknown"))} + ) + summary_data.append( + {"指标": "测量迭代", "数值": str(config.get("measured_iterations", "Unknown"))} + ) + + # Performance summary + for metric in test_result.get("metrics", []): + if metric.get("name") == "comm.bandwidth" and metric.get("data") is not None: + df = metric["data"] + if "bandwidth_gbs" in df.columns: + avg_bw = df["bandwidth_gbs"].mean() + max_bw = df["bandwidth_gbs"].max() + summary_data.append({"指标": "平均带宽", "数值": f"{avg_bw:.2f} GB/s"}) + summary_data.append({"指标": "峰值带宽", "数值": f"{max_bw:.2f} GB/s"}) + + if metric.get("name") == "comm.latency" and metric.get("data") is not None: + df = metric["data"] + if "latency_us" in df.columns: + avg_lat = df["latency_us"].mean() + min_lat = df["latency_us"].min() + summary_data.append({"指标": "平均延迟", "数值": f"{avg_lat:.2f} µs"}) + summary_data.append({"指标": "最小延迟", "数值": f"{min_lat:.2f} µs"}) + + duration = next( + ( + m["value"] + for m in test_result.get("metrics", []) + if m.get("name") == "comm.duration" + ), + None, + ) + if duration: + summary_data.append({"指标": "测试耗时", "数值": f"{duration:.2f} ms"}) + + return pd.DataFrame(summary_data) + + +def create_infer_summary_table(test_result: dict) -> pd.DataFrame: + """Create summary table for inference tests.""" + rows = [] + + env = test_result.get("environment", {}) + try: + acc = env["cluster"][0]["machine"]["accelerators"][0] + rows += [ + {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, + {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, + {"指标": "显存/卡", "数值": f"{acc.get('memory_gb_per_card','?')} GB"}, + {"指标": "CUDA", "数值": str(acc.get("cuda", "Unknown"))}, + {"指标": "平台", "数值": str(acc.get("type", "nvidia"))}, + ] + except Exception: + pass + + cfg = test_result.get("config", {}) + rows += [ + {"指标": "框架", "数值": str(cfg.get("framework", "unknown"))}, + {"指标": "模型", "数值": str(cfg.get("model", ""))}, + { + "指标": "batch", + "数值": str( + (cfg.get("infer_args", {}) or {}).get("static_batch_size", "unknown") + ), + }, + { + "指标": "prompt_tok", + "数值": str( + (cfg.get("infer_args", {}) or {}).get("prompt_token_num", "unknown") + ), + }, + { + "指标": "output_tok", + "数值": str( + (cfg.get("infer_args", {}) or {}).get("output_token_num", "unknown") + ), + }, + {"指标": "warmup", "数值": str(cfg.get("warmup_iterations", "unknown"))}, + {"指标": "measured", "数值": str(cfg.get("measured_iterations", "unknown"))}, + ] + + for m in test_result.get("metrics", []): + if m.get("type") == "scalar": + rows.append( + { + "指标": str(m.get("name", "")), + "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), + } + ) + + return pd.DataFrame(rows) + + +def create_ops_summary_table(test_result: dict) -> pd.DataFrame: + """Create summary table for operator tests.""" + rows = [] + cfg = test_result.get("config", {}) + + rows.append({"指标": "testcase", "数值": str(test_result.get("testcase", ""))}) + rows.append( + {"指标": "算子", "数值": str(cfg.get("operator", cfg.get("op_name", "Unknown")))} + ) + + env = test_result.get("environment", {}) + try: + acc = env["cluster"][0]["machine"]["accelerators"][0] + rows += [ + {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, + {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, + ] + except Exception: + pass + + scalars = [m for m in test_result.get("metrics", []) if m.get("type") == "scalar"] + for m in scalars: + rows.append( + { + "指标": str(m.get("name", "")), + "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), + } + ) + + for k in [ + "dtype", + "shape", + "batch_size", + "warmup_iterations", + "measured_iterations", + ]: + if k in cfg: + rows.append({"指标": k, "数值": str(cfg.get(k))}) + + return pd.DataFrame(rows) diff --git a/dashboard/utils/training_plots.py b/dashboard/utils/visualizations/training.py similarity index 99% rename from dashboard/utils/training_plots.py rename to dashboard/utils/visualizations/training.py index b22cc5d..19335ee 100644 --- a/dashboard/utils/training_plots.py +++ b/dashboard/utils/visualizations/training.py @@ -6,7 +6,7 @@ import plotly.express as px from utils.training_utils import get_metric_dataframe, apply_smoothing -from utils.visualizations import create_gauge_chart +from .base import create_gauge_chart def render_performance_curves(selected_runs, smoothing, y_log):