Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions images/flashbox-l1.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Include=shared/mkosi.conf
Include=modules/flashbox/common/mkosi.conf
Include=modules/flashbox/flashbox-l1/mkosi.conf
Include=modules/flashbox/observability/mkosi.conf

[Config]
Profiles=azure,gcp
Expand Down
1 change: 1 addition & 0 deletions images/flashbox-l2.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Include=shared/mkosi.conf
Include=modules/flashbox/common/mkosi.conf
Include=modules/flashbox/flashbox-l2/mkosi.conf
Include=modules/flashbox/observability/mkosi.conf

[Config]
Profiles=gcp
Expand Down
4 changes: 4 additions & 0 deletions modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ drop_dst_ip() {
#
# `source` is not supported in dash
###########################################################################

# Load observability config if the module is included (metrics endpoint IP)
[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env

. /etc/bob/firewall-config

###########################################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ accept_dst_port $CHAIN_ALWAYS_OUT udp $CL_P2P_PORT "CL P2P (UDP)"
accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP"
accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS"

# Observability metrics endpoint (loaded from /etc/flashbox/observability.env)
if [ -n "${METRICS_ENDPOINT:-}" ]; then
accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)"
fi

# Titan builder bundle endpoints (always on)
# Security note: This is a side channel.
# While the operator will not be able to see the content of the packets,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ accept_dst_port $CHAIN_ALWAYS_IN tcp $CVM_REVERSE_PROXY_PORT "CVM reverse-proxy"
accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP"
accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS"

# Observability metrics endpoint (loaded from /etc/flashbox/observability.env)
if [ -n "${METRICS_ENDPOINT:-}" ]; then
accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)"
fi

###########################################################################
# (3) MAINTENANCE_IN: Inbound rules for Maintenance Mode
###########################################################################
Expand Down
12 changes: 12 additions & 0 deletions modules/flashbox/observability/mkosi.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
set -euxo pipefail

source scripts/make_git_package.sh

# Build gomplate (template engine for Prometheus config)
make_git_package \
"gomplate" \
"v4.3.3" \
"https://github.com/hairyhenderson/gomplate" \
'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \
"build/gomplate:/usr/bin/gomplate"
15 changes: 15 additions & 0 deletions modules/flashbox/observability/mkosi.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Build]
WithNetwork=true

[Content]
ExtraTrees=modules/flashbox/observability/mkosi.extra
PostInstallationScripts=modules/flashbox/observability/mkosi.postinst
BuildScripts=modules/flashbox/observability/mkosi.build

Packages=prometheus
prometheus-node-exporter
prometheus-process-exporter

BuildPackages=build-essential
git
golang
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
process_names:
# Monitor the searcher container (conmon + all children via --children flag)
- name: "searcher-container"
cmdline:
- 'conmon.*searcher-container'
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

# Recording rules for aggregated metrics
rule_files:
- /etc/prometheus/recording_rules.yml

# Scrape configurations
scrape_configs:
# Node exporter on localhost
- job_name: 'node'
static_configs:
- targets: ['localhost:9100']
metric_relabel_configs:
# Only keep aggregated metrics for remote write
- source_labels: [__name__]
regex: 'node_(cpu|memory|disk|filesystem|network|vmstat)_.*'
action: keep

# Process exporter for container monitoring
- job_name: 'process'
static_configs:
- targets: ['localhost:9256']

{{- $config := (datasource "config") }}
{{- if $config.remote_write_flashbots_url }}

# Remote write configuration (dynamically configured)
remote_write:
# Flashbots endpoint
- url: {{ $config.remote_write_flashbots_url }}
write_relabel_configs:
# Only send flashbox: prefixed metrics
- source_labels: [__name__]
regex: 'flashbox:.*'
action: keep
{{- if $config.remote_write_flashbots_auth }}
basic_auth:
username: {{ $config.remote_write_flashbots_username }}
password: {{ $config.remote_write_flashbots_password }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
groups:
# Base metrics — local: prefix means they stay inside the TEE
# (remote_write only forwards flashbox:*)
- name: local_container_metrics
interval: 30s
rules:
- record: local:container_cpu_percent
expr: sum(rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m])) * 100

# Forwarded metrics — flashbox: prefix, picked up by remote_write
- name: flashbox_health
interval: 30s
rules:
- record: flashbox:container_alive
expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"}

# Spike-guarded: current 15m avg must be under 80%,
# AND the 10m max ending 5m ago must have been under 70%
- record: flashbox:container_average_cpu_is_under_80_percent
expr: >
(avg_over_time(local:container_cpu_percent[15m]) < bool 80)
* (max_over_time(local:container_cpu_percent[10m] offset 5m) < bool 70)
- record: flashbox:container_oom_kills_count
expr: node_vmstat_oom_kill

- record: flashbox:disk_free_space_is_over_10_percent
expr: >
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) > bool 0.1
- record: flashbox:disk_free_space_is_over_128_gb
expr: >
(node_filesystem_avail_bytes{mountpoint="/persistent"}) > bool (128 * 1024 * 1024 * 1024)
- record: flashbox:network_is_up
expr: >
(sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m]))
+ sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])))
> bool 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=Fetch observability configuration
After=network-online.target
Wants=network-online.target

[Service]
Type=oneshot
ExecStart=/usr/bin/fetch-observability-config.sh
RemainAfterExit=yes
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=minimal.target
Comment thread
pablin-10 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[Unit]
Description=Prometheus Node Exporter
Documentation=https://github.com/prometheus/node_exporter
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-node-exporter \
--web.listen-address=127.0.0.1:9100 \
--collector.cpu \
--collector.meminfo \
--collector.diskstats \
--collector.filesystem \
--collector.netdev \
--collector.loadavg \
--no-collector.arp \
--no-collector.bcache \
--no-collector.bonding \
--no-collector.conntrack \
--no-collector.cpufreq \
--no-collector.edac \
--no-collector.entropy \
--no-collector.filefd \
--no-collector.hwmon \
--no-collector.infiniband \
--no-collector.ipvs \
--no-collector.mdadm \
--no-collector.netclass \
--no-collector.netstat \
--no-collector.nfs \
--no-collector.nfsd \
--no-collector.pressure \
--no-collector.rapl \
--no-collector.schedstat \
--no-collector.sockstat \
--no-collector.softnet \
--no-collector.stat \
--no-collector.textfile \
--no-collector.thermal_zone \
--no-collector.time \
--no-collector.timex \
--no-collector.udp_queues \
--no-collector.uname \
--collector.vmstat \
--no-collector.xfs \
--no-collector.zfs \
--no-collector.systemd \
--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/)
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=minimal.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Prometheus Process Exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network-online.target searcher-container.service
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-process-exporter \
--web.listen-address=127.0.0.1:9256 \
--config.path=/etc/prometheus/process-exporter.yml \
--children
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=minimal.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[Unit]
Description=Prometheus Monitoring System
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target fetch-observability-config.service
Wants=network-online.target
Requires=fetch-observability-config.service

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json
ExecStart=/usr/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/ \
--storage.tsdb.retention.time=24h \
--web.console.templates=/usr/share/prometheus/consoles \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.listen-address=127.0.0.1:9090
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=minimal.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Unit]
After=fetch-observability-config.service
Wants=fetch-observability-config.service
Loading