Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## v0.15.0 (TBD)

- [BREAKING] Changed `GetBlockByNumber` to accept a `BlockRequest` (with optional `include_proof` flag) and returns a response containing the block and an optional block proof ([#1864](https://github.com/0xMiden/node/pull/1864)).
- Network monitor now auto-regenerates accounts after persistent increment failures instead of staying unhealthy indefinitely ([#1942](https://github.com/0xMiden/node/pull/1942)).
- Extended `ValidatorStatus` proto response with `chain_tip`, `validated_transactions_count`, and `signed_blocks_count`; added Validator card to the network monitor dashboard ([#1900](https://github.com/0xMiden/node/pull/1900)).
- Updated the RocksDB SMT backend to use budgeted deserialization for bytes read from disk, ported from `0xMiden/crypto` PR [#846](https://github.com/0xMiden/crypto/pull/846) ([#1923](https://github.com/0xMiden/node/pull/1923)).
- [BREAKING] Renamed `GetNoteError` endpoint to `GetNetworkNoteStatus` and extended it to return the full lifecycle status of a network note (`Pending`, `Processed`, `Discarded`, `Committed`) instead of only error information. Consumed notes are now retained in the database after block commit instead of being deleted ([#1892](https://github.com/0xMiden/node/pull/1892)).
Expand Down
137 changes: 127 additions & 10 deletions bin/network-monitor/src/counter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Instant;
use std::time::{Duration, Instant};

use anyhow::{Context, Result};
use miden_node_proto::clients::RpcClient;
Expand Down Expand Up @@ -43,6 +43,12 @@ use tracing::{error, info, instrument, warn};
/// Number of consecutive increment failures before re-syncing the wallet account from the RPC.
const RESYNC_FAILURE_THRESHOLD: usize = 3;

/// Number of consecutive increment failures before regenerating accounts from scratch.
const REGENERATE_FAILURE_THRESHOLD: usize = 10;

/// Minimum time between account regeneration attempts.
const REGENERATE_COOLDOWN: Duration = Duration::from_secs(3600);

use crate::config::MonitorConfig;
use crate::deploy::counter::COUNTER_SLOT_NAME;
use crate::deploy::{MonitorDataStore, create_genesis_aware_rpc_client, get_counter_library};
Expand Down Expand Up @@ -391,16 +397,17 @@ pub async fn run_increment_task(
let (
mut details,
mut wallet_account,
counter_account,
block_header,
mut counter_account,
mut block_header,
mut data_store,
increment_script,
secret_key,
mut increment_script,
mut secret_key,
) = setup_increment_task(config.clone(), &mut rpc_client).await?;

let mut rng = ChaCha20Rng::from_os_rng();
let mut interval = tokio::time::interval(config.counter_increment_interval);
let mut consecutive_failures: usize = 0;
let mut last_regeneration: Option<Instant> = None;

loop {
interval.tick().await;
Expand Down Expand Up @@ -444,16 +451,41 @@ pub async fn run_increment_task(
consecutive_failures += 1;
last_error = Some(handle_increment_failure(&mut details, &e));

if consecutive_failures >= RESYNC_FAILURE_THRESHOLD {
if try_resync_wallet_account(
if consecutive_failures >= RESYNC_FAILURE_THRESHOLD
&& try_resync_wallet_account(
&mut rpc_client,
&mut wallet_account,
&mut data_store,
)
.await
.is_ok()
{
consecutive_failures = 0;
{
consecutive_failures = 0;
}

// If re-sync keeps failing, regenerate accounts from scratch (rate-limited).
let cooldown_elapsed =
last_regeneration.is_none_or(|t| t.elapsed() >= REGENERATE_COOLDOWN);
if consecutive_failures >= REGENERATE_FAILURE_THRESHOLD && cooldown_elapsed {
warn!(
consecutive_failures,
"re-sync ineffective, regenerating accounts from scratch"
);
last_regeneration = Some(Instant::now());
Comment thread
JereSalo marked this conversation as resolved.
match try_regenerate_accounts(&config, &mut rpc_client).await {
Ok(new_state) => {
(
details,
wallet_account,
counter_account,
block_header,
data_store,
increment_script,
secret_key,
) = new_state;
consecutive_failures = 0;
},
Err(regen_err) => error!("account regeneration failed: {regen_err:?}"),
}
}
},
Expand Down Expand Up @@ -531,6 +563,46 @@ async fn try_resync_wallet_account(
Ok(())
}

/// Regenerate accounts from scratch when re-sync is ineffective.
///
/// Creates fresh wallet and counter accounts, deploys them to the network, and re-initializes
/// the increment task state. This is a last resort after [`REGENERATE_FAILURE_THRESHOLD`]
/// consecutive failures.
#[instrument(
parent = None,
target = COMPONENT,
name = "network_monitor.counter.try_regenerate_accounts",
skip_all,
level = "warn",
err,
)]
async fn try_regenerate_accounts(
config: &MonitorConfig,
rpc_client: &mut RpcClient,
) -> Result<(
IncrementDetails,
Account,
Account,
BlockHeader,
MonitorDataStore,
NoteScript,
SecretKey,
)> {
crate::deploy::force_recreate_accounts(
&config.wallet_filepath,
&config.counter_filepath,
&config.rpc_url,
)
.await
.context("failed to regenerate accounts")?;

// Re-initialize the full task state from the newly-created account files.
let state = setup_increment_task(config.clone(), rpc_client).await?;

info!("account regeneration completed, increment task re-initialized");
Ok(state)
}

/// Handle the failure path when creating/submitting the network note fails.
fn handle_increment_failure(details: &mut IncrementDetails, error: &anyhow::Error) -> String {
error!("Failed to create and submit network note: {:?}", error);
Expand Down Expand Up @@ -593,7 +665,7 @@ pub async fn run_counter_tracking_task(
create_genesis_aware_rpc_client(&config.rpc_url, config.request_timeout).await?;

// Load counter account to get the account ID
let counter_account = match load_counter_account(&config.counter_filepath) {
let mut counter_account = match load_counter_account(&config.counter_filepath) {
Ok(account) => account,
Err(e) => {
error!("Failed to load counter account: {:?}", e);
Expand All @@ -615,6 +687,17 @@ pub async fn run_counter_tracking_task(
loop {
poll_interval.tick().await;

// The increment task may regenerate accounts when doesn't fixes the card, reload from
// the account file so tracking follows the new account.
reload_counter_account_if_changed(
&config,
&mut counter_account,
&mut rpc_client,
&expected_counter_value,
&mut details,
)
.await;

let last_error = poll_counter_once(
&mut rpc_client,
&counter_account,
Expand All @@ -629,6 +712,40 @@ pub async fn run_counter_tracking_task(
}
}

/// Reload the counter account from disk and re-initialize tracking state if its ID changed.
///
/// The increment task regenerates accounts on persistent failure and rewrites the account
/// file. Without this the tracking task would keep polling the stale account ID forever.
async fn reload_counter_account_if_changed(
config: &MonitorConfig,
counter_account: &mut Account,
rpc_client: &mut RpcClient,
expected_counter_value: &Arc<AtomicU64>,
details: &mut CounterTrackingDetails,
) {
let reloaded = match load_counter_account(&config.counter_filepath) {
Ok(account) => account,
Err(e) => {
warn!(err = ?e, "failed to reload counter account file");
return;
},
};

if reloaded.id() == counter_account.id() {
return;
}

info!(
old.id = %counter_account.id(),
new.id = %reloaded.id(),
"counter account file changed, resetting tracking state",
);
*counter_account = reloaded;
*details = CounterTrackingDetails::default();
initialize_counter_tracking_state(rpc_client, counter_account, expected_counter_value, details)
.await;
}

/// Initialize tracking state by fetching the current counter value from the node.
///
/// Populates `expected_counter_value` and seeds `details` with the latest observed
Expand Down
22 changes: 22 additions & 0 deletions bin/network-monitor/src/deploy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,28 @@ pub async fn ensure_accounts_exist(
save_counter_account(&counter_account, counter_filepath)
}

/// Unconditionally creates fresh wallet and counter accounts, deploys the counter, and saves both
/// to disk. Unlike [`ensure_accounts_exist`], this always replaces existing account files.
///
/// Used by the increment task when accounts are fundamentally outdated (e.g., after a network
/// reset) and re-syncing from the RPC is not sufficient.
pub async fn force_recreate_accounts(
wallet_filepath: &Path,
counter_filepath: &Path,
rpc_url: &Url,
) -> Result<()> {
tracing::warn!("Regenerating monitor accounts (force recreate)");

let (wallet_account, secret_key) = create_wallet_account()?;
let counter_account = create_counter_account(wallet_account.id())?;

deploy_counter_account(&counter_account, rpc_url).await?;
tracing::info!("Successfully recreated and deployed accounts");

save_wallet_account(&wallet_account, &secret_key, wallet_filepath)?;
save_counter_account(&counter_account, counter_filepath)
}

/// Deploy counter account to the network.
///
/// This function creates a counter program account,
Expand Down
Loading