From dd4e82ed7a597126ac503bf6615445c1ab1eab17 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Fri, 29 May 2026 17:22:02 +0700 Subject: [PATCH 01/21] feat(consensus): implement consensus/qbft --- Cargo.lock | 29 +- Cargo.toml | 2 + crates/cli/Cargo.toml | 1 + crates/cli/src/commands/create_cluster.rs | 2 +- crates/cli/src/commands/create_dkg.rs | 2 +- crates/cli/src/commands/version.rs | 4 +- crates/consensus/Cargo.toml | 35 + .../consensus => consensus/src}/instance.rs | 2 + .../consensus/mod.rs => consensus/src/lib.rs} | 0 .../consensus => consensus/src}/protocols.rs | 0 crates/consensus/src/qbft/admission.rs | 460 +++++ crates/consensus/src/qbft/component.rs | 680 ++++++ crates/consensus/src/qbft/definition.rs | 743 +++++++ crates/consensus/src/qbft/interop_test.rs | 114 + crates/consensus/src/qbft/mod.rs | 26 + .../consensus => consensus/src}/qbft/msg.rs | 18 +- crates/consensus/src/qbft/p2p.rs | 1830 +++++++++++++++++ crates/consensus/src/qbft/qbft_run_test.rs | 174 ++ crates/consensus/src/qbft/runner.rs | 726 +++++++ .../src}/qbft/sniffer.rs | 11 +- .../src}/qbft/transport.rs | 17 +- .../src/consensus => consensus/src}/timer.rs | 16 +- crates/core/Cargo.toml | 3 - crates/core/src/consensus/qbft/mod.rs | 7 - crates/core/src/lib.rs | 3 - 25 files changed, 4866 insertions(+), 39 deletions(-) create mode 100644 crates/consensus/Cargo.toml rename crates/{core/src/consensus => consensus/src}/instance.rs (99%) rename crates/{core/src/consensus/mod.rs => consensus/src/lib.rs} (100%) rename crates/{core/src/consensus => consensus/src}/protocols.rs (100%) create mode 100644 crates/consensus/src/qbft/admission.rs create mode 100644 crates/consensus/src/qbft/component.rs create mode 100644 crates/consensus/src/qbft/definition.rs create mode 100644 crates/consensus/src/qbft/interop_test.rs create mode 100644 crates/consensus/src/qbft/mod.rs rename crates/{core/src/consensus => consensus/src}/qbft/msg.rs (96%) create mode 100644 crates/consensus/src/qbft/p2p.rs create mode 100644 crates/consensus/src/qbft/qbft_run_test.rs create mode 100644 crates/consensus/src/qbft/runner.rs rename crates/{core/src/consensus => consensus/src}/qbft/sniffer.rs (89%) rename crates/{core/src/consensus => consensus/src}/qbft/transport.rs (98%) rename crates/{core/src/consensus => consensus/src}/timer.rs (97%) delete mode 100644 crates/core/src/consensus/qbft/mod.rs diff --git a/Cargo.lock b/Cargo.lock index ecab19d1..406c1f73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5521,6 +5521,7 @@ dependencies = [ "libp2p", "pluto-app", "pluto-cluster", + "pluto-consensus", "pluto-core", "pluto-crypto", "pluto-dkg", @@ -5583,6 +5584,32 @@ dependencies = [ "wiremock", ] +[[package]] +name = "pluto-consensus" +version = "1.7.1" +dependencies = [ + "cancellation", + "chrono", + "crossbeam", + "either", + "futures", + "hex", + "k256", + "libp2p", + "pluto-core", + "pluto-featureset", + "pluto-k1util", + "pluto-p2p", + "pluto-ssz", + "prost 0.14.3", + "prost-types 0.14.3", + "test-case", + "thiserror 2.0.18", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "pluto-core" version = "1.7.1" @@ -5603,7 +5630,6 @@ dependencies = [ "futures", "hex", "k256", - "libp2p", "pluto-build-proto", "pluto-cluster", "pluto-crypto", @@ -5611,7 +5637,6 @@ dependencies = [ "pluto-eth2util", "pluto-featureset", "pluto-k1util", - "pluto-p2p", "pluto-ssz", "pluto-testutil", "pluto-tracing", diff --git a/Cargo.toml b/Cargo.toml index ac569187..1b4258d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "crates/build-proto", "crates/cli", "crates/cluster", + "crates/consensus", "crates/core", "crates/crypto", "crates/dkg", @@ -115,6 +116,7 @@ pluto-parsigex = { path = "crates/parsigex" } pluto-build-proto = { path = "crates/build-proto" } pluto-cli = { path = "crates/cli" } pluto-cluster = { path = "crates/cluster" } +pluto-consensus = { path = "crates/consensus" } pluto-core = { path = "crates/core" } pluto-crypto = { path = "crates/crypto" } pluto-dkg = { path = "crates/dkg" } diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 959f6163..d87cf740 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -20,6 +20,7 @@ tokio.workspace = true pluto-app.workspace = true pluto-eth1wrap.workspace = true pluto-cluster.workspace = true +pluto-consensus.workspace = true pluto-dkg.workspace = true pluto-crypto.workspace = true pluto-relay-server.workspace = true diff --git a/crates/cli/src/commands/create_cluster.rs b/crates/cli/src/commands/create_cluster.rs index 3babc14c..e0b1f118 100644 --- a/crates/cli/src/commands/create_cluster.rs +++ b/crates/cli/src/commands/create_cluster.rs @@ -23,7 +23,7 @@ use pluto_cluster::{ operator::Operator, registration::{BuilderRegistration, Registration}, }; -use pluto_core::consensus::protocols; +use pluto_consensus::protocols; use pluto_crypto::{ blst_impl::BlstImpl, tbls::Tbls, diff --git a/crates/cli/src/commands/create_dkg.rs b/crates/cli/src/commands/create_dkg.rs index 8b9359e7..cec2dec4 100644 --- a/crates/cli/src/commands/create_dkg.rs +++ b/crates/cli/src/commands/create_dkg.rs @@ -11,7 +11,7 @@ use pluto_cluster::{ definition::{Creator, Definition}, operator::Operator, }; -use pluto_core::consensus::protocols::is_supported_protocol_name; +use pluto_consensus::protocols::is_supported_protocol_name; use pluto_eth2util::{ deposit::{eths_to_gweis, verify_deposit_amounts}, enr::Record, diff --git a/crates/cli/src/commands/version.rs b/crates/cli/src/commands/version.rs index 2ff1bb74..57382838 100644 --- a/crates/cli/src/commands/version.rs +++ b/crates/cli/src/commands/version.rs @@ -40,7 +40,7 @@ fn run_with_writer(args: VersionArgs, writer: &mut W) -> Result<()> { } writeln!(writer, "Consensus protocols:")?; - for protocol in pluto_core::consensus::protocols::protocols() { + for protocol in pluto_consensus::protocols::protocols() { writeln!(writer, "\t{}", protocol)?; } @@ -132,7 +132,7 @@ mod tests { ); // Check that the first protocol is listed - let protocols = pluto_core::consensus::protocols::protocols(); + let protocols = pluto_consensus::protocols::protocols(); assert!(!protocols.is_empty(), "Should have at least one protocol"); let first_protocol = protocols[0].to_string(); assert!( diff --git a/crates/consensus/Cargo.toml b/crates/consensus/Cargo.toml new file mode 100644 index 00000000..a36203b0 --- /dev/null +++ b/crates/consensus/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "pluto-consensus" +version.workspace = true +edition.workspace = true +repository.workspace = true +license.workspace = true +publish.workspace = true + +[dependencies] +cancellation.workspace = true +chrono.workspace = true +crossbeam.workspace = true +futures.workspace = true +hex.workspace = true +either.workspace = true +k256.workspace = true +libp2p.workspace = true +pluto-core.workspace = true +pluto-featureset.workspace = true +pluto-k1util.workspace = true +pluto-p2p.workspace = true +pluto-ssz.workspace = true +prost.workspace = true +prost-types.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-util.workspace = true +tracing.workspace = true + +[dev-dependencies] +test-case.workspace = true +tokio = { workspace = true, features = ["test-util"] } + +[lints] +workspace = true diff --git a/crates/core/src/consensus/instance.rs b/crates/consensus/src/instance.rs similarity index 99% rename from crates/core/src/consensus/instance.rs rename to crates/consensus/src/instance.rs index 6c26546f..138efdce 100644 --- a/crates/core/src/consensus/instance.rs +++ b/crates/consensus/src/instance.rs @@ -203,11 +203,13 @@ impl InstanceIo { } impl Default for InstanceIo { + /// Creates a fresh instance I/O state. fn default() -> Self { Self::new() } } +/// Takes exclusive ownership of a single-consumer receiver slot. fn take_receiver( receiver: &Mutex>>, channel: &'static str, diff --git a/crates/core/src/consensus/mod.rs b/crates/consensus/src/lib.rs similarity index 100% rename from crates/core/src/consensus/mod.rs rename to crates/consensus/src/lib.rs diff --git a/crates/core/src/consensus/protocols.rs b/crates/consensus/src/protocols.rs similarity index 100% rename from crates/core/src/consensus/protocols.rs rename to crates/consensus/src/protocols.rs diff --git a/crates/consensus/src/qbft/admission.rs b/crates/consensus/src/qbft/admission.rs new file mode 100644 index 00000000..98957448 --- /dev/null +++ b/crates/consensus/src/qbft/admission.rs @@ -0,0 +1,460 @@ +//! QBFT inbound message admission. + +use prost::{Message, Name}; +use prost_types::Any; + +use pluto_core::corepb::v1::{core as pbcore, priority as pbpriority}; + +use super::{ + component::DecodedValue, + msg::{self, ValueMap}, +}; + +/// Admission result. +pub type Result = std::result::Result; + +/// Admission errors. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Outer consensus message was absent or wrong. + #[error("invalid consensus message")] + InvalidConsensusMessage, + + /// Inner message type was invalid. + #[error("invalid consensus message type")] + InvalidConsensusMessageType, + + /// Inner duty type was invalid. + #[error("invalid consensus message duty type")] + InvalidConsensusMessageDutyType, + + /// Inner round was invalid. + #[error("invalid consensus message round")] + InvalidConsensusMessageRound, + + /// Inner prepared round was invalid. + #[error("invalid consensus message prepared round")] + InvalidConsensusMessagePreparedRound, + + /// Message peer index was not in the peer map. + #[error("invalid peer index")] + InvalidPeerIndex, + + /// Signature verification failed before comparison. + #[error("verify consensus message signature: {0}")] + VerifyConsensusMessageSignature(#[source] msg::Error), + + /// Signature recovered to a different peer key. + #[error("invalid consensus message signature")] + InvalidConsensusMessageSignature, + + /// Duty gate rejected the message. + #[error("invalid duty")] + InvalidDuty, + + /// Justification failed validation. + #[error("invalid justification: {0}")] + InvalidJustification(#[source] Box), + + /// Justification duty differed from the outer message duty. + #[error("qbft justification duty differs from message duty")] + JustificationDutyDiffers, + + /// Inbound Any could not be decoded. + #[error("unmarshal any")] + UnmarshalAny, + + /// Message wrapper rejected the value map. + #[error("{0}")] + Msg(#[from] msg::Error), + + /// Duty deadline rejected the message. + #[error("duty expired")] + DutyExpired, + + /// Receive buffer could not accept the message. + #[error("timeout enqueuing receive buffer")] + TimeoutEnqueuingReceiveBuffer, + + /// Context was cancelled after expensive verification. + #[error("receive cancelled during verification")] + ReceiveCancelledDuringVerification, +} + +/// Canonicalizes inbound `Any` values into the hash map used by QBFT messages. +pub(crate) fn values_by_hash(values: &[Any]) -> Result { + let mut out = ValueMap::new(); + + for value in values { + let decoded = decode_supported_any(value)?; + let hash = match decoded { + DecodedValue::UnsignedDataSet(inner) => msg::hash_proto(&inner)?, + DecodedValue::PriorityResult(inner) => msg::hash_proto(&inner)?, + }; + out.insert(hash, value.clone()); + } + + Ok(out) +} + +/// Decodes the protobuf `Any` payload types accepted by this consensus layer. +pub(crate) fn decode_supported_any(value: &Any) -> Result { + if value.type_url == pbcore::UnsignedDataSet::type_url() { + let decoded = pbcore::UnsignedDataSet::decode(value.value.as_slice()) + .map_err(|_| Error::UnmarshalAny)?; + return Ok(DecodedValue::UnsignedDataSet(decoded)); + } + + if value.type_url == pbpriority::PriorityResult::type_url() { + let decoded = pbpriority::PriorityResult::decode(value.value.as_slice()) + .map_err(|_| Error::UnmarshalAny)?; + return Ok(DecodedValue::PriorityResult(decoded)); + } + + Err(Error::UnmarshalAny) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use prost::bytes::Bytes; + use prost_types::Any; + use test_case::test_case; + use tokio_util::sync::CancellationToken; + + use super::*; + use crate::qbft::{ + Consensus, + component::tests::{config_base, consensus, duty, peers, secret_key}, + }; + use pluto_core::{ + corepb::v1::{consensus as pbconsensus, core as pbcore}, + qbft::{self, SomeMsg}, + types::DutyType, + }; + + #[tokio::test] + async fn handle_rejects_invalid_outer_message() { + let err = consensus(0, true) + .handle(&CancellationToken::new(), None) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message"); + } + + #[tokio::test] + async fn handle_rejects_missing_inner_message() { + let err = consensus(0, true) + .handle( + &CancellationToken::new(), + Some(pbconsensus::QbftConsensusMsg::default()), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message"); + } + + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.r#type = 99, "invalid consensus message type" ; "invalid_message_type")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.duty.as_mut().unwrap().r#type = 99, "invalid consensus message duty type" ; "invalid_duty_type")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.round = 0, "invalid consensus message round" ; "invalid_round")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.prepared_round = -1, "invalid consensus message prepared round" ; "invalid_prepared_round")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.peer_idx = 9, "invalid peer index" ; "invalid_peer_idx")] + #[tokio::test] + async fn verify_msg_rejects_invalid_fields(mutate: fn(&mut pbconsensus::QbftMsg), want: &str) { + let consensus = consensus(0, true); + let mut msg = signed_msg(0); + mutate(&mut msg); + if want != "invalid consensus message signature" { + msg.signature.clear(); + msg = sign_for_peer(msg, 0); + mutate(&mut msg); + } + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!(err.to_string(), want); + } + + #[tokio::test] + async fn verify_msg_rejects_missing_duty() { + let consensus = consensus(0, true); + let mut msg = signed_msg(0); + msg.duty = None; + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message"); + } + + #[tokio::test] + async fn verify_msg_rejects_empty_signature() { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.signature.clear(); + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!( + err.to_string(), + "verify consensus message signature: empty signature" + ); + } + + #[tokio::test] + async fn verify_msg_rejects_malformed_signature() { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.signature = vec![0x42; 64].into(); + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert!( + err.to_string() + .starts_with("verify consensus message signature: recover pubkey") + ); + } + + #[tokio::test] + async fn verify_msg_rejects_wrong_signature() { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.signature = msg::sign_msg(&msg, &secret_key(1)).unwrap().signature; + msg.peer_idx = 1; + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message signature"); + } + + #[tokio::test] + async fn verify_msg_accepts_valid_signature() { + let consensus = consensus(0, true); + + consensus.verify_msg(&signed_msg(0)).unwrap(); + } + + #[tokio::test] + async fn handle_rejects_duty_gate_false() { + let err = consensus(0, false) + .handle( + &CancellationToken::new(), + Some(consensus_msg(signed_msg(0))), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid duty"); + } + + #[tokio::test] + async fn handle_rejects_invalid_justification() { + let mut invalid = signed_msg(0); + invalid.round = 0; + let outer = pbconsensus::QbftConsensusMsg { + msg: Some(signed_msg(0)), + justification: vec![invalid], + values: vec![], + }; + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(outer)) + .await + .unwrap_err(); + + assert!(err.to_string().starts_with("invalid justification")); + } + + #[tokio::test] + async fn handle_rejects_justification_duty_mismatch() { + let mut justification = unsigned_msg(0); + justification.duty = Some(pbcore::Duty { + slot: 43, + r#type: i32::try_from(&DutyType::Attester).unwrap(), + }); + let justification = sign_for_peer(justification, 0); + let outer = pbconsensus::QbftConsensusMsg { + msg: Some(signed_msg(0)), + justification: vec![justification], + values: vec![], + }; + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(outer)) + .await + .unwrap_err(); + + assert_eq!( + err.to_string(), + "qbft justification duty differs from message duty" + ); + } + + #[test] + fn values_by_hash_rejects_invalid_type_url() { + let err = values_by_hash(&[Any { + type_url: "type.googleapis.com/unknown.Type".to_string(), + value: vec![], + }]) + .unwrap_err(); + + assert_eq!(err.to_string(), "unmarshal any"); + } + + #[test] + fn values_by_hash_rejects_malformed_any_value() { + let err = values_by_hash(&[Any { + type_url: pbcore::UnsignedDataSet::type_url(), + value: b"not-protobuf".to_vec(), + }]) + .unwrap_err(); + + assert_eq!(err.to_string(), "unmarshal any"); + } + + #[test] + fn values_by_hash_hashes_decoded_inner_message() { + let any = unsigned_any("a", b"first"); + let values = values_by_hash(std::slice::from_ref(&any)).unwrap(); + let decoded = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let hash = msg::hash_proto(&decoded).unwrap(); + + assert_eq!(values.get(&hash), Some(&any)); + } + + #[tokio::test] + async fn handle_rejects_missing_value_hash() { + let mut msg = unsigned_msg(0); + msg.value_hash = [9u8; 32].to_vec().into(); + let msg = sign_for_peer(msg, 0); + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "value hash not found in values"); + } + + #[tokio::test] + async fn handle_enqueues_valid_message() { + let consensus = consensus(0, true); + let any = unsigned_any("a", b"first"); + let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let value_hash = msg::hash_proto(&value).unwrap(); + let mut msg = unsigned_msg(0); + msg.value_hash = value_hash.to_vec().into(); + let msg = sign_for_peer(msg, 0); + let duty = duty(); + let inst = consensus.get_instance_io(duty.clone()); + + consensus + .handle( + &CancellationToken::new(), + Some(pbconsensus::QbftConsensusMsg { + msg: Some(msg), + justification: vec![], + values: vec![any], + }), + ) + .await + .unwrap(); + + let mut recv_rx = inst.take_recv_rx().unwrap(); + let received = recv_rx.try_recv().unwrap(); + assert_eq!(received.value(), value_hash); + } + + #[tokio::test] + async fn handle_rejects_deadliner_false_as_duty_expired() { + let consensus = Consensus::new(super::super::component::Config { + peers: peers(), + local_peer_idx: 0, + ..config_base(true) + }) + .unwrap(); + + let err = consensus + .handle( + &CancellationToken::new(), + Some(consensus_msg(signed_msg(0))), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "duty expired"); + } + + #[tokio::test] + async fn handle_rejects_cancellation_after_verification() { + let ct = CancellationToken::new(); + ct.cancel(); + + let err = consensus(0, true) + .handle(&ct, Some(consensus_msg(signed_msg(0)))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "receive cancelled during verification"); + } + + #[tokio::test] + async fn handle_rejects_full_receive_buffer() { + let consensus = consensus(0, true); + let inst = consensus.get_instance_io(duty()); + for _ in 0..crate::instance::RECV_BUFFER_SIZE { + inst.recv_tx.try_send(wrapped_msg()).unwrap(); + } + + let err = consensus + .handle( + &CancellationToken::new(), + Some(consensus_msg(signed_msg(0))), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "timeout enqueuing receive buffer"); + } + + fn consensus_msg(msg: pbconsensus::QbftMsg) -> pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg { + msg: Some(msg), + justification: vec![], + values: vec![], + } + } + + fn unsigned_msg(peer_idx: i64) -> pbconsensus::QbftMsg { + pbconsensus::QbftMsg { + r#type: i64::from(qbft::MSG_PRE_PREPARE), + duty: Some(pbcore::Duty::try_from(&duty()).unwrap()), + peer_idx, + round: 1, + prepared_round: 0, + ..Default::default() + } + } + + fn signed_msg(peer_idx: i64) -> pbconsensus::QbftMsg { + sign_for_peer(unsigned_msg(peer_idx), peer_idx) + } + + fn sign_for_peer(msg: pbconsensus::QbftMsg, peer_idx: i64) -> pbconsensus::QbftMsg { + let seed = u8::try_from(peer_idx.checked_add(1).unwrap()).unwrap(); + msg::sign_msg(&msg, &secret_key(seed)).unwrap() + } + + fn unsigned_any(key: &str, value: &'static [u8]) -> Any { + Any::from_msg(&pbcore::UnsignedDataSet { + set: [(key.to_string(), Bytes::from_static(value))].into(), + }) + .unwrap() + } + + fn wrapped_msg() -> msg::Msg { + msg::Msg::new(unsigned_msg(0), vec![], Arc::default()).unwrap() + } +} diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs new file mode 100644 index 00000000..4d6928b0 --- /dev/null +++ b/crates/consensus/src/qbft/component.rs @@ -0,0 +1,680 @@ +//! QBFT consensus component state. + +use std::{ + collections::HashMap, + error::Error as StdError, + sync::{Arc, Mutex, PoisonError}, +}; + +use futures::future::BoxFuture; +use k256::{PublicKey, SecretKey}; +use tokio::{ + sync::{mpsc, mpsc::error::TrySendError}, + task::JoinHandle, +}; +use tokio_util::sync::CancellationToken; + +use crate::{ + instance::InstanceIo, + protocols::QBFT_V2_PROTOCOL_ID, + timer::{RoundTimer, RoundTimerFunc}, +}; +use pluto_core::{ + corepb::v1::{consensus as pbconsensus, core as pbcore, priority as pbpriority}, + deadline::{AddOutcome, DeadlinerHandle}, + qbft, + types::{Duty, DutyType}, +}; + +use super::{admission, msg, runner}; + +/// Result returned by outbound QBFT broadcasting. +pub type BroadcastResult = std::result::Result<(), Box>; + +/// External consensus-message broadcaster seam. +pub type Broadcaster = Arc< + dyn Fn(CancellationToken, pbconsensus::QbftConsensusMsg) -> BoxFuture<'static, BroadcastResult> + + Send + + Sync + + 'static, +>; + +/// Duty admission gate. +pub type DutyGater = Arc bool + Send + Sync + 'static>; + +/// Sink for completed sniffer instances. +pub type SnifferSink = Arc; + +/// Subscriber callback result. +pub type SubscriberResult = std::result::Result<(), Box>; + +type UnsignedSubscriber = + Box SubscriberResult + Send + Sync + 'static>; +type PrioritySubscriber = + Box SubscriberResult + Send + Sync + 'static>; + +/// Peer metadata needed by consensus QBFT. +#[derive(Clone, Debug)] +pub struct Peer { + /// External peer index, used only for labels. + pub index: i64, + /// Human-readable peer name. + pub name: String, + /// Peer secp256k1 public key. + pub public_key: PublicKey, +} + +/// QBFT consensus constructor config. +pub struct Config { + /// Consensus peers in process-index order. + pub peers: Vec, + /// Local zero-based process index. + pub local_peer_idx: i64, + /// Local secp256k1 private key. + pub privkey: SecretKey, + /// Duty deadline scheduler. + pub deadliner: DeadlinerHandle, + /// Duty admission gate. + pub duty_gater: DutyGater, + /// External message broadcaster. + pub broadcaster: Broadcaster, + /// Completed sniffer sink. + pub sniffer: SnifferSink, + /// Enables attestation value comparison. + pub compare_attestations: bool, + /// Round timer factory. + pub timer_func: RoundTimerFunc, +} + +/// Decoded consensus value supported by this component. +#[derive(Clone, Debug, PartialEq)] +pub(crate) enum DecodedValue { + /// Unsigned duty data set. + UnsignedDataSet(pbcore::UnsignedDataSet), + /// Priority protocol result. + PriorityResult(pbpriority::PriorityResult), +} + +pub(crate) enum Subscriber { + Unsigned(UnsignedSubscriber), + Priority(PrioritySubscriber), +} + +/// Shared subscriber registry. +#[derive(Clone, Default)] +pub(crate) struct SubscriberSet(Arc>>); + +impl SubscriberSet { + /// Adds a subscriber callback to the shared registry. + fn push(&self, subscriber: Subscriber) { + self.0 + .lock() + .unwrap_or_else(PoisonError::into_inner) + .push(subscriber); + } + + /// Dispatches a decoded value to subscribers that accept its payload type. + pub(crate) fn dispatch_decoded(&self, duty: &Duty, value: &DecodedValue) { + let subscribers = self.0.lock().unwrap_or_else(PoisonError::into_inner); + + for subscriber in subscribers.iter() { + let result = match (subscriber, value) { + (Subscriber::Unsigned(fn_), DecodedValue::UnsignedDataSet(value)) => { + fn_(duty.clone(), value.clone()) + } + (Subscriber::Priority(fn_), DecodedValue::PriorityResult(value)) => { + fn_(duty.clone(), value.clone()) + } + _ => Ok(()), + }; + + if let Err(err) = result { + tracing::warn!(error = %err, duty = %duty, "QBFT subscriber error"); + } + } + } +} + +/// QBFT consensus component. +pub struct Consensus { + peers: Vec, + #[cfg(test)] + peer_labels: Vec, + pubkeys: HashMap, + local_peer_idx: i64, + privkey: SecretKey, + deadliner: DeadlinerHandle, + duty_gater: DutyGater, + broadcaster: Broadcaster, + sniffer: SnifferSink, + timer_func: RoundTimerFunc, + compare_attestations: bool, + subscribers: SubscriberSet, + instances: Mutex>>>, +} + +/// Component result. +pub type Result = std::result::Result; + +/// Component construction errors. +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum Error { + /// Peer order did not fit the wire index type. + #[error("peer index overflow: {index}")] + PeerIndexOverflow { + /// Peer order index. + index: usize, + }, + + /// Local peer index is not present in the peer list. + #[error("invalid local peer index: {peer_idx}")] + InvalidLocalPeerIndex { + /// Local peer index. + peer_idx: i64, + }, +} + +impl Consensus { + /// Creates a new QBFT consensus component. + pub fn new(config: Config) -> Result { + let mut pubkeys = HashMap::with_capacity(config.peers.len()); + #[cfg(test)] + let mut peer_labels = Vec::with_capacity(config.peers.len()); + + for (index, peer) in config.peers.iter().enumerate() { + let peer_idx = i64::try_from(index).map_err(|_| Error::PeerIndexOverflow { index })?; + pubkeys.insert(peer_idx, peer.public_key); + #[cfg(test)] + peer_labels.push(format!("{}:{}", peer.index, peer.name)); + } + + if !pubkeys.contains_key(&config.local_peer_idx) { + return Err(Error::InvalidLocalPeerIndex { + peer_idx: config.local_peer_idx, + }); + } + + Ok(Self { + peers: config.peers, + #[cfg(test)] + peer_labels, + pubkeys, + local_peer_idx: config.local_peer_idx, + privkey: config.privkey, + deadliner: config.deadliner, + duty_gater: config.duty_gater, + broadcaster: config.broadcaster, + sniffer: config.sniffer, + timer_func: config.timer_func, + compare_attestations: config.compare_attestations, + subscribers: SubscriberSet::default(), + instances: Mutex::default(), + }) + } + + /// Returns the QBFT v2 protocol ID. + pub fn protocol_id(&self) -> &'static str { + QBFT_V2_PROTOCOL_ID + } + + /// Registers a callback for decided unsigned duty data. + pub fn subscribe(&self, fn_: F) + where + F: Fn(Duty, pbcore::UnsignedDataSet) -> SubscriberResult + Send + Sync + 'static, + { + self.subscribers.push(Subscriber::Unsigned(Box::new(fn_))); + } + + /// Registers a callback for decided priority protocol results. + pub fn subscribe_priority(&self, fn_: F) + where + F: Fn(Duty, pbpriority::PriorityResult) -> SubscriberResult + Send + Sync + 'static, + { + self.subscribers.push(Subscriber::Priority(Box::new(fn_))); + } + + /// Validates, wraps, and queues an inbound QBFT consensus message. + pub async fn handle( + &self, + ct: &CancellationToken, + req: Option, + ) -> admission::Result<()> { + let pb_msg = req.ok_or(admission::Error::InvalidConsensusMessage)?; + let msg = pb_msg + .msg + .as_ref() + .ok_or(admission::Error::InvalidConsensusMessage)?; + + self.verify_msg(msg)?; + let duty = duty_from_msg(msg)?; + + if !self.duty_allowed(&duty) { + return Err(admission::Error::InvalidDuty); + } + + for justification in &pb_msg.justification { + self.verify_msg(justification) + .map_err(|err| admission::Error::InvalidJustification(Box::new(err)))?; + + let just_duty = duty_from_msg(justification) + .map_err(|err| admission::Error::InvalidJustification(Box::new(err)))?; + if just_duty != duty { + return Err(admission::Error::JustificationDutyDiffers); + } + } + + let values = admission::values_by_hash(&pb_msg.values)?; + let wrapped = msg::Msg::new(msg.clone(), pb_msg.justification.clone(), Arc::new(values))?; + + if ct.is_cancelled() { + return Err(admission::Error::ReceiveCancelledDuringVerification); + } + + if self.add_deadline(duty.clone()).await != AddOutcome::Scheduled { + return Err(admission::Error::DutyExpired); + } + + self.get_recv_buffer(duty) + .try_send(wrapped) + .map_err(|err| match err { + TrySendError::Full(_) | TrySendError::Closed(_) => { + admission::Error::TimeoutEnqueuingReceiveBuffer + } + }) + } + + /// Verifies fields and signature for one raw QBFT message. + pub(crate) fn verify_msg(&self, msg: &pbconsensus::QbftMsg) -> admission::Result<()> { + if msg.duty.is_none() { + return Err(admission::Error::InvalidConsensusMessage); + } + + if !qbft::MessageType::from_wire(msg.r#type).valid() { + return Err(admission::Error::InvalidConsensusMessageType); + } + + let duty = msg + .duty + .as_ref() + .ok_or(admission::Error::InvalidConsensusMessage)?; + let duty_type = DutyType::try_from(duty.r#type) + .map_err(|_| admission::Error::InvalidConsensusMessageDutyType)?; + if !duty_type.is_valid() { + return Err(admission::Error::InvalidConsensusMessageDutyType); + } + + if msg.round <= 0 { + return Err(admission::Error::InvalidConsensusMessageRound); + } + + if msg.prepared_round < 0 { + return Err(admission::Error::InvalidConsensusMessagePreparedRound); + } + + let pubkey = self + .pubkey(msg.peer_idx) + .ok_or(admission::Error::InvalidPeerIndex)?; + let signature_ok = msg::verify_msg_sig(msg, pubkey) + .map_err(admission::Error::VerifyConsensusMessageSignature)?; + if !signature_ok { + return Err(admission::Error::InvalidConsensusMessageSignature); + } + + Ok(()) + } + + /// Runs the internal expired-duty cleanup loop until cancellation. + pub fn start( + self: Arc, + ct: CancellationToken, + mut expired_rx: mpsc::Receiver, + ) -> JoinHandle<()> { + tokio::spawn(async move { + loop { + tokio::select! { + () = ct.cancelled() => return, + duty = expired_rx.recv() => match duty { + Some(duty) => self.delete_instance_io(&duty), + None => return, + }, + } + } + }) + } + + /// Returns existing instance I/O for `duty`, or creates an empty one. + pub(crate) fn get_instance_io(&self, duty: Duty) -> Arc> { + let mut instances = self + .instances + .lock() + .unwrap_or_else(PoisonError::into_inner); + instances + .entry(duty) + .or_insert_with(|| Arc::new(InstanceIo::new())) + .clone() + } + + /// Returns the inbound message buffer for a duty instance. + pub(crate) fn get_recv_buffer(&self, duty: Duty) -> mpsc::Sender { + self.get_instance_io(duty).recv_tx.clone() + } + + /// Drops cached I/O for a completed or expired duty instance. + pub(crate) fn delete_instance_io(&self, duty: &Duty) { + self.instances + .lock() + .unwrap_or_else(PoisonError::into_inner) + .remove(duty); + } + + /// Returns the local zero-based peer index used by QBFT messages. + pub(crate) fn get_peer_idx(&self) -> i64 { + self.local_peer_idx + } + + /// Returns the public key registered for a QBFT peer index. + pub(crate) fn pubkey(&self, peer_idx: i64) -> Option<&PublicKey> { + self.pubkeys.get(&peer_idx) + } + + /// Returns whether local policy admits consensus for the duty. + pub(crate) fn duty_allowed(&self, duty: &Duty) -> bool { + (self.duty_gater)(duty) + } + + /// Registers the duty with the deadline scheduler. + pub(crate) async fn add_deadline(&self, duty: Duty) -> AddOutcome { + self.deadliner.add(duty).await + } + + /// Returns a clone of the subscriber registry handle. + pub(crate) fn subscribers(&self) -> SubscriberSet { + self.subscribers.clone() + } + + /// Returns the configured QBFT node count. + pub(crate) fn node_count(&self) -> usize { + self.peers.len() + } + + /// Returns the local signing key for outbound QBFT messages. + pub(crate) fn privkey(&self) -> SecretKey { + self.privkey.clone() + } + + /// Returns the outbound broadcaster callback. + pub(crate) fn broadcaster(&self) -> Broadcaster { + Arc::clone(&self.broadcaster) + } + + /// Returns the completed-instance sniffer sink. + pub(crate) fn sniffer(&self) -> SnifferSink { + Arc::clone(&self.sniffer) + } + + /// Returns whether attester values should be compared before commit. + pub(crate) fn compare_attestations(&self) -> bool { + self.compare_attestations + } + + /// Creates a round timer for one duty instance. + pub(crate) fn round_timer(&self, duty: Duty) -> Box { + (self.timer_func)(duty) + } + + /// Proposes unsigned duty data for a consensus instance. + pub async fn propose( + &self, + ct: &CancellationToken, + duty: Duty, + value: pbcore::UnsignedDataSet, + ) -> runner::Result<()> { + runner::propose_unsigned(self, ct, duty, value).await + } + + /// Proposes priority protocol data for a consensus instance. + pub async fn propose_priority( + &self, + ct: &CancellationToken, + duty: Duty, + value: pbpriority::PriorityResult, + ) -> runner::Result<()> { + runner::propose_priority(self, ct, duty, value).await + } + + /// Starts participating in a consensus instance. + pub async fn participate(&self, ct: &CancellationToken, duty: Duty) -> runner::Result<()> { + runner::participate(self, ct, duty).await + } + + #[cfg(test)] + pub(crate) fn pubkeys(&self) -> &HashMap { + &self.pubkeys + } + + #[cfg(test)] + pub(crate) fn peer_labels(&self) -> &[String] { + &self.peer_labels + } +} + +/// Extracts the domain duty from a validated raw QBFT message. +fn duty_from_msg(msg: &pbconsensus::QbftMsg) -> admission::Result { + let duty = msg + .duty + .as_ref() + .ok_or(admission::Error::InvalidConsensusMessage)?; + Duty::try_from(duty).map_err(|_| admission::Error::InvalidConsensusMessageDutyType) +} + +#[cfg(test)] +pub(crate) mod tests { + use std::sync::Mutex as StdMutex; + + use tokio_util::sync::CancellationToken; + + use super::*; + use crate::timer::get_round_timer_func; + use pluto_core::{ + deadline::{DeadlineCalculator, DeadlinerTask}, + types::{DutyType, SlotNumber}, + }; + + struct FutureCalculator; + + impl DeadlineCalculator for FutureCalculator { + fn deadline( + &self, + _duty: &Duty, + ) -> pluto_core::deadline::Result>> { + Ok(Some( + chrono::Utc::now() + .checked_add_signed(chrono::Duration::hours(1)) + .expect("one hour deadline fits DateTime"), + )) + } + } + + #[tokio::test] + async fn constructor_builds_pubkey_map_by_peer_order() { + let consensus = consensus(1, true); + + assert_eq!(consensus.pubkeys().len(), 2); + assert_eq!(consensus.pubkey(0), Some(&secret_key(1).public_key())); + assert_eq!(consensus.pubkey(1), Some(&secret_key(2).public_key())); + assert_eq!(consensus.peer_labels(), ["10:node-0", "20:node-1"]); + } + + #[tokio::test] + async fn constructor_rejects_invalid_local_peer_idx() { + let result = Consensus::new(Config { + peers: peers(), + local_peer_idx: 3, + ..config_base(true) + }); + let err = match result { + Ok(_) => panic!("constructor accepted invalid local peer index"), + Err(err) => err, + }; + + assert_eq!(err, Error::InvalidLocalPeerIndex { peer_idx: 3 }); + } + + #[tokio::test] + async fn protocol_id_returns_qbft_v2() { + assert_eq!(consensus(0, true).protocol_id(), QBFT_V2_PROTOCOL_ID); + } + + #[tokio::test] + async fn start_deletes_expired_instance_io_until_cancelled() { + let consensus = Arc::new(consensus(0, true)); + let duty = duty(); + let first = consensus.get_instance_io(duty.clone()); + let cancel = CancellationToken::new(); + let (expired_tx, expired_rx) = mpsc::channel(1); + let task = Arc::clone(&consensus).start(cancel.clone(), expired_rx); + + expired_tx.send(duty.clone()).await.unwrap(); + tokio::time::timeout( + std::time::Duration::from_secs(1), + wait_until_recreated(&consensus, &duty, &first), + ) + .await + .expect("expired instance was not deleted"); + + cancel.cancel(); + task.await.unwrap(); + } + + #[tokio::test] + async fn get_instance_io_returns_same_arc_for_same_duty() { + let consensus = consensus(0, true); + let duty = duty(); + + let first = consensus.get_instance_io(duty.clone()); + let second = consensus.get_instance_io(duty); + + assert!(Arc::ptr_eq(&first, &second)); + } + + #[tokio::test] + async fn delete_instance_io_causes_next_get_to_create_new_arc() { + let consensus = consensus(0, true); + let duty = duty(); + let first = consensus.get_instance_io(duty.clone()); + + consensus.delete_instance_io(&duty); + let second = consensus.get_instance_io(duty); + + assert!(!Arc::ptr_eq(&first, &second)); + } + + #[tokio::test] + async fn subscribers_are_invoked_in_registration_order() { + let consensus = consensus(0, true); + let calls = Arc::new(StdMutex::new(Vec::new())); + + { + let calls = Arc::clone(&calls); + consensus.subscribe(move |_, _| { + calls.lock().unwrap().push("unsigned-1"); + Ok(()) + }); + } + { + let calls = Arc::clone(&calls); + consensus.subscribe_priority(move |_, _| { + calls.lock().unwrap().push("priority-ignored"); + Ok(()) + }); + } + { + let calls = Arc::clone(&calls); + consensus.subscribe(move |_, _| { + calls.lock().unwrap().push("unsigned-2"); + Ok(()) + }); + } + + consensus.subscribers().dispatch_decoded( + &duty(), + &DecodedValue::UnsignedDataSet(pbcore::UnsignedDataSet::default()), + ); + + assert_eq!( + calls.lock().unwrap().as_slice(), + ["unsigned-1", "unsigned-2"] + ); + } + + pub(crate) fn consensus(local_peer_idx: i64, duty_allowed: bool) -> Consensus { + Consensus::new(Config { + peers: peers(), + local_peer_idx, + duty_gater: Arc::new(move |_| duty_allowed), + ..config_base(false) + }) + .unwrap() + } + + pub(crate) fn config_base(never_expiring: bool) -> Config { + let cancel = CancellationToken::new(); + let (deadliner, _expired_rx) = if never_expiring { + DeadlinerTask::start( + cancel, + "qbft-test", + pluto_core::deadline::NeverExpiringCalculator, + ) + } else { + DeadlinerTask::start(cancel, "qbft-test", FutureCalculator) + }; + + Config { + peers: vec![], + local_peer_idx: 0, + privkey: secret_key(1), + deadliner, + duty_gater: Arc::new(|_| true), + broadcaster: Arc::new(|_, _| Box::pin(async { Ok(()) })), + sniffer: Arc::new(|_| {}), + compare_attestations: false, + timer_func: get_round_timer_func(), + } + } + + pub(crate) fn peers() -> Vec { + vec![ + Peer { + index: 10, + name: "node-0".to_string(), + public_key: secret_key(1).public_key(), + }, + Peer { + index: 20, + name: "node-1".to_string(), + public_key: secret_key(2).public_key(), + }, + ] + } + + pub(crate) fn duty() -> Duty { + Duty::new(SlotNumber::new(42), DutyType::Attester) + } + + pub(crate) fn secret_key(seed: u8) -> SecretKey { + SecretKey::from_slice(&[seed; 32]).unwrap() + } + + async fn wait_until_recreated( + consensus: &Consensus, + duty: &Duty, + old: &Arc>, + ) { + loop { + if !Arc::ptr_eq(&consensus.get_instance_io(duty.clone()), old) { + return; + } + tokio::task::yield_now().await; + } + } +} diff --git a/crates/consensus/src/qbft/definition.rs b/crates/consensus/src/qbft/definition.rs new file mode 100644 index 00000000..56985079 --- /dev/null +++ b/crates/consensus/src/qbft/definition.rs @@ -0,0 +1,743 @@ +//! QBFT definition callbacks. + +use std::{sync::Arc, time}; + +use crate::{instance::RECV_BUFFER_SIZE, timer::RoundTimer}; +use crossbeam::channel as mpmc; +use pluto_core::{ + qbft::{self, QbftLogger}, + types::{Duty, DutyType}, +}; +use tokio::runtime::Handle; +use tokio_util::sync::CancellationToken; + +use super::{ + admission, + component::SubscriberSet, + msg::{self, ConsensusQbftTypes}, +}; + +/// Callback invoked with the decided commit quorum. +pub(crate) type DecideCallback = + Arc>) + Send + Sync + 'static>; + +/// Definition constructor config. +pub(crate) struct DefinitionConfig { + /// Number of QBFT participants. + pub(crate) nodes: usize, + /// Subscriber registry notified after decode. + pub(crate) subscribers: SubscriberSet, + /// Round timer for this consensus instance. + pub(crate) round_timer: Box, + /// Internal callback invoked when the core decides. + pub(crate) decide_callback: DecideCallback, + /// Whether attester proposal comparison is enabled. + pub(crate) compare_attestations: bool, + /// Runtime used to host timer futures for the blocking QBFT core. + pub(crate) runtime: Handle, +} + +/// Returns a QBFT core definition for one consensus instance. +pub(crate) fn new_definition(config: DefinitionConfig) -> qbft::Definition { + let nodes = i64::try_from(config.nodes).expect("node count fits i64"); + let quorum = quorum(nodes); + let round_timer: Arc = Arc::from(config.round_timer); + let compare_attestations = config.compare_attestations; + let subscribers = config.subscribers; + let decide_callback = config.decide_callback; + + qbft::Definition { + is_leader: Box::new(move |request| { + leader(request.instance, request.round, nodes) == request.process + }), + new_timer: Box::new({ + let runtime = config.runtime; + move |round| new_timer(Arc::clone(&round_timer), runtime.clone(), round) + }), + compare: Arc::new(move |request| compare(compare_attestations, request)), + decide: Box::new(move |request| { + decide(request, Arc::clone(&decide_callback), subscribers.clone()); + }), + logger: QbftLogger { + upon_rule: Box::new(|log| { + tracing::debug!( + rule = %log.upon_rule, + round = log.round, + "QBFT upon rule triggered" + ); + }), + round_change: Box::new(move |log| { + let leader = usize::try_from(leader(log.instance, log.round, nodes)) + .expect("leader index fits usize"); + let steps = group_round_messages(log.msgs, config.nodes, log.round, leader); + let pre_prepare = fmt_step_peers(step_by_type(&steps, qbft::MSG_PRE_PREPARE)); + let prepare = fmt_step_peers(step_by_type(&steps, qbft::MSG_PREPARE)); + let commit = fmt_step_peers(step_by_type(&steps, qbft::MSG_COMMIT)); + let round_change = fmt_step_peers(step_by_type(&steps, qbft::MSG_ROUND_CHANGE)); + + if log.upon_rule == qbft::UPON_ROUND_TIMEOUT { + tracing::debug!( + rule = %log.upon_rule, + round = log.round, + new_round = log.new_round, + pre_prepare, + prepare, + commit, + round_change, + timeout_reason = %timeout_reason(&steps, log.round, quorum), + "QBFT round changed" + ); + } else { + tracing::debug!( + rule = %log.upon_rule, + round = log.round, + new_round = log.new_round, + pre_prepare, + prepare, + commit, + round_change, + "QBFT round changed" + ); + } + }), + unjust: Box::new(|log| { + tracing::warn!( + type = %log.msg.type_(), + peer = log.msg.source(), + "Unjustified consensus message from peer" + ); + }), + }, + nodes, + fifo_limit: i64::try_from(RECV_BUFFER_SIZE).expect("receive buffer size fits i64"), + } +} + +/// Handles a QBFT core decision by decoding the decided value and notifying +/// listeners. +fn decide( + request: qbft::DecideRequest<'_, ConsensusQbftTypes>, + decide_callback: DecideCallback, + subscribers: SubscriberSet, +) { + let Some(qcommit_msg) = request.qcommit.first() else { + tracing::error!("Invalid message type"); + return; + }; + + let Some(msg) = qcommit_msg.as_any().downcast_ref::() else { + tracing::error!("Invalid message type"); + return; + }; + + let Some(any_value) = msg.values().get(request.value) else { + tracing::error!("Invalid value hash"); + return; + }; + + let decoded = match admission::decode_supported_any(any_value) { + Ok(decoded) => decoded, + Err(err) => { + tracing::error!(error = %err, "Invalid any value"); + return; + } + }; + + decide_callback(request.qcommit.clone()); + subscribers.dispatch_decoded(request.instance, &decoded); +} + +/// Compares proposal values before commit when attester comparison is enabled. +fn compare(compare_attestations: bool, request: qbft::CompareRequest<'_, ConsensusQbftTypes>) { + if !compare_attestations { + let _ = request.return_err.send(Ok(())); + return; + } + + if request.qcommit.instance().duty_type != DutyType::Attester { + let _ = request.return_err.send(Ok(())); + return; + } + + tracing::warn!("QBFT attester compare deferred: unsigned data domain decoding is unavailable"); + let _ = request.return_err.send(Err(qbft::QbftError::CompareError)); +} + +/// Adapts an async round timer future into the blocking QBFT core timer type. +fn new_timer(round_timer: Arc, runtime: Handle, round: i64) -> qbft::Timer { + let (timer_tx, timer_rx) = mpmc::bounded(1); + let Ok(timer) = round_timer.timer(round) else { + tracing::warn!(round, "QBFT round timer construction failed"); + return qbft::Timer { + receive: timer_rx, + stop: Box::new(|| {}), + }; + }; + + let ct = CancellationToken::new(); + let task_ct = ct.clone(); + runtime.spawn(async move { + tokio::select! { + () = task_ct.cancelled() => {} + _ = timer => { + let _ = timer_tx.send(time::Instant::now()); + } + } + }); + + qbft::Timer { + receive: timer_rx, + stop: Box::new(move || ct.cancel()), + } +} + +/// Returns the QBFT quorum threshold for `nodes`. +fn quorum(nodes: i64) -> usize { + let quorum = nodes + .checked_mul(2) + .and_then(|nodes| nodes.checked_add(2)) + .and_then(|nodes| nodes.checked_div(3)) + .expect("node count permits quorum calculation"); + usize::try_from(quorum).expect("quorum fits usize") +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct RoundStep { + type_: qbft::MessageType, + present: Vec, + missing: Vec, + peers: usize, +} + +/// Groups received round messages by protocol step for timeout diagnostics. +fn group_round_messages( + msgs: &[qbft::Msg], + peers: usize, + round: i64, + leader: usize, +) -> Vec { + [ + qbft::MSG_PRE_PREPARE, + qbft::MSG_PREPARE, + qbft::MSG_COMMIT, + qbft::MSG_ROUND_CHANGE, + ] + .into_iter() + .map(|type_| { + let (present, missing) = check_peers(msgs, peers, round, leader, type_); + RoundStep { + type_, + present, + missing, + peers, + } + }) + .collect() +} + +/// Returns present and missing peers for a round step. +fn check_peers( + msgs: &[qbft::Msg], + peers: usize, + round: i64, + leader: usize, + type_: qbft::MessageType, +) -> (Vec, Vec) { + let mut present = vec![]; + let mut missing = vec![]; + + for peer in 0..peers { + let peer_idx = i64::try_from(peer).expect("peer index fits i64"); + let included = msgs + .iter() + .any(|msg| msg.type_() == type_ && msg.source() == peer_idx); + + if included { + present.push(peer); + continue; + } + + if type_ == qbft::MSG_PRE_PREPARE && peer != leader { + continue; + } + + if type_ == qbft::MSG_ROUND_CHANGE && round == 1 { + continue; + } + + missing.push(peer); + } + + (present, missing) +} + +/// Returns the most specific timeout reason visible from round message state. +fn timeout_reason(steps: &[RoundStep], round: i64, quorum: usize) -> String { + if round > 1 { + let step = step_by_type(steps, qbft::MSG_ROUND_CHANGE); + if step.present.len() < quorum { + return format!( + "insufficient round-changes, missing peers={}", + fmt_peer_list(&step.missing) + ); + } + } + + let step = step_by_type(steps, qbft::MSG_PRE_PREPARE); + if step.present.is_empty() { + return format!( + "no pre-prepare, missing leader={}", + fmt_peer_list(&step.missing) + ); + } + + let step = step_by_type(steps, qbft::MSG_PREPARE); + if step.present.len() < quorum { + return format!( + "insufficient prepares, missing peers={}", + fmt_peer_list(&step.missing) + ); + } + + let step = step_by_type(steps, qbft::MSG_COMMIT); + if step.present.len() < quorum { + return format!( + "insufficient commits, missing peers={}", + fmt_peer_list(&step.missing) + ); + } + + "unknown reason".to_string() +} + +/// Finds the diagnostic record for a message type. +fn step_by_type(steps: &[RoundStep], type_: qbft::MessageType) -> &RoundStep { + steps + .iter() + .find(|step| step.type_ == type_) + .expect("round step type exists") +} + +/// Formats a round step as a compact peer bitmap for logs. +fn fmt_step_peers(step: &RoundStep) -> String { + let mut out = vec!["_"; step.peers]; + + for peer in &step.present { + out[*peer] = "*"; + } + + for peer in &step.missing { + out[*peer] = "?"; + } + + out.join("") +} + +/// Formats peer indices for timeout reason strings. +fn fmt_peer_list(peers: &[usize]) -> String { + format!( + "[{}]", + peers + .iter() + .map(usize::to_string) + .collect::>() + .join(" ") + ) +} + +/// Returns the deterministic leader index for a duty and round. +pub(crate) fn leader(duty: &Duty, round: i64, nodes: i64) -> i64 { + debug_assert!(nodes > 0); + + let duty_type = match i32::try_from(&duty.duty_type) { + Ok(value) => value, + Err(_) => i32::try_from(&DutyType::Unknown).expect("unknown duty type maps to i32"), + }; + + let total = i128::from(duty.slot.inner()) + .checked_add(i128::from(duty_type)) + .and_then(|value| value.checked_add(i128::from(round))) + .expect("slot, duty type, and round fit i128"); + let nodes = i128::from(nodes); + + i64::try_from(total.rem_euclid(nodes)).expect("leader index fits i64") +} + +#[cfg(test)] +mod tests { + use std::{ + collections::HashMap, + sync::{ + Arc, Mutex, + atomic::{AtomicBool, Ordering}, + }, + }; + + use prost::{Message, Name}; + use prost_types::Any; + use test_case::test_case; + + use super::*; + use crate::qbft::{component, msg}; + use pluto_core::{ + corepb::v1::{consensus as pbconsensus, core as pbcore}, + types::{Duty, DutyType, SlotNumber}, + }; + + #[test_case(0, DutyType::Attester, 1, 4, 3 ; "attester_round_1")] + #[test_case(42, DutyType::Attester, 1, 4, 1 ; "slot_42_attester")] + #[test_case(42, DutyType::Proposer, 3, 4, 2 ; "slot_42_proposer_round_3")] + #[test_case(10, DutyType::SyncContribution, 2, 7, 3 ; "sync_contribution")] + fn leader_matches_go_formula( + slot: u64, + duty_type: DutyType, + round: i64, + nodes: i64, + want: i64, + ) { + let duty = Duty::new(SlotNumber::new(slot), duty_type); + + assert_eq!(leader(&duty, round, nodes), want); + } + + #[test] + fn group_round_messages_marks_present_and_missing_peers() { + let msgs = vec![ + test_msg(qbft::MSG_PRE_PREPARE, 1, 2), + test_msg(qbft::MSG_PREPARE, 0, 2), + test_msg(qbft::MSG_PREPARE, 2, 2), + test_msg(qbft::MSG_COMMIT, 3, 2), + test_msg(qbft::MSG_ROUND_CHANGE, 0, 2), + ]; + + let steps = group_round_messages(&msgs, 4, 2, 1); + + assert_eq!( + steps, + vec![ + RoundStep { + type_: qbft::MSG_PRE_PREPARE, + present: vec![1], + missing: vec![], + peers: 4, + }, + RoundStep { + type_: qbft::MSG_PREPARE, + present: vec![0, 2], + missing: vec![1, 3], + peers: 4, + }, + RoundStep { + type_: qbft::MSG_COMMIT, + present: vec![3], + missing: vec![0, 1, 2], + peers: 4, + }, + RoundStep { + type_: qbft::MSG_ROUND_CHANGE, + present: vec![0], + missing: vec![1, 2, 3], + peers: 4, + }, + ] + ); + } + + #[test] + fn group_round_messages_ignores_round_change_missing_peers_in_round_one() { + let steps = group_round_messages(&[], 4, 1, 1); + + let round_change = step_by_type(&steps, qbft::MSG_ROUND_CHANGE); + + assert!(round_change.present.is_empty()); + assert!(round_change.missing.is_empty()); + } + + #[test_case( + vec![ + step(qbft::MSG_ROUND_CHANGE, vec![0, 1], vec![2, 3]), + step(qbft::MSG_PRE_PREPARE, vec![1], vec![]), + step(qbft::MSG_PREPARE, vec![0, 1, 2], vec![3]), + step(qbft::MSG_COMMIT, vec![0, 1, 2], vec![3]), + ], + 2, + 3, + "insufficient round-changes, missing peers=[2 3]" ; + "insufficient_round_changes" + )] + #[test_case( + vec![ + step(qbft::MSG_PRE_PREPARE, vec![], vec![1]), + step(qbft::MSG_PREPARE, vec![0, 1, 2], vec![3]), + step(qbft::MSG_COMMIT, vec![0, 1, 2], vec![3]), + step(qbft::MSG_ROUND_CHANGE, vec![], vec![]), + ], + 1, + 3, + "no pre-prepare, missing leader=[1]" ; + "no_preprepare" + )] + #[test_case( + vec![ + step(qbft::MSG_PRE_PREPARE, vec![1], vec![]), + step(qbft::MSG_PREPARE, vec![0, 1], vec![2, 3]), + step(qbft::MSG_COMMIT, vec![0, 1, 2], vec![3]), + step(qbft::MSG_ROUND_CHANGE, vec![], vec![]), + ], + 1, + 3, + "insufficient prepares, missing peers=[2 3]" ; + "insufficient_prepares" + )] + #[test_case( + vec![ + step(qbft::MSG_PRE_PREPARE, vec![1], vec![]), + step(qbft::MSG_PREPARE, vec![0, 1, 2], vec![3]), + step(qbft::MSG_COMMIT, vec![0, 1], vec![2, 3]), + step(qbft::MSG_ROUND_CHANGE, vec![], vec![]), + ], + 1, + 3, + "insufficient commits, missing peers=[2 3]" ; + "insufficient_commits" + )] + #[test_case( + vec![ + step(qbft::MSG_PRE_PREPARE, vec![1], vec![]), + step(qbft::MSG_PREPARE, vec![0, 1, 2], vec![3]), + step(qbft::MSG_COMMIT, vec![0, 1, 2], vec![3]), + step(qbft::MSG_ROUND_CHANGE, vec![], vec![]), + ], + 1, + 3, + "unknown reason" ; + "unknown" + )] + fn timeout_reason_matches_go_order( + steps: Vec, + round: i64, + quorum: usize, + want: &str, + ) { + assert_eq!(timeout_reason(&steps, round, quorum), want); + } + + #[test] + fn fmt_step_peers_renders_present_missing_and_absent_markers() { + let step = RoundStep { + type_: qbft::MSG_PREPARE, + present: vec![0, 2], + missing: vec![3], + peers: 5, + }; + + assert_eq!(fmt_step_peers(&step), "*_*?_"); + } + + #[tokio::test] + async fn new_definition_decide_dispatches_decoded_value_and_callback() { + let consensus = component::tests::consensus(0, true); + let duty = component::tests::duty(); + let observed = Arc::new(Mutex::new(Vec::new())); + let callback_called = Arc::new(AtomicBool::new(false)); + + { + let observed = Arc::clone(&observed); + consensus.subscribe(move |duty, value| { + observed.lock().unwrap().push((duty, value)); + Ok(()) + }); + } + + let def = new_definition(DefinitionConfig { + nodes: consensus.node_count(), + subscribers: consensus.subscribers(), + round_timer: consensus.round_timer(duty.clone()), + decide_callback: { + let callback_called = Arc::clone(&callback_called); + Arc::new(move |qcommit| { + assert_eq!(qcommit.len(), 1); + callback_called.store(true, Ordering::Relaxed); + }) + }, + compare_attestations: false, + runtime: tokio::runtime::Handle::current(), + }); + let value = unsigned_value(); + let hash = msg::hash_proto(&value).unwrap(); + let qcommit = vec![commit_msg(duty.clone(), hash, any_unsigned(&value))]; + let cts = cancellation::CancellationTokenSource::new(); + let ct = cts.token().clone(); + + (def.decide)(qbft::DecideRequest { + ct: &ct, + instance: &duty, + value: &hash, + qcommit: &qcommit, + }); + + assert!(callback_called.load(Ordering::Relaxed)); + assert_eq!(observed.lock().unwrap().as_slice(), [(duty, value)]); + } + + #[test_case(false, DutyType::Attester, Ok(()) ; "disabled_attester")] + #[test_case(true, DutyType::Proposer, Ok(()) ; "enabled_non_attester")] + fn compare_accepts_disabled_or_non_attester( + compare_attestations: bool, + duty_type: DutyType, + want: Result<(), qbft::QbftError>, + ) { + let result = run_compare(compare_attestations, duty_type); + + assert!(matches!((result, want), (Ok(()), Ok(())))); + } + + #[test] + fn compare_defers_attester_source_target_matching() { + let result = run_compare(true, DutyType::Attester); + + assert!(matches!(result, Err(qbft::QbftError::CompareError))); + } + + #[tokio::test] + async fn new_definition_leader_callback_uses_go_formula() { + let consensus = component::tests::consensus(0, true); + let duty = Duty::new(SlotNumber::new(42), DutyType::Proposer); + let def = new_definition(DefinitionConfig { + nodes: 4, + subscribers: consensus.subscribers(), + round_timer: consensus.round_timer(duty.clone()), + decide_callback: Arc::new(|_| {}), + compare_attestations: false, + runtime: tokio::runtime::Handle::current(), + }); + + assert!((def.is_leader)(qbft::LeaderRequest { + instance: &duty, + round: 3, + process: 2, + })); + assert!(!(def.is_leader)(qbft::LeaderRequest { + instance: &duty, + round: 3, + process: 1, + })); + } + + fn step(type_: qbft::MessageType, present: Vec, missing: Vec) -> RoundStep { + RoundStep { + type_, + present, + missing, + peers: 4, + } + } + + fn test_msg( + type_: qbft::MessageType, + peer_idx: i64, + round: i64, + ) -> qbft::Msg { + Arc::new( + msg::Msg::new( + pbconsensus::QbftMsg { + r#type: i64::from(type_), + duty: Some(pbcore::Duty { + slot: 1, + r#type: i32::try_from(&DutyType::Attester).unwrap(), + }), + peer_idx, + round, + ..Default::default() + }, + vec![], + Arc::default(), + ) + .unwrap(), + ) + } + + fn run_compare( + compare_attestations: bool, + duty_type: DutyType, + ) -> std::result::Result<(), qbft::QbftError> { + let cts = cancellation::CancellationTokenSource::new(); + let ct = cts.token().clone(); + let qcommit = test_msg_with_duty(qbft::MSG_COMMIT, 0, 1, duty_type); + let (_input_tx, input_rx) = mpmc::bounded(1); + let (return_err_tx, return_err_rx) = mpmc::bounded(1); + let (return_value_tx, _return_value_rx) = mpmc::bounded(1); + let input_value = Any::default(); + + compare( + compare_attestations, + qbft::CompareRequest { + ct: &ct, + qcommit: &qcommit, + input_value_source_ch: &input_rx, + input_value_source: &input_value, + return_err: &return_err_tx, + return_value: &return_value_tx, + }, + ); + + return_err_rx.recv().unwrap() + } + + fn commit_msg(duty: Duty, hash: [u8; 32], value: Any) -> qbft::Msg { + let values = Arc::new(HashMap::from([(hash, value)])); + Arc::new( + msg::Msg::new( + pbconsensus::QbftMsg { + r#type: i64::from(qbft::MSG_COMMIT), + duty: Some(pbcore::Duty::try_from(&duty).unwrap()), + peer_idx: 0, + round: 1, + value_hash: hash.to_vec().into(), + ..Default::default() + }, + vec![], + values, + ) + .unwrap(), + ) + } + + fn test_msg_with_duty( + type_: qbft::MessageType, + peer_idx: i64, + round: i64, + duty_type: DutyType, + ) -> qbft::Msg { + Arc::new( + msg::Msg::new( + pbconsensus::QbftMsg { + r#type: i64::from(type_), + duty: Some(pbcore::Duty { + slot: 1, + r#type: i32::try_from(&duty_type).unwrap(), + }), + peer_idx, + round, + ..Default::default() + }, + vec![], + Arc::default(), + ) + .unwrap(), + ) + } + + fn unsigned_value() -> pbcore::UnsignedDataSet { + pbcore::UnsignedDataSet::default() + } + + fn any_unsigned(value: &pbcore::UnsignedDataSet) -> Any { + let mut buf = Vec::new(); + value.encode(&mut buf).unwrap(); + Any { + type_url: pbcore::UnsignedDataSet::type_url(), + value: buf, + } + } +} diff --git a/crates/consensus/src/qbft/interop_test.rs b/crates/consensus/src/qbft/interop_test.rs new file mode 100644 index 00000000..d84a38ec --- /dev/null +++ b/crates/consensus/src/qbft/interop_test.rs @@ -0,0 +1,114 @@ +use futures::io::Cursor; +use prost::{Message, bytes::Bytes}; +use prost_types::Any; +use tokio_util::sync::CancellationToken; + +use crate::qbft::{component::tests, msg}; +use pluto_core::{ + corepb::v1::{consensus as pbconsensus, core as pbcore}, + qbft::SomeMsg, +}; + +const REFERENCE_VALUE_HASH: &str = + "0a0c0a0430783939120401020304000000000000000000000000000000000000"; +const REFERENCE_SIGNATURE: &str = "4cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe924200"; +const REFERENCE_PAYLOAD: &str = "0a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; +const REFERENCE_FRAME: &str = "b7010a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; + +#[tokio::test] +async fn reference_framed_message_decodes() { + let mut cursor = Cursor::new(hex::decode(REFERENCE_FRAME).expect("valid fixture hex")); + + let decoded = + pluto_p2p::proto::read_protobuf_with_max_size::( + &mut cursor, + pluto_p2p::proto::MAX_MESSAGE_SIZE, + ) + .await + .expect("reference frame should decode"); + + assert_eq!(decoded, reference_consensus_msg()); +} + +#[tokio::test] +async fn reference_signed_message_is_admitted() { + let consensus = tests::consensus(0, true); + let mut recv_rx = consensus + .get_instance_io(tests::duty()) + .take_recv_rx() + .expect("recv receiver should be available"); + + consensus + .handle(&CancellationToken::new(), Some(reference_consensus_msg())) + .await + .expect("reference message should be admitted"); + + let received = recv_rx.recv().await.expect("admitted message"); + assert_eq!(received.source(), 0); + assert_eq!(hex::encode(received.value()), REFERENCE_VALUE_HASH); + assert_eq!( + received.value_source().expect("value source should exist"), + reference_any_value() + ); +} + +#[tokio::test] +async fn rust_rebuilds_reference_message_and_frame() { + let rebuilt = build_reference_consensus_msg(); + let mut frame = Cursor::new(Vec::new()); + + pluto_p2p::proto::write_protobuf(&mut frame, &rebuilt) + .await + .expect("frame write should succeed"); + + assert_eq!(rebuilt, reference_consensus_msg()); + assert_eq!(hex::encode(rebuilt.encode_to_vec()), REFERENCE_PAYLOAD); + assert_eq!(hex::encode(frame.into_inner()), REFERENCE_FRAME); +} + +fn build_reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { + let value = reference_value(); + let value_hash = msg::hash_proto(&value).expect("value should hash"); + let signed = msg::sign_msg( + &pbconsensus::QbftMsg { + r#type: i64::from(pluto_core::qbft::MSG_PREPARE), + duty: Some(pbcore::Duty { + slot: 42, + r#type: 2, + }), + peer_idx: 0, + round: 1, + value_hash: value_hash.to_vec().into(), + ..Default::default() + }, + &tests::secret_key(1), + ) + .expect("message should sign"); + + assert_eq!(hex::encode(&signed.signature), REFERENCE_SIGNATURE); + + pbconsensus::QbftConsensusMsg { + msg: Some(signed), + justification: vec![], + values: vec![Any::from_msg(&value).expect("value should pack")], + } +} + +fn reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg::decode( + hex::decode(REFERENCE_PAYLOAD) + .expect("valid fixture hex") + .as_slice(), + ) + .expect("reference payload should decode") +} + +fn reference_value() -> pbcore::UnsignedDataSet { + let mut set = std::collections::BTreeMap::new(); + set.insert("0x99".to_string(), Bytes::from_static(&[1, 2, 3, 4])); + pbcore::UnsignedDataSet { set } +} + +fn reference_any_value() -> Any { + Any::from_msg(&reference_value()).expect("value should pack") +} diff --git a/crates/consensus/src/qbft/mod.rs b/crates/consensus/src/qbft/mod.rs new file mode 100644 index 00000000..bd9475f6 --- /dev/null +++ b/crates/consensus/src/qbft/mod.rs @@ -0,0 +1,26 @@ +//! QBFT consensus wrapper. + +mod admission; +mod component; +pub(crate) mod definition; +pub(crate) mod runner; + +pub use component::{ + BroadcastResult, Broadcaster, Config, Consensus, DutyGater, Error, Peer, SnifferSink, + SubscriberResult, +}; +pub use runner::{Error as RunnerError, Result as RunnerResult}; + +/// QBFT protobuf message wrapper. +pub mod msg; + +/// Concrete libp2p adapter for QBFT consensus messages. +pub mod p2p; + +pub(crate) mod sniffer; +pub(crate) mod transport; + +#[cfg(test)] +mod interop_test; +#[cfg(test)] +mod qbft_run_test; diff --git a/crates/core/src/consensus/qbft/msg.rs b/crates/consensus/src/qbft/msg.rs similarity index 96% rename from crates/core/src/consensus/qbft/msg.rs rename to crates/consensus/src/qbft/msg.rs index 8085e234..04cccfe5 100644 --- a/crates/core/src/consensus/qbft/msg.rs +++ b/crates/consensus/src/qbft/msg.rs @@ -1,7 +1,7 @@ //! QBFT protobuf message adapter. //! //! This module bridges the domain-specific consensus protobuf messages with -//! the generic [`crate::qbft`] state machine. +//! the generic [`pluto_core::qbft`] state machine. //! //! [`QbftMsg`](pbconsensus::QbftMsg) carries only consensus metadata and value //! hashes. The concrete proposal values are transported beside it in @@ -34,7 +34,7 @@ use k256::{PublicKey, SecretKey}; use pluto_ssz::{HashWalker, Hasher, HasherError}; use prost_types::Any; -use crate::{ +use pluto_core::{ corepb::v1::{consensus as pbconsensus, core as pbcore}, qbft::{self, MessageType, SomeMsg}, types::{Duty, DutyType, SlotNumber}, @@ -208,26 +208,32 @@ impl Msg { } impl SomeMsg for Msg { + /// Returns the QBFT message type preserved from the wire value. fn type_(&self) -> MessageType { MessageType::from_wire(self.msg.r#type) } + /// Returns the duty instance this message belongs to. fn instance(&self) -> Duty { duty_from_proto(self.msg.duty.as_ref()) } + /// Returns the sender's zero-based peer index. fn source(&self) -> i64 { self.msg.peer_idx } + /// Returns the QBFT round carried by the message. fn round(&self) -> i64 { self.msg.round } + /// Returns the cached proposal value hash. fn value(&self) -> [u8; 32] { self.value_hash } + /// Returns the original value payload for core compare callbacks. fn value_source(&self) -> std::result::Result { self.values .get(&self.value_hash) @@ -235,18 +241,22 @@ impl SomeMsg for Msg { .ok_or(qbft::QbftError::ValueNotFound) } + /// Returns the prepared round carried by a round-change message. fn prepared_round(&self) -> i64 { self.msg.prepared_round } + /// Returns the cached prepared value hash. fn prepared_value(&self) -> [u8; 32] { self.prepared_value_hash } + /// Returns wrapped justification messages for core validation. fn justification(&self) -> Vec> { self.justification.clone() } + /// Exposes the concrete wrapper for transport downcasts. fn as_any(&self) -> &dyn any::Any { self } @@ -322,6 +332,7 @@ pub(crate) fn verify_msg_sig(msg: &pbconsensus::QbftMsg, pubkey: &PublicKey) -> Ok(recovered == *pubkey) } +/// Converts a protobuf bytes field into a non-zero 32-byte hash. fn to_hash32(value: &[u8]) -> Option<[u8; 32]> { let value: [u8; 32] = value.try_into().ok()?; if value == [0u8; 32] { @@ -331,6 +342,7 @@ fn to_hash32(value: &[u8]) -> Option<[u8; 32]> { Some(value) } +/// Converts an optional protobuf duty into the domain duty type. fn duty_from_proto(duty: Option<&pbcore::Duty>) -> Duty { let Some(duty) = duty else { return Duty::new(SlotNumber::new(0), DutyType::Unknown); @@ -347,7 +359,7 @@ fn duty_from_proto(duty: Option<&pbcore::Duty>) -> Duty { #[cfg(test)] mod tests { use super::*; - use crate::qbft::{MSG_PRE_PREPARE, MSG_PREPARE}; + use pluto_core::qbft::{MSG_PRE_PREPARE, MSG_PREPARE}; use prost::bytes::Bytes; use prost_types::Timestamp; use test_case::test_case; diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs new file mode 100644 index 00000000..edefb123 --- /dev/null +++ b/crates/consensus/src/qbft/p2p.rs @@ -0,0 +1,1830 @@ +//! libp2p adapter for QBFT consensus messages. + +use std::{ + collections::{HashMap, VecDeque}, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, + task::{Context, Poll}, + time::Duration, +}; + +use either::Either; +use futures::{AsyncRead, AsyncWrite, AsyncWriteExt, FutureExt, StreamExt}; +use libp2p::{ + Multiaddr, PeerId, + core::upgrade::ReadyUpgrade, + swarm::{ + ConnectionDenied, ConnectionHandler, ConnectionHandlerEvent, ConnectionId, DialError, + FromSwarm, NetworkBehaviour, NotifyHandler, Stream, StreamProtocol, StreamUpgradeError, + SubstreamProtocol, THandler, THandlerInEvent, THandlerOutEvent, ToSwarm, + dial_opts::{DialOpts, PeerCondition}, + dummy, + handler::{ + ConnectionEvent, DialUpgradeError, FullyNegotiatedInbound, FullyNegotiatedOutbound, + }, + }, +}; +use tokio::{ + sync::mpsc, + time::{error::Elapsed, timeout}, +}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, warn}; + +use crate::{protocols::QBFT_V2_PROTOCOL_ID, qbft::BroadcastResult}; +use pluto_core::corepb::v1::consensus as pbconsensus; +use pluto_p2p::p2p_context::P2PContext; + +use super::Consensus; + +/// Charon-compatible inbound receive timeout. +pub const RECEIVE_TIMEOUT: Duration = Duration::from_secs(5); +/// Charon-compatible outbound send timeout. +pub const SEND_TIMEOUT: Duration = Duration::from_secs(7); + +/// Returns the QBFT libp2p stream protocol. +pub fn protocol_id() -> StreamProtocol { + StreamProtocol::new(QBFT_V2_PROTOCOL_ID) +} + +/// QBFT libp2p adapter configuration. +#[derive(Clone)] +pub struct Config { + /// Consensus component that admits inbound QBFT messages. + pub consensus: Arc, + /// Shared runtime P2P state for connection checks. + pub p2p_context: P2PContext, + /// Cluster peer IDs in consensus peer order. + pub peers: Vec, + /// Local libp2p peer ID. + pub local_peer_id: PeerId, + /// Cancellation token for inbound admission. + pub cancellation: CancellationToken, +} + +/// QBFT adapter construction errors. +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum Error { + /// Local peer ID is absent from the configured cluster peer list. + #[error("local qbft peer missing: {peer_id}")] + LocalPeerMissing { + /// Missing local peer ID. + peer_id: PeerId, + }, + + /// Behaviour command channel is closed. + #[error("qbft p2p behaviour is no longer running")] + BehaviourClosed, +} + +/// Event emitted by the QBFT libp2p adapter. +#[derive(Debug)] +pub enum Event { + /// A broadcast command was queued for network delivery. + BroadcastQueued { + /// Broadcast request identifier. + request_id: u64, + /// Number of non-self target peers. + target_count: usize, + }, + /// A QBFT message was admitted from an inbound stream. + Received { + /// Remote peer. + peer: PeerId, + /// Connection that carried the stream. + connection: ConnectionId, + }, + /// Inbound stream read or admission failed. + InboundError { + /// Remote peer. + peer: PeerId, + /// Connection that carried the stream. + connection: ConnectionId, + /// Failure reason. + error: String, + }, + /// Outbound stream write completed. + Sent { + /// Broadcast request identifier. + request_id: u64, + /// Target peer. + peer: PeerId, + }, + /// Outbound stream write or dial failed. + SendError { + /// Broadcast request identifier. + request_id: u64, + /// Target peer. + peer: PeerId, + /// Failure reason. + error: String, + }, +} + +/// User-facing handle for QBFT outbound broadcasts. +#[derive(Clone, Debug)] +pub struct Handle { + cmd_tx: mpsc::UnboundedSender, + next_request_id: Arc, +} + +impl Handle { + /// Enqueues a QBFT message for async broadcast to every non-self peer. + pub async fn broadcast( + &self, + _ct: CancellationToken, + msg: pbconsensus::QbftConsensusMsg, + ) -> BroadcastResult { + let request_id = self.next_request_id.fetch_add(1, Ordering::Relaxed); + self.cmd_tx + .send(BroadcastCommand { request_id, msg }) + .map_err(|_| Box::new(Error::BehaviourClosed) as _) + } + + /// Returns a consensus broadcaster callback backed by this handle. + pub fn broadcaster(&self) -> super::Broadcaster { + let handle = self.clone(); + Arc::new(move |ct, msg| { + let handle = handle.clone(); + Box::pin(async move { handle.broadcast(ct, msg).await }) + }) + } +} + +#[derive(Debug)] +struct BroadcastCommand { + request_id: u64, + msg: pbconsensus::QbftConsensusMsg, +} + +#[doc(hidden)] +#[derive(Debug)] +pub enum ToHandler { + Send { + request_id: u64, + msg: pbconsensus::QbftConsensusMsg, + }, +} + +#[doc(hidden)] +#[derive(Debug)] +pub enum FromHandler { + Received, + InboundError(String), + Sent { request_id: u64 }, + SendError { request_id: u64, error: String }, +} + +type ActiveFuture = futures::future::BoxFuture<'static, Option>; + +/// Connection handler for the QBFT stream protocol. +pub struct Handler { + consensus: Arc, + cancellation: CancellationToken, + pending_open: VecDeque<(u64, pbconsensus::QbftConsensusMsg)>, + active_futures: futures::stream::FuturesUnordered, +} + +impl Handler { + /// Creates a stream handler bound to the consensus component. + fn new(consensus: Arc, cancellation: CancellationToken) -> Self { + Self { + consensus, + cancellation, + pending_open: VecDeque::new(), + active_futures: futures::stream::FuturesUnordered::new(), + } + } + + /// Reads an inbound stream and forwards the decoded message to admission. + fn handle_fully_negotiated_inbound(&mut self, mut stream: Stream) { + stream.ignore_for_keep_alive(); + let consensus = Arc::clone(&self.consensus); + let cancellation = self.cancellation.clone(); + self.active_futures.push( + async move { + Some( + match read_and_handle_inbound( + &mut stream, + consensus, + cancellation, + RECEIVE_TIMEOUT, + ) + .await + { + Ok(()) => FromHandler::Received, + Err(error) => FromHandler::InboundError(error), + }, + ) + } + .boxed(), + ); + } + + /// Writes one outbound consensus message to a negotiated stream. + fn handle_fully_negotiated_outbound( + &mut self, + mut stream: Stream, + request_id: u64, + msg: pbconsensus::QbftConsensusMsg, + ) { + stream.ignore_for_keep_alive(); + self.active_futures.push( + async move { + Some( + match write_outbound(&mut stream, request_id, &msg, SEND_TIMEOUT).await { + Ok(()) => FromHandler::Sent { request_id }, + Err(error) => FromHandler::SendError { request_id, error }, + }, + ) + } + .boxed(), + ); + } + + /// Converts outbound stream upgrade failure into a behaviour event. + fn handle_dial_upgrade_error(&mut self, request_id: u64, error: StreamUpgradeError) + where + E: std::error::Error + Send + Sync + 'static, + { + let error = match error { + StreamUpgradeError::NegotiationFailed => "protocol negotiation failed".to_string(), + StreamUpgradeError::Timeout => "operation timed out".to_string(), + StreamUpgradeError::Io(error) => error.to_string(), + StreamUpgradeError::Apply(error) => error.to_string(), + }; + self.active_futures + .push(async move { Some(FromHandler::SendError { request_id, error }) }.boxed()); + } +} + +impl ConnectionHandler for Handler { + type FromBehaviour = ToHandler; + type InboundOpenInfo = (); + type InboundProtocol = ReadyUpgrade; + type OutboundOpenInfo = (u64, pbconsensus::QbftConsensusMsg); + type OutboundProtocol = ReadyUpgrade; + type ToBehaviour = FromHandler; + + /// Advertises the single QBFT stream protocol. + fn listen_protocol(&self) -> SubstreamProtocol { + SubstreamProtocol::new(ReadyUpgrade::new(protocol_id()), ()) + } + + /// Queues a behaviour send request until libp2p opens a stream. + fn on_behaviour_event(&mut self, event: Self::FromBehaviour) { + match event { + ToHandler::Send { request_id, msg } => self.pending_open.push_back((request_id, msg)), + } + } + + /// Drives pending stream opens and completed read/write futures. + fn poll( + &mut self, + cx: &mut Context<'_>, + ) -> Poll< + ConnectionHandlerEvent, + > { + if let Some(open_info) = self.pending_open.pop_front() { + return Poll::Ready(ConnectionHandlerEvent::OutboundSubstreamRequest { + protocol: SubstreamProtocol::new(ReadyUpgrade::new(protocol_id()), open_info), + }); + } + + while let Poll::Ready(Some(event)) = self.active_futures.poll_next_unpin(cx) { + if let Some(event) = event { + return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(event)); + } + } + + Poll::Pending + } + + /// Routes negotiated streams and stream-open errors to handler helpers. + fn on_connection_event( + &mut self, + event: ConnectionEvent< + Self::InboundProtocol, + Self::OutboundProtocol, + Self::InboundOpenInfo, + Self::OutboundOpenInfo, + >, + ) { + match event { + ConnectionEvent::FullyNegotiatedInbound(FullyNegotiatedInbound { + protocol, .. + }) => self.handle_fully_negotiated_inbound(protocol), + ConnectionEvent::FullyNegotiatedOutbound(FullyNegotiatedOutbound { + protocol, + info: (request_id, msg), + .. + }) => self.handle_fully_negotiated_outbound(protocol, request_id, msg), + ConnectionEvent::DialUpgradeError(DialUpgradeError { + info: (request_id, _), + error, + }) => self.handle_dial_upgrade_error(request_id, error), + _ => {} + } + } +} + +/// Reads one inbound protobuf frame and passes it to consensus admission. +async fn read_and_handle_inbound( + stream: &mut S, + consensus: Arc, + cancellation: CancellationToken, + receive_timeout: Duration, +) -> Result<(), String> +where + S: AsyncRead + AsyncWrite + Unpin, +{ + let result = timeout(receive_timeout, async { + let msg = + pluto_p2p::proto::read_protobuf_with_max_size::( + stream, + pluto_p2p::proto::MAX_MESSAGE_SIZE, + ) + .await + .map_err(|error| error.to_string())?; + + consensus + .handle(&cancellation, Some(msg)) + .await + .map_err(|error| error.to_string()) + }) + .await; + + close_stream(stream).await; + + match result { + Ok(result) => result, + Err(error) => Err(timeout_error(error)), + } +} + +/// Writes one outbound protobuf frame and closes the stream. +async fn write_outbound( + stream: &mut S, + request_id: u64, + msg: &pbconsensus::QbftConsensusMsg, + send_timeout: Duration, +) -> Result<(), String> +where + S: AsyncWrite + Unpin, +{ + let result = timeout(send_timeout, async { + pluto_p2p::proto::write_protobuf(stream, msg) + .await + .map_err(|error| error.to_string())?; + match stream.close().await { + Ok(()) => Ok(()), + Err(error) if is_ignorable_close_error(&error) => Ok(()), + Err(error) => Err(error.to_string()), + } + }) + .await; + + match result { + Ok(Ok(())) => Ok(()), + Ok(Err(error)) => Err(error.to_string()), + Err(error) => Err(format!("request {request_id}: {}", timeout_error(error))), + } +} + +/// Returns true for stream-close errors caused by already-cancelled streams. +fn is_ignorable_close_error(error: &std::io::Error) -> bool { + error + .to_string() + .contains("close called for canceled stream") +} + +/// Best-effort closes a stream after inbound reads. +async fn close_stream(stream: &mut S) +where + S: AsyncWrite + Unpin, +{ + if let Err(error) = stream.close().await { + debug!(%error, "failed to close qbft p2p stream"); + } +} + +/// Formats libp2p timeout errors consistently. +fn timeout_error(_error: Elapsed) -> String { + "operation timed out".to_string() +} + +#[derive(Debug)] +struct PendingSend { + request_id: u64, + msg: pbconsensus::QbftConsensusMsg, +} + +/// libp2p behaviour for QBFT consensus messages. +pub struct Behaviour { + config: Config, + cmd_rx: mpsc::UnboundedReceiver, + pending_events: VecDeque>, + pending_by_peer: HashMap>, +} + +impl Behaviour { + /// Creates a behaviour and its outbound broadcast handle. + pub fn new(config: Config) -> Result<(Self, Handle), Error> { + if !config.peers.contains(&config.local_peer_id) { + return Err(Error::LocalPeerMissing { + peer_id: config.local_peer_id, + }); + } + + let (cmd_tx, cmd_rx) = mpsc::unbounded_channel(); + let handle = Handle { + cmd_tx, + next_request_id: Arc::new(AtomicU64::new(0)), + }; + + Ok(( + Self { + config, + cmd_rx, + pending_events: VecDeque::new(), + pending_by_peer: HashMap::new(), + }, + handle, + )) + } + + /// Returns a real QBFT handler only for configured cluster peers. + fn connection_handler_for_peer(&self, peer_id: PeerId) -> THandler { + if self.config.peers.contains(&peer_id) { + Either::Left(Handler::new( + Arc::clone(&self.config.consensus), + self.config.cancellation.clone(), + )) + } else { + Either::Right(dummy::ConnectionHandler) + } + } + + /// Returns whether the peer store has any live connection for the peer. + fn is_connected(&self, peer_id: &PeerId) -> bool { + !self + .config + .p2p_context + .peer_store_lock() + .connections_to_peer(peer_id) + .is_empty() + } + + /// Drains outbound broadcast commands queued through the public handle. + fn drain_commands(&mut self, cx: &mut Context<'_>) { + while let Poll::Ready(Some(command)) = self.cmd_rx.poll_recv(cx) { + self.handle_broadcast(command); + } + } + + /// Fans a broadcast command out to every non-self peer. + fn handle_broadcast(&mut self, command: BroadcastCommand) { + let mut target_count = 0usize; + for peer_id in self.config.peers.clone() { + if peer_id == self.config.local_peer_id { + continue; + } + + target_count = target_count.saturating_add(1); + self.enqueue_send( + peer_id, + PendingSend { + request_id: command.request_id, + msg: command.msg.clone(), + }, + ); + } + + self.pending_events + .push_back(ToSwarm::GenerateEvent(Event::BroadcastQueued { + request_id: command.request_id, + target_count, + })); + } + + /// Sends immediately to connected peers or queues a dial first. + fn enqueue_send(&mut self, peer_id: PeerId, pending: PendingSend) { + if self.is_connected(&peer_id) { + self.pending_events.push_back(ToSwarm::NotifyHandler { + peer_id, + handler: NotifyHandler::Any, + event: ToHandler::Send { + request_id: pending.request_id, + msg: pending.msg, + }, + }); + return; + } + + self.pending_by_peer + .entry(peer_id) + .or_default() + .push_back(pending); + self.pending_events.push_back(ToSwarm::Dial { + opts: DialOpts::peer_id(peer_id) + .condition(PeerCondition::DisconnectedAndNotDialing) + .build(), + }); + } + + /// Emits all queued sends for a peer after connection establishment. + fn flush_pending_for_peer(&mut self, peer_id: PeerId) { + let Some(mut pending) = self.pending_by_peer.remove(&peer_id) else { + return; + }; + + while let Some(pending) = pending.pop_front() { + self.pending_events.push_back(ToSwarm::NotifyHandler { + peer_id, + handler: NotifyHandler::Any, + event: ToHandler::Send { + request_id: pending.request_id, + msg: pending.msg, + }, + }); + } + } + + /// Converts queued sends for an unreachable peer into send errors. + fn fail_pending_for_peer(&mut self, peer_id: PeerId, error: String) { + let Some(pending) = self.pending_by_peer.remove(&peer_id) else { + return; + }; + + for pending in pending { + self.pending_events + .push_back(ToSwarm::GenerateEvent(Event::SendError { + request_id: pending.request_id, + peer: peer_id, + error: error.clone(), + })); + } + } + + /// Handles dial failures without dropping sends that libp2p is still + /// dialing. + fn on_dial_failure(&mut self, peer_id: PeerId, error: &DialError) { + if self.is_connected(&peer_id) { + self.flush_pending_for_peer(peer_id); + return; + } + + if matches!(error, DialError::DialPeerConditionFalse(_)) { + return; + } + + self.fail_pending_for_peer(peer_id, error.to_string()); + } +} + +impl NetworkBehaviour for Behaviour { + type ConnectionHandler = Either; + type ToSwarm = Event; + + /// Creates the per-connection handler for accepted inbound connections. + fn handle_established_inbound_connection( + &mut self, + _connection_id: ConnectionId, + peer: PeerId, + _local_addr: &Multiaddr, + _remote_addr: &Multiaddr, + ) -> Result, ConnectionDenied> { + Ok(self.connection_handler_for_peer(peer)) + } + + /// Supplies queued peer-store addresses for outbound dials. + fn handle_pending_outbound_connection( + &mut self, + _connection_id: ConnectionId, + maybe_peer: Option, + _addresses: &[Multiaddr], + _effective_role: libp2p::core::Endpoint, + ) -> Result, ConnectionDenied> { + let Some(peer_id) = maybe_peer else { + return Ok(vec![]); + }; + + Ok(self + .config + .p2p_context + .peer_store_lock() + .peer_addresses(&peer_id) + .cloned() + .unwrap_or_default()) + } + + /// Creates the per-connection handler for established outbound connections. + fn handle_established_outbound_connection( + &mut self, + _connection_id: ConnectionId, + peer: PeerId, + _addr: &Multiaddr, + _role_override: libp2p::core::Endpoint, + _port_use: libp2p::core::transport::PortUse, + ) -> Result, ConnectionDenied> { + Ok(self.connection_handler_for_peer(peer)) + } + + /// Flushes or fails pending sends based on swarm connection events. + fn on_swarm_event(&mut self, event: FromSwarm) { + match event { + FromSwarm::ConnectionEstablished(event) => { + self.flush_pending_for_peer(event.peer_id); + } + FromSwarm::DialFailure(event) => { + if let Some(peer_id) = event.peer_id { + self.on_dial_failure(peer_id, event.error); + } + } + _ => {} + } + } + + /// Converts handler read/write outcomes into behaviour events. + fn on_connection_handler_event( + &mut self, + peer_id: PeerId, + connection_id: ConnectionId, + event: THandlerOutEvent, + ) { + let event = match event { + Either::Left(event) => event, + Either::Right(unreachable) => match unreachable {}, + }; + + match event { + FromHandler::Received => { + self.pending_events + .push_back(ToSwarm::GenerateEvent(Event::Received { + peer: peer_id, + connection: connection_id, + })); + } + FromHandler::InboundError(error) => { + warn!(%peer_id, %error, "dropping invalid qbft p2p message"); + self.pending_events + .push_back(ToSwarm::GenerateEvent(Event::InboundError { + peer: peer_id, + connection: connection_id, + error, + })); + } + FromHandler::Sent { request_id } => { + self.pending_events + .push_back(ToSwarm::GenerateEvent(Event::Sent { + request_id, + peer: peer_id, + })); + } + FromHandler::SendError { request_id, error } => { + warn!(%peer_id, %error, "failed to send qbft p2p message"); + self.pending_events + .push_back(ToSwarm::GenerateEvent(Event::SendError { + request_id, + peer: peer_id, + error, + })); + } + } + } + + /// Polls command input first, then emits one pending swarm action. + fn poll( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + self.drain_commands(cx); + + if let Some(event) = self.pending_events.pop_front() { + return Poll::Ready(event.map_in(Either::Left)); + } + + Poll::Pending + } +} + +#[cfg(test)] +mod tests { + use std::{ + collections::HashSet, + error::Error as StdError, + fs, + path::{Path, PathBuf}, + process::Stdio, + task::{Context, Poll}, + time::{SystemTime, UNIX_EPOCH}, + }; + + use futures::{StreamExt as _, io::Cursor, task::noop_waker}; + use k256::SecretKey; + use libp2p::{ + Multiaddr, PeerId, + identity::Keypair, + multiaddr::Protocol, + swarm::{ + ConnectionId, DialError, DialFailure, NetworkBehaviour, SwarmEvent, ToSwarm, + dial_opts::PeerCondition, + }, + }; + use prost::bytes::Bytes; + use tokio::{ + io::{AsyncBufReadExt, BufReader, Lines}, + process::{Child, ChildStdout, Command}, + sync::{mpsc, oneshot}, + }; + + use crate::{ + protocols::QBFT_V2_PROTOCOL_ID, + qbft::{ + component::{ + Peer, + tests::{config_base, consensus, duty, secret_key}, + }, + msg, + }, + }; + use pluto_core::{ + corepb::v1::{consensus as pbconsensus, core as pbcore}, + qbft::{self, SomeMsg}, + }; + use pluto_p2p::{ + behaviours::pluto::PlutoBehaviourEvent, + config::P2PConfig, + p2p::{Node, NodeType}, + p2p_context::{P2PContext, Peer as StoredPeer}, + }; + + use super::*; + + const TEST_TIMEOUT: Duration = Duration::from_secs(10); + const GO_INTEROP_TIMEOUT: Duration = Duration::from_secs(60); + + type TestResult = Result>; + + #[test] + fn protocol_id_matches_qbft_v2() { + assert_eq!(protocol_id().to_string(), QBFT_V2_PROTOCOL_ID); + } + + #[tokio::test] + async fn inbound_handler_decodes_and_calls_consensus_handle() -> TestResult<()> { + let consensus = Arc::new(consensus(0, true)); + let duty = duty(); + let mut recv_rx = consensus.get_instance_io(duty.clone()).take_recv_rx()?; + let msg = signed_consensus_msg(&duty, 1)?; + let mut stream = Cursor::new(Vec::new()); + pluto_p2p::proto::write_protobuf(&mut stream, &msg).await?; + stream.set_position(0); + + read_and_handle_inbound( + &mut stream, + Arc::clone(&consensus), + CancellationToken::new(), + RECEIVE_TIMEOUT, + ) + .await + .map_err(std::io::Error::other)?; + + let received = tokio::time::timeout(TEST_TIMEOUT, recv_rx.recv()) + .await? + .ok_or_else(|| std::io::Error::other("receive buffer closed"))?; + assert_eq!(received.msg().peer_idx, 1); + Ok(()) + } + + #[tokio::test] + async fn outbound_broadcast_skips_self_and_targets_non_self_peers() -> TestResult<()> { + let keys = test_keys()?; + let peer_ids = peer_ids(&keys)?; + let local_peer_id = peer_ids[1]; + let p2p_context = connected_context(&peer_ids)?; + let (mut behaviour, handle) = Behaviour::new(Config { + consensus: Arc::new(consensus(1, true)), + p2p_context, + peers: peer_ids.clone(), + local_peer_id, + cancellation: CancellationToken::new(), + })?; + + handle + .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 1)?) + .await?; + + let events = drain_behaviour_events(&mut behaviour); + let targets = events + .iter() + .filter_map(|event| match event { + ToSwarm::NotifyHandler { + peer_id, + event: Either::Left(ToHandler::Send { .. }), + .. + } => Some(*peer_id), + _ => None, + }) + .collect::>(); + let queued = events.iter().find_map(|event| match event { + ToSwarm::GenerateEvent(Event::BroadcastQueued { target_count, .. }) => { + Some(*target_count) + } + _ => None, + }); + + assert_eq!(queued, Some(2)); + assert_eq!(targets.len(), 2); + assert!(targets.contains(&peer_ids[0])); + assert!(targets.contains(&peer_ids[2])); + assert!(!targets.contains(&local_peer_id)); + Ok(()) + } + + #[tokio::test] + async fn dial_peer_condition_false_preserves_pending_send() -> TestResult<()> { + let keys = test_keys()?; + let peer_ids = peer_ids(&keys)?[..2].to_vec(); + let local_peer_id = peer_ids[0]; + let target = peer_ids[1]; + let (mut behaviour, handle) = Behaviour::new(Config { + consensus: Arc::new(consensus(0, true)), + p2p_context: P2PContext::new(peer_ids.iter().copied()), + peers: peer_ids, + local_peer_id, + cancellation: CancellationToken::new(), + })?; + handle + .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 0)?) + .await?; + let _ = drain_behaviour_events(&mut behaviour); + + let error = DialError::DialPeerConditionFalse(PeerCondition::DisconnectedAndNotDialing); + behaviour.on_swarm_event(FromSwarm::DialFailure(DialFailure { + peer_id: Some(target), + error: &error, + connection_id: ConnectionId::new_unchecked(1), + })); + let events = drain_behaviour_events(&mut behaviour); + + assert!(behaviour.pending_by_peer.contains_key(&target)); + assert!(!events.iter().any(|event| { + matches!( + event, + ToSwarm::GenerateEvent(Event::SendError { peer, .. }) if *peer == target + ) + })); + Ok(()) + } + + #[tokio::test] + async fn terminal_dial_failure_reports_pending_send_error() -> TestResult<()> { + let keys = test_keys()?; + let peer_ids = peer_ids(&keys)?[..2].to_vec(); + let local_peer_id = peer_ids[0]; + let target = peer_ids[1]; + let (mut behaviour, handle) = Behaviour::new(Config { + consensus: Arc::new(consensus(0, true)), + p2p_context: P2PContext::new(peer_ids.iter().copied()), + peers: peer_ids, + local_peer_id, + cancellation: CancellationToken::new(), + })?; + handle + .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 0)?) + .await?; + let _ = drain_behaviour_events(&mut behaviour); + + let error = DialError::NoAddresses; + behaviour.on_swarm_event(FromSwarm::DialFailure(DialFailure { + peer_id: Some(target), + error: &error, + connection_id: ConnectionId::new_unchecked(1), + })); + let events = drain_behaviour_events(&mut behaviour); + + assert!(!behaviour.pending_by_peer.contains_key(&target)); + assert!(events.iter().any(|event| { + matches!( + event, + ToSwarm::GenerateEvent(Event::SendError { peer, .. }) if *peer == target + ) + })); + Ok(()) + } + + #[tokio::test] + async fn framing_round_trips_qbft_consensus_msg() -> TestResult<()> { + let msg = signed_consensus_msg(&duty(), 1)?; + let mut stream = Cursor::new(Vec::new()); + + pluto_p2p::proto::write_protobuf(&mut stream, &msg).await?; + stream.set_position(0); + let decoded = pluto_p2p::proto::read_protobuf_with_max_size::< + pbconsensus::QbftConsensusMsg, + _, + >(&mut stream, pluto_p2p::proto::MAX_MESSAGE_SIZE) + .await?; + + assert_eq!(decoded, msg); + Ok(()) + } + + #[tokio::test] + async fn real_libp2p_loopback_uses_stream_framing() -> TestResult<()> { + let keys = test_keys()?; + let peer_ids = peer_ids(&keys)?; + let mut nodes = build_nodes(keys, peer_ids.clone())?; + let mut node0_recv = nodes + .get_mut(0) + .and_then(|node| node.recv_rx.take()) + .ok_or_else(|| std::io::Error::other("missing node 0 receiver"))?; + let mut node1_recv = nodes + .get_mut(1) + .and_then(|node| node.recv_rx.take()) + .ok_or_else(|| std::io::Error::other("missing node 1 receiver"))?; + let handle = nodes + .first() + .map(|node| node.handle.clone()) + .ok_or_else(|| std::io::Error::other("missing node 0 handle"))?; + + let (listen_tx, mut listen_rx) = mpsc::unbounded_channel(); + let (conn_tx, mut conn_rx) = mpsc::unbounded_channel(); + let (event_tx, mut event_rx) = mpsc::unbounded_channel(); + let (task_err_tx, mut task_err_rx) = mpsc::unbounded_channel(); + let running = spawn_nodes(nodes, listen_tx, conn_tx, event_tx, task_err_tx)?; + let addrs = wait_for_listen_addrs(&mut listen_rx, &mut task_err_rx).await?; + dial_forward_pairs(&running, &addrs)?; + wait_for_connections(&mut conn_rx, &peer_ids).await?; + + let network_msg = signed_consensus_msg(&duty(), 0)?; + handle + .broadcast(CancellationToken::new(), network_msg.clone()) + .await?; + + wait_for_event(&mut event_rx, 1, |event| { + matches!(event, Event::Received { .. }) + }) + .await?; + let received = tokio::time::timeout(TEST_TIMEOUT, node1_recv.recv()) + .await? + .ok_or_else(|| std::io::Error::other("node 1 receive buffer closed"))?; + + assert_eq!( + received.msg(), + network_msg.msg.as_ref().ok_or_else(|| { + std::io::Error::other("test message missing inner qbft message") + })? + ); + assert!(matches!( + node0_recv.try_recv(), + Err(tokio::sync::mpsc::error::TryRecvError::Empty) + )); + + stop_nodes(running).await?; + Ok(()) + } + + #[tokio::test] + #[ignore = "requires local Charon source, Go toolchain, and local TCP sockets"] + async fn mixed_charon_pluto_libp2p_interop() -> TestResult<()> { + let keys = test_keys_n(4)?; + let peer_ids = peer_ids(&keys)?; + let mut nodes = build_pluto_nodes(keys[..2].to_vec(), peer_ids.clone())?; + let mut node0_recv = nodes + .get_mut(0) + .and_then(|node| node.recv_rx.take()) + .ok_or_else(|| std::io::Error::other("missing node 0 receiver"))?; + let mut node1_recv = nodes + .get_mut(1) + .and_then(|node| node.recv_rx.take()) + .ok_or_else(|| std::io::Error::other("missing node 1 receiver"))?; + let handle0 = nodes + .first() + .map(|node| node.handle.clone()) + .ok_or_else(|| std::io::Error::other("missing node 0 handle"))?; + let handle1 = nodes + .get(1) + .map(|node| node.handle.clone()) + .ok_or_else(|| std::io::Error::other("missing node 1 handle"))?; + + let (listen_tx, mut listen_rx) = mpsc::unbounded_channel(); + let (conn_tx, mut conn_rx) = mpsc::unbounded_channel(); + let (event_tx, mut event_rx) = mpsc::unbounded_channel(); + let (task_err_tx, mut task_err_rx) = mpsc::unbounded_channel(); + let running = spawn_nodes(nodes, listen_tx, conn_tx, event_tx, task_err_tx)?; + let rust_addrs = wait_for_listen_addrs(&mut listen_rx, &mut task_err_rx).await?; + + let harness_dir = write_go_interop_harness()?; + let mut child = spawn_go_interop(&harness_dir, &rust_addrs)?; + let stdout = child + .stdout + .take() + .ok_or_else(|| std::io::Error::other("missing go harness stdout"))?; + let mut go_lines = BufReader::new(stdout).lines(); + + let result = + async { + let go_addrs = wait_for_go_ready(&mut go_lines).await?; + dial_go_peers(&running, &go_addrs)?; + wait_for_specific_connections(&mut conn_rx, &[0, 1], &peer_ids[2..4]).await?; + + wait_for_sources(&mut node0_recv, &mut event_rx, 0, &[2, 3]).await?; + wait_for_sources(&mut node1_recv, &mut event_rx, 1, &[2, 3]).await?; + + handle0 + .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 0)?) + .await?; + handle1 + .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 1)?) + .await?; + + wait_for_event(&mut event_rx, 0, |event| { + matches!(event, Event::Sent { peer, .. } if peer_ids[2..4].contains(peer)) + }) + .await?; + wait_for_event(&mut event_rx, 1, |event| { + matches!(event, Event::Sent { peer, .. } if peer_ids[2..4].contains(peer)) + }) + .await?; + wait_for_go_done(&mut go_lines).await + } + .await; + + drop(go_lines); + let status = finish_go_interop(&mut child, result.is_err()).await; + let cleanup = fs::remove_dir_all(&harness_dir); + stop_nodes(running).await?; + result?; + status?; + cleanup?; + Ok(()) + } + + struct LocalNode { + node: Node, + handle: Handle, + recv_rx: Option>, + } + + struct RunningNode { + dial_tx: mpsc::UnboundedSender>, + stop_tx: oneshot::Sender<()>, + join: tokio::task::JoinHandle>, + } + + fn build_nodes(keys: Vec, peer_ids: Vec) -> TestResult> { + build_pluto_nodes(keys.into_iter().take(2).collect(), peer_ids) + } + + fn build_pluto_nodes( + keys: Vec, + peer_ids: Vec, + ) -> TestResult> { + let mut nodes = Vec::with_capacity(2); + for (index, key) in keys.into_iter().enumerate() { + let p2p_context = P2PContext::new(peer_ids.iter().copied()); + let consensus = Arc::new(consensus_for_cluster(index, peer_ids.len(), true)?); + let mut recv_rx = Some(consensus.get_instance_io(duty()).take_recv_rx()?); + let (behaviour, handle) = Behaviour::new(Config { + consensus, + p2p_context: p2p_context.clone(), + peers: peer_ids.clone(), + local_peer_id: peer_ids[index], + cancellation: CancellationToken::new(), + })?; + let node = Node::new_server( + P2PConfig::default(), + key, + NodeType::TCP, + false, + p2p_context, + None, + move |builder, _keypair| builder.with_inner(behaviour), + )?; + + nodes.push(LocalNode { + node, + handle, + recv_rx: recv_rx.take(), + }); + } + + Ok(nodes) + } + + fn consensus_for_cluster( + local_peer_idx: usize, + peer_count: usize, + duty_allowed: bool, + ) -> TestResult { + let mut config = config_base(false); + config.peers = (0..peer_count) + .map(|index| { + let seed = u8::try_from( + index + .checked_add(1) + .ok_or_else(|| std::io::Error::other("peer index overflow"))?, + )?; + Ok(Peer { + index: i64::try_from(index)?, + name: format!("node-{index}"), + public_key: test_secret_key(seed)?.public_key(), + }) + }) + .collect::>>()?; + config.local_peer_idx = i64::try_from(local_peer_idx)?; + let seed = u8::try_from( + local_peer_idx + .checked_add(1) + .ok_or_else(|| std::io::Error::other("local peer index overflow"))?, + )?; + config.privkey = test_secret_key(seed)?; + config.duty_gater = Arc::new(move |_| duty_allowed); + + Consensus::new(config).map_err(|error| Box::new(error) as _) + } + + fn spawn_nodes( + nodes: Vec, + listen_tx: mpsc::UnboundedSender<(usize, Multiaddr)>, + conn_tx: mpsc::UnboundedSender<(usize, PeerId)>, + event_tx: mpsc::UnboundedSender<(usize, Event)>, + task_err_tx: mpsc::UnboundedSender<(usize, String)>, + ) -> TestResult> { + let mut running = Vec::with_capacity(nodes.len()); + + for (index, local) in nodes.into_iter().enumerate() { + let mut node = local.node; + let listen_tx = listen_tx.clone(); + let conn_tx = conn_tx.clone(); + let event_tx = event_tx.clone(); + let task_err_tx = task_err_tx.clone(); + let (dial_tx, mut dial_rx) = mpsc::unbounded_channel::>(); + let (stop_tx, mut stop_rx) = oneshot::channel(); + + let join = tokio::spawn(async move { + let result: TestResult<()> = async { + node.listen_on("/ip4/127.0.0.1/tcp/0".parse()?)?; + + loop { + tokio::select! { + _ = &mut stop_rx => break, + Some(targets) = dial_rx.recv() => { + for target in targets { + node.dial(target)?; + } + } + event = node.select_next_some() => { + match event { + SwarmEvent::NewListenAddr { address, .. } => { + let _ = listen_tx.send((index, address)); + } + SwarmEvent::ConnectionEstablished { peer_id, .. } => { + let _ = conn_tx.send((index, peer_id)); + } + SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(event)) => { + let _ = event_tx.send((index, event)); + } + _ => {} + } + } + } + } + + Ok(()) + } + .await; + + if let Err(error) = &result { + let _ = task_err_tx.send((index, format!("{error:?}"))); + } + + result + }); + + running.push(RunningNode { + dial_tx, + stop_tx, + join, + }); + } + + Ok(running) + } + + async fn wait_for_listen_addrs( + listen_rx: &mut mpsc::UnboundedReceiver<(usize, Multiaddr)>, + task_err_rx: &mut mpsc::UnboundedReceiver<(usize, String)>, + ) -> TestResult> { + tokio::time::timeout(GO_INTEROP_TIMEOUT, async { + let mut addrs = vec![None, None]; + while addrs.iter().any(Option::is_none) { + tokio::select! { + result = listen_rx.recv() => { + let (index, addr) = result + .ok_or_else(|| std::io::Error::other("listen channel closed"))?; + if index < addrs.len() && addrs[index].is_none() { + addrs[index] = Some(addr); + } + } + result = task_err_rx.recv() => { + let (index, error) = result + .ok_or_else(|| std::io::Error::other("node task error channel closed"))?; + return Err(Box::new(std::io::Error::other(format!( + "node {index} exited before listen: {error}" + ))) as Box); + } + } + } + + addrs + .into_iter() + .map(|addr| { + addr.ok_or_else(|| { + Box::new(std::io::Error::other("missing listen address")) + as Box + }) + }) + .collect() + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for listen addresses"))? + } + + fn dial_forward_pairs(running: &[RunningNode], addrs: &[Multiaddr]) -> TestResult<()> { + for (index, node) in running.iter().enumerate() { + let targets = addrs + .iter() + .enumerate() + .filter(|(other, _)| *other > index) + .map(|(_, addr)| addr.clone()) + .collect::>(); + node.dial_tx.send(targets)?; + } + + Ok(()) + } + + async fn wait_for_connections( + conn_rx: &mut mpsc::UnboundedReceiver<(usize, PeerId)>, + peer_ids: &[PeerId], + ) -> TestResult<()> { + tokio::time::timeout(GO_INTEROP_TIMEOUT, async { + let mut seen = [HashSet::new(), HashSet::new()]; + while seen.iter().any(|peers| peers.is_empty()) { + let (index, peer_id) = conn_rx + .recv() + .await + .ok_or_else(|| std::io::Error::other("connection channel closed"))?; + if index < seen.len() && peer_ids.contains(&peer_id) { + seen[index].insert(peer_id); + } + } + + Ok(()) + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for loopback connections"))? + } + + async fn wait_for_specific_connections( + conn_rx: &mut mpsc::UnboundedReceiver<(usize, PeerId)>, + node_indices: &[usize], + expected_peers: &[PeerId], + ) -> TestResult<()> { + tokio::time::timeout(TEST_TIMEOUT, async { + let mut seen = vec![HashSet::new(); node_indices.len()]; + while seen.iter().any(|peers| peers.len() < expected_peers.len()) { + let (index, peer_id) = conn_rx + .recv() + .await + .ok_or_else(|| std::io::Error::other("connection channel closed"))?; + if let Some(position) = node_indices.iter().position(|node| *node == index) + && expected_peers.contains(&peer_id) + { + seen[position].insert(peer_id); + } + } + + Ok(()) + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for Go peer connections"))? + } + + async fn wait_for_sources( + recv_rx: &mut mpsc::Receiver, + event_rx: &mut mpsc::UnboundedReceiver<(usize, Event)>, + node_index: usize, + expected_sources: &[i64], + ) -> TestResult<()> { + tokio::time::timeout(TEST_TIMEOUT, async { + let mut seen = HashSet::new(); + while seen.len() < expected_sources.len() { + tokio::select! { + msg = recv_rx.recv() => { + let msg = msg.ok_or_else(|| std::io::Error::other("receive buffer closed"))?; + if expected_sources.contains(&msg.source()) { + seen.insert(msg.source()); + } + } + event = event_rx.recv() => { + let (index, event) = event.ok_or_else(|| std::io::Error::other("event channel closed"))?; + if index == node_index + && let Event::InboundError { error, .. } = event + { + return Err(Box::new(std::io::Error::other(error)) + as Box); + } + } + } + } + + Ok(()) + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for Charon inbound messages"))? + } + + async fn wait_for_event( + event_rx: &mut mpsc::UnboundedReceiver<(usize, Event)>, + node_index: usize, + predicate: impl Fn(&Event) -> bool, + ) -> TestResult<()> { + tokio::time::timeout(TEST_TIMEOUT, async { + loop { + let (index, event) = event_rx + .recv() + .await + .ok_or_else(|| std::io::Error::other("event channel closed"))?; + if index == node_index && predicate(&event) { + return Ok(()); + } + } + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for QBFT p2p event"))? + } + + async fn stop_nodes(running: Vec) -> TestResult<()> { + for node in running { + let _ = node.stop_tx.send(()); + node.join.await??; + } + + Ok(()) + } + + fn drain_behaviour_events( + behaviour: &mut Behaviour, + ) -> Vec>> { + let waker = noop_waker(); + let mut cx = Context::from_waker(&waker); + let mut events = Vec::new(); + + while let Poll::Ready(event) = NetworkBehaviour::poll(behaviour, &mut cx) { + events.push(event); + } + + events + } + + fn connected_context(peer_ids: &[PeerId]) -> TestResult { + let context = P2PContext::new(peer_ids.iter().copied()); + for (index, peer_id) in peer_ids.iter().copied().enumerate() { + let connection_index = index + .checked_add(1) + .ok_or_else(|| std::io::Error::other("connection index overflow"))?; + context.peer_store_write_lock().add_peer(StoredPeer { + id: peer_id, + connection_id: ConnectionId::new_unchecked(connection_index), + remote_addr: Multiaddr::empty() + .with(Protocol::Memory(u64::try_from(connection_index)?)), + }); + } + + Ok(context) + } + + fn test_keys() -> TestResult> { + test_keys_n(3) + } + + fn test_keys_n(count: u8) -> TestResult> { + let mut keys = Vec::with_capacity(usize::from(count)); + for seed in 1..=count { + keys.push(match seed { + 1 => secret_key(1), + 2 => secret_key(2), + _ => test_secret_key(seed)?, + }); + } + + Ok(keys) + } + + fn test_secret_key(seed: u8) -> TestResult { + SecretKey::from_slice(&[seed; 32]).map_err(|error| Box::new(error) as _) + } + + fn peer_ids(keys: &[SecretKey]) -> TestResult> { + keys.iter().map(peer_id).collect() + } + + fn peer_id(key: &SecretKey) -> TestResult { + let mut der = key.to_sec1_der()?; + Ok(Keypair::secp256k1_from_der(&mut der)?.public().to_peer_id()) + } + + fn signed_consensus_msg( + duty: &pluto_core::types::Duty, + peer_idx: i64, + ) -> TestResult { + let key = match peer_idx { + 0 => secret_key(1), + 1 => secret_key(2), + _ => test_secret_key(u8::try_from( + peer_idx + .checked_add(1) + .ok_or_else(|| std::io::Error::other("peer index overflow"))?, + )?)?, + }; + let msg = pbconsensus::QbftMsg { + r#type: i64::from(qbft::MSG_PREPARE), + duty: Some(pbcore::Duty::try_from(duty)?), + peer_idx, + round: 1, + value_hash: Bytes::new(), + prepared_value_hash: Bytes::new(), + ..Default::default() + }; + + Ok(pbconsensus::QbftConsensusMsg { + msg: Some(msg::sign_msg(&msg, &key)?), + justification: Vec::new(), + values: Vec::new(), + }) + } + + type GoLines = Lines>; + + fn dial_go_peers(running: &[RunningNode], go_addrs: &[Multiaddr]) -> TestResult<()> { + for node in running { + node.dial_tx.send(go_addrs.to_vec())?; + } + + Ok(()) + } + + async fn wait_for_go_ready(lines: &mut GoLines) -> TestResult> { + let line = read_go_line(lines, "READY ").await?; + line.strip_prefix("READY ") + .ok_or_else(|| std::io::Error::other("missing go ready prefix"))? + .split_whitespace() + .map(|addr| addr.parse().map_err(|error| Box::new(error) as _)) + .collect() + } + + async fn wait_for_go_done(lines: &mut GoLines) -> TestResult<()> { + tokio::time::timeout(GO_INTEROP_TIMEOUT, async { + loop { + let line = lines + .next_line() + .await? + .ok_or_else(|| std::io::Error::other("go harness stdout closed"))?; + if line == "DONE" { + return Ok(()); + } + } + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for Go DONE"))? + } + + async fn read_go_line(lines: &mut GoLines, prefix: &str) -> TestResult { + tokio::time::timeout(GO_INTEROP_TIMEOUT, async { + loop { + let line = lines + .next_line() + .await? + .ok_or_else(|| std::io::Error::other("go harness stdout closed"))?; + if line.starts_with(prefix) { + return Ok(line); + } + } + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for Go harness output"))? + } + + fn write_go_interop_harness() -> TestResult { + let charon_repo = charon_repo_path(); + if !charon_repo.join("go.mod").exists() { + return Err(Box::new(std::io::Error::other(format!( + "missing Charon repo at {}; set CHARON_REPO", + charon_repo.display() + )))); + } + + let mut dir = std::env::temp_dir(); + let timestamp = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); + dir.push(format!( + "pluto-qbft-interop-{}-{timestamp}", + std::process::id() + )); + fs::create_dir(&dir)?; + fs::write(dir.join("main.go"), GO_INTEROP_HARNESS)?; + + Ok(dir) + } + + fn charon_repo_path() -> PathBuf { + std::env::var("CHARON_REPO") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from("/Users/quangle/Documents/nethermind/obol/charon")) + } + + fn spawn_go_interop(harness_dir: &Path, rust_addrs: &[Multiaddr]) -> TestResult { + if rust_addrs.len() != 2 { + return Err(Box::new(std::io::Error::other("expected two rust addrs"))); + } + + Ok(Command::new("go") + .arg("run") + .arg(harness_dir.join("main.go")) + .arg(rust_addrs[0].to_string()) + .arg(rust_addrs[1].to_string()) + .current_dir(charon_repo_path()) + .env("GOWORK", "off") + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn()?) + } + + async fn finish_go_interop(child: &mut Child, kill: bool) -> TestResult<()> { + if kill { + let _ = child.kill().await; + } + + let status = tokio::time::timeout(GO_INTEROP_TIMEOUT, child.wait()).await??; + if !status.success() { + return Err(Box::new(std::io::Error::other(format!( + "go harness exited with {status}" + )))); + } + + Ok(()) + } + + const GO_INTEROP_HARNESS: &str = r#" +package main + +import ( + "bytes" + "context" + "encoding/hex" + "fmt" + "os" + "time" + + k1 "github.com/decred/dcrd/dcrec/secp256k1/v4" + ssz "github.com/ferranbt/fastssz" + "github.com/libp2p/go-libp2p" + libp2pcrypto "github.com/libp2p/go-libp2p/core/crypto" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/core/peerstore" + "github.com/libp2p/go-libp2p/p2p/security/noise" + "github.com/multiformats/go-multiaddr" + "github.com/obolnetwork/charon/app/k1util" + "github.com/obolnetwork/charon/core" + "github.com/obolnetwork/charon/core/consensus/protocols" + pbv1 "github.com/obolnetwork/charon/core/corepb/v1" + coreqbft "github.com/obolnetwork/charon/core/qbft" + "github.com/obolnetwork/charon/p2p" + "google.golang.org/protobuf/proto" +) + +type received struct { + node int + from int64 +} + +func main() { + if len(os.Args) != 3 { + panic("usage: go run . ") + } + + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + + keys := make([]*k1.PrivateKey, 4) + peerIDs := make([]peer.ID, 4) + pubkeys := make(map[int64]*k1.PublicKey, 4) + for i := range keys { + keyBytes := bytes.Repeat([]byte{byte(i + 1)}, 32) + keys[i] = k1.PrivKeyFromBytes(keyBytes) + priv := (*libp2pcrypto.Secp256k1PrivateKey)(keys[i]) + id, err := peer.IDFromPrivateKey(priv) + if err != nil { + panic(err) + } + peerIDs[i] = id + pubkeys[int64(i)] = keys[i].PubKey() + } + + rustAddrs := make([]multiaddr.Multiaddr, 2) + for i, arg := range os.Args[1:] { + addr, err := multiaddr.NewMultiaddr(arg) + if err != nil { + panic(err) + } + rustAddrs[i] = addr + } + + recvCh := make(chan received, 16) + hosts := make([]host.Host, 2) + for i := range hosts { + peerIdx := i + 2 + priv := (*libp2pcrypto.Secp256k1PrivateKey)(keys[peerIdx]) + h, err := libp2p.New( + libp2p.Identity(priv), + libp2p.Security(noise.ID, noise.New), + libp2p.ListenAddrStrings("/ip4/127.0.0.1/tcp/0"), + ) + if err != nil { + panic(err) + } + defer h.Close() + hosts[i] = h + + node := peerIdx + p2p.RegisterHandler("qbft-interop", h, protocols.QBFTv2ProtocolID, + func() proto.Message { return new(pbv1.QBFTConsensusMsg) }, + func(_ context.Context, _ peer.ID, req proto.Message) (proto.Message, bool, error) { + msg, ok := req.(*pbv1.QBFTConsensusMsg) + if !ok { + return nil, false, fmt.Errorf("unexpected request %T", req) + } + if err := verifyMsg(msg.GetMsg(), pubkeys); err != nil { + return nil, false, err + } + recvCh <- received{node: node, from: msg.GetMsg().GetPeerIdx()} + return nil, false, nil + }) + } + + goAddrs := make([]string, 2) + for i, h := range hosts { + if len(h.Addrs()) == 0 { + panic("go host has no listen address") + } + peerPart, err := multiaddr.NewMultiaddr("/p2p/" + h.ID().String()) + if err != nil { + panic(err) + } + goAddrs[i] = h.Addrs()[0].Encapsulate(peerPart).String() + } + fmt.Printf("READY %s %s\n", goAddrs[0], goAddrs[1]) + + for _, h := range hosts { + for i := range rustAddrs { + h.Peerstore().AddAddrs(peerIDs[i], []multiaddr.Multiaddr{rustAddrs[i]}, peerstore.PermanentAddrTTL) + } + } + + for i, h := range hosts { + peerIdx := int64(i + 2) + for target := 0; target < 2; target++ { + if err := p2p.Send(ctx, h, protocols.QBFTv2ProtocolID, peerIDs[target], signedConsensusMsg(peerIdx, keys[peerIdx])); err != nil { + panic(err) + } + } + } + fmt.Println("SENT") + + seen := map[int]map[int64]bool{ + 2: {}, + 3: {}, + } + for { + if seen[2][0] && seen[2][1] && seen[3][0] && seen[3][1] { + fmt.Println("DONE") + return + } + + select { + case <-ctx.Done(): + panic(ctx.Err()) + case recv := <-recvCh: + if recv.node == 2 || recv.node == 3 { + seen[recv.node][recv.from] = true + fmt.Printf("RECEIVED %d %d\n", recv.node, recv.from) + } + } + } +} + +func signedConsensusMsg(peerIdx int64, key *k1.PrivateKey) *pbv1.QBFTConsensusMsg { + msg := &pbv1.QBFTMsg{ + Type: int64(coreqbft.MsgPrepare), + Duty: &pbv1.Duty{Slot: 42, Type: int32(core.DutyAttester)}, + PeerIdx: peerIdx, + Round: 1, + ValueHash: nil, + PreparedValueHash: nil, + } + signed, err := signMsg(msg, key) + if err != nil { + panic(err) + } + return &pbv1.QBFTConsensusMsg{Msg: signed} +} + +func signMsg(msg *pbv1.QBFTMsg, privkey *k1.PrivateKey) (*pbv1.QBFTMsg, error) { + clone := proto.Clone(msg).(*pbv1.QBFTMsg) + clone.Signature = nil + + hash, err := hashProto(clone) + if err != nil { + return nil, err + } + + clone.Signature, err = k1util.Sign(privkey, hash[:]) + if err != nil { + return nil, err + } + + return clone, nil +} + +func verifyMsg(msg *pbv1.QBFTMsg, pubkeys map[int64]*k1.PublicKey) error { + if msg == nil || msg.GetDuty() == nil { + return fmt.Errorf("invalid consensus message") + } + if typ := coreqbft.MsgType(msg.GetType()); !typ.Valid() { + return fmt.Errorf("invalid consensus message type: %d", typ) + } + if typ := core.DutyType(msg.GetDuty().GetType()); !typ.Valid() { + return fmt.Errorf("invalid consensus message duty type: %d", typ) + } + if msg.GetRound() <= 0 { + return fmt.Errorf("invalid consensus message round: %d", msg.GetRound()) + } + if msg.GetPreparedRound() < 0 { + return fmt.Errorf("invalid consensus message prepared round") + } + + pubkey, ok := pubkeys[msg.GetPeerIdx()] + if !ok { + return fmt.Errorf("invalid peer index: %d", msg.GetPeerIdx()) + } + ok, err := verifyMsgSig(msg, pubkey) + if err != nil { + return err + } + if !ok { + return fmt.Errorf("invalid consensus message signature") + } + return nil +} + +func verifyMsgSig(msg *pbv1.QBFTMsg, pubkey *k1.PublicKey) (bool, error) { + clone := proto.Clone(msg).(*pbv1.QBFTMsg) + signature := clone.GetSignature() + if len(signature) == 0 { + return false, fmt.Errorf("empty signature") + } + clone.Signature = nil + + hash, err := hashProto(clone) + if err != nil { + return false, err + } + recovered, err := k1util.Recover(hash[:], signature) + if err != nil { + return false, err + } + return hex.EncodeToString(recovered.SerializeCompressed()) == hex.EncodeToString(pubkey.SerializeCompressed()), nil +} + +func hashProto(msg proto.Message) ([32]byte, error) { + hh := ssz.DefaultHasherPool.Get() + defer ssz.DefaultHasherPool.Put(hh) + + index := hh.Index() + b, err := proto.MarshalOptions{Deterministic: true}.Marshal(msg) + if err != nil { + return [32]byte{}, err + } + + hh.PutBytes(b) + hh.Merkleize(index) + return hh.HashRoot() +} +"#; +} diff --git a/crates/consensus/src/qbft/qbft_run_test.rs b/crates/consensus/src/qbft/qbft_run_test.rs new file mode 100644 index 00000000..1cfd970c --- /dev/null +++ b/crates/consensus/src/qbft/qbft_run_test.rs @@ -0,0 +1,174 @@ +use std::{ + collections::BTreeMap, + error::Error as StdError, + sync::{Arc, Mutex}, + time::Duration, +}; + +use pluto_core::{ + corepb::v1::core as pbcore, + types::{Duty, DutyType, SlotNumber}, +}; +use prost::bytes::Bytes; +use test_case::test_case; +use tokio::sync::mpsc; +use tokio_util::sync::CancellationToken; + +use super::{ + Peer, + component::{self, Config, Consensus}, +}; + +#[test_case(2, 3 ; "two_of_three")] +#[test_case(3, 4 ; "three_of_four")] +#[test_case(4, 4 ; "four_of_four")] +#[test_case(4, 6 ; "four_of_six")] +#[tokio::test] +async fn qbft_consensus(threshold: usize, cluster_nodes: usize) { + assert!(threshold <= cluster_nodes); + let (sniffed_tx, mut sniffed_rx) = mpsc::unbounded_channel(); + let active_nodes = in_memory_network(threshold, sniffed_tx); + let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); + let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); + let ct = CancellationToken::new(); + let start_ct = CancellationToken::new(); + let mut expired_txs = Vec::with_capacity(active_nodes.len()); + let mut start_tasks = Vec::with_capacity(active_nodes.len()); + + for (node_idx, node) in active_nodes.iter().enumerate() { + let decided_tx = decided_tx.clone(); + node.subscribe(move |duty, value| { + let _ = decided_tx.send((node_idx, duty, value)); + Ok(()) + }); + + let (expired_tx, expired_rx) = mpsc::channel(1); + expired_txs.push(expired_tx); + start_tasks.push(Arc::clone(node).start(start_ct.clone(), expired_rx)); + } + drop(decided_tx); + + let mut tasks = Vec::with_capacity(active_nodes.len()); + for (node_idx, node) in active_nodes.iter().enumerate() { + let node = Arc::clone(node); + let duty = duty.clone(); + let value = unsigned_value(node_idx); + let ct = ct.clone(); + tasks.push(tokio::spawn( + async move { node.propose(&ct, duty, value).await }, + )); + } + + let mut decided = Vec::with_capacity(active_nodes.len()); + for _ in 0..active_nodes.len() { + decided.push(recv_one(&mut decided_rx).await); + } + + for task in tasks { + task.await.unwrap().unwrap(); + } + + decided.sort_by_key(|(node_idx, ..)| *node_idx); + assert_eq!(decided.len(), threshold); + let (_, _, expected_value) = decided.first().expect("at least one decided value").clone(); + for (node_idx, decided_duty, decided_value) in decided { + assert_eq!(decided_duty, duty, "node {node_idx} decided wrong duty"); + assert_eq!( + decided_value, expected_value, + "node {node_idx} decided different value" + ); + } + + ct.cancel(); + start_ct.cancel(); + drop(expired_txs); + for task in start_tasks { + task.await.unwrap(); + } + + let mut sniffed = Vec::with_capacity(threshold); + for _ in 0..threshold { + sniffed.push(recv_one(&mut sniffed_rx).await); + } + sniffed.sort_by_key(|(node_idx, _)| *node_idx); + for (node_idx, msg_count) in sniffed { + assert_ne!(msg_count, 0, "node {node_idx} sniffer was empty"); + } +} + +async fn recv_one(rx: &mut mpsc::UnboundedReceiver) -> T { + tokio::time::timeout(Duration::from_secs(1), rx.recv()) + .await + .expect("receiver timed out") + .expect("receiver closed") +} + +fn unsigned_value(seed: usize) -> pbcore::UnsignedDataSet { + let mut set = BTreeMap::new(); + set.insert( + format!("validator-{seed}"), + Bytes::from(format!("unsigned-{seed}")), + ); + pbcore::UnsignedDataSet { set } +} + +fn in_memory_network( + count: usize, + sniffed_tx: mpsc::UnboundedSender<(usize, usize)>, +) -> Vec> { + let peers = (0..count) + .map(|index| Peer { + index: i64::try_from(index).expect("test peer index fits i64"), + name: format!("node-{index}"), + public_key: component::tests::secret_key( + u8::try_from(index.checked_add(1).expect("test peer index increments")) + .expect("test peer index fits u8"), + ) + .public_key(), + }) + .collect::>(); + let nodes = Arc::new(Mutex::new(Vec::>::new())); + + for index in 0..count { + let network = Arc::clone(&nodes); + let broadcaster: component::Broadcaster = Arc::new(move |ct, msg| { + let network = Arc::clone(&network); + Box::pin(async move { + let peer_idx = msg.msg.as_ref().map_or(-1, |msg| msg.peer_idx); + let peers = network.lock().unwrap().clone(); + for (index, consensus) in peers.into_iter().enumerate() { + if i64::try_from(index).expect("test peer index fits i64") == peer_idx { + continue; + } + if let Err(err) = consensus.handle(&ct, Some(msg.clone())).await { + return Err(Box::new(err) as Box); + } + } + Ok(()) + }) + }); + let consensus = Arc::new( + Consensus::new(Config { + peers: peers.clone(), + local_peer_idx: i64::try_from(index).expect("test peer index fits i64"), + privkey: component::tests::secret_key( + u8::try_from(index.checked_add(1).expect("test peer index increments")) + .expect("test peer index fits u8"), + ), + broadcaster, + compare_attestations: false, + sniffer: { + let sniffed_tx = sniffed_tx.clone(); + Arc::new(move |instance| { + let _ = sniffed_tx.send((index, instance.msgs.len())); + }) + }, + ..component::tests::config_base(false) + }) + .unwrap(), + ); + nodes.lock().unwrap().push(consensus); + } + + nodes.lock().unwrap().clone() +} diff --git a/crates/consensus/src/qbft/runner.rs b/crates/consensus/src/qbft/runner.rs new file mode 100644 index 00000000..0bced4c3 --- /dev/null +++ b/crates/consensus/src/qbft/runner.rs @@ -0,0 +1,726 @@ +//! QBFT consensus runner bridge. + +use std::sync::{ + Arc, Mutex, PoisonError, + atomic::{AtomicBool, Ordering}, +}; + +use cancellation::CancellationTokenSource; +use crossbeam::channel as mpmc; +use prost::{Message, Name}; +use prost_types::Any; +use tokio::{sync::mpsc, task::JoinError, time::Instant}; +use tokio_util::sync::CancellationToken; + +use crate::instance::{self, InstanceIo, RunnerError, RunnerResult}; +use pluto_core::{ + corepb::v1::{core as pbcore, priority as pbpriority}, + deadline::AddOutcome, + qbft, + types::{Duty, DutyType}, +}; + +use super::{ + component::Consensus, + definition::{self, DecideCallback, DefinitionConfig}, + msg::{self, ConsensusQbftTypes}, + sniffer::Sniffer, + transport, +}; + +/// Runner result. +pub type Result = std::result::Result; + +/// Runner errors. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Duplicate proposal. + #[error("propose consensus: {0}")] + ProposeConsensus(#[source] instance::Error), + + /// Duplicate participation. + #[error("participate consensus: {0}")] + ParticipateConsensus(#[source] instance::Error), + + /// Input channel was full. + #[error("input channel full")] + InputChannelFull, + + /// Receiver ownership failed. + #[error("{0}")] + Instance(#[from] instance::Error), + + /// Value hashing failed. + #[error("{0}")] + Msg(#[from] msg::Error), + + /// Value packing failed. + #[error("pack proto: {0}")] + PackProto(#[source] prost::EncodeError), + + /// Blocking runner task failed. + #[error("runner join: {0}")] + Join(#[source] JoinError), + + /// Generic QBFT core returned a non-cancellation error. + #[error("core qbft: {0}")] + Core(#[source] qbft::QbftError), + + /// Transport failed while broadcasting or receiving. + #[error("transport: {0}")] + Transport(String), + + /// Running consensus instance finished without a decision. + #[error("consensus timeout")] + ConsensusTimeout, + + /// Running instance result channel closed before completion. + #[error("runner result channel closed")] + RunnerResultChannelClosed, + + /// Running instance completed with an error. + #[error("runner result: {0}")] + RunnerResult(String), +} + +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +struct RunnerResultError(String); + +/// Proposes an unsigned duty data set into a QBFT instance. +pub(crate) async fn propose_unsigned( + consensus: &Consensus, + ct: &CancellationToken, + duty: Duty, + value: pbcore::UnsignedDataSet, +) -> Result<()> { + propose(consensus, ct, duty, value).await +} + +/// Proposes a priority protocol result into a QBFT instance. +pub(crate) async fn propose_priority( + consensus: &Consensus, + ct: &CancellationToken, + duty: Duty, + value: pbpriority::PriorityResult, +) -> Result<()> { + propose(consensus, ct, duty, value).await +} + +/// Hashes and packs the local value, then starts or joins the duty runner. +async fn propose( + consensus: &Consensus, + ct: &CancellationToken, + duty: Duty, + value: M, +) -> Result<()> +where + M: Message + Name + Clone + Send + Sync + 'static, +{ + let hash = msg::hash_proto(&value)?; + let any = Any::from_msg(&value).map_err(Error::PackProto)?; + let inst = consensus.get_instance_io(duty.clone()); + + inst.mark_proposed().map_err(Error::ProposeConsensus)?; + try_send_input(&inst.value_tx, any.clone())?; + try_send_input(&inst.hash_tx, hash)?; + if consensus.compare_attestations() { + try_send_input(&inst.verify_tx, any)?; + } + + if !inst.maybe_start() { + return wait_instance_result(&inst).await; + } + + run_instance(consensus, ct, duty, inst).await +} + +/// Starts participating in a duty without a local proposal value. +pub(crate) async fn participate( + consensus: &Consensus, + ct: &CancellationToken, + duty: Duty, +) -> Result<()> { + if matches!( + duty.duty_type, + DutyType::Aggregator | DutyType::SyncContribution + ) { + return Ok(()); + } + + if !consensus_participate_enabled() { + return Ok(()); + } + + let inst = consensus.get_instance_io(duty.clone()); + inst.mark_participated() + .map_err(Error::ParticipateConsensus)?; + + if !inst.maybe_start() { + return Ok(()); + } + + run_instance(consensus, ct, duty, inst).await +} + +/// Runs one consensus instance and publishes its completion result. +pub(crate) async fn run_instance( + consensus: &Consensus, + parent_ct: &CancellationToken, + duty: Duty, + inst: Arc>, +) -> Result<()> { + let result = run_instance_inner(consensus, parent_ct, duty.clone(), Arc::clone(&inst)).await; + let runner_result = to_runner_result(&result); + let _ = inst.err_tx.send(runner_result).await; + consensus.delete_instance_io(&duty); + + result +} + +/// Wires async component state into the generic blocking QBFT core. +async fn run_instance_inner( + consensus: &Consensus, + parent_ct: &CancellationToken, + duty: Duty, + inst: Arc>, +) -> Result<()> { + if consensus.add_deadline(duty.clone()).await != AddOutcome::Scheduled { + tracing::warn!(duty = %duty, "Skipping consensus for expired duty"); + return Ok(()); + } + + let outer_rx = inst.take_recv_rx()?; + let hash_rx = inst.take_hash_rx()?; + let value_rx = inst.take_value_rx()?; + let verify_rx = inst.take_verify_rx()?; + + let instance_ct = CancellationToken::new(); + let core_cts = Arc::new(CancellationTokenSource::new()); + let core_ct = core_cts.token().clone(); + let decided = Arc::new(AtomicBool::new(false)); + let transport_error = Arc::new(Mutex::new(None::)); + let runtime = tokio::runtime::Handle::current(); + + let (inner_recv_tx, inner_recv_rx) = mpsc::channel(instance::RECV_BUFFER_SIZE); + let (core_recv_tx, core_recv_rx) = mpmc::unbounded(); + let (core_hash_tx, core_hash_rx) = mpmc::unbounded(); + let (core_verify_tx, core_verify_rx) = mpmc::unbounded(); + + let nodes = consensus.node_count(); + let peer_idx = consensus.get_peer_idx(); + let transport = Arc::new(transport::Transport::new( + transport_broadcaster(consensus.broadcaster()), + consensus.privkey(), + value_rx, + inner_recv_tx, + Sniffer::new(i64::try_from(nodes).expect("node count fits i64"), peer_idx), + )); + + let mut tasks = vec![ + tokio::spawn(bridge_mpsc_to_crossbeam( + instance_ct.clone(), + inner_recv_rx, + core_recv_tx, + )), + tokio::spawn(bridge_mpsc_to_crossbeam( + instance_ct.clone(), + hash_rx, + core_hash_tx, + )), + tokio::spawn(bridge_mpsc_to_crossbeam( + instance_ct.clone(), + verify_rx, + core_verify_tx, + )), + ]; + + { + let transport = Arc::clone(&transport); + let instance_ct = instance_ct.clone(); + let transport_error = Arc::clone(&transport_error); + tasks.push(tokio::spawn(async move { + if let Err(err) = transport.process_receives(instance_ct, outer_rx).await { + *transport_error + .lock() + .unwrap_or_else(PoisonError::into_inner) = Some(err.to_string()); + } + })); + } + + { + let parent_ct = parent_ct.clone(); + let instance_ct = instance_ct.clone(); + let core_cts = Arc::clone(&core_cts); + tasks.push(tokio::spawn(async move { + tokio::select! { + () = parent_ct.cancelled() => { + instance_ct.cancel(); + core_cts.cancel(); + } + () = instance_ct.cancelled() => core_cts.cancel(), + } + })); + } + + let decide_callback: DecideCallback = { + let decided = Arc::clone(&decided); + let decided_at_tx = inst.decided_at_tx.clone(); + let instance_ct = instance_ct.clone(); + let core_cts = Arc::clone(&core_cts); + Arc::new(move |_qcommit| { + decided.store(true, Ordering::Relaxed); + let _ = decided_at_tx.try_send(Instant::now()); + instance_ct.cancel(); + core_cts.cancel(); + }) + }; + + let def = definition::new_definition(DefinitionConfig { + nodes, + subscribers: consensus.subscribers(), + round_timer: consensus.round_timer(duty.clone()), + decide_callback, + compare_attestations: consensus.compare_attestations(), + runtime: runtime.clone(), + }); + + let core_transport: qbft::Transport = qbft::Transport { + broadcast: Box::new({ + let transport = Arc::clone(&transport); + let runtime = runtime.clone(); + let instance_ct = instance_ct.clone(); + let transport_error = Arc::clone(&transport_error); + move |request| { + let justification = request.justification.cloned().unwrap_or_default(); + let result = runtime.block_on(transport.broadcast(transport::BroadcastRequest { + ct: instance_ct.clone(), + type_: request.type_, + duty: request.instance.clone(), + peer_idx: request.source, + round: request.round, + value_hash: *request.value, + prepared_round: request.prepared_round, + prepared_value_hash: *request.prepared_value, + justification, + })); + + match result { + Ok(()) => Ok(()), + Err(err) => { + *transport_error + .lock() + .unwrap_or_else(PoisonError::into_inner) = Some(err.to_string()); + Err(qbft::QbftError::ContextCanceled) + } + } + } + }), + receive: core_recv_rx, + }; + + let duty_for_run = duty.clone(); + let core_result = tokio::task::spawn_blocking(move || { + qbft::run( + &core_ct, + &def, + &core_transport, + &duty_for_run, + peer_idx, + core_hash_rx, + core_verify_rx, + ) + }) + .await + .map_err(Error::Join)?; + + instance_ct.cancel(); + for task in tasks { + let _ = task.await; + } + + let sniffer = consensus.sniffer(); + sniffer(transport.sniffer_instance()); + + if let Some(err) = transport_error + .lock() + .unwrap_or_else(PoisonError::into_inner) + .take() + { + return Err(Error::Transport(err)); + } + + match core_result { + Ok(()) => Ok(()), + Err(qbft::QbftError::ContextCanceled) if decided.load(Ordering::Relaxed) => Ok(()), + Err(qbft::QbftError::ContextCanceled) => Err(Error::ConsensusTimeout), + Err(err) => Err(Error::Core(err)), + } +} + +/// Sends a one-shot local input into an instance channel without waiting. +fn try_send_input(tx: &mpsc::Sender, value: T) -> Result<()> { + tx.try_send(value).map_err(|_| Error::InputChannelFull) +} + +/// Waits for an already-running instance to finish. +async fn wait_instance_result(inst: &InstanceIo) -> Result<()> { + let mut err_rx = inst.take_err_rx()?; + match err_rx.recv().await { + Some(Ok(())) => Ok(()), + Some(Err(err)) => Err(Error::RunnerResult(err.to_string())), + None => Err(Error::RunnerResultChannelClosed), + } +} + +/// Bridges Tokio channels into the crossbeam channels expected by core QBFT. +async fn bridge_mpsc_to_crossbeam( + ct: CancellationToken, + mut rx: mpsc::Receiver, + tx: mpmc::Sender, +) where + T: Send + 'static, +{ + loop { + let value = tokio::select! { + () = ct.cancelled() => return, + value = rx.recv() => match value { + Some(value) => value, + None => return, + }, + }; + + if tx.send(value).is_err() { + return; + } + } +} + +/// Converts the component broadcaster into the transport broadcaster type. +fn transport_broadcaster(broadcaster: super::component::Broadcaster) -> transport::Broadcaster { + Box::new(move |ct, msg| { + let broadcaster = Arc::clone(&broadcaster); + Box::pin(async move { + broadcaster(ct, msg) + .await + .map_err(|err| transport::Error::Broadcast(err.to_string())) + }) + }) +} + +/// Converts a runner result into the channel payload shared with joiners. +fn to_runner_result(result: &Result<()>) -> RunnerResult { + match result { + Ok(()) => Ok(()), + Err(err) => Err(Box::new(RunnerResultError(err.to_string())) as RunnerError), + } +} + +/// Returns whether passive consensus participation is enabled. +fn consensus_participate_enabled() -> bool { + pluto_featureset::GLOBAL_STATE + .read() + .expect("global feature set lock poisoned") + .enabled(pluto_featureset::Feature::ConsensusParticipate) +} + +#[cfg(test)] +mod tests { + use std::{ + mem, + sync::{Arc, Mutex, MutexGuard}, + time::Duration, + }; + + use pluto_featureset::{Config as FeatureConfig, Feature, FeatureSet, GLOBAL_STATE, Status}; + use prost::bytes::Bytes; + use prost_types::Any; + use tokio::sync::mpsc; + + use super::*; + use crate::qbft::component::{self, Config}; + use pluto_core::{corepb::v1::core as pbcore, types::SlotNumber}; + + static FEATURESET_TEST_LOCK: Mutex<()> = Mutex::new(()); + + #[tokio::test] + async fn propose_when_instance_already_running_fills_value_hash_and_verify_channels() { + let consensus = Arc::new( + Consensus::new(Config { + compare_attestations: true, + peers: component::tests::peers(), + ..component::tests::config_base(false) + }) + .unwrap(), + ); + let duty = component::tests::duty(); + let value = unsigned_value(0); + let want_hash = msg::hash_proto(&value).unwrap(); + let want_any = Any::from_msg(&value).unwrap(); + let inst = consensus.get_instance_io(duty.clone()); + assert!(inst.maybe_start()); + let mut value_rx = inst.take_value_rx().unwrap(); + let mut hash_rx = inst.take_hash_rx().unwrap(); + let mut verify_rx = inst.take_verify_rx().unwrap(); + + let task = { + let consensus = Arc::clone(&consensus); + let duty = duty.clone(); + let value = value.clone(); + tokio::spawn(async move { + let ct = CancellationToken::new(); + consensus.propose(&ct, duty, value).await + }) + }; + + assert_eq!(recv_one(&mut value_rx).await, want_any); + assert_eq!(recv_one(&mut hash_rx).await, want_hash); + assert_eq!(recv_one(&mut verify_rx).await, want_any); + inst.err_tx.send(Ok(())).await.unwrap(); + task.await.unwrap().unwrap(); + } + + #[tokio::test] + async fn propose_rejects_duplicate_entrypoint() { + let consensus = component::tests::consensus(0, true); + let duty = component::tests::duty(); + let inst = consensus.get_instance_io(duty.clone()); + inst.mark_proposed().unwrap(); + + let err = consensus + .propose(&CancellationToken::new(), duty, unsigned_value(0)) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "propose consensus: already proposed"); + } + + #[tokio::test] + async fn propose_surfaces_full_input_channel() { + let consensus = component::tests::consensus(0, true); + let duty = component::tests::duty(); + let inst = consensus.get_instance_io(duty.clone()); + inst.value_tx.try_send(Any::default()).unwrap(); + + let err = consensus + .propose(&CancellationToken::new(), duty, unsigned_value(0)) + .await + .unwrap_err(); + + assert!(matches!(err, Error::InputChannelFull)); + } + + #[tokio::test] + async fn participate_skips_aggregator_and_sync_contribution() { + let consensus = component::tests::consensus(0, true); + let aggregator = Duty::new(SlotNumber::new(1), DutyType::Aggregator); + let sync_contribution = Duty::new(SlotNumber::new(1), DutyType::SyncContribution); + + participate(&consensus, &CancellationToken::new(), aggregator.clone()) + .await + .unwrap(); + participate( + &consensus, + &CancellationToken::new(), + sync_contribution.clone(), + ) + .await + .unwrap(); + + assert!( + consensus + .get_instance_io(aggregator) + .mark_participated() + .is_ok() + ); + assert!( + consensus + .get_instance_io(sync_contribution) + .mark_participated() + .is_ok() + ); + } + + #[tokio::test] + async fn participate_skips_when_feature_disabled() { + let consensus = component::tests::consensus(0, true); + let duty = component::tests::duty(); + + let result = with_featureset( + FeatureConfig { + disabled: vec![Feature::ConsensusParticipate], + ..FeatureConfig::default() + }, + || { + futures::executor::block_on(participate( + &consensus, + &CancellationToken::new(), + duty.clone(), + )) + }, + ); + + result.unwrap(); + assert!(consensus.get_instance_io(duty).mark_participated().is_ok()); + } + + #[tokio::test] + async fn participate_rejects_duplicate_entrypoint() { + let consensus = component::tests::consensus(0, true); + let duty = component::tests::duty(); + let inst = consensus.get_instance_io(duty.clone()); + inst.mark_participated().unwrap(); + + let err = participate(&consensus, &CancellationToken::new(), duty) + .await + .unwrap_err(); + + assert_eq!( + err.to_string(), + "participate consensus: already participated" + ); + } + + #[tokio::test] + async fn run_instance_sends_ok_result_when_deadline_is_not_scheduled() { + let consensus = Consensus::new(Config { + peers: component::tests::peers(), + ..component::tests::config_base(true) + }) + .unwrap(); + let duty = component::tests::duty(); + let inst = consensus.get_instance_io(duty.clone()); + let mut err_rx = inst.take_err_rx().unwrap(); + + run_instance(&consensus, &CancellationToken::new(), duty, inst) + .await + .unwrap(); + + recv_one(&mut err_rx).await.unwrap(); + } + + #[tokio::test] + async fn run_instance_cancels_and_emits_sniffer_on_teardown() { + let sniffed = Arc::new(Mutex::new(Vec::new())); + let consensus = Consensus::new(Config { + peers: component::tests::peers(), + sniffer: { + let sniffed = Arc::clone(&sniffed); + Arc::new(move |instance| sniffed.lock().unwrap().push(instance)) + }, + ..component::tests::config_base(false) + }) + .unwrap(); + let duty = component::tests::duty(); + let inst = consensus.get_instance_io(duty.clone()); + let mut err_rx = inst.take_err_rx().unwrap(); + let ct = CancellationToken::new(); + ct.cancel(); + + let err = run_instance(&consensus, &ct, duty, inst).await.unwrap_err(); + + assert!(matches!(err, Error::ConsensusTimeout)); + let runner_err = recv_one(&mut err_rx).await.unwrap_err(); + assert_eq!(runner_err.to_string(), "consensus timeout"); + assert_eq!(sniffed.lock().unwrap().len(), 1); + } + + #[tokio::test] + async fn run_instance_parent_cancel_cancels_broadcast_token() { + let (broadcast_started_tx, mut broadcast_started_rx) = mpsc::channel(1); + let (broadcast_cancelled_tx, mut broadcast_cancelled_rx) = mpsc::channel(1); + let consensus = Consensus::new(Config { + peers: component::tests::peers(), + broadcaster: Arc::new(move |ct, _| { + let broadcast_started_tx = broadcast_started_tx.clone(); + let broadcast_cancelled_tx = broadcast_cancelled_tx.clone(); + Box::pin(async move { + let _ = broadcast_started_tx.send(()).await; + ct.cancelled().await; + let _ = broadcast_cancelled_tx.send(()).await; + Ok(()) + }) + }), + ..component::tests::config_base(false) + }) + .unwrap(); + let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); + let ct = CancellationToken::new(); + let task_ct = ct.clone(); + + let task = + tokio::spawn(async move { consensus.propose(&task_ct, duty, unsigned_value(0)).await }); + + recv_one(&mut broadcast_started_rx).await; + ct.cancel(); + recv_one(&mut broadcast_cancelled_rx).await; + let err = tokio::time::timeout(Duration::from_secs(1), task) + .await + .expect("run instance timed out") + .expect("task panicked") + .unwrap_err(); + assert!( + matches!(err, Error::ConsensusTimeout), + "unexpected error: {err}" + ); + } + + async fn recv_one(rx: &mut mpsc::Receiver) -> T { + tokio::time::timeout(Duration::from_secs(1), rx.recv()) + .await + .expect("receiver timed out") + .expect("receiver closed") + } + + fn unsigned_value(seed: usize) -> pbcore::UnsignedDataSet { + let mut set = std::collections::BTreeMap::new(); + set.insert( + format!("validator-{seed}"), + Bytes::from(format!("unsigned-{seed}")), + ); + pbcore::UnsignedDataSet { set } + } + + fn with_featureset(config: FeatureConfig, test: impl FnOnce() -> T) -> T { + let _guard = FeatureSetGuard::new(config); + test() + } + + struct FeatureSetGuard { + previous: Option, + _lock: MutexGuard<'static, ()>, + } + + impl FeatureSetGuard { + fn new(config: FeatureConfig) -> Self { + let lock = FEATURESET_TEST_LOCK + .lock() + .expect("featureset test lock poisoned"); + let replacement = FeatureSet::from_config(FeatureConfig { + min_status: Status::Stable, + ..config + }) + .expect("test featureset is valid"); + let mut global = GLOBAL_STATE + .write() + .expect("global feature set lock poisoned"); + let previous = mem::replace(&mut *global, replacement); + drop(global); + + Self { + previous: Some(previous), + _lock: lock, + } + } + } + + impl Drop for FeatureSetGuard { + fn drop(&mut self) { + if let Some(previous) = self.previous.take() { + *GLOBAL_STATE + .write() + .expect("global feature set lock poisoned") = previous; + } + } + } +} diff --git a/crates/core/src/consensus/qbft/sniffer.rs b/crates/consensus/src/qbft/sniffer.rs similarity index 89% rename from crates/core/src/consensus/qbft/sniffer.rs rename to crates/consensus/src/qbft/sniffer.rs index d3b6f13e..0fb7b85a 100644 --- a/crates/core/src/consensus/qbft/sniffer.rs +++ b/crates/consensus/src/qbft/sniffer.rs @@ -1,8 +1,5 @@ //! QBFT consensus message sniffer. -// TODO: Remove once the consensus component exports sniffer lifecycle hooks. -#![allow(dead_code)] - use std::{ sync::{Mutex, PoisonError}, time::SystemTime, @@ -10,9 +7,9 @@ use std::{ use prost_types::Timestamp; -use crate::{ - consensus::protocols::QBFT_V2_PROTOCOL_ID, - corepb::v1::consensus::{QbftConsensusMsg, SniffedConsensusInstance, SniffedConsensusMsg}, +use crate::protocols::QBFT_V2_PROTOCOL_ID; +use pluto_core::corepb::v1::consensus::{ + QbftConsensusMsg, SniffedConsensusInstance, SniffedConsensusMsg, }; /// Buffers consensus messages for the debug API. @@ -65,7 +62,7 @@ impl Sniffer { #[cfg(test)] mod tests { use super::*; - use crate::corepb::v1::consensus::QbftMsg; + use pluto_core::corepb::v1::consensus::QbftMsg; #[test] fn sniffer_add_records_messages() { diff --git a/crates/core/src/consensus/qbft/transport.rs b/crates/consensus/src/qbft/transport.rs similarity index 98% rename from crates/core/src/consensus/qbft/transport.rs rename to crates/consensus/src/qbft/transport.rs index 4f40a1ed..40a9dfc3 100644 --- a/crates/core/src/consensus/qbft/transport.rs +++ b/crates/consensus/src/qbft/transport.rs @@ -1,8 +1,5 @@ //! QBFT consensus transport adapter. -// TODO: Remove once the consensus runner wires this transport. -#![allow(dead_code)] - use std::sync::{self, Mutex, PoisonError}; use futures::future::BoxFuture; @@ -11,7 +8,7 @@ use prost_types::Any; use tokio::sync::mpsc; use tokio_util::sync::CancellationToken; -use crate::{ +use pluto_core::{ corepb::v1::{consensus as pbconsensus, core as pbcore}, qbft::{self, SomeMsg}, types::{Duty, DutyTypeError}, @@ -69,6 +66,10 @@ pub(crate) enum Error { #[error("receive buffer closed")] ReceiveBufferClosed, + /// External broadcaster failed. + #[error("broadcast: {0}")] + Broadcast(String), + /// Consensus message wrapping/signing failed. #[error("{0}")] Msg(#[from] msg::Error), @@ -267,6 +268,10 @@ struct CreateMsgRequest<'a> { } /// Creates a signed consensus QBFT message wrapper. +/// +/// This is the final boundary before the generic core message becomes a wire +/// message: it maps the domain duty, signs the raw protobuf, and preserves raw +/// justification protobufs for transport. fn create_msg(request: CreateMsgRequest<'_>) -> Result { let CreateMsgRequest { type_, @@ -312,8 +317,8 @@ fn create_msg(request: CreateMsgRequest<'_>) -> Result { #[cfg(test)] mod tests { use super::*; - use crate::{ - consensus::qbft::{msg::hash_proto, sniffer::Sniffer}, + use crate::qbft::{msg::hash_proto, sniffer::Sniffer}; + use pluto_core::{ corepb::v1::consensus::QbftMsg, qbft::SomeMsg, types::{DutyType, SlotNumber}, diff --git a/crates/core/src/consensus/timer.rs b/crates/consensus/src/timer.rs similarity index 97% rename from crates/core/src/consensus/timer.rs rename to crates/consensus/src/timer.rs index 3baea46d..dc74e7f5 100644 --- a/crates/core/src/consensus/timer.rs +++ b/crates/consensus/src/timer.rs @@ -33,7 +33,7 @@ use std::{ use pluto_featureset::{Feature, GLOBAL_STATE}; use tokio::time::{Instant, sleep_until}; -use crate::types::{Duty, DutyType}; +use pluto_core::types::{Duty, DutyType}; /// Increasing timer round-1 base timeout. pub const INC_ROUND_START: Duration = Duration::from_millis(750); @@ -291,6 +291,7 @@ pub fn get_round_timer_func() -> RoundTimerFunc { } } +/// Returns whether a consensus timer feature is enabled globally. fn feature_enabled(feature: Feature) -> bool { GLOBAL_STATE .read() @@ -298,10 +299,12 @@ fn feature_enabled(feature: Feature) -> bool { .enabled(feature) } +/// Returns true for duties that use the proposer-specific timer path. fn is_proposer(duty: &Duty) -> bool { matches!(&duty.duty_type, DutyType::Proposer) } +/// Returns proposer round-one override duration when enabled. fn proposal_timeout_duration(duty: Option<&Duty>, round: i64) -> Option { if round == 1 && duty.is_some_and(is_proposer) && feature_enabled(Feature::ProposalTimeout) { Some(PROPOSAL_TIMEOUT) @@ -310,7 +313,7 @@ fn proposal_timeout_duration(duty: Option<&Duty>, round: i64) -> Option Result { ensure_non_negative_round(round)?; @@ -323,7 +326,7 @@ fn increasing_round_timeout(round: i64) -> Result { .ok_or(Error::DurationOverflow { round }) } -// Returns `LINEAR_ROUND_INC * round` duration for a round. +/// Returns `LINEAR_ROUND_INC * round`. fn linear_round_timeout(round: i64) -> Result { ensure_non_negative_round(round)?; @@ -333,6 +336,7 @@ fn linear_round_timeout(round: i64) -> Result { .ok_or(Error::DurationOverflow { round }) } +/// Returns the reduced timeout used after linear round one. fn linear_subsequent_round_timeout(round: i64) -> Result { ensure_non_negative_round(round)?; @@ -353,6 +357,7 @@ fn linear_subsequent_round_timeout(round: i64) -> Result { Ok(Duration::from_millis(timeout_millis)) } +/// Rejects negative consensus rounds before duration arithmetic. fn ensure_non_negative_round(round: i64) -> Result<()> { if round < 0 { return Err(Error::InvalidRound { round }); @@ -361,12 +366,14 @@ fn ensure_non_negative_round(round: i64) -> Result<()> { Ok(()) } +/// Returns a timeout future scheduled relative to current Tokio time. fn timeout_from_now(timeout: Duration, round: i64) -> Result { let deadline = checked_deadline(Instant::now(), timeout, round)?; Ok(timeout_for_deadline(deadline)) } +/// Returns a future that resolves at an absolute Tokio deadline. fn timeout_for_deadline(deadline: Instant) -> RoundTimerFuture { Box::pin(async move { sleep_until(deadline).await; @@ -374,6 +381,7 @@ fn timeout_for_deadline(deadline: Instant) -> RoundTimerFuture { }) } +/// Adds a timeout to an absolute start time with overflow reporting. fn checked_deadline(start: Instant, timeout: Duration, round: i64) -> Result { start .checked_add(timeout) @@ -393,7 +401,7 @@ mod tests { use tokio::{task::JoinHandle, time::advance}; use super::*; - use crate::types::SlotNumber; + use pluto_core::types::SlotNumber; // Feature state is process-global. static FEATURESET_TEST_LOCK: StdMutex<()> = StdMutex::new(()); diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 60a2130c..e281c251 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -17,7 +17,6 @@ dyn-eq.workspace = true futures.workspace = true hex.workspace = true k256.workspace = true -libp2p.workspace = true vise.workspace = true pluto-crypto.workspace = true pluto-eth2api.workspace = true @@ -43,7 +42,6 @@ anyhow.workspace = true alloy.workspace = true clap.workspace = true rand.workspace = true -libp2p.workspace = true prost.workspace = true prost-types.workspace = true hex.workspace = true @@ -51,7 +49,6 @@ chrono.workspace = true test-case.workspace = true pluto-eth2util.workspace = true pluto-cluster.workspace = true -pluto-p2p.workspace = true pluto-testutil.workspace = true pluto-tracing.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/crates/core/src/consensus/qbft/mod.rs b/crates/core/src/consensus/qbft/mod.rs deleted file mode 100644 index 723197f8..00000000 --- a/crates/core/src/consensus/qbft/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -//! QBFT consensus wrapper. - -/// QBFT protobuf message wrapper. -pub mod msg; - -pub(crate) mod sniffer; -pub(crate) mod transport; diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 58b765bf..bf5a9295 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -11,9 +11,6 @@ pub mod types; /// Signed data wrappers and helpers. pub mod signeddata; -/// Consensus-related functionality. -pub mod consensus; - /// Protobuf definitions. pub mod corepb; From 712354d899fc28205a12d95c2b4f72b05843fc9f Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 12:06:55 +0700 Subject: [PATCH 02/21] fix: wait for qbft receive buffer capacity --- crates/consensus/src/qbft/admission.rs | 48 ++++++++++++++++++++++---- crates/consensus/src/qbft/component.rs | 19 +++++----- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/crates/consensus/src/qbft/admission.rs b/crates/consensus/src/qbft/admission.rs index 98957448..bd5d4761 100644 --- a/crates/consensus/src/qbft/admission.rs +++ b/crates/consensus/src/qbft/admission.rs @@ -401,19 +401,55 @@ mod tests { } #[tokio::test] - async fn handle_rejects_full_receive_buffer() { + async fn handle_waits_for_receive_buffer_capacity() { let consensus = consensus(0, true); let inst = consensus.get_instance_io(duty()); + let mut recv_rx = inst.take_recv_rx().unwrap(); for _ in 0..crate::instance::RECV_BUFFER_SIZE { inst.recv_tx.try_send(wrapped_msg()).unwrap(); } - let err = consensus - .handle( - &CancellationToken::new(), - Some(consensus_msg(signed_msg(0))), - ) + let ct = CancellationToken::new(); + let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); + tokio::pin!(handle); + + tokio::select! { + result = &mut handle => panic!( + "handle completed while receive buffer was full: {result:?}" + ), + () = tokio::task::yield_now() => {} + } + + recv_rx.recv().await.unwrap(); + tokio::time::timeout(std::time::Duration::from_secs(1), &mut handle) + .await + .unwrap() + .unwrap(); + } + + #[tokio::test] + async fn handle_rejects_full_receive_buffer_after_cancellation() { + let consensus = consensus(0, true); + let inst = consensus.get_instance_io(duty()); + let _recv_rx = inst.take_recv_rx().unwrap(); + for _ in 0..crate::instance::RECV_BUFFER_SIZE { + inst.recv_tx.try_send(wrapped_msg()).unwrap(); + } + + let ct = CancellationToken::new(); + let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); + tokio::pin!(handle); + + tokio::select! { + result = &mut handle => panic!( + "handle completed while receive buffer was full: {result:?}" + ), + () = tokio::task::yield_now() => {} + } + ct.cancel(); + let err = tokio::time::timeout(std::time::Duration::from_secs(1), &mut handle) .await + .unwrap() .unwrap_err(); assert_eq!(err.to_string(), "timeout enqueuing receive buffer"); diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 4d6928b0..19576182 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -8,10 +8,7 @@ use std::{ use futures::future::BoxFuture; use k256::{PublicKey, SecretKey}; -use tokio::{ - sync::{mpsc, mpsc::error::TrySendError}, - task::JoinHandle, -}; +use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; use crate::{ @@ -274,13 +271,13 @@ impl Consensus { return Err(admission::Error::DutyExpired); } - self.get_recv_buffer(duty) - .try_send(wrapped) - .map_err(|err| match err { - TrySendError::Full(_) | TrySendError::Closed(_) => { - admission::Error::TimeoutEnqueuingReceiveBuffer - } - }) + let recv_tx = self.get_recv_buffer(duty); + tokio::select! { + result = recv_tx.send(wrapped) => { + result.map_err(|_| admission::Error::TimeoutEnqueuingReceiveBuffer) + } + () = ct.cancelled() => Err(admission::Error::TimeoutEnqueuingReceiveBuffer), + } } /// Verifies fields and signature for one raw QBFT message. From 1cf0c84a0307179ddd3b3d6b6a82f06038f28c4b Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 13:10:54 +0700 Subject: [PATCH 03/21] fix: implement qbft attester compare --- Cargo.lock | 2 + crates/consensus/Cargo.toml | 2 + crates/consensus/src/qbft/definition.rs | 369 ++++++++++++++++++++++-- crates/core/src/dutydb/memory.rs | 277 +++++++++++++++++- crates/core/src/dutydb/mod.rs | 2 +- crates/core/src/parsigex_codec.rs | 8 + 6 files changed, 641 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 406c1f73..20248431 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5592,11 +5592,13 @@ dependencies = [ "chrono", "crossbeam", "either", + "ethereum_ssz", "futures", "hex", "k256", "libp2p", "pluto-core", + "pluto-eth2api", "pluto-featureset", "pluto-k1util", "pluto-p2p", diff --git a/crates/consensus/Cargo.toml b/crates/consensus/Cargo.toml index a36203b0..803bcbd6 100644 --- a/crates/consensus/Cargo.toml +++ b/crates/consensus/Cargo.toml @@ -28,6 +28,8 @@ tokio-util.workspace = true tracing.workspace = true [dev-dependencies] +pluto-eth2api.workspace = true +ssz.workspace = true test-case.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/crates/consensus/src/qbft/definition.rs b/crates/consensus/src/qbft/definition.rs index 56985079..e97ace6f 100644 --- a/crates/consensus/src/qbft/definition.rs +++ b/crates/consensus/src/qbft/definition.rs @@ -5,18 +5,23 @@ use std::{sync::Arc, time}; use crate::{instance::RECV_BUFFER_SIZE, timer::RoundTimer}; use crossbeam::channel as mpmc; use pluto_core::{ + dutydb::{UnsignedDataSet, UnsignedDutyData, unsigned_data_set_from_proto}, qbft::{self, QbftLogger}, - types::{Duty, DutyType}, + signeddata::AttestationData as CoreAttestationData, + types::{Duty, DutyType, PubKey}, }; +use prost_types::Any; use tokio::runtime::Handle; use tokio_util::sync::CancellationToken; use super::{ admission, - component::SubscriberSet, + component::{DecodedValue, SubscriberSet}, msg::{self, ConsensusQbftTypes}, }; +const LOCAL_COMPARE_VALUE_POLL_INTERVAL: time::Duration = time::Duration::from_millis(10); + /// Callback invoked with the decided commit quorum. pub(crate) type DecideCallback = Arc>) + Send + Sync + 'static>; @@ -149,18 +154,132 @@ fn decide( /// Compares proposal values before commit when attester comparison is enabled. fn compare(compare_attestations: bool, request: qbft::CompareRequest<'_, ConsensusQbftTypes>) { - if !compare_attestations { + if !compare_attestations || request.qcommit.instance().duty_type != DutyType::Attester { let _ = request.return_err.send(Ok(())); return; } - if request.qcommit.instance().duty_type != DutyType::Attester { - let _ = request.return_err.send(Ok(())); - return; + let result = compare_attester(&request).map_err(|err| { + tracing::warn!(error = %err, "QBFT attester compare failed"); + qbft::QbftError::CompareError + }); + let _ = request.return_err.send(result); +} + +/// Compares the leader's attestation source/target with the local value. +fn compare_attester( + request: &qbft::CompareRequest<'_, ConsensusQbftTypes>, +) -> std::result::Result<(), AttesterCompareError> { + let leader_any = request + .qcommit + .value_source() + .map_err(AttesterCompareError::ValueSource)?; + let leader = decode_attester_set(&leader_any)?; + let local_any = local_compare_value(request)?; + let local = decode_attester_set(&local_any)?; + + for (pubkey, leader_data) in &leader { + let leader_data = attestation_data(leader_data)?; + let Some(local_data) = local.get(pubkey) else { + tracing::warn!(pubkey = %pubkey, "No local attestation found, skipping"); + continue; + }; + let local_data = attestation_data(local_data)?; + + if leader_data.data.source.epoch != local_data.data.source.epoch { + return Err(attestation_mismatch(pubkey, "source epoch")); + } + if leader_data.data.source.root != local_data.data.source.root { + return Err(attestation_mismatch(pubkey, "source root")); + } + if leader_data.data.target.epoch != local_data.data.target.epoch { + return Err(attestation_mismatch(pubkey, "target epoch")); + } + if leader_data.data.target.root != local_data.data.target.root { + return Err(attestation_mismatch(pubkey, "target root")); + } + } + + Ok(()) +} + +/// Returns the cached local compare value or waits for the runner-provided one. +fn local_compare_value( + request: &qbft::CompareRequest<'_, ConsensusQbftTypes>, +) -> std::result::Result { + // The generic QBFT core uses `T::Compare::default()` as the "not cached" + // sentinel. For this adapter that is `Any::default()`. + if request.input_value_source != &Any::default() { + return Ok(request.input_value_source.clone()); + } + + // Poll in short intervals so compare-scoped cancellation is still observed + // while waiting on the blocking local-value channel. + loop { + if request.ct.is_canceled() { + return Err(AttesterCompareError::TimeoutWaitingLocalValue); + } + + match request + .input_value_source_ch + .recv_timeout(LOCAL_COMPARE_VALUE_POLL_INTERVAL) + { + Ok(value) => { + let _ = request.return_value.send(value.clone()); + return Ok(value); + } + Err(mpmc::RecvTimeoutError::Timeout) => {} + Err(mpmc::RecvTimeoutError::Disconnected) => { + return Err(AttesterCompareError::LocalValueChannelClosed); + } + } + } +} + +fn decode_attester_set(any: &Any) -> std::result::Result { + match admission::decode_supported_any(any).map_err(AttesterCompareError::DecodeAny)? { + DecodedValue::UnsignedDataSet(value) => { + unsigned_data_set_from_proto(&DutyType::Attester, &value) + .map_err(AttesterCompareError::DecodeUnsignedDataSet) + } + DecodedValue::PriorityResult(_) => Err(AttesterCompareError::UnexpectedValueType), + } +} + +fn attestation_data( + data: &UnsignedDutyData, +) -> std::result::Result<&CoreAttestationData, AttesterCompareError> { + match data { + UnsignedDutyData::Attestation(data) => Ok(data), + _ => Err(AttesterCompareError::UnexpectedUnsignedDataType), } +} + +fn attestation_mismatch(pubkey: &PubKey, field: &'static str) -> AttesterCompareError { + AttesterCompareError::AttestationMismatch { + pubkey: pubkey.to_string(), + field, + } +} - tracing::warn!("QBFT attester compare deferred: unsigned data domain decoding is unavailable"); - let _ = request.return_err.send(Err(qbft::QbftError::CompareError)); +#[derive(Debug, thiserror::Error)] +enum AttesterCompareError { + #[error("msg has no value source: {0}")] + ValueSource(#[source] qbft::QbftError), + #[error("decode any: {0}")] + DecodeAny(#[source] admission::Error), + #[error("unexpected compare value type")] + UnexpectedValueType, + #[error("timeout on waiting for local value")] + TimeoutWaitingLocalValue, + #[error("local value channel closed")] + LocalValueChannelClosed, + #[error("decode unsigned data set: {0}")] + DecodeUnsignedDataSet(#[source] pluto_core::ParSigExCodecError), + #[error("unexpected unsigned data type")] + UnexpectedUnsignedDataType, + #[error("leader attestation {field} differs from local {field}; public_key={pubkey}")] + AttestationMismatch { pubkey: String, field: &'static str }, } /// Adapts an async round timer future into the blocking QBFT core timer type. @@ -373,8 +492,10 @@ mod tests { }, }; - use prost::{Message, Name}; + use pluto_eth2api::spec::phase0; + use prost::{Message, bytes::Bytes}; use prost_types::Any; + use ssz::Encode; use test_case::test_case; use super::*; @@ -384,6 +505,9 @@ mod tests { types::{Duty, DutyType, SlotNumber}, }; + const ATTESTATION_DATA_SSZ_OFFSET: usize = 8; + const ATTESTER_DUTY_SSZ_SIZE: usize = 96; + #[test_case(0, DutyType::Attester, 1, 4, 3 ; "attester_round_1")] #[test_case(42, DutyType::Attester, 1, 4, 1 ; "slot_42_attester")] #[test_case(42, DutyType::Proposer, 3, 4, 2 ; "slot_42_proposer_round_3")] @@ -593,12 +717,121 @@ mod tests { } #[test] - fn compare_defers_attester_source_target_matching() { - let result = run_compare(true, DutyType::Attester); + fn compare_attester_accepts_matching_source_target() { + let leader = unsigned_attestation_set(&pubkey(1), attestation_data()); + let local = leader.clone(); + + let result = run_compare_attester(leader, Some(any_unsigned(&local)), Any::default()); + + assert!(matches!(result, Ok(()))); + } + + #[test_case( + |data: &mut phase0::AttestationData| data.source.epoch = 2 ; + "source_epoch" + )] + #[test_case( + |data: &mut phase0::AttestationData| data.source.root = [3; 32] ; + "source_root" + )] + #[test_case( + |data: &mut phase0::AttestationData| data.target.epoch = 4 ; + "target_epoch" + )] + #[test_case( + |data: &mut phase0::AttestationData| data.target.root = [5; 32] ; + "target_root" + )] + fn compare_attester_rejects_source_target_mismatch(mutate: fn(&mut phase0::AttestationData)) { + let pubkey = pubkey(1); + let leader = unsigned_attestation_set(&pubkey, attestation_data()); + let mut local_data = attestation_data(); + mutate(&mut local_data); + let local = unsigned_attestation_set(&pubkey, local_data); + + let result = run_compare_attester(leader, Some(any_unsigned(&local)), Any::default()); assert!(matches!(result, Err(qbft::QbftError::CompareError))); } + #[test] + fn compare_attester_skips_missing_local_attestation() { + let leader = unsigned_attestation_set(&pubkey(1), attestation_data()); + let local = unsigned_attestation_set(&pubkey(2), changed_attestation_data()); + + let result = run_compare_attester(leader, Some(any_unsigned(&local)), Any::default()); + + assert!(matches!(result, Ok(()))); + } + + #[test] + fn compare_attester_waits_for_local_value_and_returns_cache() { + let leader = unsigned_attestation_set(&pubkey(1), attestation_data()); + let local = any_unsigned(&leader); + let cts = cancellation::CancellationTokenSource::new(); + let ct = cts.token().clone(); + let qcommit = qcommit_for_value(component::tests::duty(), any_unsigned(&leader)); + let (input_tx, input_rx) = mpmc::bounded(1); + input_tx.send(local.clone()).unwrap(); + let (return_err_tx, return_err_rx) = mpmc::bounded(1); + let (return_value_tx, return_value_rx) = mpmc::bounded(1); + let input_value = Any::default(); + + compare( + true, + qbft::CompareRequest { + ct: &ct, + qcommit: &qcommit, + input_value_source_ch: &input_rx, + input_value_source: &input_value, + return_err: &return_err_tx, + return_value: &return_value_tx, + }, + ); + + assert!(matches!(return_err_rx.recv().unwrap(), Ok(()))); + assert_eq!(return_value_rx.recv().unwrap(), local); + } + + #[test] + fn compare_attester_uses_cached_local_value() { + let leader = unsigned_attestation_set(&pubkey(1), attestation_data()); + let local = any_unsigned(&leader); + let result = run_compare_attester(leader, None, local); + + assert!(matches!(result, Ok(()))); + } + + #[test] + fn compare_attester_returns_error_when_cancelled_waiting_for_local_value() { + let leader = unsigned_attestation_set(&pubkey(1), attestation_data()); + let cts = cancellation::CancellationTokenSource::new(); + cts.cancel(); + let ct = cts.token().clone(); + let qcommit = qcommit_for_value(component::tests::duty(), any_unsigned(&leader)); + let (_input_tx, input_rx) = mpmc::bounded(1); + let (return_err_tx, return_err_rx) = mpmc::bounded(1); + let (return_value_tx, _return_value_rx) = mpmc::bounded(1); + let input_value = Any::default(); + + compare( + true, + qbft::CompareRequest { + ct: &ct, + qcommit: &qcommit, + input_value_source_ch: &input_rx, + input_value_source: &input_value, + return_err: &return_err_tx, + return_value: &return_value_tx, + }, + ); + + assert!(matches!( + return_err_rx.recv().unwrap(), + Err(qbft::QbftError::CompareError) + )); + } + #[tokio::test] async fn new_definition_leader_callback_uses_go_formula() { let consensus = component::tests::consensus(0, true); @@ -684,7 +917,47 @@ mod tests { return_err_rx.recv().unwrap() } + fn run_compare_attester( + leader: pbcore::UnsignedDataSet, + local_from_channel: Option, + cached_local: Any, + ) -> std::result::Result<(), qbft::QbftError> { + let cts = cancellation::CancellationTokenSource::new(); + let ct = cts.token().clone(); + let qcommit = qcommit_for_value(component::tests::duty(), any_unsigned(&leader)); + let (input_tx, input_rx) = mpmc::bounded(1); + if let Some(local) = local_from_channel { + input_tx.send(local).unwrap(); + } + let (return_err_tx, return_err_rx) = mpmc::bounded(1); + let (return_value_tx, _return_value_rx) = mpmc::bounded(1); + + compare( + true, + qbft::CompareRequest { + ct: &ct, + qcommit: &qcommit, + input_value_source_ch: &input_rx, + input_value_source: &cached_local, + return_err: &return_err_tx, + return_value: &return_value_tx, + }, + ); + + return_err_rx.recv().unwrap() + } + fn commit_msg(duty: Duty, hash: [u8; 32], value: Any) -> qbft::Msg { + qcommit_for_hash(duty, hash, value) + } + + fn qcommit_for_value(duty: Duty, value: Any) -> qbft::Msg { + let decoded = pbcore::UnsignedDataSet::decode(value.value.as_slice()).unwrap(); + let hash = msg::hash_proto(&decoded).unwrap(); + qcommit_for_hash(duty, hash, value) + } + + fn qcommit_for_hash(duty: Duty, hash: [u8; 32], value: Any) -> qbft::Msg { let values = Arc::new(HashMap::from([(hash, value)])); Arc::new( msg::Msg::new( @@ -732,12 +1005,74 @@ mod tests { pbcore::UnsignedDataSet::default() } - fn any_unsigned(value: &pbcore::UnsignedDataSet) -> Any { - let mut buf = Vec::new(); - value.encode(&mut buf).unwrap(); - Any { - type_url: pbcore::UnsignedDataSet::type_url(), - value: buf, + fn unsigned_attestation_set( + pubkey: &str, + data: phase0::AttestationData, + ) -> pbcore::UnsignedDataSet { + pbcore::UnsignedDataSet { + set: [(pubkey.to_string(), attestation_bytes(&data))].into(), } } + + fn attestation_bytes(data: &phase0::AttestationData) -> Bytes { + let data = data.as_ssz_bytes(); + let duty_offset = ATTESTATION_DATA_SSZ_OFFSET + .checked_add(data.len()) + .expect("test attestation data offset fits usize"); + let capacity = duty_offset + .checked_add(ATTESTER_DUTY_SSZ_SIZE) + .expect("test attestation data length fits usize"); + let mut out = Vec::with_capacity(capacity); + out.extend_from_slice( + &u32::try_from(ATTESTATION_DATA_SSZ_OFFSET) + .expect("test attestation data offset fits u32") + .to_le_bytes(), + ); + out.extend_from_slice( + &u32::try_from(duty_offset) + .expect("test attestation duty offset fits u32") + .to_le_bytes(), + ); + out.extend_from_slice(&data); + out.extend_from_slice(&[0; ATTESTER_DUTY_SSZ_SIZE]); + Bytes::from(out) + } + + fn attestation_data() -> phase0::AttestationData { + phase0::AttestationData { + slot: 1, + index: 2, + beacon_block_root: [3; 32], + source: phase0::Checkpoint { + epoch: 4, + root: [5; 32], + }, + target: phase0::Checkpoint { + epoch: 6, + root: [7; 32], + }, + } + } + + fn changed_attestation_data() -> phase0::AttestationData { + phase0::AttestationData { + source: phase0::Checkpoint { + epoch: 8, + root: [9; 32], + }, + target: phase0::Checkpoint { + epoch: 10, + root: [11; 32], + }, + ..attestation_data() + } + } + + fn any_unsigned(value: &pbcore::UnsignedDataSet) -> Any { + Any::from_msg(value).unwrap() + } + + fn pubkey(seed: u8) -> String { + format!("0x{}", hex::encode([seed; 48])) + } } diff --git a/crates/core/src/dutydb/memory.rs b/crates/core/src/dutydb/memory.rs index 01a68b86..75df01fe 100644 --- a/crates/core/src/dutydb/memory.rs +++ b/crates/core/src/dutydb/memory.rs @@ -8,19 +8,28 @@ use pluto_eth2api::{ spec::{altair, phase0}, versioned, }; +use pluto_ssz::decode::{decode_u32, decode_u64}; +use serde::{Deserialize, Deserializer, de}; +use ssz::Decode; use tokio::sync::{Notify, RwLock, mpsc}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use tree_hash::TreeHash; use crate::{ + ParSigExCodecError, + corepb::v1::core as pbcore, deadline::{AddOutcome, DeadlinerHandle}, signeddata::{ - AttestationData, SyncContribution, VersionedAggregatedAttestation, VersionedProposal, + AttestationData, AttesterDuty, SyncContribution, VersionedAggregatedAttestation, + VersionedProposal, }, types::{Duty, DutyType, PubKey}, }; +const ATTESTATION_DATA_SSZ_OFFSET: usize = 8; +const ATTESTER_DUTY_SSZ_SIZE: usize = 96; + /// Error type for DutyDB operations. #[derive(Debug, thiserror::Error)] pub enum Error { @@ -146,6 +155,167 @@ pub enum UnsignedDutyData { /// `core.UnsignedDataSet`. pub type UnsignedDataSet = HashMap; +/// Converts an unsigned-data-set protobuf into domain unsigned duty data. +/// Currently decodes attester data; other duty types return unsupported. +pub fn unsigned_data_set_from_proto( + duty_type: &DutyType, + set: &pbcore::UnsignedDataSet, +) -> std::result::Result { + if set.set.is_empty() { + return Err(ParSigExCodecError::InvalidUnsignedDataSetFields); + } + + let mut out = UnsignedDataSet::with_capacity(set.set.len()); + for (pubkey, data) in &set.set { + let pubkey = PubKey::try_from(pubkey.as_str()) + .map_err(|_| ParSigExCodecError::InvalidPubKey(pubkey.clone()))?; + out.insert(pubkey, unsigned_duty_data_from_proto(duty_type, data)?); + } + + Ok(out) +} + +fn unsigned_duty_data_from_proto( + duty_type: &DutyType, + data: &[u8], +) -> std::result::Result { + match duty_type { + DutyType::Attester => decode_attestation_data(data).map(UnsignedDutyData::Attestation), + _ => Err(ParSigExCodecError::UnsupportedDutyType), + } +} + +fn decode_attestation_data( + data: &[u8], +) -> std::result::Result { + if let Ok(data) = decode_attestation_data_ssz(data) { + return Ok(data); + } + + if data.iter().find(|b| !b.is_ascii_whitespace()).copied() == Some(b'{') { + let decoded: AttestationDataJson = + serde_json::from_slice(data).map_err(ParSigExCodecError::from)?; + return Ok(AttestationData { + data: decoded.attestation_data, + duty: decoded.attestation_duty.into(), + }); + } + + Err(ParSigExCodecError::UnsignedData( + "unmarshal attestation data".to_string(), + )) +} + +fn decode_attestation_data_ssz( + data: &[u8], +) -> std::result::Result { + if data.len() < ATTESTATION_DATA_SSZ_OFFSET { + return Err(ParSigExCodecError::UnsignedData( + "attestation data too short".to_string(), + )); + } + + let data_offset = usize::try_from( + decode_u32(&data[..4]).map_err(|err| ParSigExCodecError::UnsignedData(err.to_string()))?, + ) + .map_err(|err| ParSigExCodecError::UnsignedData(err.to_string()))?; + let duty_offset = usize::try_from( + decode_u32(&data[4..ATTESTATION_DATA_SSZ_OFFSET]) + .map_err(|err| ParSigExCodecError::UnsignedData(err.to_string()))?, + ) + .map_err(|err| ParSigExCodecError::UnsignedData(err.to_string()))?; + + if data_offset != ATTESTATION_DATA_SSZ_OFFSET + || duty_offset < data_offset + || duty_offset > data.len() + || data.len().saturating_sub(duty_offset) < ATTESTER_DUTY_SSZ_SIZE + { + return Err(ParSigExCodecError::UnsignedData( + "attestation data offset".to_string(), + )); + } + + let attestation_data = phase0::AttestationData::from_ssz_bytes(&data[data_offset..duty_offset]) + .map_err(|err| ParSigExCodecError::UnsignedData(format!("{err:?}")))?; + let duty = decode_attester_duty_ssz(&data[duty_offset..])?; + + Ok(AttestationData { + data: attestation_data, + duty, + }) +} + +fn decode_attester_duty_ssz(data: &[u8]) -> std::result::Result { + if data.len() < ATTESTER_DUTY_SSZ_SIZE { + return Err(ParSigExCodecError::UnsignedData( + "attester duty too short".to_string(), + )); + } + + let field = |start, end| { + decode_u64(&data[start..end]) + .map_err(|err| ParSigExCodecError::UnsignedData(err.to_string())) + }; + + Ok(AttesterDuty { + slot: field(48, 56)?, + validator_index: field(56, 64)?, + committee_index: field(64, 72)?, + committee_length: field(72, 80)?, + committees_at_slot: field(80, 88)?, + validator_committee_index: field(88, 96)?, + }) +} + +#[derive(Deserialize)] +struct AttestationDataJson { + attestation_data: phase0::AttestationData, + attestation_duty: AttesterDutyJson, +} + +#[derive(Deserialize)] +struct AttesterDutyJson { + #[serde(deserialize_with = "deserialize_u64")] + slot: u64, + #[serde(deserialize_with = "deserialize_u64")] + validator_index: u64, + #[serde(deserialize_with = "deserialize_u64")] + committee_index: u64, + #[serde(deserialize_with = "deserialize_u64")] + committee_length: u64, + #[serde(deserialize_with = "deserialize_u64")] + committees_at_slot: u64, + #[serde(deserialize_with = "deserialize_u64")] + validator_committee_index: u64, +} + +impl From for AttesterDuty { + fn from(value: AttesterDutyJson) -> Self { + Self { + slot: value.slot, + validator_index: value.validator_index, + committee_index: value.committee_index, + committee_length: value.committee_length, + committees_at_slot: value.committees_at_slot, + validator_committee_index: value.validator_committee_index, + } + } +} + +fn deserialize_u64<'de, D>(deserializer: D) -> std::result::Result +where + D: Deserializer<'de>, +{ + let value = serde_json::Value::deserialize(deserializer)?; + match value { + serde_json::Value::Number(number) => number + .as_u64() + .ok_or_else(|| de::Error::custom("invalid u64 number")), + serde_json::Value::String(string) => string.parse().map_err(de::Error::custom), + _ => Err(de::Error::custom("expected u64 string or number")), + } +} + /// Lookup key for attestation data: (slot, committee index). #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] struct AttKey { @@ -615,6 +785,8 @@ mod tests { use std::sync::Arc; use chrono::{DateTime, Utc}; + use prost::bytes::Bytes; + use ssz::Encode; use tokio::sync::mpsc::{Receiver, channel}; use tokio_util::sync::CancellationToken; @@ -687,6 +859,69 @@ mod tests { } } + fn unsigned_attestation_proto( + pubkey: PubKey, + data: &AttestationData, + ) -> pbcore::UnsignedDataSet { + pbcore::UnsignedDataSet { + set: [(pubkey.to_string(), attestation_proto_bytes(data))].into(), + } + } + + fn attestation_proto_bytes(data: &AttestationData) -> Bytes { + let attestation = data.data.as_ssz_bytes(); + let duty_offset = ATTESTATION_DATA_SSZ_OFFSET + .checked_add(attestation.len()) + .expect("test attestation offset fits usize"); + let capacity = duty_offset + .checked_add(ATTESTER_DUTY_SSZ_SIZE) + .expect("test attestation proto length fits usize"); + let mut out = Vec::with_capacity(capacity); + out.extend_from_slice( + &u32::try_from(ATTESTATION_DATA_SSZ_OFFSET) + .expect("test attestation offset fits u32") + .to_le_bytes(), + ); + out.extend_from_slice( + &u32::try_from(duty_offset) + .expect("test duty offset fits u32") + .to_le_bytes(), + ); + out.extend_from_slice(&attestation); + out.extend_from_slice(&[0; 48]); + out.extend_from_slice(&data.duty.slot.to_le_bytes()); + out.extend_from_slice(&data.duty.validator_index.to_le_bytes()); + out.extend_from_slice(&data.duty.committee_index.to_le_bytes()); + out.extend_from_slice(&data.duty.committee_length.to_le_bytes()); + out.extend_from_slice(&data.duty.committees_at_slot.to_le_bytes()); + out.extend_from_slice(&data.duty.validator_committee_index.to_le_bytes()); + Bytes::from(out) + } + + fn unsigned_attestation_json_proto( + pubkey: PubKey, + data: &AttestationData, + ) -> pbcore::UnsignedDataSet { + let value = serde_json::json!({ + "attestation_data": data.data, + "attestation_duty": { + "slot": data.duty.slot.to_string(), + "validator_index": data.duty.validator_index.to_string(), + "committee_index": data.duty.committee_index.to_string(), + "committee_length": data.duty.committee_length.to_string(), + "committees_at_slot": data.duty.committees_at_slot.to_string(), + "validator_committee_index": data.duty.validator_committee_index.to_string(), + }, + }); + pbcore::UnsignedDataSet { + set: [( + pubkey.to_string(), + Bytes::from(serde_json::to_vec(&value).unwrap()), + )] + .into(), + } + } + fn phase0_proposal(slot: u64, proposer_index: u64) -> VersionedProposal { use pluto_eth2api::spec::phase0 as p0; @@ -827,6 +1062,46 @@ mod tests { ); } + #[test] + fn unsigned_data_set_from_proto_decodes_attester_ssz() { + let pubkey = random_core_pub_key(); + let data = att_data(123, 4, 5); + let proto = unsigned_attestation_proto(pubkey, &data); + + let decoded = unsigned_data_set_from_proto(&DutyType::Attester, &proto).unwrap(); + + match decoded.get(&pubkey).unwrap() { + UnsignedDutyData::Attestation(decoded) => assert_eq!(decoded, &data), + other => panic!("unexpected unsigned data: {other:?}"), + } + } + + #[test] + fn unsigned_data_set_from_proto_decodes_attester_json() { + let pubkey = random_core_pub_key(); + let data = att_data(123, 4, 5); + let proto = unsigned_attestation_json_proto(pubkey, &data); + + let decoded = unsigned_data_set_from_proto(&DutyType::Attester, &proto).unwrap(); + + match decoded.get(&pubkey).unwrap() { + UnsignedDutyData::Attestation(decoded) => assert_eq!(decoded, &data), + other => panic!("unexpected unsigned data: {other:?}"), + } + } + + #[test] + fn unsigned_data_set_from_proto_rejects_empty_set() { + let err = + unsigned_data_set_from_proto(&DutyType::Attester, &pbcore::UnsignedDataSet::default()) + .unwrap_err(); + + assert!(matches!( + err, + ParSigExCodecError::InvalidUnsignedDataSetFields + )); + } + /// `FarFutureCalculator` schedules every duty, so it can't exercise the /// `AddOutcome::NoDeadline` arm in `store()`. Back the DB with /// `NeverExpiringCalculator` (always `Ok(None)`) so that types without a diff --git a/crates/core/src/dutydb/mod.rs b/crates/core/src/dutydb/mod.rs index c96e9ddf..37c501e8 100644 --- a/crates/core/src/dutydb/mod.rs +++ b/crates/core/src/dutydb/mod.rs @@ -2,4 +2,4 @@ pub mod memory; -pub use memory::{Error, MemDB, UnsignedDataSet, UnsignedDutyData}; +pub use memory::{Error, MemDB, UnsignedDataSet, UnsignedDutyData, unsigned_data_set_from_proto}; diff --git a/crates/core/src/parsigex_codec.rs b/crates/core/src/parsigex_codec.rs index b19c770d..b8e7c8ec 100644 --- a/crates/core/src/parsigex_codec.rs +++ b/crates/core/src/parsigex_codec.rs @@ -31,6 +31,10 @@ pub enum ParSigExCodecError { #[error("invalid partial signed data set proto fields")] InvalidParSignedDataSetFields, + /// Invalid unsigned data set proto. + #[error("invalid unsigned data set fields")] + InvalidUnsignedDataSetFields, + /// Invalid partial signed proto. #[error("invalid partial signed proto")] InvalidParSignedProto, @@ -67,6 +71,10 @@ pub enum ParSigExCodecError { #[error("signed data: {0}")] SignedData(String), + /// Unsigned data construction error. + #[error("unsigned data: {0}")] + UnsignedData(String), + /// Failed to extract the signature from signed data. #[error("invalid signature: {0}")] InvalidSignature(String), From 233e80e22218ff5660b96f50d03d58d2baab21a5 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 13:38:34 +0700 Subject: [PATCH 04/21] fix: retain qbft instance until deadline --- crates/consensus/src/instance.rs | 5 ++ crates/consensus/src/qbft/admission.rs | 29 +++++++++++ crates/consensus/src/qbft/component.rs | 18 ++++--- crates/consensus/src/qbft/runner.rs | 67 +++++++++++++++++++++++--- 4 files changed, 104 insertions(+), 15 deletions(-) diff --git a/crates/consensus/src/instance.rs b/crates/consensus/src/instance.rs index 138efdce..6538ea3a 100644 --- a/crates/consensus/src/instance.rs +++ b/crates/consensus/src/instance.rs @@ -171,6 +171,11 @@ impl InstanceIo { .is_ok() } + /// Returns true once this instance's runner has been started. + pub fn has_started(&self) -> bool { + self.running.load(Ordering::Relaxed) + } + /// Transfers receive-buffer ownership to the runner. pub fn take_recv_rx(&self) -> Result> { take_receiver(&self.recv_rx, "recv") diff --git a/crates/consensus/src/qbft/admission.rs b/crates/consensus/src/qbft/admission.rs index bd5d4761..d1f79c57 100644 --- a/crates/consensus/src/qbft/admission.rs +++ b/crates/consensus/src/qbft/admission.rs @@ -455,6 +455,35 @@ mod tests { assert_eq!(err.to_string(), "timeout enqueuing receive buffer"); } + #[tokio::test] + async fn handle_drops_late_message_after_started_receiver_closed() { + let consensus = consensus(0, true); + let duty = duty(); + let inst = consensus.get_instance_io(duty.clone()); + assert!(inst.maybe_start()); + drop(inst.take_recv_rx().unwrap()); + let any = unsigned_any("a", b"first"); + let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let value_hash = msg::hash_proto(&value).unwrap(); + let mut msg = unsigned_msg(0); + msg.value_hash = value_hash.to_vec().into(); + let msg = sign_for_peer(msg, 0); + + consensus + .handle( + &CancellationToken::new(), + Some(pbconsensus::QbftConsensusMsg { + msg: Some(msg), + justification: vec![], + values: vec![any], + }), + ) + .await + .unwrap(); + + assert!(Arc::ptr_eq(&inst, &consensus.get_instance_io(duty))); + } + fn consensus_msg(msg: pbconsensus::QbftMsg) -> pbconsensus::QbftConsensusMsg { pbconsensus::QbftConsensusMsg { msg: Some(msg), diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 19576182..187b92d7 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -271,10 +271,17 @@ impl Consensus { return Err(admission::Error::DutyExpired); } - let recv_tx = self.get_recv_buffer(duty); + let inst = self.get_instance_io(duty); tokio::select! { - result = recv_tx.send(wrapped) => { - result.map_err(|_| admission::Error::TimeoutEnqueuingReceiveBuffer) + result = inst.recv_tx.send(wrapped) => { + match result { + Ok(()) => Ok(()), + // A completed instance is retained until the duty deadline + // expires. Its receive task is gone, but late messages + // should not abort the sender's broadcast. + Err(_) if inst.has_started() => Ok(()), + Err(_) => Err(admission::Error::TimeoutEnqueuingReceiveBuffer), + } } () = ct.cancelled() => Err(admission::Error::TimeoutEnqueuingReceiveBuffer), } @@ -351,11 +358,6 @@ impl Consensus { .clone() } - /// Returns the inbound message buffer for a duty instance. - pub(crate) fn get_recv_buffer(&self, duty: Duty) -> mpsc::Sender { - self.get_instance_io(duty).recv_tx.clone() - } - /// Drops cached I/O for a completed or expired duty instance. pub(crate) fn delete_instance_io(&self, duty: &Duty) { self.instances diff --git a/crates/consensus/src/qbft/runner.rs b/crates/consensus/src/qbft/runner.rs index 0bced4c3..d0ac1327 100644 --- a/crates/consensus/src/qbft/runner.rs +++ b/crates/consensus/src/qbft/runner.rs @@ -122,10 +122,17 @@ where let inst = consensus.get_instance_io(duty.clone()); inst.mark_proposed().map_err(Error::ProposeConsensus)?; - try_send_input(&inst.value_tx, any.clone())?; - try_send_input(&inst.hash_tx, hash)?; - if consensus.compare_attestations() { - try_send_input(&inst.verify_tx, any)?; + let value_closed = try_send_input(&inst.value_tx, any.clone())?.is_closed(); + let hash_closed = try_send_input(&inst.hash_tx, hash)?.is_closed(); + let verify_closed = + consensus.compare_attestations() && try_send_input(&inst.verify_tx, any)?.is_closed(); + let input_closed = value_closed || hash_closed || verify_closed; + + if input_closed { + if inst.has_started() { + return wait_instance_result(&inst).await; + } + return Err(Error::InputChannelFull); } if !inst.maybe_start() { @@ -173,7 +180,6 @@ pub(crate) async fn run_instance( let result = run_instance_inner(consensus, parent_ct, duty.clone(), Arc::clone(&inst)).await; let runner_result = to_runner_result(&result); let _ = inst.err_tx.send(runner_result).await; - consensus.delete_instance_io(&duty); result } @@ -358,9 +364,25 @@ async fn run_instance_inner( } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InputSend { + Sent, + Closed, +} + +impl InputSend { + fn is_closed(self) -> bool { + self == Self::Closed + } +} + /// Sends a one-shot local input into an instance channel without waiting. -fn try_send_input(tx: &mpsc::Sender, value: T) -> Result<()> { - tx.try_send(value).map_err(|_| Error::InputChannelFull) +fn try_send_input(tx: &mpsc::Sender, value: T) -> Result { + match tx.try_send(value) { + Ok(()) => Ok(InputSend::Sent), + Err(mpsc::error::TrySendError::Full(_)) => Err(Error::InputChannelFull), + Err(mpsc::error::TrySendError::Closed(_)) => Ok(InputSend::Closed), + } } /// Waits for an already-running instance to finish. @@ -625,6 +647,37 @@ mod tests { assert_eq!(sniffed.lock().unwrap().len(), 1); } + #[tokio::test] + async fn completed_participation_keeps_instance_for_late_propose() { + let consensus = component::tests::consensus(0, true); + let duty = component::tests::duty(); + let inst = consensus.get_instance_io(duty.clone()); + inst.mark_participated().unwrap(); + assert!(inst.maybe_start()); + let ct = CancellationToken::new(); + ct.cancel(); + + let err = run_instance(&consensus, &ct, duty.clone(), Arc::clone(&inst)) + .await + .unwrap_err(); + + assert!(matches!(err, Error::ConsensusTimeout)); + let retained = consensus.get_instance_io(duty.clone()); + assert!(Arc::ptr_eq(&inst, &retained)); + assert!(retained.has_started()); + + let err = consensus + .propose(&CancellationToken::new(), duty.clone(), unsigned_value(0)) + .await + .unwrap_err(); + + assert!(matches!( + err, + Error::RunnerResult(ref message) if message == "consensus timeout" + )); + assert!(Arc::ptr_eq(&retained, &consensus.get_instance_io(duty))); + } + #[tokio::test] async fn run_instance_parent_cancel_cancels_broadcast_token() { let (broadcast_started_tx, mut broadcast_started_rx) = mpsc::channel(1); From a74e6b675474717be66e68c8bfaef0b954dae39a Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 13:45:58 +0700 Subject: [PATCH 05/21] fix: update docs --- crates/consensus/src/qbft/msg.rs | 3 --- crates/consensus/src/qbft/p2p.rs | 6 +++++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/consensus/src/qbft/msg.rs b/crates/consensus/src/qbft/msg.rs index 04cccfe5..6ecb7a9d 100644 --- a/crates/consensus/src/qbft/msg.rs +++ b/crates/consensus/src/qbft/msg.rs @@ -25,9 +25,6 @@ //! message types, while invalid duty wire values project to //! [`DutyType::Unknown`]. -// TODO: Remove once component/transport wiring uses the crate-visible helpers. -#![allow(dead_code)] - use std::{any, collections::HashMap, fmt, sync}; use k256::{PublicKey, SecretKey}; diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs index edefb123..767b74e3 100644 --- a/crates/consensus/src/qbft/p2p.rs +++ b/crates/consensus/src/qbft/p2p.rs @@ -132,6 +132,9 @@ pub struct Handle { impl Handle { /// Enqueues a QBFT message for async broadcast to every non-self peer. + /// + /// The token is accepted for the shared broadcaster shape. After enqueue, + /// network fanout is best-effort and is not cancelled by this token. pub async fn broadcast( &self, _ct: CancellationToken, @@ -487,7 +490,8 @@ impl Behaviour { /// Fans a broadcast command out to every non-self peer. fn handle_broadcast(&mut self, command: BroadcastCommand) { let mut target_count = 0usize; - for peer_id in self.config.peers.clone() { + for peer_idx in 0..self.config.peers.len() { + let peer_id = self.config.peers[peer_idx]; if peer_id == self.config.local_peer_id { continue; } From eadb1077cba8a4699fa51b52f91fb37fba4419d9 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 14:09:03 +0700 Subject: [PATCH 06/21] fix: avoid qbft attester compare polling --- crates/consensus/src/qbft/definition.rs | 71 +++++++++++++++++-------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/crates/consensus/src/qbft/definition.rs b/crates/consensus/src/qbft/definition.rs index e97ace6f..974e3adf 100644 --- a/crates/consensus/src/qbft/definition.rs +++ b/crates/consensus/src/qbft/definition.rs @@ -20,8 +20,6 @@ use super::{ msg::{self, ConsensusQbftTypes}, }; -const LOCAL_COMPARE_VALUE_POLL_INTERVAL: time::Duration = time::Duration::from_millis(10); - /// Callback invoked with the decided commit quorum. pub(crate) type DecideCallback = Arc>) + Send + Sync + 'static>; @@ -213,27 +211,23 @@ fn local_compare_value( return Ok(request.input_value_source.clone()); } - // Poll in short intervals so compare-scoped cancellation is still observed - // while waiting on the blocking local-value channel. - loop { - if request.ct.is_canceled() { - return Err(AttesterCompareError::TimeoutWaitingLocalValue); - } + let (cancel_tx, cancel_rx) = mpmc::bounded(1); - match request - .input_value_source_ch - .recv_timeout(LOCAL_COMPARE_VALUE_POLL_INTERVAL) - { - Ok(value) => { - let _ = request.return_value.send(value.clone()); - return Ok(value); - } - Err(mpmc::RecvTimeoutError::Timeout) => {} - Err(mpmc::RecvTimeoutError::Disconnected) => { - return Err(AttesterCompareError::LocalValueChannelClosed); + request.ct.run( + move || { + let _ = cancel_tx.try_send(()); + }, + || { + mpmc::select! { + recv(request.input_value_source_ch) -> msg => { + let value = msg.map_err(|_| AttesterCompareError::LocalValueChannelClosed)?; + let _ = request.return_value.send(value.clone()); + Ok(value) + }, + recv(cancel_rx) -> _ => Err(AttesterCompareError::TimeoutWaitingLocalValue), } - } - } + }, + ) } fn decode_attester_set(any: &Any) -> std::result::Result { @@ -832,6 +826,41 @@ mod tests { )); } + #[test] + fn compare_attester_wakes_when_cancelled_while_waiting_for_local_value() { + let leader = unsigned_attestation_set(&pubkey(1), attestation_data()); + let cts = cancellation::CancellationTokenSource::new(); + let ct = cts.token().clone(); + let qcommit = qcommit_for_value(component::tests::duty(), any_unsigned(&leader)); + let (_input_tx, input_rx) = mpmc::bounded(1); + let (return_err_tx, return_err_rx) = mpmc::bounded(1); + let (return_value_tx, _return_value_rx) = mpmc::bounded(1); + let input_value = Any::default(); + + std::thread::scope(|scope| { + scope.spawn(|| { + compare( + true, + qbft::CompareRequest { + ct: &ct, + qcommit: &qcommit, + input_value_source_ch: &input_rx, + input_value_source: &input_value, + return_err: &return_err_tx, + return_value: &return_value_tx, + }, + ); + }); + + cts.cancel(); + }); + + assert!(matches!( + return_err_rx.recv().unwrap(), + Err(qbft::QbftError::CompareError) + )); + } + #[tokio::test] async fn new_definition_leader_callback_uses_go_formula() { let consensus = component::tests::consensus(0, true); From bcf679812b435514a58b4c5990a3ac49fcce6806 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 14:43:14 +0700 Subject: [PATCH 07/21] fix: refactor code --- crates/consensus/src/instance.rs | 2 - crates/consensus/src/qbft/admission.rs | 525 ----------------- crates/consensus/src/qbft/component.rs | 653 ++++++++++++++++++++-- crates/consensus/src/qbft/definition.rs | 9 +- crates/consensus/src/qbft/interop_test.rs | 114 ---- crates/consensus/src/qbft/mod.rs | 5 +- crates/consensus/src/qbft/p2p.rs | 588 +++---------------- 7 files changed, 677 insertions(+), 1219 deletions(-) delete mode 100644 crates/consensus/src/qbft/admission.rs delete mode 100644 crates/consensus/src/qbft/interop_test.rs diff --git a/crates/consensus/src/instance.rs b/crates/consensus/src/instance.rs index 6538ea3a..e638a580 100644 --- a/crates/consensus/src/instance.rs +++ b/crates/consensus/src/instance.rs @@ -72,8 +72,6 @@ pub type RunnerResult = std::result::Result<(), RunnerError>; /// Sender fields are crate-visible so component code can enqueue directly. /// Receiver fields stay private because each receiver must move exactly once to /// the task that owns that stream. -// TODO: Remove once the instance runner wires these senders. -#[allow(dead_code)] #[derive(Debug)] pub struct InstanceIo { // Lifecycle flags are duplicate/start guards only. They do not publish or diff --git a/crates/consensus/src/qbft/admission.rs b/crates/consensus/src/qbft/admission.rs deleted file mode 100644 index d1f79c57..00000000 --- a/crates/consensus/src/qbft/admission.rs +++ /dev/null @@ -1,525 +0,0 @@ -//! QBFT inbound message admission. - -use prost::{Message, Name}; -use prost_types::Any; - -use pluto_core::corepb::v1::{core as pbcore, priority as pbpriority}; - -use super::{ - component::DecodedValue, - msg::{self, ValueMap}, -}; - -/// Admission result. -pub type Result = std::result::Result; - -/// Admission errors. -#[derive(Debug, thiserror::Error)] -pub enum Error { - /// Outer consensus message was absent or wrong. - #[error("invalid consensus message")] - InvalidConsensusMessage, - - /// Inner message type was invalid. - #[error("invalid consensus message type")] - InvalidConsensusMessageType, - - /// Inner duty type was invalid. - #[error("invalid consensus message duty type")] - InvalidConsensusMessageDutyType, - - /// Inner round was invalid. - #[error("invalid consensus message round")] - InvalidConsensusMessageRound, - - /// Inner prepared round was invalid. - #[error("invalid consensus message prepared round")] - InvalidConsensusMessagePreparedRound, - - /// Message peer index was not in the peer map. - #[error("invalid peer index")] - InvalidPeerIndex, - - /// Signature verification failed before comparison. - #[error("verify consensus message signature: {0}")] - VerifyConsensusMessageSignature(#[source] msg::Error), - - /// Signature recovered to a different peer key. - #[error("invalid consensus message signature")] - InvalidConsensusMessageSignature, - - /// Duty gate rejected the message. - #[error("invalid duty")] - InvalidDuty, - - /// Justification failed validation. - #[error("invalid justification: {0}")] - InvalidJustification(#[source] Box), - - /// Justification duty differed from the outer message duty. - #[error("qbft justification duty differs from message duty")] - JustificationDutyDiffers, - - /// Inbound Any could not be decoded. - #[error("unmarshal any")] - UnmarshalAny, - - /// Message wrapper rejected the value map. - #[error("{0}")] - Msg(#[from] msg::Error), - - /// Duty deadline rejected the message. - #[error("duty expired")] - DutyExpired, - - /// Receive buffer could not accept the message. - #[error("timeout enqueuing receive buffer")] - TimeoutEnqueuingReceiveBuffer, - - /// Context was cancelled after expensive verification. - #[error("receive cancelled during verification")] - ReceiveCancelledDuringVerification, -} - -/// Canonicalizes inbound `Any` values into the hash map used by QBFT messages. -pub(crate) fn values_by_hash(values: &[Any]) -> Result { - let mut out = ValueMap::new(); - - for value in values { - let decoded = decode_supported_any(value)?; - let hash = match decoded { - DecodedValue::UnsignedDataSet(inner) => msg::hash_proto(&inner)?, - DecodedValue::PriorityResult(inner) => msg::hash_proto(&inner)?, - }; - out.insert(hash, value.clone()); - } - - Ok(out) -} - -/// Decodes the protobuf `Any` payload types accepted by this consensus layer. -pub(crate) fn decode_supported_any(value: &Any) -> Result { - if value.type_url == pbcore::UnsignedDataSet::type_url() { - let decoded = pbcore::UnsignedDataSet::decode(value.value.as_slice()) - .map_err(|_| Error::UnmarshalAny)?; - return Ok(DecodedValue::UnsignedDataSet(decoded)); - } - - if value.type_url == pbpriority::PriorityResult::type_url() { - let decoded = pbpriority::PriorityResult::decode(value.value.as_slice()) - .map_err(|_| Error::UnmarshalAny)?; - return Ok(DecodedValue::PriorityResult(decoded)); - } - - Err(Error::UnmarshalAny) -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use prost::bytes::Bytes; - use prost_types::Any; - use test_case::test_case; - use tokio_util::sync::CancellationToken; - - use super::*; - use crate::qbft::{ - Consensus, - component::tests::{config_base, consensus, duty, peers, secret_key}, - }; - use pluto_core::{ - corepb::v1::{consensus as pbconsensus, core as pbcore}, - qbft::{self, SomeMsg}, - types::DutyType, - }; - - #[tokio::test] - async fn handle_rejects_invalid_outer_message() { - let err = consensus(0, true) - .handle(&CancellationToken::new(), None) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "invalid consensus message"); - } - - #[tokio::test] - async fn handle_rejects_missing_inner_message() { - let err = consensus(0, true) - .handle( - &CancellationToken::new(), - Some(pbconsensus::QbftConsensusMsg::default()), - ) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "invalid consensus message"); - } - - #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.r#type = 99, "invalid consensus message type" ; "invalid_message_type")] - #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.duty.as_mut().unwrap().r#type = 99, "invalid consensus message duty type" ; "invalid_duty_type")] - #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.round = 0, "invalid consensus message round" ; "invalid_round")] - #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.prepared_round = -1, "invalid consensus message prepared round" ; "invalid_prepared_round")] - #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.peer_idx = 9, "invalid peer index" ; "invalid_peer_idx")] - #[tokio::test] - async fn verify_msg_rejects_invalid_fields(mutate: fn(&mut pbconsensus::QbftMsg), want: &str) { - let consensus = consensus(0, true); - let mut msg = signed_msg(0); - mutate(&mut msg); - if want != "invalid consensus message signature" { - msg.signature.clear(); - msg = sign_for_peer(msg, 0); - mutate(&mut msg); - } - - let err = consensus.verify_msg(&msg).unwrap_err(); - - assert_eq!(err.to_string(), want); - } - - #[tokio::test] - async fn verify_msg_rejects_missing_duty() { - let consensus = consensus(0, true); - let mut msg = signed_msg(0); - msg.duty = None; - - let err = consensus.verify_msg(&msg).unwrap_err(); - - assert_eq!(err.to_string(), "invalid consensus message"); - } - - #[tokio::test] - async fn verify_msg_rejects_empty_signature() { - let consensus = consensus(0, true); - let mut msg = unsigned_msg(0); - msg.signature.clear(); - - let err = consensus.verify_msg(&msg).unwrap_err(); - - assert_eq!( - err.to_string(), - "verify consensus message signature: empty signature" - ); - } - - #[tokio::test] - async fn verify_msg_rejects_malformed_signature() { - let consensus = consensus(0, true); - let mut msg = unsigned_msg(0); - msg.signature = vec![0x42; 64].into(); - - let err = consensus.verify_msg(&msg).unwrap_err(); - - assert!( - err.to_string() - .starts_with("verify consensus message signature: recover pubkey") - ); - } - - #[tokio::test] - async fn verify_msg_rejects_wrong_signature() { - let consensus = consensus(0, true); - let mut msg = unsigned_msg(0); - msg.signature = msg::sign_msg(&msg, &secret_key(1)).unwrap().signature; - msg.peer_idx = 1; - - let err = consensus.verify_msg(&msg).unwrap_err(); - - assert_eq!(err.to_string(), "invalid consensus message signature"); - } - - #[tokio::test] - async fn verify_msg_accepts_valid_signature() { - let consensus = consensus(0, true); - - consensus.verify_msg(&signed_msg(0)).unwrap(); - } - - #[tokio::test] - async fn handle_rejects_duty_gate_false() { - let err = consensus(0, false) - .handle( - &CancellationToken::new(), - Some(consensus_msg(signed_msg(0))), - ) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "invalid duty"); - } - - #[tokio::test] - async fn handle_rejects_invalid_justification() { - let mut invalid = signed_msg(0); - invalid.round = 0; - let outer = pbconsensus::QbftConsensusMsg { - msg: Some(signed_msg(0)), - justification: vec![invalid], - values: vec![], - }; - - let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(outer)) - .await - .unwrap_err(); - - assert!(err.to_string().starts_with("invalid justification")); - } - - #[tokio::test] - async fn handle_rejects_justification_duty_mismatch() { - let mut justification = unsigned_msg(0); - justification.duty = Some(pbcore::Duty { - slot: 43, - r#type: i32::try_from(&DutyType::Attester).unwrap(), - }); - let justification = sign_for_peer(justification, 0); - let outer = pbconsensus::QbftConsensusMsg { - msg: Some(signed_msg(0)), - justification: vec![justification], - values: vec![], - }; - - let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(outer)) - .await - .unwrap_err(); - - assert_eq!( - err.to_string(), - "qbft justification duty differs from message duty" - ); - } - - #[test] - fn values_by_hash_rejects_invalid_type_url() { - let err = values_by_hash(&[Any { - type_url: "type.googleapis.com/unknown.Type".to_string(), - value: vec![], - }]) - .unwrap_err(); - - assert_eq!(err.to_string(), "unmarshal any"); - } - - #[test] - fn values_by_hash_rejects_malformed_any_value() { - let err = values_by_hash(&[Any { - type_url: pbcore::UnsignedDataSet::type_url(), - value: b"not-protobuf".to_vec(), - }]) - .unwrap_err(); - - assert_eq!(err.to_string(), "unmarshal any"); - } - - #[test] - fn values_by_hash_hashes_decoded_inner_message() { - let any = unsigned_any("a", b"first"); - let values = values_by_hash(std::slice::from_ref(&any)).unwrap(); - let decoded = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); - let hash = msg::hash_proto(&decoded).unwrap(); - - assert_eq!(values.get(&hash), Some(&any)); - } - - #[tokio::test] - async fn handle_rejects_missing_value_hash() { - let mut msg = unsigned_msg(0); - msg.value_hash = [9u8; 32].to_vec().into(); - let msg = sign_for_peer(msg, 0); - - let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(consensus_msg(msg))) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "value hash not found in values"); - } - - #[tokio::test] - async fn handle_enqueues_valid_message() { - let consensus = consensus(0, true); - let any = unsigned_any("a", b"first"); - let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); - let value_hash = msg::hash_proto(&value).unwrap(); - let mut msg = unsigned_msg(0); - msg.value_hash = value_hash.to_vec().into(); - let msg = sign_for_peer(msg, 0); - let duty = duty(); - let inst = consensus.get_instance_io(duty.clone()); - - consensus - .handle( - &CancellationToken::new(), - Some(pbconsensus::QbftConsensusMsg { - msg: Some(msg), - justification: vec![], - values: vec![any], - }), - ) - .await - .unwrap(); - - let mut recv_rx = inst.take_recv_rx().unwrap(); - let received = recv_rx.try_recv().unwrap(); - assert_eq!(received.value(), value_hash); - } - - #[tokio::test] - async fn handle_rejects_deadliner_false_as_duty_expired() { - let consensus = Consensus::new(super::super::component::Config { - peers: peers(), - local_peer_idx: 0, - ..config_base(true) - }) - .unwrap(); - - let err = consensus - .handle( - &CancellationToken::new(), - Some(consensus_msg(signed_msg(0))), - ) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "duty expired"); - } - - #[tokio::test] - async fn handle_rejects_cancellation_after_verification() { - let ct = CancellationToken::new(); - ct.cancel(); - - let err = consensus(0, true) - .handle(&ct, Some(consensus_msg(signed_msg(0)))) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "receive cancelled during verification"); - } - - #[tokio::test] - async fn handle_waits_for_receive_buffer_capacity() { - let consensus = consensus(0, true); - let inst = consensus.get_instance_io(duty()); - let mut recv_rx = inst.take_recv_rx().unwrap(); - for _ in 0..crate::instance::RECV_BUFFER_SIZE { - inst.recv_tx.try_send(wrapped_msg()).unwrap(); - } - - let ct = CancellationToken::new(); - let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); - tokio::pin!(handle); - - tokio::select! { - result = &mut handle => panic!( - "handle completed while receive buffer was full: {result:?}" - ), - () = tokio::task::yield_now() => {} - } - - recv_rx.recv().await.unwrap(); - tokio::time::timeout(std::time::Duration::from_secs(1), &mut handle) - .await - .unwrap() - .unwrap(); - } - - #[tokio::test] - async fn handle_rejects_full_receive_buffer_after_cancellation() { - let consensus = consensus(0, true); - let inst = consensus.get_instance_io(duty()); - let _recv_rx = inst.take_recv_rx().unwrap(); - for _ in 0..crate::instance::RECV_BUFFER_SIZE { - inst.recv_tx.try_send(wrapped_msg()).unwrap(); - } - - let ct = CancellationToken::new(); - let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); - tokio::pin!(handle); - - tokio::select! { - result = &mut handle => panic!( - "handle completed while receive buffer was full: {result:?}" - ), - () = tokio::task::yield_now() => {} - } - ct.cancel(); - let err = tokio::time::timeout(std::time::Duration::from_secs(1), &mut handle) - .await - .unwrap() - .unwrap_err(); - - assert_eq!(err.to_string(), "timeout enqueuing receive buffer"); - } - - #[tokio::test] - async fn handle_drops_late_message_after_started_receiver_closed() { - let consensus = consensus(0, true); - let duty = duty(); - let inst = consensus.get_instance_io(duty.clone()); - assert!(inst.maybe_start()); - drop(inst.take_recv_rx().unwrap()); - let any = unsigned_any("a", b"first"); - let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); - let value_hash = msg::hash_proto(&value).unwrap(); - let mut msg = unsigned_msg(0); - msg.value_hash = value_hash.to_vec().into(); - let msg = sign_for_peer(msg, 0); - - consensus - .handle( - &CancellationToken::new(), - Some(pbconsensus::QbftConsensusMsg { - msg: Some(msg), - justification: vec![], - values: vec![any], - }), - ) - .await - .unwrap(); - - assert!(Arc::ptr_eq(&inst, &consensus.get_instance_io(duty))); - } - - fn consensus_msg(msg: pbconsensus::QbftMsg) -> pbconsensus::QbftConsensusMsg { - pbconsensus::QbftConsensusMsg { - msg: Some(msg), - justification: vec![], - values: vec![], - } - } - - fn unsigned_msg(peer_idx: i64) -> pbconsensus::QbftMsg { - pbconsensus::QbftMsg { - r#type: i64::from(qbft::MSG_PRE_PREPARE), - duty: Some(pbcore::Duty::try_from(&duty()).unwrap()), - peer_idx, - round: 1, - prepared_round: 0, - ..Default::default() - } - } - - fn signed_msg(peer_idx: i64) -> pbconsensus::QbftMsg { - sign_for_peer(unsigned_msg(peer_idx), peer_idx) - } - - fn sign_for_peer(msg: pbconsensus::QbftMsg, peer_idx: i64) -> pbconsensus::QbftMsg { - let seed = u8::try_from(peer_idx.checked_add(1).unwrap()).unwrap(); - msg::sign_msg(&msg, &secret_key(seed)).unwrap() - } - - fn unsigned_any(key: &str, value: &'static [u8]) -> Any { - Any::from_msg(&pbcore::UnsignedDataSet { - set: [(key.to_string(), Bytes::from_static(value))].into(), - }) - .unwrap() - } - - fn wrapped_msg() -> msg::Msg { - msg::Msg::new(unsigned_msg(0), vec![], Arc::default()).unwrap() - } -} diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 187b92d7..84f19d5b 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -8,6 +8,8 @@ use std::{ use futures::future::BoxFuture; use k256::{PublicKey, SecretKey}; +use prost::{Message, Name}; +use prost_types::Any; use tokio::{sync::mpsc, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -23,7 +25,10 @@ use pluto_core::{ types::{Duty, DutyType}, }; -use super::{admission, msg, runner}; +use super::{ + msg::{self, ValueMap}, + runner, +}; /// Result returned by outbound QBFT broadcasting. pub type BroadcastResult = std::result::Result<(), Box>; @@ -92,6 +97,124 @@ pub(crate) enum DecodedValue { PriorityResult(pbpriority::PriorityResult), } +/// Component result. +pub type Result = std::result::Result; + +/// Component construction and inbound admission errors. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Peer order did not fit the wire index type. + #[error("peer index overflow: {index}")] + PeerIndexOverflow { + /// Peer order index. + index: usize, + }, + + /// Local peer index is not present in the peer list. + #[error("invalid local peer index: {peer_idx}")] + InvalidLocalPeerIndex { + /// Local peer index. + peer_idx: i64, + }, + + /// Outer consensus message was absent or wrong. + #[error("invalid consensus message")] + InvalidConsensusMessage, + + /// Inner message type was invalid. + #[error("invalid consensus message type")] + InvalidConsensusMessageType, + + /// Inner duty type was invalid. + #[error("invalid consensus message duty type")] + InvalidConsensusMessageDutyType, + + /// Inner round was invalid. + #[error("invalid consensus message round")] + InvalidConsensusMessageRound, + + /// Inner prepared round was invalid. + #[error("invalid consensus message prepared round")] + InvalidConsensusMessagePreparedRound, + + /// Message peer index was not in the peer map. + #[error("invalid peer index")] + InvalidPeerIndex, + + /// Signature verification failed before comparison. + #[error("verify consensus message signature: {0}")] + VerifyConsensusMessageSignature(#[source] msg::Error), + + /// Signature recovered to a different peer key. + #[error("invalid consensus message signature")] + InvalidConsensusMessageSignature, + + /// Duty gate rejected the message. + #[error("invalid duty")] + InvalidDuty, + + /// Justification failed validation. + #[error("invalid justification: {0}")] + InvalidJustification(#[source] Box), + + /// Justification duty differed from the outer message duty. + #[error("qbft justification duty differs from message duty")] + JustificationDutyDiffers, + + /// Inbound Any could not be decoded. + #[error("unmarshal any")] + UnmarshalAny, + + /// Message wrapper rejected the value map. + #[error("{0}")] + Msg(#[from] msg::Error), + + /// Duty deadline rejected the message. + #[error("duty expired")] + DutyExpired, + + /// Receive buffer could not accept the message. + #[error("timeout enqueuing receive buffer")] + TimeoutEnqueuingReceiveBuffer, + + /// Context was cancelled after expensive verification. + #[error("receive cancelled during verification")] + ReceiveCancelledDuringVerification, +} + +/// Canonicalizes inbound `Any` values into the hash map used by QBFT messages. +pub(crate) fn values_by_hash(values: &[Any]) -> Result { + let mut out = ValueMap::new(); + + for value in values { + let decoded = decode_supported_any(value)?; + let hash = match decoded { + DecodedValue::UnsignedDataSet(inner) => msg::hash_proto(&inner)?, + DecodedValue::PriorityResult(inner) => msg::hash_proto(&inner)?, + }; + out.insert(hash, value.clone()); + } + + Ok(out) +} + +/// Decodes the protobuf `Any` payload types accepted by this consensus layer. +pub(crate) fn decode_supported_any(value: &Any) -> Result { + if value.type_url == pbcore::UnsignedDataSet::type_url() { + let decoded = pbcore::UnsignedDataSet::decode(value.value.as_slice()) + .map_err(|_| Error::UnmarshalAny)?; + return Ok(DecodedValue::UnsignedDataSet(decoded)); + } + + if value.type_url == pbpriority::PriorityResult::type_url() { + let decoded = pbpriority::PriorityResult::decode(value.value.as_slice()) + .map_err(|_| Error::UnmarshalAny)?; + return Ok(DecodedValue::PriorityResult(decoded)); + } + + Err(Error::UnmarshalAny) +} + pub(crate) enum Subscriber { Unsigned(UnsignedSubscriber), Priority(PrioritySubscriber), @@ -150,27 +273,6 @@ pub struct Consensus { instances: Mutex>>>, } -/// Component result. -pub type Result = std::result::Result; - -/// Component construction errors. -#[derive(Debug, thiserror::Error, PartialEq, Eq)] -pub enum Error { - /// Peer order did not fit the wire index type. - #[error("peer index overflow: {index}")] - PeerIndexOverflow { - /// Peer order index. - index: usize, - }, - - /// Local peer index is not present in the peer list. - #[error("invalid local peer index: {peer_idx}")] - InvalidLocalPeerIndex { - /// Local peer index. - peer_idx: i64, - }, -} - impl Consensus { /// Creates a new QBFT consensus component. pub fn new(config: Config) -> Result { @@ -235,40 +337,37 @@ impl Consensus { &self, ct: &CancellationToken, req: Option, - ) -> admission::Result<()> { - let pb_msg = req.ok_or(admission::Error::InvalidConsensusMessage)?; - let msg = pb_msg - .msg - .as_ref() - .ok_or(admission::Error::InvalidConsensusMessage)?; + ) -> Result<()> { + let pb_msg = req.ok_or(Error::InvalidConsensusMessage)?; + let msg = pb_msg.msg.as_ref().ok_or(Error::InvalidConsensusMessage)?; self.verify_msg(msg)?; let duty = duty_from_msg(msg)?; if !self.duty_allowed(&duty) { - return Err(admission::Error::InvalidDuty); + return Err(Error::InvalidDuty); } for justification in &pb_msg.justification { self.verify_msg(justification) - .map_err(|err| admission::Error::InvalidJustification(Box::new(err)))?; + .map_err(|err| Error::InvalidJustification(Box::new(err)))?; let just_duty = duty_from_msg(justification) - .map_err(|err| admission::Error::InvalidJustification(Box::new(err)))?; + .map_err(|err| Error::InvalidJustification(Box::new(err)))?; if just_duty != duty { - return Err(admission::Error::JustificationDutyDiffers); + return Err(Error::JustificationDutyDiffers); } } - let values = admission::values_by_hash(&pb_msg.values)?; + let values = values_by_hash(&pb_msg.values)?; let wrapped = msg::Msg::new(msg.clone(), pb_msg.justification.clone(), Arc::new(values))?; if ct.is_cancelled() { - return Err(admission::Error::ReceiveCancelledDuringVerification); + return Err(Error::ReceiveCancelledDuringVerification); } if self.add_deadline(duty.clone()).await != AddOutcome::Scheduled { - return Err(admission::Error::DutyExpired); + return Err(Error::DutyExpired); } let inst = self.get_instance_io(duty); @@ -280,48 +379,43 @@ impl Consensus { // expires. Its receive task is gone, but late messages // should not abort the sender's broadcast. Err(_) if inst.has_started() => Ok(()), - Err(_) => Err(admission::Error::TimeoutEnqueuingReceiveBuffer), + Err(_) => Err(Error::TimeoutEnqueuingReceiveBuffer), } } - () = ct.cancelled() => Err(admission::Error::TimeoutEnqueuingReceiveBuffer), + () = ct.cancelled() => Err(Error::TimeoutEnqueuingReceiveBuffer), } } /// Verifies fields and signature for one raw QBFT message. - pub(crate) fn verify_msg(&self, msg: &pbconsensus::QbftMsg) -> admission::Result<()> { + pub(crate) fn verify_msg(&self, msg: &pbconsensus::QbftMsg) -> Result<()> { if msg.duty.is_none() { - return Err(admission::Error::InvalidConsensusMessage); + return Err(Error::InvalidConsensusMessage); } if !qbft::MessageType::from_wire(msg.r#type).valid() { - return Err(admission::Error::InvalidConsensusMessageType); + return Err(Error::InvalidConsensusMessageType); } - let duty = msg - .duty - .as_ref() - .ok_or(admission::Error::InvalidConsensusMessage)?; - let duty_type = DutyType::try_from(duty.r#type) - .map_err(|_| admission::Error::InvalidConsensusMessageDutyType)?; + let duty = msg.duty.as_ref().ok_or(Error::InvalidConsensusMessage)?; + let duty_type = + DutyType::try_from(duty.r#type).map_err(|_| Error::InvalidConsensusMessageDutyType)?; if !duty_type.is_valid() { - return Err(admission::Error::InvalidConsensusMessageDutyType); + return Err(Error::InvalidConsensusMessageDutyType); } if msg.round <= 0 { - return Err(admission::Error::InvalidConsensusMessageRound); + return Err(Error::InvalidConsensusMessageRound); } if msg.prepared_round < 0 { - return Err(admission::Error::InvalidConsensusMessagePreparedRound); + return Err(Error::InvalidConsensusMessagePreparedRound); } - let pubkey = self - .pubkey(msg.peer_idx) - .ok_or(admission::Error::InvalidPeerIndex)?; - let signature_ok = msg::verify_msg_sig(msg, pubkey) - .map_err(admission::Error::VerifyConsensusMessageSignature)?; + let pubkey = self.pubkey(msg.peer_idx).ok_or(Error::InvalidPeerIndex)?; + let signature_ok = + msg::verify_msg_sig(msg, pubkey).map_err(Error::VerifyConsensusMessageSignature)?; if !signature_ok { - return Err(admission::Error::InvalidConsensusMessageSignature); + return Err(Error::InvalidConsensusMessageSignature); } Ok(()) @@ -458,27 +552,32 @@ impl Consensus { } /// Extracts the domain duty from a validated raw QBFT message. -fn duty_from_msg(msg: &pbconsensus::QbftMsg) -> admission::Result { - let duty = msg - .duty - .as_ref() - .ok_or(admission::Error::InvalidConsensusMessage)?; - Duty::try_from(duty).map_err(|_| admission::Error::InvalidConsensusMessageDutyType) +fn duty_from_msg(msg: &pbconsensus::QbftMsg) -> Result { + let duty = msg.duty.as_ref().ok_or(Error::InvalidConsensusMessage)?; + Duty::try_from(duty).map_err(|_| Error::InvalidConsensusMessageDutyType) } #[cfg(test)] pub(crate) mod tests { use std::sync::Mutex as StdMutex; + use prost::{Message, bytes::Bytes}; + use prost_types::Any; + use test_case::test_case; use tokio_util::sync::CancellationToken; use super::*; use crate::timer::get_round_timer_func; use pluto_core::{ deadline::{DeadlineCalculator, DeadlinerTask}, + qbft::SomeMsg, types::{DutyType, SlotNumber}, }; + const REFERENCE_VALUE_HASH: &str = + "0a0c0a0430783939120401020304000000000000000000000000000000000000"; + const REFERENCE_PAYLOAD: &str = "0a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; + struct FutureCalculator; impl DeadlineCalculator for FutureCalculator { @@ -516,7 +615,7 @@ pub(crate) mod tests { Err(err) => err, }; - assert_eq!(err, Error::InvalidLocalPeerIndex { peer_idx: 3 }); + assert!(matches!(err, Error::InvalidLocalPeerIndex { peer_idx: 3 })); } #[tokio::test] @@ -606,6 +705,436 @@ pub(crate) mod tests { ); } + #[tokio::test] + async fn handle_rejects_invalid_outer_message() { + let err = consensus(0, true) + .handle(&CancellationToken::new(), None) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message"); + } + + #[tokio::test] + async fn handle_rejects_missing_inner_message() { + let err = consensus(0, true) + .handle( + &CancellationToken::new(), + Some(pbconsensus::QbftConsensusMsg::default()), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message"); + } + + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.r#type = 99, "invalid consensus message type" ; "invalid_message_type")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.duty.as_mut().unwrap().r#type = 99, "invalid consensus message duty type" ; "invalid_duty_type")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.round = 0, "invalid consensus message round" ; "invalid_round")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.prepared_round = -1, "invalid consensus message prepared round" ; "invalid_prepared_round")] + #[test_case(|msg: &mut pbconsensus::QbftMsg| msg.peer_idx = 9, "invalid peer index" ; "invalid_peer_idx")] + #[tokio::test] + async fn verify_msg_rejects_invalid_fields(mutate: fn(&mut pbconsensus::QbftMsg), want: &str) { + let consensus = consensus(0, true); + let mut msg = signed_msg(0); + mutate(&mut msg); + if want != "invalid consensus message signature" { + msg.signature.clear(); + msg = sign_for_peer(msg, 0); + mutate(&mut msg); + } + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!(err.to_string(), want); + } + + #[tokio::test] + async fn verify_msg_rejects_missing_duty() { + let consensus = consensus(0, true); + let mut msg = signed_msg(0); + msg.duty = None; + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message"); + } + + #[tokio::test] + async fn verify_msg_rejects_empty_signature() { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.signature.clear(); + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!( + err.to_string(), + "verify consensus message signature: empty signature" + ); + } + + #[tokio::test] + async fn verify_msg_rejects_malformed_signature() { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.signature = vec![0x42; 64].into(); + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert!( + err.to_string() + .starts_with("verify consensus message signature: recover pubkey") + ); + } + + #[tokio::test] + async fn verify_msg_rejects_wrong_signature() { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.signature = msg::sign_msg(&msg, &secret_key(1)).unwrap().signature; + msg.peer_idx = 1; + + let err = consensus.verify_msg(&msg).unwrap_err(); + + assert_eq!(err.to_string(), "invalid consensus message signature"); + } + + #[tokio::test] + async fn verify_msg_accepts_valid_signature() { + let consensus = consensus(0, true); + + consensus.verify_msg(&signed_msg(0)).unwrap(); + } + + #[tokio::test] + async fn handle_rejects_duty_gate_false() { + let err = consensus(0, false) + .handle( + &CancellationToken::new(), + Some(consensus_msg(signed_msg(0))), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid duty"); + } + + #[tokio::test] + async fn handle_rejects_invalid_justification() { + let mut invalid = signed_msg(0); + invalid.round = 0; + let outer = pbconsensus::QbftConsensusMsg { + msg: Some(signed_msg(0)), + justification: vec![invalid], + values: vec![], + }; + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(outer)) + .await + .unwrap_err(); + + assert!(err.to_string().starts_with("invalid justification")); + } + + #[tokio::test] + async fn handle_rejects_justification_duty_mismatch() { + let mut justification = unsigned_msg(0); + justification.duty = Some(pbcore::Duty { + slot: 43, + r#type: i32::try_from(&DutyType::Attester).unwrap(), + }); + let justification = sign_for_peer(justification, 0); + let outer = pbconsensus::QbftConsensusMsg { + msg: Some(signed_msg(0)), + justification: vec![justification], + values: vec![], + }; + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(outer)) + .await + .unwrap_err(); + + assert_eq!( + err.to_string(), + "qbft justification duty differs from message duty" + ); + } + + #[test] + fn values_by_hash_rejects_invalid_type_url() { + let err = values_by_hash(&[Any { + type_url: "type.googleapis.com/unknown.Type".to_string(), + value: vec![], + }]) + .unwrap_err(); + + assert_eq!(err.to_string(), "unmarshal any"); + } + + #[test] + fn values_by_hash_rejects_malformed_any_value() { + let err = values_by_hash(&[Any { + type_url: pbcore::UnsignedDataSet::type_url(), + value: b"not-protobuf".to_vec(), + }]) + .unwrap_err(); + + assert_eq!(err.to_string(), "unmarshal any"); + } + + #[test] + fn values_by_hash_hashes_decoded_inner_message() { + let any = unsigned_any("a", b"first"); + let values = values_by_hash(std::slice::from_ref(&any)).unwrap(); + let decoded = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let hash = msg::hash_proto(&decoded).unwrap(); + + assert_eq!(values.get(&hash), Some(&any)); + } + + #[tokio::test] + async fn handle_rejects_missing_value_hash() { + let mut msg = unsigned_msg(0); + msg.value_hash = [9u8; 32].to_vec().into(); + let msg = sign_for_peer(msg, 0); + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "value hash not found in values"); + } + + #[tokio::test] + async fn handle_enqueues_valid_message() { + let consensus = consensus(0, true); + let any = unsigned_any("a", b"first"); + let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let value_hash = msg::hash_proto(&value).unwrap(); + let mut msg = unsigned_msg(0); + msg.value_hash = value_hash.to_vec().into(); + let msg = sign_for_peer(msg, 0); + let duty = duty(); + let inst = consensus.get_instance_io(duty.clone()); + + consensus + .handle( + &CancellationToken::new(), + Some(pbconsensus::QbftConsensusMsg { + msg: Some(msg), + justification: vec![], + values: vec![any], + }), + ) + .await + .unwrap(); + + let mut recv_rx = inst.take_recv_rx().unwrap(); + let received = recv_rx.try_recv().unwrap(); + assert_eq!(received.value(), value_hash); + } + + #[tokio::test] + async fn handle_rejects_deadliner_false_as_duty_expired() { + let consensus = Consensus::new(Config { + peers: peers(), + local_peer_idx: 0, + ..config_base(true) + }) + .unwrap(); + + let err = consensus + .handle( + &CancellationToken::new(), + Some(consensus_msg(signed_msg(0))), + ) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "duty expired"); + } + + #[tokio::test] + async fn handle_rejects_cancellation_after_verification() { + let ct = CancellationToken::new(); + ct.cancel(); + + let err = consensus(0, true) + .handle(&ct, Some(consensus_msg(signed_msg(0)))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "receive cancelled during verification"); + } + + #[tokio::test] + async fn handle_waits_for_receive_buffer_capacity() { + let consensus = consensus(0, true); + let inst = consensus.get_instance_io(duty()); + let mut recv_rx = inst.take_recv_rx().unwrap(); + for _ in 0..crate::instance::RECV_BUFFER_SIZE { + inst.recv_tx.try_send(wrapped_msg()).unwrap(); + } + + let ct = CancellationToken::new(); + let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); + tokio::pin!(handle); + + tokio::select! { + result = &mut handle => panic!( + "handle completed while receive buffer was full: {result:?}" + ), + () = tokio::task::yield_now() => {} + } + + recv_rx.recv().await.unwrap(); + tokio::time::timeout(std::time::Duration::from_secs(1), &mut handle) + .await + .unwrap() + .unwrap(); + } + + #[tokio::test] + async fn handle_rejects_full_receive_buffer_after_cancellation() { + let consensus = consensus(0, true); + let inst = consensus.get_instance_io(duty()); + let _recv_rx = inst.take_recv_rx().unwrap(); + for _ in 0..crate::instance::RECV_BUFFER_SIZE { + inst.recv_tx.try_send(wrapped_msg()).unwrap(); + } + + let ct = CancellationToken::new(); + let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); + tokio::pin!(handle); + + tokio::select! { + result = &mut handle => panic!( + "handle completed while receive buffer was full: {result:?}" + ), + () = tokio::task::yield_now() => {} + } + ct.cancel(); + let err = tokio::time::timeout(std::time::Duration::from_secs(1), &mut handle) + .await + .unwrap() + .unwrap_err(); + + assert_eq!(err.to_string(), "timeout enqueuing receive buffer"); + } + + #[tokio::test] + async fn handle_drops_late_message_after_started_receiver_closed() { + let consensus = consensus(0, true); + let duty = duty(); + let inst = consensus.get_instance_io(duty.clone()); + assert!(inst.maybe_start()); + drop(inst.take_recv_rx().unwrap()); + let any = unsigned_any("a", b"first"); + let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let value_hash = msg::hash_proto(&value).unwrap(); + let mut msg = unsigned_msg(0); + msg.value_hash = value_hash.to_vec().into(); + let msg = sign_for_peer(msg, 0); + + consensus + .handle( + &CancellationToken::new(), + Some(pbconsensus::QbftConsensusMsg { + msg: Some(msg), + justification: vec![], + values: vec![any], + }), + ) + .await + .unwrap(); + + assert!(Arc::ptr_eq(&inst, &consensus.get_instance_io(duty))); + } + + #[tokio::test] + async fn reference_signed_message_is_admitted() { + let consensus = consensus(0, true); + let mut recv_rx = consensus + .get_instance_io(duty()) + .take_recv_rx() + .expect("recv receiver should be available"); + + consensus + .handle(&CancellationToken::new(), Some(reference_consensus_msg())) + .await + .expect("reference message should be admitted"); + + let received = recv_rx.recv().await.expect("admitted message"); + assert_eq!(received.source(), 0); + assert_eq!(hex::encode(received.value()), REFERENCE_VALUE_HASH); + assert_eq!( + received.value_source().expect("value source should exist"), + reference_any_value() + ); + } + + fn consensus_msg(msg: pbconsensus::QbftMsg) -> pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg { + msg: Some(msg), + justification: vec![], + values: vec![], + } + } + + fn unsigned_msg(peer_idx: i64) -> pbconsensus::QbftMsg { + pbconsensus::QbftMsg { + r#type: i64::from(qbft::MSG_PRE_PREPARE), + duty: Some(pbcore::Duty::try_from(&duty()).unwrap()), + peer_idx, + round: 1, + prepared_round: 0, + ..Default::default() + } + } + + fn signed_msg(peer_idx: i64) -> pbconsensus::QbftMsg { + sign_for_peer(unsigned_msg(peer_idx), peer_idx) + } + + fn sign_for_peer(msg: pbconsensus::QbftMsg, peer_idx: i64) -> pbconsensus::QbftMsg { + let seed = u8::try_from(peer_idx.checked_add(1).unwrap()).unwrap(); + msg::sign_msg(&msg, &secret_key(seed)).unwrap() + } + + fn unsigned_any(key: &str, value: &'static [u8]) -> Any { + Any::from_msg(&pbcore::UnsignedDataSet { + set: [(key.to_string(), Bytes::from_static(value))].into(), + }) + .unwrap() + } + + fn reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg::decode( + hex::decode(REFERENCE_PAYLOAD) + .expect("valid fixture hex") + .as_slice(), + ) + .expect("reference payload should decode") + } + + fn reference_value() -> pbcore::UnsignedDataSet { + let mut set = std::collections::BTreeMap::new(); + set.insert("0x99".to_string(), Bytes::from_static(&[1, 2, 3, 4])); + pbcore::UnsignedDataSet { set } + } + + fn reference_any_value() -> Any { + Any::from_msg(&reference_value()).expect("value should pack") + } + + fn wrapped_msg() -> msg::Msg { + msg::Msg::new(unsigned_msg(0), vec![], Arc::default()).unwrap() + } + pub(crate) fn consensus(local_peer_idx: i64, duty_allowed: bool) -> Consensus { Consensus::new(Config { peers: peers(), diff --git a/crates/consensus/src/qbft/definition.rs b/crates/consensus/src/qbft/definition.rs index 974e3adf..a53f1226 100644 --- a/crates/consensus/src/qbft/definition.rs +++ b/crates/consensus/src/qbft/definition.rs @@ -15,8 +15,7 @@ use tokio::runtime::Handle; use tokio_util::sync::CancellationToken; use super::{ - admission, - component::{DecodedValue, SubscriberSet}, + component::{DecodedValue, Error as ComponentError, SubscriberSet, decode_supported_any}, msg::{self, ConsensusQbftTypes}, }; @@ -138,7 +137,7 @@ fn decide( return; }; - let decoded = match admission::decode_supported_any(any_value) { + let decoded = match decode_supported_any(any_value) { Ok(decoded) => decoded, Err(err) => { tracing::error!(error = %err, "Invalid any value"); @@ -231,7 +230,7 @@ fn local_compare_value( } fn decode_attester_set(any: &Any) -> std::result::Result { - match admission::decode_supported_any(any).map_err(AttesterCompareError::DecodeAny)? { + match decode_supported_any(any).map_err(AttesterCompareError::DecodeAny)? { DecodedValue::UnsignedDataSet(value) => { unsigned_data_set_from_proto(&DutyType::Attester, &value) .map_err(AttesterCompareError::DecodeUnsignedDataSet) @@ -261,7 +260,7 @@ enum AttesterCompareError { #[error("msg has no value source: {0}")] ValueSource(#[source] qbft::QbftError), #[error("decode any: {0}")] - DecodeAny(#[source] admission::Error), + DecodeAny(#[source] ComponentError), #[error("unexpected compare value type")] UnexpectedValueType, #[error("timeout on waiting for local value")] diff --git a/crates/consensus/src/qbft/interop_test.rs b/crates/consensus/src/qbft/interop_test.rs deleted file mode 100644 index d84a38ec..00000000 --- a/crates/consensus/src/qbft/interop_test.rs +++ /dev/null @@ -1,114 +0,0 @@ -use futures::io::Cursor; -use prost::{Message, bytes::Bytes}; -use prost_types::Any; -use tokio_util::sync::CancellationToken; - -use crate::qbft::{component::tests, msg}; -use pluto_core::{ - corepb::v1::{consensus as pbconsensus, core as pbcore}, - qbft::SomeMsg, -}; - -const REFERENCE_VALUE_HASH: &str = - "0a0c0a0430783939120401020304000000000000000000000000000000000000"; -const REFERENCE_SIGNATURE: &str = "4cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe924200"; -const REFERENCE_PAYLOAD: &str = "0a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; -const REFERENCE_FRAME: &str = "b7010a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; - -#[tokio::test] -async fn reference_framed_message_decodes() { - let mut cursor = Cursor::new(hex::decode(REFERENCE_FRAME).expect("valid fixture hex")); - - let decoded = - pluto_p2p::proto::read_protobuf_with_max_size::( - &mut cursor, - pluto_p2p::proto::MAX_MESSAGE_SIZE, - ) - .await - .expect("reference frame should decode"); - - assert_eq!(decoded, reference_consensus_msg()); -} - -#[tokio::test] -async fn reference_signed_message_is_admitted() { - let consensus = tests::consensus(0, true); - let mut recv_rx = consensus - .get_instance_io(tests::duty()) - .take_recv_rx() - .expect("recv receiver should be available"); - - consensus - .handle(&CancellationToken::new(), Some(reference_consensus_msg())) - .await - .expect("reference message should be admitted"); - - let received = recv_rx.recv().await.expect("admitted message"); - assert_eq!(received.source(), 0); - assert_eq!(hex::encode(received.value()), REFERENCE_VALUE_HASH); - assert_eq!( - received.value_source().expect("value source should exist"), - reference_any_value() - ); -} - -#[tokio::test] -async fn rust_rebuilds_reference_message_and_frame() { - let rebuilt = build_reference_consensus_msg(); - let mut frame = Cursor::new(Vec::new()); - - pluto_p2p::proto::write_protobuf(&mut frame, &rebuilt) - .await - .expect("frame write should succeed"); - - assert_eq!(rebuilt, reference_consensus_msg()); - assert_eq!(hex::encode(rebuilt.encode_to_vec()), REFERENCE_PAYLOAD); - assert_eq!(hex::encode(frame.into_inner()), REFERENCE_FRAME); -} - -fn build_reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { - let value = reference_value(); - let value_hash = msg::hash_proto(&value).expect("value should hash"); - let signed = msg::sign_msg( - &pbconsensus::QbftMsg { - r#type: i64::from(pluto_core::qbft::MSG_PREPARE), - duty: Some(pbcore::Duty { - slot: 42, - r#type: 2, - }), - peer_idx: 0, - round: 1, - value_hash: value_hash.to_vec().into(), - ..Default::default() - }, - &tests::secret_key(1), - ) - .expect("message should sign"); - - assert_eq!(hex::encode(&signed.signature), REFERENCE_SIGNATURE); - - pbconsensus::QbftConsensusMsg { - msg: Some(signed), - justification: vec![], - values: vec![Any::from_msg(&value).expect("value should pack")], - } -} - -fn reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { - pbconsensus::QbftConsensusMsg::decode( - hex::decode(REFERENCE_PAYLOAD) - .expect("valid fixture hex") - .as_slice(), - ) - .expect("reference payload should decode") -} - -fn reference_value() -> pbcore::UnsignedDataSet { - let mut set = std::collections::BTreeMap::new(); - set.insert("0x99".to_string(), Bytes::from_static(&[1, 2, 3, 4])); - pbcore::UnsignedDataSet { set } -} - -fn reference_any_value() -> Any { - Any::from_msg(&reference_value()).expect("value should pack") -} diff --git a/crates/consensus/src/qbft/mod.rs b/crates/consensus/src/qbft/mod.rs index bd9475f6..926ccdce 100644 --- a/crates/consensus/src/qbft/mod.rs +++ b/crates/consensus/src/qbft/mod.rs @@ -1,12 +1,11 @@ //! QBFT consensus wrapper. -mod admission; mod component; pub(crate) mod definition; pub(crate) mod runner; pub use component::{ - BroadcastResult, Broadcaster, Config, Consensus, DutyGater, Error, Peer, SnifferSink, + BroadcastResult, Broadcaster, Config, Consensus, DutyGater, Error, Peer, Result, SnifferSink, SubscriberResult, }; pub use runner::{Error as RunnerError, Result as RunnerResult}; @@ -20,7 +19,5 @@ pub mod p2p; pub(crate) mod sniffer; pub(crate) mod transport; -#[cfg(test)] -mod interop_test; #[cfg(test)] mod qbft_run_test; diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs index 767b74e3..d052ea8c 100644 --- a/crates/consensus/src/qbft/p2p.rs +++ b/crates/consensus/src/qbft/p2p.rs @@ -719,11 +719,7 @@ mod tests { use std::{ collections::HashSet, error::Error as StdError, - fs, - path::{Path, PathBuf}, - process::Stdio, task::{Context, Poll}, - time::{SystemTime, UNIX_EPOCH}, }; use futures::{StreamExt as _, io::Cursor, task::noop_waker}; @@ -737,12 +733,9 @@ mod tests { dial_opts::PeerCondition, }, }; - use prost::bytes::Bytes; - use tokio::{ - io::{AsyncBufReadExt, BufReader, Lines}, - process::{Child, ChildStdout, Command}, - sync::{mpsc, oneshot}, - }; + use prost::{Message, bytes::Bytes}; + use prost_types::Any; + use tokio::sync::{mpsc, oneshot}; use crate::{ protocols::QBFT_V2_PROTOCOL_ID, @@ -756,7 +749,7 @@ mod tests { }; use pluto_core::{ corepb::v1::{consensus as pbconsensus, core as pbcore}, - qbft::{self, SomeMsg}, + qbft, }; use pluto_p2p::{ behaviours::pluto::PlutoBehaviourEvent, @@ -768,7 +761,10 @@ mod tests { use super::*; const TEST_TIMEOUT: Duration = Duration::from_secs(10); - const GO_INTEROP_TIMEOUT: Duration = Duration::from_secs(60); + const LIBP2P_SETUP_TIMEOUT: Duration = Duration::from_secs(60); + const REFERENCE_SIGNATURE: &str = "4cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe924200"; + const REFERENCE_PAYLOAD: &str = "0a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; + const REFERENCE_FRAME: &str = "b7010a6f08021204082a1002200142414cf90756a4241bce7b71e18c6fb9cf91dc96abc6ef1739218974d96e75faf0a15921d47997210232cf064b5e401c6de800fb1f654fcadca0e293dea335fe9242005a200a0c0a04307839391204010203040000000000000000000000000000000000001a440a32747970652e676f6f676c65617069732e636f6d2f636f72652e636f726570622e76312e556e7369676e656444617461536574120e0a0c0a0430783939120401020304"; type TestResult = Result>; @@ -777,6 +773,34 @@ mod tests { assert_eq!(protocol_id().to_string(), QBFT_V2_PROTOCOL_ID); } + #[tokio::test] + async fn reference_framed_message_decodes() { + let mut cursor = Cursor::new(hex::decode(REFERENCE_FRAME).expect("valid fixture hex")); + + let decoded = pluto_p2p::proto::read_protobuf_with_max_size::< + pbconsensus::QbftConsensusMsg, + _, + >(&mut cursor, pluto_p2p::proto::MAX_MESSAGE_SIZE) + .await + .expect("reference frame should decode"); + + assert_eq!(decoded, reference_consensus_msg()); + } + + #[tokio::test] + async fn rust_rebuilds_reference_message_and_frame() { + let rebuilt = build_reference_consensus_msg(); + let mut frame = Cursor::new(Vec::new()); + + pluto_p2p::proto::write_protobuf(&mut frame, &rebuilt) + .await + .expect("frame write should succeed"); + + assert_eq!(rebuilt, reference_consensus_msg()); + assert_eq!(hex::encode(rebuilt.encode_to_vec()), REFERENCE_PAYLOAD); + assert_eq!(hex::encode(frame.into_inner()), REFERENCE_FRAME); + } + #[tokio::test] async fn inbound_handler_decodes_and_calls_consensus_handle() -> TestResult<()> { let consensus = Arc::new(consensus(0, true)); @@ -992,82 +1016,6 @@ mod tests { Ok(()) } - #[tokio::test] - #[ignore = "requires local Charon source, Go toolchain, and local TCP sockets"] - async fn mixed_charon_pluto_libp2p_interop() -> TestResult<()> { - let keys = test_keys_n(4)?; - let peer_ids = peer_ids(&keys)?; - let mut nodes = build_pluto_nodes(keys[..2].to_vec(), peer_ids.clone())?; - let mut node0_recv = nodes - .get_mut(0) - .and_then(|node| node.recv_rx.take()) - .ok_or_else(|| std::io::Error::other("missing node 0 receiver"))?; - let mut node1_recv = nodes - .get_mut(1) - .and_then(|node| node.recv_rx.take()) - .ok_or_else(|| std::io::Error::other("missing node 1 receiver"))?; - let handle0 = nodes - .first() - .map(|node| node.handle.clone()) - .ok_or_else(|| std::io::Error::other("missing node 0 handle"))?; - let handle1 = nodes - .get(1) - .map(|node| node.handle.clone()) - .ok_or_else(|| std::io::Error::other("missing node 1 handle"))?; - - let (listen_tx, mut listen_rx) = mpsc::unbounded_channel(); - let (conn_tx, mut conn_rx) = mpsc::unbounded_channel(); - let (event_tx, mut event_rx) = mpsc::unbounded_channel(); - let (task_err_tx, mut task_err_rx) = mpsc::unbounded_channel(); - let running = spawn_nodes(nodes, listen_tx, conn_tx, event_tx, task_err_tx)?; - let rust_addrs = wait_for_listen_addrs(&mut listen_rx, &mut task_err_rx).await?; - - let harness_dir = write_go_interop_harness()?; - let mut child = spawn_go_interop(&harness_dir, &rust_addrs)?; - let stdout = child - .stdout - .take() - .ok_or_else(|| std::io::Error::other("missing go harness stdout"))?; - let mut go_lines = BufReader::new(stdout).lines(); - - let result = - async { - let go_addrs = wait_for_go_ready(&mut go_lines).await?; - dial_go_peers(&running, &go_addrs)?; - wait_for_specific_connections(&mut conn_rx, &[0, 1], &peer_ids[2..4]).await?; - - wait_for_sources(&mut node0_recv, &mut event_rx, 0, &[2, 3]).await?; - wait_for_sources(&mut node1_recv, &mut event_rx, 1, &[2, 3]).await?; - - handle0 - .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 0)?) - .await?; - handle1 - .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 1)?) - .await?; - - wait_for_event(&mut event_rx, 0, |event| { - matches!(event, Event::Sent { peer, .. } if peer_ids[2..4].contains(peer)) - }) - .await?; - wait_for_event(&mut event_rx, 1, |event| { - matches!(event, Event::Sent { peer, .. } if peer_ids[2..4].contains(peer)) - }) - .await?; - wait_for_go_done(&mut go_lines).await - } - .await; - - drop(go_lines); - let status = finish_go_interop(&mut child, result.is_err()).await; - let cleanup = fs::remove_dir_all(&harness_dir); - stop_nodes(running).await?; - result?; - status?; - cleanup?; - Ok(()) - } - struct LocalNode { node: Node, handle: Handle, @@ -1224,7 +1172,7 @@ mod tests { listen_rx: &mut mpsc::UnboundedReceiver<(usize, Multiaddr)>, task_err_rx: &mut mpsc::UnboundedReceiver<(usize, String)>, ) -> TestResult> { - tokio::time::timeout(GO_INTEROP_TIMEOUT, async { + tokio::time::timeout(LIBP2P_SETUP_TIMEOUT, async { let mut addrs = vec![None, None]; while addrs.iter().any(Option::is_none) { tokio::select! { @@ -1277,7 +1225,7 @@ mod tests { conn_rx: &mut mpsc::UnboundedReceiver<(usize, PeerId)>, peer_ids: &[PeerId], ) -> TestResult<()> { - tokio::time::timeout(GO_INTEROP_TIMEOUT, async { + tokio::time::timeout(LIBP2P_SETUP_TIMEOUT, async { let mut seen = [HashSet::new(), HashSet::new()]; while seen.iter().any(|peers| peers.is_empty()) { let (index, peer_id) = conn_rx @@ -1295,65 +1243,6 @@ mod tests { .map_err(|_| std::io::Error::other("timeout waiting for loopback connections"))? } - async fn wait_for_specific_connections( - conn_rx: &mut mpsc::UnboundedReceiver<(usize, PeerId)>, - node_indices: &[usize], - expected_peers: &[PeerId], - ) -> TestResult<()> { - tokio::time::timeout(TEST_TIMEOUT, async { - let mut seen = vec![HashSet::new(); node_indices.len()]; - while seen.iter().any(|peers| peers.len() < expected_peers.len()) { - let (index, peer_id) = conn_rx - .recv() - .await - .ok_or_else(|| std::io::Error::other("connection channel closed"))?; - if let Some(position) = node_indices.iter().position(|node| *node == index) - && expected_peers.contains(&peer_id) - { - seen[position].insert(peer_id); - } - } - - Ok(()) - }) - .await - .map_err(|_| std::io::Error::other("timeout waiting for Go peer connections"))? - } - - async fn wait_for_sources( - recv_rx: &mut mpsc::Receiver, - event_rx: &mut mpsc::UnboundedReceiver<(usize, Event)>, - node_index: usize, - expected_sources: &[i64], - ) -> TestResult<()> { - tokio::time::timeout(TEST_TIMEOUT, async { - let mut seen = HashSet::new(); - while seen.len() < expected_sources.len() { - tokio::select! { - msg = recv_rx.recv() => { - let msg = msg.ok_or_else(|| std::io::Error::other("receive buffer closed"))?; - if expected_sources.contains(&msg.source()) { - seen.insert(msg.source()); - } - } - event = event_rx.recv() => { - let (index, event) = event.ok_or_else(|| std::io::Error::other("event channel closed"))?; - if index == node_index - && let Event::InboundError { error, .. } = event - { - return Err(Box::new(std::io::Error::other(error)) - as Box); - } - } - } - } - - Ok(()) - }) - .await - .map_err(|_| std::io::Error::other("timeout waiting for Charon inbound messages"))? - } - async fn wait_for_event( event_rx: &mut mpsc::UnboundedReceiver<(usize, Event)>, node_index: usize, @@ -1444,6 +1333,49 @@ mod tests { Ok(Keypair::secp256k1_from_der(&mut der)?.public().to_peer_id()) } + fn build_reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { + let value = reference_value(); + let value_hash = msg::hash_proto(&value).expect("value should hash"); + let signed = msg::sign_msg( + &pbconsensus::QbftMsg { + r#type: i64::from(pluto_core::qbft::MSG_PREPARE), + duty: Some(pbcore::Duty { + slot: 42, + r#type: 2, + }), + peer_idx: 0, + round: 1, + value_hash: value_hash.to_vec().into(), + ..Default::default() + }, + &secret_key(1), + ) + .expect("message should sign"); + + assert_eq!(hex::encode(&signed.signature), REFERENCE_SIGNATURE); + + pbconsensus::QbftConsensusMsg { + msg: Some(signed), + justification: vec![], + values: vec![Any::from_msg(&value).expect("value should pack")], + } + } + + fn reference_consensus_msg() -> pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg::decode( + hex::decode(REFERENCE_PAYLOAD) + .expect("valid fixture hex") + .as_slice(), + ) + .expect("reference payload should decode") + } + + fn reference_value() -> pbcore::UnsignedDataSet { + let mut set = std::collections::BTreeMap::new(); + set.insert("0x99".to_string(), Bytes::from_static(&[1, 2, 3, 4])); + pbcore::UnsignedDataSet { set } + } + fn signed_consensus_msg( duty: &pluto_core::types::Duty, peer_idx: i64, @@ -1473,362 +1405,4 @@ mod tests { values: Vec::new(), }) } - - type GoLines = Lines>; - - fn dial_go_peers(running: &[RunningNode], go_addrs: &[Multiaddr]) -> TestResult<()> { - for node in running { - node.dial_tx.send(go_addrs.to_vec())?; - } - - Ok(()) - } - - async fn wait_for_go_ready(lines: &mut GoLines) -> TestResult> { - let line = read_go_line(lines, "READY ").await?; - line.strip_prefix("READY ") - .ok_or_else(|| std::io::Error::other("missing go ready prefix"))? - .split_whitespace() - .map(|addr| addr.parse().map_err(|error| Box::new(error) as _)) - .collect() - } - - async fn wait_for_go_done(lines: &mut GoLines) -> TestResult<()> { - tokio::time::timeout(GO_INTEROP_TIMEOUT, async { - loop { - let line = lines - .next_line() - .await? - .ok_or_else(|| std::io::Error::other("go harness stdout closed"))?; - if line == "DONE" { - return Ok(()); - } - } - }) - .await - .map_err(|_| std::io::Error::other("timeout waiting for Go DONE"))? - } - - async fn read_go_line(lines: &mut GoLines, prefix: &str) -> TestResult { - tokio::time::timeout(GO_INTEROP_TIMEOUT, async { - loop { - let line = lines - .next_line() - .await? - .ok_or_else(|| std::io::Error::other("go harness stdout closed"))?; - if line.starts_with(prefix) { - return Ok(line); - } - } - }) - .await - .map_err(|_| std::io::Error::other("timeout waiting for Go harness output"))? - } - - fn write_go_interop_harness() -> TestResult { - let charon_repo = charon_repo_path(); - if !charon_repo.join("go.mod").exists() { - return Err(Box::new(std::io::Error::other(format!( - "missing Charon repo at {}; set CHARON_REPO", - charon_repo.display() - )))); - } - - let mut dir = std::env::temp_dir(); - let timestamp = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); - dir.push(format!( - "pluto-qbft-interop-{}-{timestamp}", - std::process::id() - )); - fs::create_dir(&dir)?; - fs::write(dir.join("main.go"), GO_INTEROP_HARNESS)?; - - Ok(dir) - } - - fn charon_repo_path() -> PathBuf { - std::env::var("CHARON_REPO") - .map(PathBuf::from) - .unwrap_or_else(|_| PathBuf::from("/Users/quangle/Documents/nethermind/obol/charon")) - } - - fn spawn_go_interop(harness_dir: &Path, rust_addrs: &[Multiaddr]) -> TestResult { - if rust_addrs.len() != 2 { - return Err(Box::new(std::io::Error::other("expected two rust addrs"))); - } - - Ok(Command::new("go") - .arg("run") - .arg(harness_dir.join("main.go")) - .arg(rust_addrs[0].to_string()) - .arg(rust_addrs[1].to_string()) - .current_dir(charon_repo_path()) - .env("GOWORK", "off") - .stdout(Stdio::piped()) - .stderr(Stdio::inherit()) - .spawn()?) - } - - async fn finish_go_interop(child: &mut Child, kill: bool) -> TestResult<()> { - if kill { - let _ = child.kill().await; - } - - let status = tokio::time::timeout(GO_INTEROP_TIMEOUT, child.wait()).await??; - if !status.success() { - return Err(Box::new(std::io::Error::other(format!( - "go harness exited with {status}" - )))); - } - - Ok(()) - } - - const GO_INTEROP_HARNESS: &str = r#" -package main - -import ( - "bytes" - "context" - "encoding/hex" - "fmt" - "os" - "time" - - k1 "github.com/decred/dcrd/dcrec/secp256k1/v4" - ssz "github.com/ferranbt/fastssz" - "github.com/libp2p/go-libp2p" - libp2pcrypto "github.com/libp2p/go-libp2p/core/crypto" - "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/core/peerstore" - "github.com/libp2p/go-libp2p/p2p/security/noise" - "github.com/multiformats/go-multiaddr" - "github.com/obolnetwork/charon/app/k1util" - "github.com/obolnetwork/charon/core" - "github.com/obolnetwork/charon/core/consensus/protocols" - pbv1 "github.com/obolnetwork/charon/core/corepb/v1" - coreqbft "github.com/obolnetwork/charon/core/qbft" - "github.com/obolnetwork/charon/p2p" - "google.golang.org/protobuf/proto" -) - -type received struct { - node int - from int64 -} - -func main() { - if len(os.Args) != 3 { - panic("usage: go run . ") - } - - ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) - defer cancel() - - keys := make([]*k1.PrivateKey, 4) - peerIDs := make([]peer.ID, 4) - pubkeys := make(map[int64]*k1.PublicKey, 4) - for i := range keys { - keyBytes := bytes.Repeat([]byte{byte(i + 1)}, 32) - keys[i] = k1.PrivKeyFromBytes(keyBytes) - priv := (*libp2pcrypto.Secp256k1PrivateKey)(keys[i]) - id, err := peer.IDFromPrivateKey(priv) - if err != nil { - panic(err) - } - peerIDs[i] = id - pubkeys[int64(i)] = keys[i].PubKey() - } - - rustAddrs := make([]multiaddr.Multiaddr, 2) - for i, arg := range os.Args[1:] { - addr, err := multiaddr.NewMultiaddr(arg) - if err != nil { - panic(err) - } - rustAddrs[i] = addr - } - - recvCh := make(chan received, 16) - hosts := make([]host.Host, 2) - for i := range hosts { - peerIdx := i + 2 - priv := (*libp2pcrypto.Secp256k1PrivateKey)(keys[peerIdx]) - h, err := libp2p.New( - libp2p.Identity(priv), - libp2p.Security(noise.ID, noise.New), - libp2p.ListenAddrStrings("/ip4/127.0.0.1/tcp/0"), - ) - if err != nil { - panic(err) - } - defer h.Close() - hosts[i] = h - - node := peerIdx - p2p.RegisterHandler("qbft-interop", h, protocols.QBFTv2ProtocolID, - func() proto.Message { return new(pbv1.QBFTConsensusMsg) }, - func(_ context.Context, _ peer.ID, req proto.Message) (proto.Message, bool, error) { - msg, ok := req.(*pbv1.QBFTConsensusMsg) - if !ok { - return nil, false, fmt.Errorf("unexpected request %T", req) - } - if err := verifyMsg(msg.GetMsg(), pubkeys); err != nil { - return nil, false, err - } - recvCh <- received{node: node, from: msg.GetMsg().GetPeerIdx()} - return nil, false, nil - }) - } - - goAddrs := make([]string, 2) - for i, h := range hosts { - if len(h.Addrs()) == 0 { - panic("go host has no listen address") - } - peerPart, err := multiaddr.NewMultiaddr("/p2p/" + h.ID().String()) - if err != nil { - panic(err) - } - goAddrs[i] = h.Addrs()[0].Encapsulate(peerPart).String() - } - fmt.Printf("READY %s %s\n", goAddrs[0], goAddrs[1]) - - for _, h := range hosts { - for i := range rustAddrs { - h.Peerstore().AddAddrs(peerIDs[i], []multiaddr.Multiaddr{rustAddrs[i]}, peerstore.PermanentAddrTTL) - } - } - - for i, h := range hosts { - peerIdx := int64(i + 2) - for target := 0; target < 2; target++ { - if err := p2p.Send(ctx, h, protocols.QBFTv2ProtocolID, peerIDs[target], signedConsensusMsg(peerIdx, keys[peerIdx])); err != nil { - panic(err) - } - } - } - fmt.Println("SENT") - - seen := map[int]map[int64]bool{ - 2: {}, - 3: {}, - } - for { - if seen[2][0] && seen[2][1] && seen[3][0] && seen[3][1] { - fmt.Println("DONE") - return - } - - select { - case <-ctx.Done(): - panic(ctx.Err()) - case recv := <-recvCh: - if recv.node == 2 || recv.node == 3 { - seen[recv.node][recv.from] = true - fmt.Printf("RECEIVED %d %d\n", recv.node, recv.from) - } - } - } -} - -func signedConsensusMsg(peerIdx int64, key *k1.PrivateKey) *pbv1.QBFTConsensusMsg { - msg := &pbv1.QBFTMsg{ - Type: int64(coreqbft.MsgPrepare), - Duty: &pbv1.Duty{Slot: 42, Type: int32(core.DutyAttester)}, - PeerIdx: peerIdx, - Round: 1, - ValueHash: nil, - PreparedValueHash: nil, - } - signed, err := signMsg(msg, key) - if err != nil { - panic(err) - } - return &pbv1.QBFTConsensusMsg{Msg: signed} -} - -func signMsg(msg *pbv1.QBFTMsg, privkey *k1.PrivateKey) (*pbv1.QBFTMsg, error) { - clone := proto.Clone(msg).(*pbv1.QBFTMsg) - clone.Signature = nil - - hash, err := hashProto(clone) - if err != nil { - return nil, err - } - - clone.Signature, err = k1util.Sign(privkey, hash[:]) - if err != nil { - return nil, err - } - - return clone, nil -} - -func verifyMsg(msg *pbv1.QBFTMsg, pubkeys map[int64]*k1.PublicKey) error { - if msg == nil || msg.GetDuty() == nil { - return fmt.Errorf("invalid consensus message") - } - if typ := coreqbft.MsgType(msg.GetType()); !typ.Valid() { - return fmt.Errorf("invalid consensus message type: %d", typ) - } - if typ := core.DutyType(msg.GetDuty().GetType()); !typ.Valid() { - return fmt.Errorf("invalid consensus message duty type: %d", typ) - } - if msg.GetRound() <= 0 { - return fmt.Errorf("invalid consensus message round: %d", msg.GetRound()) - } - if msg.GetPreparedRound() < 0 { - return fmt.Errorf("invalid consensus message prepared round") - } - - pubkey, ok := pubkeys[msg.GetPeerIdx()] - if !ok { - return fmt.Errorf("invalid peer index: %d", msg.GetPeerIdx()) - } - ok, err := verifyMsgSig(msg, pubkey) - if err != nil { - return err - } - if !ok { - return fmt.Errorf("invalid consensus message signature") - } - return nil -} - -func verifyMsgSig(msg *pbv1.QBFTMsg, pubkey *k1.PublicKey) (bool, error) { - clone := proto.Clone(msg).(*pbv1.QBFTMsg) - signature := clone.GetSignature() - if len(signature) == 0 { - return false, fmt.Errorf("empty signature") - } - clone.Signature = nil - - hash, err := hashProto(clone) - if err != nil { - return false, err - } - recovered, err := k1util.Recover(hash[:], signature) - if err != nil { - return false, err - } - return hex.EncodeToString(recovered.SerializeCompressed()) == hex.EncodeToString(pubkey.SerializeCompressed()), nil -} - -func hashProto(msg proto.Message) ([32]byte, error) { - hh := ssz.DefaultHasherPool.Get() - defer ssz.DefaultHasherPool.Put(hh) - - index := hh.Index() - b, err := proto.MarshalOptions{Deterministic: true}.Marshal(msg) - if err != nil { - return [32]byte{}, err - } - - hh.PutBytes(b) - hh.Merkleize(index) - return hh.HashRoot() -} -"#; } From 51182aaed9b91cc68399c3efc93898e0769212f3 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 14:46:36 +0700 Subject: [PATCH 08/21] ifx: machete --- Cargo.lock | 4 ---- crates/core/Cargo.toml | 4 ---- 2 files changed, 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 20248431..29dfbd77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5629,16 +5629,12 @@ dependencies = [ "dyn-clone", "dyn-eq", "ethereum_ssz", - "futures", "hex", - "k256", "pluto-build-proto", "pluto-cluster", "pluto-crypto", "pluto-eth2api", "pluto-eth2util", - "pluto-featureset", - "pluto-k1util", "pluto-ssz", "pluto-testutil", "pluto-tracing", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index e281c251..3b313278 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -14,13 +14,10 @@ chrono.workspace = true crossbeam.workspace = true dyn-clone.workspace = true dyn-eq.workspace = true -futures.workspace = true hex.workspace = true -k256.workspace = true vise.workspace = true pluto-crypto.workspace = true pluto-eth2api.workspace = true -pluto-k1util.workspace = true prost.workspace = true prost-types.workspace = true regex.workspace = true @@ -32,7 +29,6 @@ tokio.workspace = true tokio-util.workspace = true tracing.workspace = true pluto-eth2util.workspace = true -pluto-featureset.workspace = true pluto-ssz.workspace = true ssz.workspace = true tree_hash.workspace = true From d5585f990032b68e91c0292ff2f26f161cb72005 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 1 Jun 2026 17:15:13 +0700 Subject: [PATCH 09/21] feat: add example to run qbft --- Cargo.lock | 5 + crates/consensus/Cargo.toml | 5 + crates/consensus/examples/qbft.rs | 832 +++++++++++++++++++++++++ crates/consensus/src/qbft/transport.rs | 7 +- 4 files changed, 848 insertions(+), 1 deletion(-) create mode 100644 crates/consensus/examples/qbft.rs diff --git a/Cargo.lock b/Cargo.lock index 29dfbd77..cf9ad2b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5588,8 +5588,10 @@ dependencies = [ name = "pluto-consensus" version = "1.7.1" dependencies = [ + "anyhow", "cancellation", "chrono", + "clap", "crossbeam", "either", "ethereum_ssz", @@ -5597,14 +5599,17 @@ dependencies = [ "hex", "k256", "libp2p", + "pluto-cluster", "pluto-core", "pluto-eth2api", + "pluto-eth2util", "pluto-featureset", "pluto-k1util", "pluto-p2p", "pluto-ssz", "prost 0.14.3", "prost-types 0.14.3", + "serde_json", "test-case", "thiserror 2.0.18", "tokio", diff --git a/crates/consensus/Cargo.toml b/crates/consensus/Cargo.toml index 803bcbd6..a6eb61a2 100644 --- a/crates/consensus/Cargo.toml +++ b/crates/consensus/Cargo.toml @@ -28,7 +28,12 @@ tokio-util.workspace = true tracing.workspace = true [dev-dependencies] +anyhow.workspace = true +clap.workspace = true +pluto-cluster.workspace = true pluto-eth2api.workspace = true +pluto-eth2util.workspace = true +serde_json.workspace = true ssz.workspace = true test-case.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/crates/consensus/examples/qbft.rs b/crates/consensus/examples/qbft.rs new file mode 100644 index 00000000..7ecb03d6 --- /dev/null +++ b/crates/consensus/examples/qbft.rs @@ -0,0 +1,832 @@ +//! QBFT libp2p example. +//! +//! This example runs one QBFT node per terminal over the real Pluto libp2p +//! stack and the concrete `consensus::qbft::p2p` adapter. By default it runs +//! five sequential synthetic attester duties starting at `--slot` and prints +//! `-------------` after each local decision. +//! +//! Create a cluster first: +//! +//! ```text +//! cargo run -p pluto-cli -- create cluster \ +//! --cluster-dir /tmp/pluto-qbft-demo \ +//! --name qbft-demo \ +//! --network holesky \ +//! --nodes 4 \ +//! --threshold 3 \ +//! --num-validators 1 \ +//! --insecure-keys \ +//! --fee-recipient-addresses 0x000000000000000000000000000000000000dead \ +//! --withdrawal-addresses 0x000000000000000000000000000000000000dead +//! ``` +//! +//! Then run one command per terminal: +//! +//! ```text +//! cargo run -p pluto-consensus --example qbft -- \ +//! --relays https://pluto-relay-0.ovh.dev-nethermind.xyz,https://pluto-relay-1.ovh.dev-nethermind.xyz \ +//! --data-dir /tmp/pluto-qbft-demo/node0 +//! cargo run -p pluto-consensus --example qbft -- \ +//! --relays https://pluto-relay-0.ovh.dev-nethermind.xyz,https://pluto-relay-1.ovh.dev-nethermind.xyz \ +//! --data-dir /tmp/pluto-qbft-demo/node1 +//! cargo run -p pluto-consensus --example qbft -- \ +//! --relays https://pluto-relay-0.ovh.dev-nethermind.xyz,https://pluto-relay-1.ovh.dev-nethermind.xyz \ +//! --data-dir /tmp/pluto-qbft-demo/node2 +//! cargo run -p pluto-consensus --example qbft -- \ +//! --relays https://pluto-relay-0.ovh.dev-nethermind.xyz,https://pluto-relay-1.ovh.dev-nethermind.xyz \ +//! --data-dir /tmp/pluto-qbft-demo/node3 +//! ``` +//! +//! # Flow +//! +//! 1. **Load fixture** (`load_fixture`): reads the node's +//! `charon-enr-private-key` and `cluster-lock.json` from `--data-dir`, +//! derives the cluster peer IDs, locates this node's index, and builds the +//! consensus peer set (secp256k1 public keys from each operator ENR). +//! 2. **Wire consensus** (`build_consensus`): constructs a `qbft::Consensus` +//! with an attester-only duty gater, an `IncreasingRoundTimer`, a +//! `DemoDeadline`, and a broadcaster bound to the QBFT libp2p +//! `qbft::p2p::Handle` through a shared `OnceLock` (the handle only exists +//! after the behaviour is built, so the broadcaster reads it lazily). +//! Decided values are forwarded to a channel via `Consensus::subscribe`, and +//! the expired-duty cleanup loop is spawned. +//! 3. **Build the libp2p node**: an `ExampleBehaviour` combining the relay +//! client, `RelayManager`, mDNS, and the `qbft::p2p::Behaviour`, gated to +//! the configured relays and cluster peers. +//! 4. **Connect**: cluster peers are reached over relays and/or mDNS; the event +//! loop tracks established cluster connections and waits until +//! `--start-after-peers` (default: all other peers) are connected. +//! 5. **Run duties sequentially**: for each of `--duties` synthetic attester +//! duties starting at `--slot`, the round-1 leader calls +//! `Consensus::propose` with a synthetic value while every other node calls +//! `Consensus::participate`. QBFT runs over the p2p adapter; on local +//! decision the subscriber prints the decided value followed by +//! `-------------`, then the next duty starts. +//! 6. **Shut down**: after the last duty decides, the swarm is kept alive for +//! `COMPLETION_DRAIN` so slower peers can still receive the final messages, +//! then the process exits. `ctrl-c`, parent cancellation, or the +//! `--timeout-secs` start deadline also stop the loop. + +use std::{ + collections::{BTreeMap, HashSet}, + convert::Infallible, + error::Error as StdError, + path::PathBuf, + sync::{Arc, OnceLock}, + time::Duration, +}; + +use anyhow::{Context as _, Result, anyhow, bail}; +use chrono::{TimeDelta, Utc}; +use clap::Parser; +use futures::StreamExt as _; +use libp2p::{ + PeerId, mdns, + relay::{self}, + swarm::{NetworkBehaviour, SwarmEvent}, +}; +use pluto_cluster::lock::Lock; +use pluto_consensus::{ + qbft, + timer::{IncreasingRoundTimer, RoundTimer}, +}; +use pluto_core::{ + corepb::v1::core as pbcore, + deadline::{DeadlineCalculator, DeadlinerTask}, + types::{Duty, DutyType, SlotNumber}, +}; +use pluto_eth2util::enr::Record; +use pluto_p2p::{ + behaviours::pluto::PlutoBehaviourEvent, + bootnode, + config::P2PConfig, + gater, k1, + p2p::{Node, NodeType}, + p2p_context::P2PContext, + peer::peer_id_from_key, + relay::{RelayManager, RelayManagerEvent}, +}; +use prost::bytes::Bytes; +use tokio::fs; +use tokio_util::sync::CancellationToken; + +const COMPLETION_DRAIN: Duration = Duration::from_secs(2); + +#[derive(NetworkBehaviour)] +#[behaviour(to_swarm = "ExampleBehaviourEvent")] +struct ExampleBehaviour { + relay: relay::client::Behaviour, + relay_manager: RelayManager, + mdns: mdns::tokio::Behaviour, + qbft: qbft::p2p::Behaviour, +} + +#[derive(Debug)] +enum ExampleBehaviourEvent { + Relay(relay::client::Event), + RelayManager(RelayManagerEvent), + Mdns(mdns::Event), + Qbft(qbft::p2p::Event), +} + +impl From for ExampleBehaviourEvent { + fn from(event: relay::client::Event) -> Self { + Self::Relay(event) + } +} + +impl From for ExampleBehaviourEvent { + fn from(event: RelayManagerEvent) -> Self { + Self::RelayManager(event) + } +} + +impl From for ExampleBehaviourEvent { + fn from(event: mdns::Event) -> Self { + Self::Mdns(event) + } +} + +impl From for ExampleBehaviourEvent { + fn from(event: qbft::p2p::Event) -> Self { + Self::Qbft(event) + } +} + +impl From for ExampleBehaviourEvent { + fn from(value: Infallible) -> Self { + match value {} + } +} + +#[derive(Debug, Parser)] +#[command(name = "qbft-example")] +#[command(about = "Run one relay/local-discovery QBFT demo node")] +struct Args { + /// Directory holding `charon-enr-private-key` and `cluster-lock.json`. + #[arg(long)] + data_dir: PathBuf, + + /// Relay URLs or relay multiaddrs. + #[arg(long, value_delimiter = ',')] + relays: Vec, + + /// TCP listen addresses. + #[arg(long, value_delimiter = ',', default_value = "0.0.0.0:0")] + tcp_addrs: Vec, + + /// UDP listen addresses used for QUIC. + #[arg(long, value_delimiter = ',', default_value = "0.0.0.0:0")] + udp_addrs: Vec, + + /// Whether to filter private addresses from advertisements. + #[arg(long, default_value_t = false)] + filter_private_addrs: bool, + + /// External IP address to advertise. + #[arg(long)] + external_ip: Option, + + /// External hostname to advertise. + #[arg(long)] + external_host: Option, + + /// Whether to disable socket reuse-port. + #[arg(long, default_value_t = false)] + disable_reuse_port: bool, + + /// Duty slot used by the synthetic attester value. + #[arg(long, default_value_t = 1)] + slot: u64, + + /// Number of sequential synthetic duties to run. + #[arg(long, default_value_t = 5)] + duties: u64, + + /// Connected cluster peers required before starting QBFT. + #[arg(long)] + start_after_peers: Option, + + /// Maximum time to wait for connections and decision. + #[arg(long, default_value_t = 60)] + timeout_secs: u64, + + /// Print discovery, relay, send, receive, and connection-error details. + #[arg(long, default_value_t = false)] + verbose_p2p: bool, +} + +#[derive(Debug)] +struct Decision { + duty: Duty, + value: pbcore::UnsignedDataSet, +} + +struct DutyRun { + duties: Vec, + index: usize, + started: bool, + decided: bool, + task: Option>>, +} + +impl DutyRun { + fn new(duties: Vec) -> Self { + Self { + duties, + index: 0, + started: false, + decided: false, + task: None, + } + } + + fn current(&self) -> Option<&Duty> { + self.duties.get(self.index) + } + + fn is_complete(&self) -> bool { + self.index == self.duties.len() + } + + fn try_start( + &mut self, + component: &Arc, + fixture: &Fixture, + connected_peer_count: usize, + start_after: usize, + cancel: CancellationToken, + ) { + if self.started || self.is_complete() || connected_peer_count < start_after { + return; + } + + let duty = self + .current() + .expect("incomplete duty run has duty") + .clone(); + let leader_node = node_number(leader_index(&duty, fixture.peer_ids.len())); + let local_node = node_number(fixture.local_index); + println!( + "node={local_node} starting duty {}/{} duty={} leader=node-{leader_node}", + self.index.checked_add(1).expect("duty index increments"), + self.duties.len(), + duty + ); + + self.started = true; + self.task = Some(start_consensus_for_node( + Arc::clone(component), + fixture, + duty, + cancel, + )); + } + + fn mark_decided(&mut self, duty: &Duty) -> bool { + if self.current() != Some(duty) { + return false; + } + + self.decided = true; + true + } + + fn clear_task(&mut self) { + self.task = None; + } + + fn advance_if_ready(&mut self) -> bool { + if !self.decided || self.task.is_some() { + return false; + } + + self.index = self.index.checked_add(1).expect("duty index increments"); + self.started = false; + self.decided = false; + true + } +} + +struct DemoDeadline { + timeout: Duration, +} + +impl DeadlineCalculator for DemoDeadline { + fn deadline( + &self, + _duty: &Duty, + ) -> pluto_core::deadline::Result>> { + let delta = TimeDelta::from_std(self.timeout) + .map_err(|_| pluto_core::deadline::DeadlineError::DurationConversion)?; + Ok(Some(Utc::now().checked_add_signed(delta).ok_or( + pluto_core::deadline::DeadlineError::DateTimeCalculation, + )?)) + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + let timeout = Duration::from_secs(args.timeout_secs); + let duties = build_duties(args.slot, args.duties)?; + let first_duty = duties.first().expect("duty count is non-zero"); + let fixture = load_fixture(&args).await?; + let local_node = node_number(fixture.local_index); + let leader = leader_index(first_duty, fixture.peer_ids.len()); + let leader_node = node_number(leader); + + let cancel = CancellationToken::new(); + let relays = bootnode::new_relays( + cancel.child_token(), + &args.relays, + &hex::encode(&fixture.lock_hash), + ) + .await + .context("resolve relays")?; + let relay_peer_ids = relays + .iter() + .filter_map(|relay| relay.peer().map(|peer| peer.id)) + .collect::>(); + let conn_gater = gater::ConnGater::new( + gater::Config::closed() + .with_relays(relays.clone()) + .with_peer_ids(fixture.peer_ids.clone()), + ); + + let (decision_tx, mut decision_rx) = tokio::sync::mpsc::unbounded_channel(); + let consensus = build_consensus(&fixture, timeout, cancel.child_token(), decision_tx)?; + let (qbft_behaviour, handle) = qbft::p2p::Behaviour::new(qbft::p2p::Config { + consensus: Arc::clone(&consensus.component), + p2p_context: consensus.p2p_context.clone(), + peers: fixture.peer_ids.clone(), + local_peer_id: fixture.peer_ids[fixture.local_index], + cancellation: cancel.child_token(), + })?; + consensus + .handle_slot + .set(handle) + .map_err(|_| anyhow!("qbft handle already initialized"))?; + + let p2p_config = P2PConfig { + relays: vec![], + external_ip: args.external_ip.clone(), + external_host: args.external_host.clone(), + tcp_addrs: args.tcp_addrs.clone(), + udp_addrs: args.udp_addrs.clone(), + disable_reuse_port: args.disable_reuse_port, + }; + let mut node: Node = Node::new( + p2p_config, + fixture.key.clone(), + NodeType::QUIC, + args.filter_private_addrs, + consensus.p2p_context.clone(), + |builder, keypair, relay_client| { + let local_peer_id = keypair.public().to_peer_id(); + let p2p_context = builder.p2p_context(); + let mdns = mdns::tokio::Behaviour::new(mdns::Config::default(), local_peer_id) + .expect("mDNS should initialize"); + builder.with_gater(conn_gater).with_inner(ExampleBehaviour { + relay: relay_client, + relay_manager: RelayManager::new(relays.clone(), p2p_context), + mdns, + qbft: qbft_behaviour, + }) + }, + )?; + + println!( + "qbft example started node={local_node} peer_id={} duties={} first_duty={} first_leader=node-{leader_node}", + node.local_peer_id(), + duties.len(), + first_duty + ); + println!("cluster peers={}", peer_list(&fixture.peer_ids)); + + let start_after = args + .start_after_peers + .unwrap_or_else(|| fixture.peer_ids.len().saturating_sub(1)); + let mut connected_cluster_peers = HashSet::::new(); + let mut duty_run = DutyRun::new(duties); + let mut completion_drain = None; + + loop { + tokio::select! { + _ = tokio::signal::ctrl_c() => { + println!("node={local_node} ctrl-c received"); + break; + } + _ = cancel.cancelled() => break, + _ = async { + match &mut completion_drain { + Some(sleep) => sleep.await, + None => std::future::pending().await, + } + } => { + break; + } + result = async { + match &mut duty_run.task { + Some(task) => task.await.context("consensus task join")?, + None => std::future::pending::>().await, + } + }, if duty_run.task.is_some() => { + result?; + duty_run.clear_task(); + if duty_run.advance_if_ready() && duty_run.is_complete() { + println!("node={local_node} all duties decided"); + // Keep libp2p alive briefly so slower peers can receive + // final duty messages before this demo process exits. + completion_drain = Some(Box::pin(tokio::time::sleep(COMPLETION_DRAIN))); + } + duty_run.try_start( + &consensus.component, + &fixture, + connected_cluster_peers.len(), + start_after, + cancel.child_token(), + ); + } + Some(decision) = decision_rx.recv() => { + if duty_run.mark_decided(&decision.duty) { + println!( + "node={local_node} decided duty={} entries={}", + decision.duty, + format_value(&decision.value) + ); + println!("-------------"); + if duty_run.advance_if_ready() && duty_run.is_complete() { + println!("node={local_node} all duties decided"); + // Keep libp2p alive briefly so slower peers can receive + // final duty messages before this demo process exits. + completion_drain = Some(Box::pin(tokio::time::sleep(COMPLETION_DRAIN))); + } + duty_run.try_start( + &consensus.component, + &fixture, + connected_cluster_peers.len(), + start_after, + cancel.child_token(), + ); + } else if args.verbose_p2p { + println!("node={local_node} ignoring out-of-order decision duty={}", decision.duty); + } + } + event = node.select_next_some() => { + handle_swarm_event( + event, + &fixture, + &mut node, + &relay_peer_ids, + &mut connected_cluster_peers, + args.verbose_p2p, + )?; + duty_run.try_start( + &consensus.component, + &fixture, + connected_cluster_peers.len(), + start_after, + cancel.child_token(), + ); + } + _ = tokio::time::sleep(timeout), if !duty_run.started && !duty_run.is_complete() => { + bail!("timeout waiting for enough peers to start QBFT"); + } + } + } + + cancel.cancel(); + if let Some(task) = duty_run.task { + tokio::time::timeout(timeout, task) + .await + .context("timeout waiting for consensus task to stop")? + .context("consensus task join")??; + } + consensus.lifecycle_task.await?; + println!("node={local_node} qbft example stopped"); + + Ok(()) +} + +struct Fixture { + key: k256::SecretKey, + peer_ids: Vec, + local_index: usize, + consensus_peers: Vec, + lock_hash: Vec, +} + +async fn load_fixture(args: &Args) -> Result { + let key = k1::load_priv_key(&args.data_dir) + .with_context(|| format!("load private key from {}", args.data_dir.display()))?; + let local_peer_id = + peer_id_from_key(key.public_key()).context("derive local peer ID from private key")?; + let lock_path = args.data_dir.join("cluster-lock.json"); + let lock_str = fs::read_to_string(&lock_path) + .await + .with_context(|| format!("read {}", lock_path.display()))?; + let lock: Lock = serde_json::from_str(&lock_str) + .with_context(|| format!("parse {}", lock_path.display()))?; + let peer_ids = lock.peer_ids().context("derive peer IDs from lock")?; + let Some(local_index) = peer_ids + .iter() + .position(|peer_id| *peer_id == local_peer_id) + else { + bail!("local peer ID {local_peer_id} not found in cluster lock"); + }; + let consensus_peers = lock + .operators + .iter() + .enumerate() + .map(|(index, operator)| { + let record = Record::try_from(operator.enr.as_str()).context("parse operator ENR")?; + let public_key = record + .public_key + .context("operator ENR missing public key")?; + Ok(qbft::Peer { + index: i64::try_from(index)?, + name: format!("node-{}", node_number(index)), + public_key, + }) + }) + .collect::>>()?; + + Ok(Fixture { + key, + peer_ids, + local_index, + consensus_peers, + lock_hash: lock.lock_hash, + }) +} + +struct ConsensusRuntime { + component: Arc, + p2p_context: P2PContext, + handle_slot: Arc>, + lifecycle_task: tokio::task::JoinHandle<()>, +} + +fn build_consensus( + fixture: &Fixture, + timeout: Duration, + cancel: CancellationToken, + decision_tx: tokio::sync::mpsc::UnboundedSender, +) -> Result { + let handle_slot = Arc::new(OnceLock::::new()); + let broadcaster_slot = Arc::clone(&handle_slot); + let broadcaster: qbft::Broadcaster = Arc::new(move |ct, msg| { + let broadcaster_slot = Arc::clone(&broadcaster_slot); + Box::pin(async move { + let Some(handle) = broadcaster_slot.get() else { + let err = std::io::Error::other("qbft p2p handle not initialized"); + return Err(Box::new(err) as Box); + }; + handle.broadcast(ct, msg).await + }) + }); + + let (deadliner, expired_rx) = DeadlinerTask::start( + cancel.child_token(), + format!("qbft-example-node-{}", node_number(fixture.local_index)), + DemoDeadline { timeout }, + ); + let local_node = node_number(fixture.local_index); + let component = Arc::new(qbft::Consensus::new(qbft::Config { + peers: fixture.consensus_peers.clone(), + local_peer_idx: i64::try_from(fixture.local_index)?, + privkey: fixture.key.clone(), + deadliner, + duty_gater: Arc::new(|duty| duty.duty_type == DutyType::Attester), + broadcaster, + sniffer: Arc::new(move |instance| { + println!( + "node={local_node} sniffed consensus messages={}", + instance.msgs.len() + ); + }), + compare_attestations: false, + timer_func: Box::new(|duty| { + Box::new(IncreasingRoundTimer::with_duty(duty)) as Box + }), + })?); + component.subscribe(move |decision_duty, value| { + let _ = decision_tx.send(Decision { + duty: decision_duty, + value, + }); + Ok(()) + }); + let lifecycle_task = Arc::clone(&component).start(cancel.child_token(), expired_rx); + + Ok(ConsensusRuntime { + component, + p2p_context: P2PContext::new(fixture.peer_ids.iter().copied()), + handle_slot, + lifecycle_task, + }) +} + +fn handle_swarm_event( + event: SwarmEvent>, + fixture: &Fixture, + node: &mut Node, + relay_peer_ids: &HashSet, + connected_cluster_peers: &mut HashSet, + verbose_p2p: bool, +) -> Result<()> { + let local_node = node_number(fixture.local_index); + match event { + SwarmEvent::NewListenAddr { address, .. } => { + if verbose_p2p { + println!("node={local_node} listen={address}"); + } + } + SwarmEvent::ConnectionEstablished { peer_id, .. } => { + if fixture.peer_ids.contains(&peer_id) + && peer_id != fixture.peer_ids[fixture.local_index] + && connected_cluster_peers.insert(peer_id) + { + println!( + "node={local_node} connected cluster_peer={} count={}/{}", + peer_id, + connected_cluster_peers.len(), + fixture.peer_ids.len().saturating_sub(1) + ); + } + } + SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(ExampleBehaviourEvent::Mdns( + mdns::Event::Discovered(peers), + ))) => { + for (peer_id, addr) in peers { + if fixture.peer_ids.contains(&peer_id) + && peer_id != fixture.peer_ids[fixture.local_index] + { + if verbose_p2p { + println!("node={local_node} mdns discovered peer={peer_id} addr={addr}"); + } + node.dial(addr)?; + } + } + } + SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(ExampleBehaviourEvent::RelayManager( + event, + ))) => { + if verbose_p2p { + println!("node={local_node} relay_manager={event:?}"); + } + } + SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(ExampleBehaviourEvent::Relay(event))) => { + if verbose_p2p { + println!("node={local_node} relay={event:?}"); + } + } + SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(ExampleBehaviourEvent::Qbft(event))) => { + log_qbft_event(local_node, event, verbose_p2p); + } + SwarmEvent::OutgoingConnectionError { peer_id, error, .. } => { + if verbose_p2p { + println!("node={local_node} outgoing_error peer={peer_id:?} error={error}"); + } + } + SwarmEvent::IncomingConnectionError { error, .. } => { + if verbose_p2p { + println!("node={local_node} incoming_error error={error}"); + } + } + SwarmEvent::Behaviour(PlutoBehaviourEvent::Identify(_)) + | SwarmEvent::Behaviour(PlutoBehaviourEvent::Ping(_)) + | SwarmEvent::Behaviour(PlutoBehaviourEvent::Autonat(_)) + | SwarmEvent::Behaviour(PlutoBehaviourEvent::ConnLogger(_)) + | SwarmEvent::Behaviour(PlutoBehaviourEvent::Gater(_)) + | SwarmEvent::Behaviour(PlutoBehaviourEvent::QuicUpgrade(_)) + | SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(ExampleBehaviourEvent::Mdns(_))) => {} + _ => { + if verbose_p2p && !relay_peer_ids.is_empty() { + println!("node={local_node} swarm_event={event:?}"); + } + } + } + + Ok(()) +} + +fn start_consensus_for_node( + component: Arc, + fixture: &Fixture, + duty: Duty, + cancel: CancellationToken, +) -> tokio::task::JoinHandle> { + let local_node = node_number(fixture.local_index); + let leader = leader_index(&duty, fixture.peer_ids.len()); + if fixture.local_index == leader { + let slot = duty.slot.inner(); + println!("node={local_node} proposing value"); + tokio::spawn(async move { + component + .propose(&cancel, duty, demo_value(local_node, slot)) + .await + .map_err(|error| anyhow!(error)) + }) + } else { + println!("node={local_node} participating"); + tokio::spawn(async move { + component + .participate(&cancel, duty) + .await + .map_err(|error| anyhow!(error)) + }) + } +} + +fn log_qbft_event(local_node: usize, event: qbft::p2p::Event, verbose_p2p: bool) { + match event { + qbft::p2p::Event::BroadcastQueued { + request_id, + target_count, + } => { + println!( + "node={local_node} qbft broadcast request={request_id} targets={target_count}" + ); + } + qbft::p2p::Event::Received { peer, .. } => { + if verbose_p2p { + println!("node={local_node} qbft received peer={peer}"); + } + } + qbft::p2p::Event::Sent { request_id, peer } => { + if verbose_p2p { + println!("node={local_node} qbft sent request={request_id} peer={peer}"); + } + } + qbft::p2p::Event::SendError { + request_id, + peer, + error, + } => { + println!( + "node={local_node} qbft send_error request={request_id} peer={peer} error={error}" + ); + } + qbft::p2p::Event::InboundError { peer, error, .. } => { + println!("node={local_node} qbft inbound_error peer={peer} error={error}"); + } + } +} + +fn leader_index(duty: &Duty, nodes: usize) -> usize { + let nodes = i128::try_from(nodes).expect("node count fits i128"); + let duty_type = i32::try_from(&duty.duty_type).expect("duty type maps to i32"); + let total = i128::from(duty.slot.inner()) + .checked_add(i128::from(duty_type)) + .and_then(|value| value.checked_add(1)) + .expect("slot, duty type, and round fit i128"); + usize::try_from(total.rem_euclid(nodes)).expect("leader index fits usize") +} + +fn build_duties(start_slot: u64, count: u64) -> Result> { + if count == 0 { + bail!("--duties must be greater than zero"); + } + + (0..count) + .map(|offset| { + let slot = start_slot + .checked_add(offset) + .context("slot overflow while building duties")?; + Ok(Duty::new_attester_duty(SlotNumber::new(slot))) + }) + .collect() +} + +fn demo_value(node: usize, slot: u64) -> pbcore::UnsignedDataSet { + let mut set = BTreeMap::new(); + set.insert( + "demo-validator".to_string(), + Bytes::from(format!("qbft-demo-slot-{slot}-node-{node}")), + ); + pbcore::UnsignedDataSet { set } +} + +fn node_number(index: usize) -> usize { + index.checked_add(1).expect("node index increments") +} + +fn peer_list(peers: &[PeerId]) -> String { + peers + .iter() + .enumerate() + .map(|(index, peer_id)| format!("node-{}={peer_id}", node_number(index))) + .collect::>() + .join(",") +} + +fn format_value(value: &pbcore::UnsignedDataSet) -> String { + value + .set + .iter() + .map(|(key, value)| format!("{key}={}", String::from_utf8_lossy(value))) + .collect::>() + .join(",") +} diff --git a/crates/consensus/src/qbft/transport.rs b/crates/consensus/src/qbft/transport.rs index 40a9dfc3..2a4c5698 100644 --- a/crates/consensus/src/qbft/transport.rs +++ b/crates/consensus/src/qbft/transport.rs @@ -241,7 +241,12 @@ impl Transport { tokio::select! { () = ct.cancelled() => return Ok(()), result = self.recv_tx.send(inner_msg) => { - result.map_err(|_| Error::ReceiveBufferClosed)?; + if result.is_err() { + if ct.is_cancelled() { + return Ok(()); + } + return Err(Error::ReceiveBufferClosed); + } self.sniffer.add(consensus_msg); } } From 0ac81da671b1b9614e8fb7bcc397290040428b26 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 2 Jun 2026 11:42:05 +0700 Subject: [PATCH 10/21] fix: single p2p_context in example --- crates/consensus/examples/qbft.rs | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/crates/consensus/examples/qbft.rs b/crates/consensus/examples/qbft.rs index 7ecb03d6..1f3af47a 100644 --- a/crates/consensus/examples/qbft.rs +++ b/crates/consensus/examples/qbft.rs @@ -265,8 +265,8 @@ impl DutyRun { .current() .expect("incomplete duty run has duty") .clone(); - let leader_node = node_number(leader_index(&duty, fixture.peer_ids.len())); - let local_node = node_number(fixture.local_index); + let leader_node = leader_index(&duty, fixture.peer_ids.len()); + let local_node = fixture.local_index; println!( "node={local_node} starting duty {}/{} duty={} leader=node-{leader_node}", self.index.checked_add(1).expect("duty index increments"), @@ -332,9 +332,9 @@ async fn main() -> Result<()> { let duties = build_duties(args.slot, args.duties)?; let first_duty = duties.first().expect("duty count is non-zero"); let fixture = load_fixture(&args).await?; - let local_node = node_number(fixture.local_index); + let local_node = fixture.local_index; let leader = leader_index(first_duty, fixture.peer_ids.len()); - let leader_node = node_number(leader); + let leader_node = leader; let cancel = CancellationToken::new(); let relays = bootnode::new_relays( @@ -353,12 +353,13 @@ async fn main() -> Result<()> { .with_relays(relays.clone()) .with_peer_ids(fixture.peer_ids.clone()), ); + let p2p_context = P2PContext::new(fixture.peer_ids.iter().copied()); let (decision_tx, mut decision_rx) = tokio::sync::mpsc::unbounded_channel(); let consensus = build_consensus(&fixture, timeout, cancel.child_token(), decision_tx)?; let (qbft_behaviour, handle) = qbft::p2p::Behaviour::new(qbft::p2p::Config { consensus: Arc::clone(&consensus.component), - p2p_context: consensus.p2p_context.clone(), + p2p_context: p2p_context.clone(), peers: fixture.peer_ids.clone(), local_peer_id: fixture.peer_ids[fixture.local_index], cancellation: cancel.child_token(), @@ -381,7 +382,7 @@ async fn main() -> Result<()> { fixture.key.clone(), NodeType::QUIC, args.filter_private_addrs, - consensus.p2p_context.clone(), + p2p_context, |builder, keypair, relay_client| { let local_peer_id = keypair.public().to_peer_id(); let p2p_context = builder.p2p_context(); @@ -546,7 +547,7 @@ async fn load_fixture(args: &Args) -> Result { .context("operator ENR missing public key")?; Ok(qbft::Peer { index: i64::try_from(index)?, - name: format!("node-{}", node_number(index)), + name: format!("node-{index}"), public_key, }) }) @@ -563,7 +564,6 @@ async fn load_fixture(args: &Args) -> Result { struct ConsensusRuntime { component: Arc, - p2p_context: P2PContext, handle_slot: Arc>, lifecycle_task: tokio::task::JoinHandle<()>, } @@ -589,10 +589,10 @@ fn build_consensus( let (deadliner, expired_rx) = DeadlinerTask::start( cancel.child_token(), - format!("qbft-example-node-{}", node_number(fixture.local_index)), + format!("qbft-example-node-{}", fixture.local_index), DemoDeadline { timeout }, ); - let local_node = node_number(fixture.local_index); + let local_node = fixture.local_index; let component = Arc::new(qbft::Consensus::new(qbft::Config { peers: fixture.consensus_peers.clone(), local_peer_idx: i64::try_from(fixture.local_index)?, @@ -622,7 +622,6 @@ fn build_consensus( Ok(ConsensusRuntime { component, - p2p_context: P2PContext::new(fixture.peer_ids.iter().copied()), handle_slot, lifecycle_task, }) @@ -636,7 +635,7 @@ fn handle_swarm_event( connected_cluster_peers: &mut HashSet, verbose_p2p: bool, ) -> Result<()> { - let local_node = node_number(fixture.local_index); + let local_node = fixture.local_index; match event { SwarmEvent::NewListenAddr { address, .. } => { if verbose_p2p { @@ -718,7 +717,7 @@ fn start_consensus_for_node( duty: Duty, cancel: CancellationToken, ) -> tokio::task::JoinHandle> { - let local_node = node_number(fixture.local_index); + let local_node = fixture.local_index; let leader = leader_index(&duty, fixture.peer_ids.len()); if fixture.local_index == leader { let slot = duty.slot.inner(); @@ -809,15 +808,11 @@ fn demo_value(node: usize, slot: u64) -> pbcore::UnsignedDataSet { pbcore::UnsignedDataSet { set } } -fn node_number(index: usize) -> usize { - index.checked_add(1).expect("node index increments") -} - fn peer_list(peers: &[PeerId]) -> String { peers .iter() .enumerate() - .map(|(index, peer_id)| format!("node-{}={peer_id}", node_number(index))) + .map(|(index, peer_id)| format!("node-{index}={peer_id}")) .collect::>() .join(",") } From 8fd1b9d5ad9cc80b31c010c785d1fd3eb1d62f68 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 2 Jun 2026 13:22:03 +0700 Subject: [PATCH 11/21] fix: harden qbft receive validation --- crates/consensus/src/qbft/component.rs | 61 +++++++++++++++++ crates/consensus/src/qbft/msg.rs | 64 ++++++++++++++++-- crates/consensus/src/qbft/runner.rs | 91 +++++++++++++++++++++++--- 3 files changed, 200 insertions(+), 16 deletions(-) diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 84f19d5b..2c220c18 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -909,6 +909,67 @@ pub(crate) mod tests { assert_eq!(err.to_string(), "value hash not found in values"); } + #[test_case(vec![] ; "empty")] + #[test_case(vec![0; 32] ; "zero")] + #[test_case(vec![1; 31] ; "short")] + #[test_case(vec![1; 33] ; "long")] + #[tokio::test] + async fn handle_rejects_invalid_prepared_round_change_hash(hash: Vec) { + let mut msg = unsigned_msg(0); + msg.r#type = i64::from(qbft::MSG_ROUND_CHANGE); + msg.prepared_round = 1; + msg.prepared_value_hash = hash.into(); + let msg = sign_for_peer(msg, 0); + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid prepared value hash"); + } + + #[tokio::test] + async fn handle_rejects_missing_prepared_round_change_hash() { + let mut msg = unsigned_msg(0); + msg.r#type = i64::from(qbft::MSG_ROUND_CHANGE); + msg.prepared_round = 1; + msg.prepared_value_hash = [2u8; 32].to_vec().into(); + let msg = sign_for_peer(msg, 0); + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "prepared value hash not found in values"); + } + + #[test_case(vec![] ; "empty")] + #[test_case(vec![0; 32] ; "zero")] + #[tokio::test] + async fn handle_accepts_null_unprepared_round_change_hash(hash: Vec) { + let consensus = consensus(0, true); + let mut msg = unsigned_msg(0); + msg.r#type = i64::from(qbft::MSG_ROUND_CHANGE); + msg.value_hash = Bytes::new(); + msg.prepared_round = 0; + msg.prepared_value_hash = hash.into(); + let msg = sign_for_peer(msg, 0); + let inst = consensus.get_instance_io(duty()); + + consensus + .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .await + .unwrap(); + + let mut recv_rx = inst.take_recv_rx().unwrap(); + let received = recv_rx.try_recv().unwrap(); + assert_eq!(received.type_(), qbft::MSG_ROUND_CHANGE); + assert_eq!(received.prepared_round(), 0); + assert_eq!(received.prepared_value(), [0u8; 32]); + } + #[tokio::test] async fn handle_enqueues_valid_message() { let consensus = consensus(0, true); diff --git a/crates/consensus/src/qbft/msg.rs b/crates/consensus/src/qbft/msg.rs index 6ecb7a9d..4b3ebd01 100644 --- a/crates/consensus/src/qbft/msg.rs +++ b/crates/consensus/src/qbft/msg.rs @@ -70,6 +70,11 @@ pub enum Error { #[error("prepared value hash not found in values")] PreparedValueHashNotFound, + /// Prepared value hash was absent, zero, or not exactly 32 bytes when + /// required. + #[error("invalid prepared value hash")] + InvalidPreparedValueHash, + /// Value did not exist in the values map. #[error("value not found")] ValueNotFound, @@ -152,11 +157,7 @@ impl Msg { Some(_) => return Err(Error::ValueHashNotFound), None => [0u8; 32], }; - let prepared_value_hash = match to_hash32(&msg.prepared_value_hash) { - Some(hash) if values.contains_key(&hash) => hash, - Some(_) => return Err(Error::PreparedValueHashNotFound), - None => [0u8; 32], - }; + let prepared_value_hash = prepared_value_hash(&msg, &values)?; let mut justification_impls: Vec> = Vec::with_capacity(justification.len()); @@ -339,6 +340,34 @@ fn to_hash32(value: &[u8]) -> Option<[u8; 32]> { Some(value) } +fn prepared_value_hash(msg: &pbconsensus::QbftMsg, values: &ValueMap) -> Result<[u8; 32]> { + if msg.prepared_value_hash.is_empty() { + return if msg.prepared_round > 0 { + Err(Error::InvalidPreparedValueHash) + } else { + Ok([0u8; 32]) + }; + } + + if msg.prepared_value_hash.len() != 32 { + return Err(Error::InvalidPreparedValueHash); + } + + let Some(hash) = to_hash32(&msg.prepared_value_hash) else { + return if msg.prepared_round > 0 { + Err(Error::InvalidPreparedValueHash) + } else { + Ok([0u8; 32]) + }; + }; + + if values.contains_key(&hash) { + return Ok(hash); + } + + Err(Error::PreparedValueHashNotFound) +} + /// Converts an optional protobuf duty into the domain duty type. fn duty_from_proto(duty: Option<&pbcore::Duty>) -> Duty { let Some(duty) = duty else { @@ -523,9 +552,9 @@ mod tests { assert_eq!(msg.value(), [0u8; 32]); } - #[test_case(vec![1; 31] ; "invalid_length")] + #[test_case(vec![] ; "empty")] #[test_case(vec![0; 32] ; "zero_hash")] - fn new_treats_invalid_prepared_value_hash_as_nil(hash: Vec) { + fn new_allows_nil_prepared_value_hash_when_unprepared(hash: Vec) { let msg = Msg::new( pbconsensus::QbftMsg { prepared_value_hash: hash.into(), @@ -539,6 +568,27 @@ mod tests { assert_eq!(msg.prepared_value(), [0u8; 32]); } + #[test_case(0, vec![1; 31] ; "unprepared_short")] + #[test_case(0, vec![1; 33] ; "unprepared_long")] + #[test_case(1, vec![] ; "prepared_empty")] + #[test_case(1, vec![0; 32] ; "prepared_zero")] + #[test_case(1, vec![1; 31] ; "prepared_short")] + #[test_case(1, vec![1; 33] ; "prepared_long")] + fn new_rejects_invalid_prepared_value_hash(prepared_round: i64, hash: Vec) { + let err = Msg::new( + pbconsensus::QbftMsg { + prepared_round, + prepared_value_hash: hash.into(), + ..Default::default() + }, + vec![], + sync::Arc::default(), + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid prepared value hash"); + } + #[test] fn new_errors_on_missing_value_hash() { let err = Msg::new( diff --git a/crates/consensus/src/qbft/runner.rs b/crates/consensus/src/qbft/runner.rs index d0ac1327..982a904d 100644 --- a/crates/consensus/src/qbft/runner.rs +++ b/crates/consensus/src/qbft/runner.rs @@ -9,7 +9,11 @@ use cancellation::CancellationTokenSource; use crossbeam::channel as mpmc; use prost::{Message, Name}; use prost_types::Any; -use tokio::{sync::mpsc, task::JoinError, time::Instant}; +use tokio::{ + sync::mpsc, + task::JoinError, + time::{Duration, Instant}, +}; use tokio_util::sync::CancellationToken; use crate::instance::{self, InstanceIo, RunnerError, RunnerResult}; @@ -28,6 +32,10 @@ use super::{ transport, }; +// Only used while a bounded core channel is full; keep it low enough to resume +// promptly, but not a 1ms spin under sustained backpressure. +const BRIDGE_SEND_RETRY_INTERVAL: Duration = Duration::from_millis(10); + /// Runner result. pub type Result = std::result::Result; @@ -209,9 +217,9 @@ async fn run_instance_inner( let runtime = tokio::runtime::Handle::current(); let (inner_recv_tx, inner_recv_rx) = mpsc::channel(instance::RECV_BUFFER_SIZE); - let (core_recv_tx, core_recv_rx) = mpmc::unbounded(); - let (core_hash_tx, core_hash_rx) = mpmc::unbounded(); - let (core_verify_tx, core_verify_rx) = mpmc::unbounded(); + let (core_recv_tx, core_recv_rx) = mpmc::bounded(instance::RECV_BUFFER_SIZE); + let (core_hash_tx, core_hash_rx) = mpmc::bounded(1); + let (core_verify_tx, core_verify_rx) = mpmc::bounded(1); let nodes = consensus.node_count(); let peer_idx = consensus.get_peer_idx(); @@ -326,9 +334,10 @@ async fn run_instance_inner( }; let duty_for_run = duty.clone(); + let core_ct_for_run = core_ct.clone(); let core_result = tokio::task::spawn_blocking(move || { qbft::run( - &core_ct, + &core_ct_for_run, &def, &core_transport, &duty_for_run, @@ -340,6 +349,8 @@ async fn run_instance_inner( .await .map_err(Error::Join)?; + let canceled_before_teardown = + parent_ct.is_cancelled() || instance_ct.is_cancelled() || core_ct.is_canceled(); instance_ct.cancel(); for task in tasks { let _ = task.await; @@ -360,6 +371,9 @@ async fn run_instance_inner( Ok(()) => Ok(()), Err(qbft::QbftError::ContextCanceled) if decided.load(Ordering::Relaxed) => Ok(()), Err(qbft::QbftError::ContextCanceled) => Err(Error::ConsensusTimeout), + Err(qbft::QbftError::ChannelError(_)) if canceled_before_teardown => { + Err(Error::ConsensusTimeout) + } Err(err) => Err(Error::Core(err)), } } @@ -412,8 +426,20 @@ async fn bridge_mpsc_to_crossbeam( }, }; - if tx.send(value).is_err() { - return; + send_to_crossbeam(&ct, &tx, value).await; + } +} + +async fn send_to_crossbeam(ct: &CancellationToken, tx: &mpmc::Sender, mut value: T) { + loop { + match tx.try_send(value) { + Ok(()) | Err(mpmc::TrySendError::Disconnected(_)) => return, + Err(mpmc::TrySendError::Full(returned)) => value = returned, + } + + tokio::select! { + () = ct.cancelled() => return, + () = tokio::time::sleep(BRIDGE_SEND_RETRY_INTERVAL) => {} } } } @@ -641,7 +667,10 @@ mod tests { let err = run_instance(&consensus, &ct, duty, inst).await.unwrap_err(); - assert!(matches!(err, Error::ConsensusTimeout)); + assert!( + matches!(err, Error::ConsensusTimeout), + "unexpected error: {err:?}" + ); let runner_err = recv_one(&mut err_rx).await.unwrap_err(); assert_eq!(runner_err.to_string(), "consensus timeout"); assert_eq!(sniffed.lock().unwrap().len(), 1); @@ -661,7 +690,10 @@ mod tests { .await .unwrap_err(); - assert!(matches!(err, Error::ConsensusTimeout)); + assert!( + matches!(err, Error::ConsensusTimeout), + "unexpected error: {err:?}" + ); let retained = consensus.get_instance_io(duty.clone()); assert!(Arc::ptr_eq(&inst, &retained)); assert!(retained.has_started()); @@ -718,6 +750,47 @@ mod tests { ); } + #[tokio::test] + async fn bridge_stops_draining_when_core_channel_is_full() { + let ct = CancellationToken::new(); + let (async_tx, async_rx) = mpsc::channel(1); + let (core_tx, core_rx) = mpmc::bounded(1); + core_tx.try_send(0).unwrap(); + async_tx.try_send(1).unwrap(); + let task = tokio::spawn(bridge_mpsc_to_crossbeam(ct.clone(), async_rx, core_tx)); + + tokio::time::timeout(Duration::from_secs(1), async { + let mut value = 2; + loop { + match async_tx.try_send(value) { + Ok(()) => return, + Err(mpsc::error::TrySendError::Full(returned)) => { + value = returned; + tokio::task::yield_now().await; + } + Err(mpsc::error::TrySendError::Closed(_)) => { + panic!("async bridge input closed") + } + } + } + }) + .await + .expect("bridge did not take first async item"); + + assert!(matches!( + async_tx.try_send(3), + Err(mpsc::error::TrySendError::Full(3)) + )); + assert_eq!(core_rx.len(), 1); + + ct.cancel(); + drop(core_rx); + tokio::time::timeout(Duration::from_secs(1), task) + .await + .expect("bridge task did not stop") + .expect("bridge task panicked"); + } + async fn recv_one(rx: &mut mpsc::Receiver) -> T { tokio::time::timeout(Duration::from_secs(1), rx.recv()) .await From 45c7f0c55ae36ed62c51434103031b11eb1b219e Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 2 Jun 2026 14:11:45 +0700 Subject: [PATCH 12/21] fix: harden qbft value hash admission --- crates/consensus/src/qbft/component.rs | 51 ++++++++++-- crates/consensus/src/qbft/definition.rs | 25 +++++- crates/consensus/src/qbft/msg.rs | 104 +++++++++++++++++++++--- crates/consensus/src/qbft/p2p.rs | 6 +- crates/consensus/src/qbft/transport.rs | 9 +- 5 files changed, 167 insertions(+), 28 deletions(-) diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 2c220c18..5ea0bcf8 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -909,6 +909,24 @@ pub(crate) mod tests { assert_eq!(err.to_string(), "value hash not found in values"); } + #[test_case(vec![] ; "empty")] + #[test_case(vec![0; 32] ; "zero")] + #[test_case(vec![1; 31] ; "short")] + #[test_case(vec![1; 33] ; "long")] + #[tokio::test] + async fn handle_rejects_invalid_value_hash(hash: Vec) { + let mut msg = unsigned_msg(0); + msg.value_hash = hash.into(); + let msg = sign_for_peer(msg, 0); + + let err = consensus(0, true) + .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .await + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid value hash"); + } + #[test_case(vec![] ; "empty")] #[test_case(vec![0; 32] ; "zero")] #[test_case(vec![1; 31] ; "short")] @@ -1009,10 +1027,7 @@ pub(crate) mod tests { .unwrap(); let err = consensus - .handle( - &CancellationToken::new(), - Some(consensus_msg(signed_msg(0))), - ) + .handle(&CancellationToken::new(), Some(valid_consensus_msg(0))) .await .unwrap_err(); @@ -1025,7 +1040,7 @@ pub(crate) mod tests { ct.cancel(); let err = consensus(0, true) - .handle(&ct, Some(consensus_msg(signed_msg(0)))) + .handle(&ct, Some(valid_consensus_msg(0))) .await .unwrap_err(); @@ -1042,7 +1057,7 @@ pub(crate) mod tests { } let ct = CancellationToken::new(); - let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); + let handle = consensus.handle(&ct, Some(valid_consensus_msg(0))); tokio::pin!(handle); tokio::select! { @@ -1069,7 +1084,7 @@ pub(crate) mod tests { } let ct = CancellationToken::new(); - let handle = consensus.handle(&ct, Some(consensus_msg(signed_msg(0)))); + let handle = consensus.handle(&ct, Some(valid_consensus_msg(0))); tokio::pin!(handle); tokio::select! { @@ -1161,6 +1176,20 @@ pub(crate) mod tests { sign_for_peer(unsigned_msg(peer_idx), peer_idx) } + fn valid_consensus_msg(peer_idx: i64) -> pbconsensus::QbftConsensusMsg { + let any = unsigned_any("a", b"first"); + let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let value_hash = msg::hash_proto(&value).unwrap(); + let mut msg = unsigned_msg(peer_idx); + msg.value_hash = value_hash.to_vec().into(); + + pbconsensus::QbftConsensusMsg { + msg: Some(sign_for_peer(msg, peer_idx)), + justification: vec![], + values: vec![any], + } + } + fn sign_for_peer(msg: pbconsensus::QbftMsg, peer_idx: i64) -> pbconsensus::QbftMsg { let seed = u8::try_from(peer_idx.checked_add(1).unwrap()).unwrap(); msg::sign_msg(&msg, &secret_key(seed)).unwrap() @@ -1193,7 +1222,13 @@ pub(crate) mod tests { } fn wrapped_msg() -> msg::Msg { - msg::Msg::new(unsigned_msg(0), vec![], Arc::default()).unwrap() + let any = unsigned_any("a", b"first"); + let value = pbcore::UnsignedDataSet::decode(any.value.as_slice()).unwrap(); + let value_hash = msg::hash_proto(&value).unwrap(); + let mut msg = unsigned_msg(0); + msg.value_hash = value_hash.to_vec().into(); + + msg::Msg::new(msg, vec![], Arc::new(ValueMap::from([(value_hash, any)]))).unwrap() } pub(crate) fn consensus(local_peer_idx: i64, duty_allowed: bool) -> Consensus { diff --git a/crates/consensus/src/qbft/definition.rs b/crates/consensus/src/qbft/definition.rs index a53f1226..e3dc693d 100644 --- a/crates/consensus/src/qbft/definition.rs +++ b/crates/consensus/src/qbft/definition.rs @@ -899,6 +899,7 @@ mod tests { peer_idx: i64, round: i64, ) -> qbft::Msg { + let (value_hash, values) = test_value_parts(type_); Arc::new( msg::Msg::new( pbconsensus::QbftMsg { @@ -909,10 +910,11 @@ mod tests { }), peer_idx, round, + value_hash, ..Default::default() }, vec![], - Arc::default(), + values, ) .unwrap(), ) @@ -1010,6 +1012,7 @@ mod tests { round: i64, duty_type: DutyType, ) -> qbft::Msg { + let (value_hash, values) = test_value_parts(type_); Arc::new( msg::Msg::new( pbconsensus::QbftMsg { @@ -1020,17 +1023,33 @@ mod tests { }), peer_idx, round, + value_hash, ..Default::default() }, vec![], - Arc::default(), + values, ) .unwrap(), ) } + fn test_value_parts(type_: qbft::MessageType) -> (Bytes, Arc>) { + if type_ == qbft::MSG_ROUND_CHANGE || !type_.valid() { + return (Bytes::new(), Arc::default()); + } + + let value = unsigned_value(); + let hash = msg::hash_proto(&value).unwrap(); + ( + hash.to_vec().into(), + Arc::new(HashMap::from([(hash, any_unsigned(&value))])), + ) + } + fn unsigned_value() -> pbcore::UnsignedDataSet { - pbcore::UnsignedDataSet::default() + pbcore::UnsignedDataSet { + set: [("0x1".to_string(), Bytes::from_static(&[1]))].into(), + } } fn unsigned_attestation_set( diff --git a/crates/consensus/src/qbft/msg.rs b/crates/consensus/src/qbft/msg.rs index 4b3ebd01..3fc6ae22 100644 --- a/crates/consensus/src/qbft/msg.rs +++ b/crates/consensus/src/qbft/msg.rs @@ -66,6 +66,10 @@ pub enum Error { #[error("value hash not found in values")] ValueHashNotFound, + /// Value hash was absent, zero, or not exactly 32 bytes when required. + #[error("invalid value hash")] + InvalidValueHash, + /// Prepared value hash did not exist in the values map. #[error("prepared value hash not found in values")] PreparedValueHashNotFound, @@ -141,9 +145,13 @@ impl fmt::Debug for Msg { impl Msg { /// Wraps a raw QBFT protobuf message for the generic core. /// - /// Non-zero `value_hash` and `prepared_value_hash` fields must both exist - /// in `values`. Invalid hash encodings, including zero hashes, are - /// treated as the nil value and do not require a map entry. + /// Value-bearing messages must include a non-zero 32-byte `value_hash` + /// present in `values`. This is deliberately stricter than Charon's + /// current wrapper behavior, which collapses absent or malformed hashes to + /// nil; admitting that shape can let core progress on a value that cannot + /// be decoded at decision time. + /// + /// `prepared_value_hash` is optional only while `prepared_round` is zero. /// /// Justifications are raw protobuf messages from the same consensus /// envelope. They are recursively wrapped with the same shared value map. @@ -152,11 +160,7 @@ impl Msg { justification: Vec, values: sync::Arc, ) -> Result { - let value_hash = match to_hash32(&msg.value_hash) { - Some(hash) if values.contains_key(&hash) => hash, - Some(_) => return Err(Error::ValueHashNotFound), - None => [0u8; 32], - }; + let value_hash = value_hash(&msg, &values)?; let prepared_value_hash = prepared_value_hash(&msg, &values)?; let mut justification_impls: Vec> = @@ -340,6 +344,42 @@ fn to_hash32(value: &[u8]) -> Option<[u8; 32]> { Some(value) } +fn value_hash(msg: &pbconsensus::QbftMsg, values: &ValueMap) -> Result<[u8; 32]> { + let required = value_hash_required(MessageType::from_wire(msg.r#type)); + if msg.value_hash.is_empty() { + return if required { + Err(Error::InvalidValueHash) + } else { + Ok([0u8; 32]) + }; + } + + if msg.value_hash.len() != 32 { + return Err(Error::InvalidValueHash); + } + + let Some(hash) = to_hash32(&msg.value_hash) else { + return if required { + Err(Error::InvalidValueHash) + } else { + Ok([0u8; 32]) + }; + }; + + if values.contains_key(&hash) { + return Ok(hash); + } + + Err(Error::ValueHashNotFound) +} + +fn value_hash_required(type_: MessageType) -> bool { + type_ == qbft::MSG_PRE_PREPARE + || type_ == qbft::MSG_PREPARE + || type_ == qbft::MSG_COMMIT + || type_ == qbft::MSG_DECIDED +} + fn prepared_value_hash(msg: &pbconsensus::QbftMsg, values: &ValueMap) -> Result<[u8; 32]> { if msg.prepared_value_hash.is_empty() { return if msg.prepared_round > 0 { @@ -385,7 +425,9 @@ fn duty_from_proto(duty: Option<&pbcore::Duty>) -> Duty { #[cfg(test)] mod tests { use super::*; - use pluto_core::qbft::{MSG_PRE_PREPARE, MSG_PREPARE}; + use pluto_core::qbft::{ + MSG_COMMIT, MSG_DECIDED, MSG_PRE_PREPARE, MSG_PREPARE, MSG_ROUND_CHANGE, + }; use prost::bytes::Bytes; use prost_types::Timestamp; use test_case::test_case; @@ -536,11 +578,31 @@ mod tests { assert_eq!(msg.values().len(), 2); } - #[test_case(vec![1; 31] ; "invalid_length")] + #[test_case(MSG_PRE_PREPARE, vec![] ; "pre_prepare_empty")] + #[test_case(MSG_PREPARE, vec![0; 32] ; "prepare_zero")] + #[test_case(MSG_COMMIT, vec![1; 31] ; "commit_short")] + #[test_case(MSG_DECIDED, vec![1; 33] ; "decided_long")] + fn new_rejects_required_invalid_value_hash(type_: MessageType, hash: Vec) { + let err = Msg::new( + pbconsensus::QbftMsg { + r#type: i64::from(type_), + value_hash: hash.into(), + ..Default::default() + }, + vec![], + sync::Arc::default(), + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid value hash"); + } + + #[test_case(vec![] ; "empty")] #[test_case(vec![0; 32] ; "zero_hash")] - fn new_treats_invalid_value_hash_as_nil(hash: Vec) { + fn new_allows_nil_value_hash_for_round_change(hash: Vec) { let msg = Msg::new( pbconsensus::QbftMsg { + r#type: i64::from(MSG_ROUND_CHANGE), value_hash: hash.into(), ..Default::default() }, @@ -552,6 +614,23 @@ mod tests { assert_eq!(msg.value(), [0u8; 32]); } + #[test_case(vec![1; 31] ; "short")] + #[test_case(vec![1; 33] ; "long")] + fn new_rejects_malformed_optional_value_hash(hash: Vec) { + let err = Msg::new( + pbconsensus::QbftMsg { + r#type: i64::from(MSG_ROUND_CHANGE), + value_hash: hash.into(), + ..Default::default() + }, + vec![], + sync::Arc::default(), + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "invalid value hash"); + } + #[test_case(vec![] ; "empty")] #[test_case(vec![0; 32] ; "zero_hash")] fn new_allows_nil_prepared_value_hash_when_unprepared(hash: Vec) { @@ -687,7 +766,8 @@ mod tests { ..Default::default() }; let raw_justification = pbconsensus::QbftMsg { - r#type: 2, + r#type: i64::from(MSG_ROUND_CHANGE), + prepared_round: 1, prepared_value_hash: prepared_hash.to_vec().into(), ..Default::default() }; diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs index d052ea8c..1e70f922 100644 --- a/crates/consensus/src/qbft/p2p.rs +++ b/crates/consensus/src/qbft/p2p.rs @@ -1380,6 +1380,8 @@ mod tests { duty: &pluto_core::types::Duty, peer_idx: i64, ) -> TestResult { + let value = reference_value(); + let value_hash = msg::hash_proto(&value)?; let key = match peer_idx { 0 => secret_key(1), 1 => secret_key(2), @@ -1394,7 +1396,7 @@ mod tests { duty: Some(pbcore::Duty::try_from(duty)?), peer_idx, round: 1, - value_hash: Bytes::new(), + value_hash: value_hash.to_vec().into(), prepared_value_hash: Bytes::new(), ..Default::default() }; @@ -1402,7 +1404,7 @@ mod tests { Ok(pbconsensus::QbftConsensusMsg { msg: Some(msg::sign_msg(&msg, &key)?), justification: Vec::new(), - values: Vec::new(), + values: vec![Any::from_msg(&value)?], }) } } diff --git a/crates/consensus/src/qbft/transport.rs b/crates/consensus/src/qbft/transport.rs index 2a4c5698..15320004 100644 --- a/crates/consensus/src/qbft/transport.rs +++ b/crates/consensus/src/qbft/transport.rs @@ -414,19 +414,21 @@ mod tests { let key = secret_key(); let duty = duty(); let nested = QbftMsg { - r#type: 3, + r#type: i64::from(qbft::MSG_ROUND_CHANGE), round: 9, ..Default::default() }; + let value_hash = value_hash(1); let raw_justification = QbftMsg { - r#type: 2, + r#type: i64::from(qbft::MSG_PREPARE), round: 4, + value_hash: value_hash.to_vec().into(), ..Default::default() }; let justification = msg::Msg::new( raw_justification.clone(), vec![nested], - sync::Arc::default(), + sync::Arc::new(value_map(vec![(value_hash, any_timestamp(1))])), ) .unwrap(); let justification: qbft::Msg = sync::Arc::new(justification); @@ -436,6 +438,7 @@ mod tests { request.peer_idx = 2; request.round = 5; request.justification = &justifications; + request.values = value_map(vec![(value_hash, any_timestamp(1))]); let msg = create_msg(request).unwrap(); From 00928b12d7c669b46e2714c938449b95e80d4ff1 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 2 Jun 2026 14:59:54 +0700 Subject: [PATCH 13/21] test: cover qbft attester compare full run --- crates/consensus/src/qbft/qbft_run_test.rs | 171 +++++++++++++++++++-- 1 file changed, 161 insertions(+), 10 deletions(-) diff --git a/crates/consensus/src/qbft/qbft_run_test.rs b/crates/consensus/src/qbft/qbft_run_test.rs index 1cfd970c..d5a7b992 100644 --- a/crates/consensus/src/qbft/qbft_run_test.rs +++ b/crates/consensus/src/qbft/qbft_run_test.rs @@ -9,15 +9,17 @@ use pluto_core::{ corepb::v1::core as pbcore, types::{Duty, DutyType, SlotNumber}, }; +use pluto_eth2api::spec::phase0; use prost::bytes::Bytes; use test_case::test_case; -use tokio::sync::mpsc; +use tokio::{sync::mpsc, task::JoinSet}; use tokio_util::sync::CancellationToken; use super::{ Peer, component::{self, Config, Consensus}, }; +use crate::timer::{RoundTimer, RoundTimerFunc, RoundTimerFuture, TimerType}; #[test_case(2, 3 ; "two_of_three")] #[test_case(3, 4 ; "three_of_four")] @@ -26,8 +28,77 @@ use super::{ #[tokio::test] async fn qbft_consensus(threshold: usize, cluster_nodes: usize) { assert!(threshold <= cluster_nodes); + run_qbft_consensus(threshold, false, unsigned_value).await; +} + +#[tokio::test] +async fn qbft_consensus_attester_compare_enabled() { + run_qbft_consensus(3, true, |_| attester_value(0)).await; +} + +#[tokio::test] +async fn qbft_consensus_attester_compare_mismatch_does_not_decide() { + let threshold = 3; + let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); + let active_nodes = + in_memory_network(threshold, true, Some(Duration::from_millis(20)), sniffed_tx); + let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); + let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); + let ct = CancellationToken::new(); + let start_ct = CancellationToken::new(); + let mut expired_txs = Vec::with_capacity(active_nodes.len()); + let mut start_tasks = Vec::with_capacity(active_nodes.len()); + + for (node_idx, node) in active_nodes.iter().enumerate() { + let decided_tx = decided_tx.clone(); + node.subscribe(move |duty, value| { + let _ = decided_tx.send((node_idx, duty, value)); + Ok(()) + }); + + let (expired_tx, expired_rx) = mpsc::channel(1); + expired_txs.push(expired_tx); + start_tasks.push(Arc::clone(node).start(start_ct.clone(), expired_rx)); + } + drop(decided_tx); + + let mut tasks = JoinSet::new(); + for (node_idx, node) in active_nodes.iter().enumerate() { + let node = Arc::clone(node); + let duty = duty.clone(); + let value = attester_value(node_idx); + let ct = ct.clone(); + tasks.spawn(async move { node.propose(&ct, duty, value).await }); + } + + tokio::time::timeout(Duration::from_millis(150), decided_rx.recv()) + .await + .expect_err("mismatched attester compare unexpectedly decided"); + + ct.cancel(); + tokio::time::timeout(Duration::from_secs(1), async { + while let Some(result) = tasks.join_next().await { + assert!(result.expect("mismatched compare task panicked").is_err()); + } + }) + .await + .expect("mismatched compare tasks did not stop after cancellation"); + assert!(decided_rx.try_recv().is_err()); + + start_ct.cancel(); + drop(expired_txs); + for task in start_tasks { + task.await.unwrap(); + } +} + +async fn run_qbft_consensus( + threshold: usize, + compare_attestations: bool, + value: fn(usize) -> pbcore::UnsignedDataSet, +) { let (sniffed_tx, mut sniffed_rx) = mpsc::unbounded_channel(); - let active_nodes = in_memory_network(threshold, sniffed_tx); + let active_nodes = in_memory_network(threshold, compare_attestations, None, sniffed_tx); let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); let ct = CancellationToken::new(); @@ -48,15 +119,13 @@ async fn qbft_consensus(threshold: usize, cluster_nodes: usize) { } drop(decided_tx); - let mut tasks = Vec::with_capacity(active_nodes.len()); + let mut tasks = JoinSet::new(); for (node_idx, node) in active_nodes.iter().enumerate() { let node = Arc::clone(node); let duty = duty.clone(); - let value = unsigned_value(node_idx); + let value = value(node_idx); let ct = ct.clone(); - tasks.push(tokio::spawn( - async move { node.propose(&ct, duty, value).await }, - )); + tasks.spawn(async move { node.propose(&ct, duty, value).await }); } let mut decided = Vec::with_capacity(active_nodes.len()); @@ -64,8 +133,10 @@ async fn qbft_consensus(threshold: usize, cluster_nodes: usize) { decided.push(recv_one(&mut decided_rx).await); } - for task in tasks { - task.await.unwrap().unwrap(); + while let Some(result) = tasks.join_next().await { + result + .expect("consensus task panicked") + .expect("consensus task failed"); } decided.sort_by_key(|(node_idx, ..)| *node_idx); @@ -112,8 +183,60 @@ fn unsigned_value(seed: usize) -> pbcore::UnsignedDataSet { pbcore::UnsignedDataSet { set } } +fn attester_value(seed: usize) -> pbcore::UnsignedDataSet { + let mut set = BTreeMap::new(); + set.insert(pubkey(1), attestation_json_bytes(&attestation_data(seed))); + pbcore::UnsignedDataSet { set } +} + +fn attestation_json_bytes(data: &phase0::AttestationData) -> Bytes { + let value = serde_json::json!({ + "attestation_data": data, + "attestation_duty": { + "slot": "1", + "validator_index": "1", + "committee_index": "2", + "committee_length": "8", + "committees_at_slot": "1", + "validator_committee_index": "1", + }, + }); + Bytes::from(serde_json::to_vec(&value).expect("test attestation json serializes")) +} + +fn attestation_data(seed: usize) -> phase0::AttestationData { + let seed = u8::try_from(seed).expect("test attestation seed fits u8"); + let source_epoch = u64::from(seed) + .checked_add(4) + .expect("test source epoch fits u64"); + let source_root = seed.checked_add(5).expect("test source root byte fits u8"); + let target_epoch = u64::from(seed) + .checked_add(6) + .expect("test target epoch fits u64"); + let target_root = seed.checked_add(7).expect("test target root byte fits u8"); + phase0::AttestationData { + slot: 1, + index: 2, + beacon_block_root: [3; 32], + source: phase0::Checkpoint { + epoch: source_epoch, + root: [source_root; 32], + }, + target: phase0::Checkpoint { + epoch: target_epoch, + root: [target_root; 32], + }, + } +} + +fn pubkey(seed: u8) -> String { + format!("0x{}", hex::encode([seed; 48])) +} + fn in_memory_network( count: usize, + compare_attestations: bool, + round_timeout: Option, sniffed_tx: mpsc::UnboundedSender<(usize, usize)>, ) -> Vec> { let peers = (0..count) @@ -156,7 +279,11 @@ fn in_memory_network( .expect("test peer index fits u8"), ), broadcaster, - compare_attestations: false, + compare_attestations, + timer_func: match round_timeout { + Some(timeout) => short_timer_func(timeout), + None => crate::timer::get_round_timer_func(), + }, sniffer: { let sniffed_tx = sniffed_tx.clone(); Arc::new(move |instance| { @@ -172,3 +299,27 @@ fn in_memory_network( nodes.lock().unwrap().clone() } + +fn short_timer_func(timeout: Duration) -> RoundTimerFunc { + Box::new(move |_| Box::new(ShortRoundTimer { timeout })) +} + +struct ShortRoundTimer { + timeout: Duration, +} + +impl RoundTimer for ShortRoundTimer { + fn timer_type(&self) -> TimerType { + TimerType::Increasing + } + + fn timer(&self, _round: i64) -> crate::timer::Result { + let deadline = tokio::time::Instant::now() + .checked_add(self.timeout) + .expect("test timer deadline fits Instant"); + Ok(Box::pin(async move { + tokio::time::sleep_until(deadline).await; + deadline + })) + } +} From 7a2a112f071919a426d920b2326c5b1e334bd988 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 2 Jun 2026 16:08:47 +0700 Subject: [PATCH 14/21] fix: simplify error --- crates/consensus/src/qbft/p2p.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs index 1e70f922..825b5338 100644 --- a/crates/consensus/src/qbft/p2p.rs +++ b/crates/consensus/src/qbft/p2p.rs @@ -391,7 +391,7 @@ where match result { Ok(Ok(())) => Ok(()), - Ok(Err(error)) => Err(error.to_string()), + Ok(Err(error)) => Err(error), Err(error) => Err(format!("request {request_id}: {}", timeout_error(error))), } } From e577c6a4831d5af5716c8a0fb1da79c6b1e82be2 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 2 Jun 2026 17:07:53 +0700 Subject: [PATCH 15/21] fix: test 4_6 --- crates/consensus/src/instance.rs | 24 +----------- crates/consensus/src/qbft/qbft_run_test.rs | 44 ++++++++++++++++------ crates/consensus/src/qbft/runner.rs | 8 +--- 3 files changed, 35 insertions(+), 41 deletions(-) diff --git a/crates/consensus/src/instance.rs b/crates/consensus/src/instance.rs index e638a580..f5f3a9f1 100644 --- a/crates/consensus/src/instance.rs +++ b/crates/consensus/src/instance.rs @@ -32,7 +32,7 @@ use std::{ }; use prost_types::Any; -use tokio::{sync::mpsc, time::Instant}; +use tokio::sync::mpsc; /// Receive-buffer channel capacity. pub const RECV_BUFFER_SIZE: usize = 100; @@ -105,10 +105,6 @@ pub struct InstanceIo { /// Publishes the runner completion result. pub(crate) err_tx: mpsc::Sender, err_rx: ReceiverSlot, - - /// Publishes the decision timestamp. - pub(crate) decided_at_tx: mpsc::Sender, - decided_at_rx: ReceiverSlot, } impl InstanceIo { @@ -119,7 +115,6 @@ impl InstanceIo { let (value_tx, value_rx) = mpsc::channel(1); let (verify_tx, verify_rx) = mpsc::channel(1); let (err_tx, err_rx) = mpsc::channel(1); - let (decided_at_tx, decided_at_rx) = mpsc::channel(1); Self { participated: AtomicBool::new(false), @@ -135,8 +130,6 @@ impl InstanceIo { verify_rx: Mutex::new(Some(verify_rx)), err_tx, err_rx: Mutex::new(Some(err_rx)), - decided_at_tx, - decided_at_rx: Mutex::new(Some(decided_at_rx)), } } @@ -198,11 +191,6 @@ impl InstanceIo { pub fn take_err_rx(&self) -> Result> { take_receiver(&self.err_rx, "err") } - - /// Transfers decision timestamp ownership to the waiting task. - pub fn take_decided_at_rx(&self) -> Result> { - take_receiver(&self.decided_at_rx, "decided_at") - } } impl Default for InstanceIo { @@ -302,13 +290,6 @@ mod tests { io.err_tx.try_send(Err(Box::new(TestError))), Err(TrySendError::Full(Err(_))) )); - - let decided_at = Instant::now(); - assert!(io.decided_at_tx.try_send(decided_at).is_ok()); - assert!(matches!( - io.decided_at_tx.try_send(decided_at), - Err(TrySendError::Full(_)) - )); } #[test] @@ -338,9 +319,6 @@ mod tests { assert!(io.take_err_rx().is_ok()); assert_receiver_already_taken(io.take_err_rx(), "err"); - - assert!(io.take_decided_at_rx().is_ok()); - assert_receiver_already_taken(io.take_decided_at_rx(), "decided_at"); } #[test] diff --git a/crates/consensus/src/qbft/qbft_run_test.rs b/crates/consensus/src/qbft/qbft_run_test.rs index d5a7b992..f365d4fc 100644 --- a/crates/consensus/src/qbft/qbft_run_test.rs +++ b/crates/consensus/src/qbft/qbft_run_test.rs @@ -28,20 +28,25 @@ use crate::timer::{RoundTimer, RoundTimerFunc, RoundTimerFuture, TimerType}; #[tokio::test] async fn qbft_consensus(threshold: usize, cluster_nodes: usize) { assert!(threshold <= cluster_nodes); - run_qbft_consensus(threshold, false, unsigned_value).await; + run_qbft_consensus(threshold, cluster_nodes, false, unsigned_value).await; } #[tokio::test] async fn qbft_consensus_attester_compare_enabled() { - run_qbft_consensus(3, true, |_| attester_value(0)).await; + run_qbft_consensus(3, 3, true, |_| attester_value(0)).await; } #[tokio::test] async fn qbft_consensus_attester_compare_mismatch_does_not_decide() { let threshold = 3; let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); - let active_nodes = - in_memory_network(threshold, true, Some(Duration::from_millis(20)), sniffed_tx); + let active_nodes = in_memory_network( + threshold, + threshold, + true, + Some(Duration::from_millis(20)), + sniffed_tx, + ); let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); let ct = CancellationToken::new(); @@ -94,11 +99,22 @@ async fn qbft_consensus_attester_compare_mismatch_does_not_decide() { async fn run_qbft_consensus( threshold: usize, + cluster_nodes: usize, compare_attestations: bool, value: fn(usize) -> pbcore::UnsignedDataSet, ) { let (sniffed_tx, mut sniffed_rx) = mpsc::unbounded_channel(); - let active_nodes = in_memory_network(threshold, compare_attestations, None, sniffed_tx); + // Silent-peer cases intentionally exercise leader rotation, so keep their + // round timers short enough for the test suite. + let round_timeout = (cluster_nodes > threshold).then_some(Duration::from_millis(100)); + let nodes = in_memory_network( + cluster_nodes, + threshold, + compare_attestations, + round_timeout, + sniffed_tx, + ); + let active_nodes = &nodes[..threshold]; let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); let ct = CancellationToken::new(); @@ -133,11 +149,15 @@ async fn run_qbft_consensus( decided.push(recv_one(&mut decided_rx).await); } - while let Some(result) = tasks.join_next().await { - result - .expect("consensus task panicked") - .expect("consensus task failed"); - } + tokio::time::timeout(Duration::from_secs(1), async { + while let Some(result) = tasks.join_next().await { + result + .expect("consensus task panicked") + .expect("consensus task failed"); + } + }) + .await + .expect("consensus tasks did not stop after decision"); decided.sort_by_key(|(node_idx, ..)| *node_idx); assert_eq!(decided.len(), threshold); @@ -235,10 +255,12 @@ fn pubkey(seed: u8) -> String { fn in_memory_network( count: usize, + active_count: usize, compare_attestations: bool, round_timeout: Option, sniffed_tx: mpsc::UnboundedSender<(usize, usize)>, ) -> Vec> { + assert!(active_count <= count); let peers = (0..count) .map(|index| Peer { index: i64::try_from(index).expect("test peer index fits i64"), @@ -259,7 +281,7 @@ fn in_memory_network( Box::pin(async move { let peer_idx = msg.msg.as_ref().map_or(-1, |msg| msg.peer_idx); let peers = network.lock().unwrap().clone(); - for (index, consensus) in peers.into_iter().enumerate() { + for (index, consensus) in peers.into_iter().take(active_count).enumerate() { if i64::try_from(index).expect("test peer index fits i64") == peer_idx { continue; } diff --git a/crates/consensus/src/qbft/runner.rs b/crates/consensus/src/qbft/runner.rs index 982a904d..65c58ceb 100644 --- a/crates/consensus/src/qbft/runner.rs +++ b/crates/consensus/src/qbft/runner.rs @@ -9,11 +9,7 @@ use cancellation::CancellationTokenSource; use crossbeam::channel as mpmc; use prost::{Message, Name}; use prost_types::Any; -use tokio::{ - sync::mpsc, - task::JoinError, - time::{Duration, Instant}, -}; +use tokio::{sync::mpsc, task::JoinError, time::Duration}; use tokio_util::sync::CancellationToken; use crate::instance::{self, InstanceIo, RunnerError, RunnerResult}; @@ -279,12 +275,10 @@ async fn run_instance_inner( let decide_callback: DecideCallback = { let decided = Arc::clone(&decided); - let decided_at_tx = inst.decided_at_tx.clone(); let instance_ct = instance_ct.clone(); let core_cts = Arc::clone(&core_cts); Arc::new(move |_qcommit| { decided.store(true, Ordering::Relaxed); - let _ = decided_at_tx.try_send(Instant::now()); instance_ct.cancel(); core_cts.cancel(); }) From 6f3355517179161008fd50ace92c02a6c9d61f6b Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 3 Jun 2026 14:49:25 +0700 Subject: [PATCH 16/21] test: add strategy sim test --- crates/consensus/src/qbft/component.rs | 19 + crates/consensus/src/qbft/mod.rs | 2 + crates/consensus/src/qbft/p2p.rs | 189 +++- crates/consensus/src/qbft/qbft_run_test.rs | 318 ++++++- crates/consensus/src/qbft/runner.rs | 13 +- .../consensus/src/qbft/strategy_sim_test.rs | 889 ++++++++++++++++++ 6 files changed, 1404 insertions(+), 26 deletions(-) create mode 100644 crates/consensus/src/qbft/strategy_sim_test.rs diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 5ea0bcf8..19999d2c 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -863,6 +863,25 @@ pub(crate) mod tests { ); } + #[tokio::test] + async fn handle_accepts_same_duty_justification() { + let consensus = consensus(0, true); + let inst = consensus.get_instance_io(duty()); + let mut justification = unsigned_msg(0); + justification.r#type = i64::from(qbft::MSG_ROUND_CHANGE); + justification.value_hash = Bytes::new(); + let mut outer = valid_consensus_msg(0); + outer.justification = vec![sign_for_peer(justification, 0)]; + + consensus + .handle(&CancellationToken::new(), Some(outer)) + .await + .unwrap(); + + let mut recv_rx = inst.take_recv_rx().unwrap(); + assert_eq!(recv_rx.try_recv().unwrap().justification().len(), 1); + } + #[test] fn values_by_hash_rejects_invalid_type_url() { let err = values_by_hash(&[Any { diff --git a/crates/consensus/src/qbft/mod.rs b/crates/consensus/src/qbft/mod.rs index 926ccdce..598f3bc2 100644 --- a/crates/consensus/src/qbft/mod.rs +++ b/crates/consensus/src/qbft/mod.rs @@ -21,3 +21,5 @@ pub(crate) mod transport; #[cfg(test)] mod qbft_run_test; +#[cfg(test)] +mod strategy_sim_test; diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs index 825b5338..6d7b7d24 100644 --- a/crates/consensus/src/qbft/p2p.rs +++ b/crates/consensus/src/qbft/p2p.rs @@ -717,8 +717,9 @@ impl NetworkBehaviour for Behaviour { #[cfg(test)] mod tests { use std::{ - collections::HashSet, + collections::{BTreeMap, HashSet}, error::Error as StdError, + sync::OnceLock, task::{Context, Poll}, }; @@ -735,7 +736,10 @@ mod tests { }; use prost::{Message, bytes::Bytes}; use prost_types::Any; - use tokio::sync::{mpsc, oneshot}; + use tokio::{ + sync::{mpsc, oneshot}, + task::JoinSet, + }; use crate::{ protocols::QBFT_V2_PROTOCOL_ID, @@ -750,6 +754,7 @@ mod tests { use pluto_core::{ corepb::v1::{consensus as pbconsensus, core as pbcore}, qbft, + types::Duty, }; use pluto_p2p::{ behaviours::pluto::PlutoBehaviourEvent, @@ -984,9 +989,9 @@ mod tests { let (event_tx, mut event_rx) = mpsc::unbounded_channel(); let (task_err_tx, mut task_err_rx) = mpsc::unbounded_channel(); let running = spawn_nodes(nodes, listen_tx, conn_tx, event_tx, task_err_tx)?; - let addrs = wait_for_listen_addrs(&mut listen_rx, &mut task_err_rx).await?; + let addrs = wait_for_listen_addrs(2, &mut listen_rx, &mut task_err_rx).await?; dial_forward_pairs(&running, &addrs)?; - wait_for_connections(&mut conn_rx, &peer_ids).await?; + wait_for_connections(&mut conn_rx, &peer_ids[..2]).await?; let network_msg = signed_consensus_msg(&duty(), 0)?; handle @@ -1016,8 +1021,70 @@ mod tests { Ok(()) } + #[tokio::test] + async fn real_libp2p_loopback_runs_consensus() -> TestResult<()> { + let keys = test_keys()?; + let peer_ids = peer_ids(&keys)?; + let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); + let nodes = build_consensus_nodes(keys, peer_ids.clone(), decided_tx)?; + let consensuses = nodes + .iter() + .map(|node| Arc::clone(&node.consensus)) + .collect::>(); + + let (listen_tx, mut listen_rx) = mpsc::unbounded_channel(); + let (conn_tx, mut conn_rx) = mpsc::unbounded_channel(); + let (event_tx, _event_rx) = mpsc::unbounded_channel(); + let (task_err_tx, mut task_err_rx) = mpsc::unbounded_channel(); + let running = spawn_nodes(nodes, listen_tx, conn_tx, event_tx, task_err_tx)?; + let addrs = wait_for_listen_addrs(peer_ids.len(), &mut listen_rx, &mut task_err_rx).await?; + dial_forward_pairs(&running, &addrs)?; + wait_for_connections(&mut conn_rx, &peer_ids).await?; + + let ct = CancellationToken::new(); + let duty = duty(); + let mut tasks = JoinSet::new(); + for (index, consensus) in consensuses.iter().enumerate() { + let consensus = Arc::clone(consensus); + let duty = duty.clone(); + let ct = ct.clone(); + tasks.spawn(async move { consensus.propose(&ct, duty, unsigned_value(index)).await }); + } + + let mut decided = Vec::with_capacity(consensuses.len()); + for _ in 0..consensuses.len() { + decided.push( + tokio::time::timeout(TEST_TIMEOUT, decided_rx.recv()) + .await? + .ok_or_else(|| std::io::Error::other("decided channel closed"))?, + ); + } + + tokio::time::timeout(TEST_TIMEOUT, async { + while let Some(result) = tasks.join_next().await { + result + .expect("consensus task panicked") + .expect("consensus task failed"); + } + }) + .await + .map_err(|_| std::io::Error::other("timeout waiting for consensus tasks"))?; + + ct.cancel(); + stop_nodes(running).await?; + + let (_, _, expected) = decided.first().expect("at least one decision").clone(); + for (node_index, decided_duty, value) in decided { + assert_eq!(decided_duty, duty, "node {node_index} decided wrong duty"); + assert_eq!(value, expected, "node {node_index} decided different value"); + } + + Ok(()) + } + struct LocalNode { node: Node, + consensus: Arc, handle: Handle, recv_rx: Option>, } @@ -1036,13 +1103,13 @@ mod tests { keys: Vec, peer_ids: Vec, ) -> TestResult> { - let mut nodes = Vec::with_capacity(2); + let mut nodes = Vec::with_capacity(keys.len()); for (index, key) in keys.into_iter().enumerate() { let p2p_context = P2PContext::new(peer_ids.iter().copied()); let consensus = Arc::new(consensus_for_cluster(index, peer_ids.len(), true)?); let mut recv_rx = Some(consensus.get_instance_io(duty()).take_recv_rx()?); let (behaviour, handle) = Behaviour::new(Config { - consensus, + consensus: Arc::clone(&consensus), p2p_context: p2p_context.clone(), peers: peer_ids.clone(), local_peer_id: peer_ids[index], @@ -1060,6 +1127,7 @@ mod tests { nodes.push(LocalNode { node, + consensus, handle, recv_rx: recv_rx.take(), }); @@ -1068,11 +1136,84 @@ mod tests { Ok(nodes) } + fn build_consensus_nodes( + keys: Vec, + peer_ids: Vec, + decided_tx: mpsc::UnboundedSender<(usize, Duty, pbcore::UnsignedDataSet)>, + ) -> TestResult> { + let mut nodes = Vec::with_capacity(keys.len()); + for (index, key) in keys.into_iter().enumerate() { + let p2p_context = P2PContext::new(peer_ids.iter().copied()); + let handle_slot = Arc::new(OnceLock::::new()); + let broadcaster = { + let handle_slot = Arc::clone(&handle_slot); + Arc::new(move |ct, msg| { + let handle = handle_slot + .get() + .expect("test p2p handle initialized") + .clone(); + Box::pin(async move { handle.broadcast(ct, msg).await }) + as futures::future::BoxFuture<'static, BroadcastResult> + }) + }; + let mut config = config_for_cluster(index, peer_ids.len(), true)?; + config.broadcaster = broadcaster; + let consensus = Arc::new(Consensus::new(config)?); + let decided_tx = decided_tx.clone(); + consensus.subscribe(move |duty, value| { + let _ = decided_tx.send((index, duty, value)); + Ok(()) + }); + + let (behaviour, handle) = Behaviour::new(Config { + consensus: Arc::clone(&consensus), + p2p_context: p2p_context.clone(), + peers: peer_ids.clone(), + local_peer_id: peer_ids[index], + cancellation: CancellationToken::new(), + })?; + handle_slot + .set(handle.clone()) + .map_err(|_| std::io::Error::other("test p2p handle set twice"))?; + let node = Node::new_server( + P2PConfig::default(), + key, + NodeType::TCP, + false, + p2p_context, + None, + move |builder, _keypair| builder.with_inner(behaviour), + )?; + + nodes.push(LocalNode { + node, + consensus, + handle, + recv_rx: None, + }); + } + + Ok(nodes) + } + fn consensus_for_cluster( local_peer_idx: usize, peer_count: usize, duty_allowed: bool, ) -> TestResult { + Consensus::new(config_for_cluster( + local_peer_idx, + peer_count, + duty_allowed, + )?) + .map_err(|error| Box::new(error) as _) + } + + fn config_for_cluster( + local_peer_idx: usize, + peer_count: usize, + duty_allowed: bool, + ) -> TestResult { let mut config = config_base(false); config.peers = (0..peer_count) .map(|index| { @@ -1097,7 +1238,7 @@ mod tests { config.privkey = test_secret_key(seed)?; config.duty_gater = Arc::new(move |_| duty_allowed); - Consensus::new(config).map_err(|error| Box::new(error) as _) + Ok(config) } fn spawn_nodes( @@ -1130,8 +1271,10 @@ mod tests { node.dial(target)?; } } - event = node.select_next_some() => { - match event { + event = node.next() => { + match event.ok_or_else(|| { + std::io::Error::other("node swarm ended") + })? { SwarmEvent::NewListenAddr { address, .. } => { let _ = listen_tx.send((index, address)); } @@ -1169,16 +1312,24 @@ mod tests { } async fn wait_for_listen_addrs( + node_count: usize, listen_rx: &mut mpsc::UnboundedReceiver<(usize, Multiaddr)>, task_err_rx: &mut mpsc::UnboundedReceiver<(usize, String)>, ) -> TestResult> { tokio::time::timeout(LIBP2P_SETUP_TIMEOUT, async { - let mut addrs = vec![None, None]; + let mut addrs = vec![None; node_count]; while addrs.iter().any(Option::is_none) { tokio::select! { result = listen_rx.recv() => { - let (index, addr) = result - .ok_or_else(|| std::io::Error::other("listen channel closed"))?; + let Some((index, addr)) = result else { + if let Ok((index, error)) = task_err_rx.try_recv() { + return Err(Box::new(std::io::Error::other(format!( + "node {index} exited before listen: {error}" + ))) as Box); + } + return Err(Box::new(std::io::Error::other("listen channel closed")) + as Box); + }; if index < addrs.len() && addrs[index].is_none() { addrs[index] = Some(addr); } @@ -1226,8 +1377,9 @@ mod tests { peer_ids: &[PeerId], ) -> TestResult<()> { tokio::time::timeout(LIBP2P_SETUP_TIMEOUT, async { - let mut seen = [HashSet::new(), HashSet::new()]; - while seen.iter().any(|peers| peers.is_empty()) { + let mut seen = vec![HashSet::new(); peer_ids.len()]; + let expected_connections = peer_ids.len().saturating_sub(1); + while seen.iter().any(|peers| peers.len() < expected_connections) { let (index, peer_id) = conn_rx .recv() .await @@ -1303,6 +1455,15 @@ mod tests { Ok(context) } + fn unsigned_value(seed: usize) -> pbcore::UnsignedDataSet { + let mut set = BTreeMap::new(); + set.insert( + format!("validator-{seed}"), + Bytes::from(format!("unsigned-{seed}")), + ); + pbcore::UnsignedDataSet { set } + } + fn test_keys() -> TestResult> { test_keys_n(3) } diff --git a/crates/consensus/src/qbft/qbft_run_test.rs b/crates/consensus/src/qbft/qbft_run_test.rs index f365d4fc..362dd411 100644 --- a/crates/consensus/src/qbft/qbft_run_test.rs +++ b/crates/consensus/src/qbft/qbft_run_test.rs @@ -5,12 +5,16 @@ use std::{ time::Duration, }; +use cancellation::CancellationTokenSource; +use crossbeam::channel as mpmc; use pluto_core::{ - corepb::v1::core as pbcore, + corepb::v1::{consensus as pbconsensus, core as pbcore, priority as pbpriority}, + qbft, types::{Duty, DutyType, SlotNumber}, }; use pluto_eth2api::spec::phase0; use prost::bytes::Bytes; +use prost_types::Any; use test_case::test_case; use tokio::{sync::mpsc, task::JoinSet}; use tokio_util::sync::CancellationToken; @@ -18,6 +22,8 @@ use tokio_util::sync::CancellationToken; use super::{ Peer, component::{self, Config, Consensus}, + definition::{self, DefinitionConfig}, + msg::{self, ConsensusQbftTypes}, }; use crate::timer::{RoundTimer, RoundTimerFunc, RoundTimerFuture, TimerType}; @@ -36,6 +42,168 @@ async fn qbft_consensus_attester_compare_enabled() { run_qbft_consensus(3, 3, true, |_| attester_value(0)).await; } +#[tokio::test] +async fn qbft_sniffed_instance_replay_decides() { + let sniffed = run_qbft_consensus(4, 4, false, unsigned_value).await; + let instance = sniffed + .into_iter() + .find(|(node_idx, _)| *node_idx == 0) + .expect("node zero emitted sniffed instance") + .1; + + replay_sniffed_instance_decides(instance).await; +} + +#[tokio::test] +async fn qbft_priority_consensus() { + let threshold = 3; + let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); + let active_nodes = in_memory_network(threshold, threshold, false, None, sniffed_tx); + let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); + let duty = Duty::new(SlotNumber::new(1), DutyType::InfoSync); + let ct = CancellationToken::new(); + let start_ct = CancellationToken::new(); + let mut expired_txs = Vec::with_capacity(active_nodes.len()); + let mut start_tasks = Vec::with_capacity(active_nodes.len()); + + for (node_idx, node) in active_nodes.iter().enumerate() { + let decided_tx = decided_tx.clone(); + node.subscribe_priority(move |duty, value| { + let _ = decided_tx.send((node_idx, duty, value)); + Ok(()) + }); + + let (expired_tx, expired_rx) = mpsc::channel(1); + expired_txs.push(expired_tx); + start_tasks.push(Arc::clone(node).start(start_ct.clone(), expired_rx)); + } + drop(decided_tx); + + let mut tasks = JoinSet::new(); + for (node_idx, node) in active_nodes.iter().enumerate() { + let node = Arc::clone(node); + let duty = duty.clone(); + let value = priority_value(&duty, node_idx); + let ct = ct.clone(); + tasks.spawn(async move { node.propose_priority(&ct, duty, value).await }); + } + + let mut decided = Vec::with_capacity(active_nodes.len()); + for _ in 0..active_nodes.len() { + decided.push(recv_one(&mut decided_rx).await); + } + let (_, _, expected_value) = decided.first().expect("at least one decided value").clone(); + for (node_idx, decided_duty, decided_value) in decided { + assert_eq!(decided_duty, duty, "node {node_idx} decided wrong duty"); + assert_eq!( + decided_value, expected_value, + "node {node_idx} decided different priority value" + ); + } + + tokio::time::timeout(Duration::from_secs(1), async { + while let Some(result) = tasks.join_next().await { + result + .expect("priority consensus task panicked") + .expect("priority consensus task failed"); + } + }) + .await + .expect("priority consensus tasks did not stop after decision"); + + ct.cancel(); + start_ct.cancel(); + drop(expired_txs); + for task in start_tasks { + task.await.unwrap(); + } +} + +#[tokio::test] +async fn qbft_consensus_participate_then_late_propose() { + let threshold = 4; + let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); + let active_nodes = in_memory_network(threshold, threshold, false, None, sniffed_tx); + let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); + let duty = Duty::new(SlotNumber::new(1), DutyType::Attester); + let ct = CancellationToken::new(); + let start_ct = CancellationToken::new(); + let mut expired_txs = Vec::with_capacity(active_nodes.len()); + let mut start_tasks = Vec::with_capacity(active_nodes.len()); + + for (node_idx, node) in active_nodes.iter().enumerate() { + let decided_tx = decided_tx.clone(); + node.subscribe(move |duty, value| { + let _ = decided_tx.send((node_idx, duty, value)); + Ok(()) + }); + + let (expired_tx, expired_rx) = mpsc::channel(1); + expired_txs.push(expired_tx); + start_tasks.push(Arc::clone(node).start(start_ct.clone(), expired_rx)); + } + drop(decided_tx); + + let mut tasks = JoinSet::new(); + for node in &active_nodes { + let node = Arc::clone(node); + let duty = duty.clone(); + let ct = ct.clone(); + tasks.spawn(async move { node.participate(&ct, duty).await }); + } + + tokio::time::timeout(Duration::from_secs(1), async { + loop { + if active_nodes + .iter() + .all(|node| node.get_instance_io(duty.clone()).has_started()) + { + return; + } + tokio::task::yield_now().await; + } + }) + .await + .expect("participants did not start consensus instances"); + + for (node_idx, node) in active_nodes.iter().enumerate() { + let node = Arc::clone(node); + let duty = duty.clone(); + let ct = ct.clone(); + tasks.spawn(async move { node.propose(&ct, duty, unsigned_value(node_idx)).await }); + } + + let mut decided = Vec::with_capacity(active_nodes.len()); + for _ in 0..active_nodes.len() { + decided.push(recv_one(&mut decided_rx).await); + } + let (_, _, expected_value) = decided.first().expect("at least one decided value").clone(); + for (node_idx, decided_duty, decided_value) in decided { + assert_eq!(decided_duty, duty, "node {node_idx} decided wrong duty"); + assert_eq!( + decided_value, expected_value, + "node {node_idx} decided different value" + ); + } + + tokio::time::timeout(Duration::from_secs(1), async { + while let Some(result) = tasks.join_next().await { + result + .expect("consensus task panicked") + .expect("consensus task failed"); + } + }) + .await + .expect("consensus tasks did not stop after decision"); + + ct.cancel(); + start_ct.cancel(); + drop(expired_txs); + for task in start_tasks { + task.await.unwrap(); + } +} + #[tokio::test] async fn qbft_consensus_attester_compare_mismatch_does_not_decide() { let threshold = 3; @@ -102,7 +270,7 @@ async fn run_qbft_consensus( cluster_nodes: usize, compare_attestations: bool, value: fn(usize) -> pbcore::UnsignedDataSet, -) { +) -> Vec<(usize, pbconsensus::SniffedConsensusInstance)> { let (sniffed_tx, mut sniffed_rx) = mpsc::unbounded_channel(); // Silent-peer cases intentionally exercise leader rotation, so keep their // round timers short enough for the test suite. @@ -182,9 +350,136 @@ async fn run_qbft_consensus( sniffed.push(recv_one(&mut sniffed_rx).await); } sniffed.sort_by_key(|(node_idx, _)| *node_idx); - for (node_idx, msg_count) in sniffed { - assert_ne!(msg_count, 0, "node {node_idx} sniffer was empty"); + for (node_idx, instance) in &sniffed { + assert_ne!(instance.msgs.len(), 0, "node {node_idx} sniffer was empty"); } + + sniffed +} + +async fn replay_sniffed_instance_decides(instance: pbconsensus::SniffedConsensusInstance) { + assert!(!instance.msgs.is_empty()); + + let first_msg = instance + .msgs + .iter() + .filter_map(|sniffed| sniffed.msg.as_ref()) + .filter_map(|outer| outer.msg.as_ref()) + .next() + .expect("sniffed instance has inner message"); + let duty = Duty::try_from(first_msg.duty.as_ref().expect("sniffed message has duty")) + .expect("sniffed message duty converts"); + let input_hash = sniffed_input_hash(&instance); + let input_source = sniffed_input_source(&instance); + let nodes = usize::try_from(instance.nodes).expect("sniffed node count fits usize"); + let peer_idx = instance.peer_idx; + + let (recv_tx, recv_rx) = mpmc::bounded(instance.msgs.len()); + for sniffed in instance.msgs { + let outer = sniffed.msg.expect("sniffed entry has outer message"); + let raw = outer.msg.expect("sniffed outer message has inner message"); + let values = component::values_by_hash(&outer.values).expect("sniffed values decode"); + let wrapped = msg::Msg::new(raw, outer.justification, Arc::new(values)) + .expect("sniffed message wraps"); + let wrapped: qbft::Msg = Arc::new(wrapped); + recv_tx + .send(wrapped) + .expect("replay receive buffer accepts"); + } + drop(recv_tx); + + let (input_hash_tx, input_hash_rx) = mpmc::bounded(1); + input_hash_tx + .send(input_hash) + .expect("replay input hash channel accepts"); + drop(input_hash_tx); + + let (input_source_tx, input_source_rx) = mpmc::bounded(1); + input_source_tx + .send(input_source) + .expect("replay input source channel accepts"); + drop(input_source_tx); + + let cts = Arc::new(CancellationTokenSource::new()); + let core_ct = cts.token().clone(); + let callback_cts = Arc::clone(&cts); + let (decided_tx, decided_rx) = mpmc::bounded(1); + let def = definition::new_definition(DefinitionConfig { + nodes, + subscribers: component::SubscriberSet::default(), + round_timer: Box::new(ShortRoundTimer { + timeout: Duration::from_secs(1), + }), + decide_callback: Arc::new(move |_| { + let _ = decided_tx.try_send(()); + callback_cts.cancel(); + }), + compare_attestations: false, + runtime: tokio::runtime::Handle::current(), + }); + let transport = qbft::Transport { + broadcast: Box::new(|_| Ok(())), + receive: recv_rx, + }; + + let run_task = tokio::task::spawn_blocking(move || { + qbft::run( + &core_ct, + &def, + &transport, + &duty, + peer_idx, + input_hash_rx, + input_source_rx, + ) + }); + tokio::time::timeout(Duration::from_secs(1), async { + loop { + if decided_rx.try_recv().is_ok() { + return; + } + tokio::task::yield_now().await; + } + }) + .await + .expect("sniffed replay did not decide"); + + cts.cancel(); + let result = tokio::time::timeout(Duration::from_secs(1), run_task) + .await + .expect("sniffed replay core did not stop") + .expect("sniffed replay core task panicked"); + assert!( + matches!(result, Ok(()) | Err(qbft::QbftError::ContextCanceled)), + "unexpected sniffed replay result: {result:?}" + ); +} + +fn sniffed_input_hash(instance: &pbconsensus::SniffedConsensusInstance) -> [u8; 32] { + instance + .msgs + .iter() + .filter_map(|sniffed| sniffed.msg.as_ref()) + .filter_map(|outer| outer.msg.as_ref()) + .filter_map(|msg| hash32(&msg.value_hash)) + .next() + .expect("sniffed instance has value hash") +} + +fn sniffed_input_source(instance: &pbconsensus::SniffedConsensusInstance) -> Any { + instance + .msgs + .iter() + .filter_map(|sniffed| sniffed.msg.as_ref()) + .flat_map(|outer| outer.values.iter()) + .next() + .cloned() + .expect("sniffed instance has value source") +} + +fn hash32(value: &[u8]) -> Option<[u8; 32]> { + let hash: [u8; 32] = value.try_into().ok()?; + (hash != [0; 32]).then_some(hash) } async fn recv_one(rx: &mut mpsc::UnboundedReceiver) -> T { @@ -209,6 +504,17 @@ fn attester_value(seed: usize) -> pbcore::UnsignedDataSet { pbcore::UnsignedDataSet { set } } +fn priority_value(duty: &Duty, seed: usize) -> pbpriority::PriorityResult { + pbpriority::PriorityResult { + msgs: vec![pbpriority::PriorityMsg { + duty: Some(pbcore::Duty::try_from(duty).expect("test duty converts to proto")), + peer_id: format!("peer-{seed}"), + ..Default::default() + }], + ..Default::default() + } +} + fn attestation_json_bytes(data: &phase0::AttestationData) -> Bytes { let value = serde_json::json!({ "attestation_data": data, @@ -258,7 +564,7 @@ fn in_memory_network( active_count: usize, compare_attestations: bool, round_timeout: Option, - sniffed_tx: mpsc::UnboundedSender<(usize, usize)>, + sniffed_tx: mpsc::UnboundedSender<(usize, pbconsensus::SniffedConsensusInstance)>, ) -> Vec> { assert!(active_count <= count); let peers = (0..count) @@ -309,7 +615,7 @@ fn in_memory_network( sniffer: { let sniffed_tx = sniffed_tx.clone(); Arc::new(move |instance| { - let _ = sniffed_tx.send((index, instance.msgs.len())); + let _ = sniffed_tx.send((index, instance)); }) }, ..component::tests::config_base(false) diff --git a/crates/consensus/src/qbft/runner.rs b/crates/consensus/src/qbft/runner.rs index 65c58ceb..86b50eb4 100644 --- a/crates/consensus/src/qbft/runner.rs +++ b/crates/consensus/src/qbft/runner.rs @@ -84,7 +84,7 @@ pub enum Error { /// Running instance completed with an error. #[error("runner result: {0}")] - RunnerResult(String), + RunnerResult(#[source] RunnerError), } #[derive(Debug, thiserror::Error)] @@ -398,7 +398,7 @@ async fn wait_instance_result(inst: &InstanceIo) -> Result<()> { let mut err_rx = inst.take_err_rx()?; match err_rx.recv().await { Some(Ok(())) => Ok(()), - Some(Err(err)) => Err(Error::RunnerResult(err.to_string())), + Some(Err(err)) => Err(Error::RunnerResult(err)), None => Err(Error::RunnerResultChannelClosed), } } @@ -697,10 +697,11 @@ mod tests { .await .unwrap_err(); - assert!(matches!( - err, - Error::RunnerResult(ref message) if message == "consensus timeout" - )); + let Error::RunnerResult(source) = err else { + panic!("unexpected error: {err:?}"); + }; + assert_eq!(source.to_string(), "consensus timeout"); + assert!(source.source().is_none()); assert!(Arc::ptr_eq(&retained, &consensus.get_instance_io(duty))); } diff --git a/crates/consensus/src/qbft/strategy_sim_test.rs b/crates/consensus/src/qbft/strategy_sim_test.rs new file mode 100644 index 00000000..05710f24 --- /dev/null +++ b/crates/consensus/src/qbft/strategy_sim_test.rs @@ -0,0 +1,889 @@ +use std::{ + collections::{BTreeMap, HashMap, HashSet, hash_map::Entry}, + sync::{Arc, Mutex, PoisonError}, + time::Duration, +}; + +use pluto_core::{ + corepb::v1::{consensus as pbconsensus, core as pbcore}, + qbft, + types::{Duty, DutyType, SlotNumber}, +}; +use prost::bytes::Bytes; +use tokio::{sync::mpsc, task::JoinSet, time::Instant}; +use tokio_util::sync::CancellationToken; + +use super::{ + Peer, + component::{self, Config, Consensus}, +}; +use crate::timer::{ + INC_ROUND_INCREASE, INC_ROUND_START, LINEAR_ROUND_INC, RoundTimer, RoundTimerFunc, + RoundTimerFuture, TimerType, +}; + +const SIM_TIMEOUT: Duration = Duration::from_secs(12); +const TICK: Duration = Duration::from_millis(10); +const DISABLED: Duration = Duration::from_secs(999 * 60 * 60); + +#[tokio::test(start_paused = true)] +async fn strategy_simulator_once() { + let results = run_strategy_simulator(SimConfig { + label: None, + seed: 0, + latency_jitter: Duration::from_millis(50), + latency_per_peer: BTreeMap::from([ + (0, Duration::from_millis(100)), + (1, Duration::from_millis(100)), + (2, Duration::from_millis(100)), + (3, Duration::from_millis(100)), + ]), + start_by_peer: BTreeMap::new(), + timer_strategy: TimerStrategy::Increasing, + timeout: SIM_TIMEOUT, + }) + .await; + + assert_eq!(results.len(), 4); + assert!( + !is_undecided(&results), + "expected all peers to decide: {results:?}" + ); +} + +#[ignore = "diagnostic matrix is intentionally skipped by default"] +#[tokio::test(start_paused = true)] +async fn strategy_simulator_matrix() { + let configs = matrix_configs(1); + assert!(!configs.is_empty()); + + let total_configs = configs.len(); + let mut summaries = BTreeMap::::new(); + for (index, config) in configs.into_iter().enumerate() { + let peer_count = config.latency_per_peer.len(); + let label = config.label.expect("matrix config has label"); + let key = MatrixKey { + size: label.size, + distribution: label.distribution, + timer: config.timer_strategy.name(), + }; + let results = run_strategy_simulator(config).await; + assert_eq!(results.len(), peer_count); + + let summary = summaries.entry(key).or_default(); + summary.total = summary + .total + .checked_add(1) + .expect("matrix summary total fits usize"); + if is_undecided(&results) { + summary.undecided = summary + .undecided + .checked_add(1) + .expect("matrix summary undecided count fits usize"); + } else { + summary.durations.push(quorum_decided_duration(&results)); + summary.rounds.push(decided_round(&results)); + } + + let completed = index + .checked_add(1) + .expect("matrix config index increments"); + if completed.checked_rem(100).expect("non-zero divisor") == 0 { + println!("Completed {completed}/{total_configs}"); + } + } + + print_matrix_summaries(&summaries); + print_timer_aggregates(&summaries); +} + +#[tokio::test(start_paused = true)] +async fn strategy_exp_timer_smoke() { + let timer = StrategyRoundTimer::new(TimerStrategy::Exp { + base: LINEAR_ROUND_INC, + }); + + for round in 1..5 { + let timeout = timer.timer(round).expect("timer constructs"); + drop(timeout); + } +} + +#[derive(Debug, Clone)] +struct SimConfig { + label: Option, + seed: u64, + latency_jitter: Duration, + latency_per_peer: BTreeMap, + start_by_peer: BTreeMap, + timer_strategy: TimerStrategy, + timeout: Duration, +} + +#[derive(Debug, Clone, Copy)] +struct MatrixLabel { + size: &'static str, + distribution: &'static str, +} + +#[derive(Debug, Default)] +struct MatrixSummary { + total: usize, + undecided: usize, + rounds: Vec, + durations: Vec, +} + +impl MatrixSummary { + fn undecided_percent(&self) -> f64 { + if self.total == 0 { + return 0.0; + } + + 100.0 * f64_from_usize(self.undecided) / f64_from_usize(self.total) + } + + fn avg_round(&self) -> f64 { + if self.rounds.is_empty() { + return 0.0; + } + + self.rounds.iter().copied().map(f64_from_i64).sum::() + / f64_from_usize(self.rounds.len()) + } + + fn avg_duration(&self) -> Duration { + if self.durations.is_empty() { + return Duration::ZERO; + } + + let total = self + .durations + .iter() + .copied() + .fold(Duration::ZERO, |sum, duration| { + sum.checked_add(duration) + .expect("test matrix duration sum fits Duration") + }); + total + .checked_div(u32::try_from(self.durations.len()).expect("duration count fits u32")) + .expect("non-empty duration count") + } + + fn stddev_duration(&self) -> Duration { + if self.durations.is_empty() { + return Duration::ZERO; + } + + let mean = self.avg_duration().as_secs_f64(); + let variance = self + .durations + .iter() + .map(|duration| { + let diff = duration.as_secs_f64() - mean; + diff * diff + }) + .sum::() + / f64_from_usize(self.durations.len()); + + Duration::from_secs_f64(variance.sqrt()) + } +} + +#[derive(Debug, Eq, Ord, PartialEq, PartialOrd)] +struct MatrixKey { + size: &'static str, + distribution: &'static str, + timer: String, +} + +#[derive(Debug, Clone, Copy)] +enum TimerStrategy { + Increasing, + Exp { base: Duration }, + ExpDouble { base: Duration }, + Linear { base: Duration }, + LinearDouble { base: Duration }, +} + +impl TimerStrategy { + fn duration(self, round: i64) -> Duration { + let round = u32::try_from(round).expect("test round fits u32"); + match self { + Self::Increasing => INC_ROUND_START + .checked_add( + INC_ROUND_INCREASE + .checked_mul(round) + .expect("test increasing timer increment fits"), + ) + .expect("test increasing timer duration fits"), + Self::Exp { base } | Self::ExpDouble { base } => { + let exponent = round.checked_sub(1).expect("test round is positive"); + let multiplier = 2u32 + .checked_pow(exponent) + .expect("test exp timer multiplier fits u32"); + base.checked_mul(multiplier) + .expect("test exp timer duration fits") + } + Self::Linear { base } | Self::LinearDouble { base } => base + .checked_mul(round) + .expect("test linear timer duration fits"), + } + } + + fn double(self) -> bool { + matches!(self, Self::ExpDouble { .. } | Self::LinearDouble { .. }) + } + + fn timer_type(self) -> TimerType { + match self { + Self::Increasing => TimerType::Increasing, + Self::Exp { .. } + | Self::ExpDouble { .. } + | Self::Linear { .. } + | Self::LinearDouble { .. } => TimerType::EagerDoubleLinear, + } + } + + fn name(self) -> String { + match self { + Self::Increasing => "increasing".to_owned(), + Self::Exp { base } => format!("exp_{}", base.as_millis()), + Self::ExpDouble { base } => format!("edouble_{}", base.as_millis()), + Self::Linear { base } => format!("linear_{}", base.as_millis()), + Self::LinearDouble { base } => format!("ldouble_{}", base.as_millis()), + } + } +} + +#[derive(Debug, Clone)] +struct SimResult { + peer_idx: usize, + decided: bool, + round: Option, + duration: Option, +} + +async fn run_strategy_simulator(config: SimConfig) -> Vec { + let peer_count = config.latency_per_peer.len(); + let (round_tx, mut round_rx) = mpsc::unbounded_channel(); + let network = SimNetwork::new(peer_count, &config, round_tx); + let duty = Duty::new(SlotNumber::new(config.seed), DutyType::Attester); + let ct = CancellationToken::new(); + let (decided_tx, mut decided_rx) = mpsc::unbounded_channel(); + let start = Instant::now(); + + for (peer_idx, node) in network.nodes().iter().enumerate() { + let decided_tx = decided_tx.clone(); + node.subscribe(move |_, _| { + let _ = decided_tx.send((peer_idx, start.elapsed())); + Ok(()) + }); + } + drop(decided_tx); + + let mut tasks = JoinSet::new(); + for (peer_idx, node) in network.nodes().iter().enumerate() { + let start_delay = config + .start_by_peer + .get(&peer_idx) + .copied() + .unwrap_or_default(); + if start_delay == DISABLED { + continue; + } + + let node = Arc::clone(node); + let duty = duty.clone(); + let ct = ct.clone(); + tasks.spawn(async move { + if !start_delay.is_zero() { + tokio::time::sleep(start_delay).await; + } + let result = node.propose(&ct, duty, unsigned_value(peer_idx)).await; + (peer_idx, result) + }); + } + + let mut results = (0..peer_count) + .map(|peer_idx| SimResult { + peer_idx, + decided: false, + round: None, + duration: None, + }) + .collect::>(); + + while start.elapsed() < config.timeout && !all_started_peers_decided(&results, &config) { + drain_decisions(&mut decided_rx, &mut results); + + if all_started_peers_decided(&results, &config) { + break; + } + + tokio::time::advance(TICK).await; + tokio::task::yield_now().await; + } + + drain_decisions(&mut decided_rx, &mut results); + + ct.cancel(); + network.cancel(); + while let Some(joined) = tasks.join_next().await { + let (_peer_idx, result) = joined.expect("strategy simulator task panicked"); + if let Err(err) = result { + assert!( + matches!(err, super::runner::Error::ConsensusTimeout), + "unexpected simulator error: {err}" + ); + } + } + + while let Ok((peer_idx, round)) = round_rx.try_recv() { + if let Some(result) = results.get_mut(peer_idx) { + result.round = Some(round); + } + } + + results +} + +fn is_undecided(results: &[SimResult]) -> bool { + let decided = results.iter().filter(|result| result.decided).count(); + decided < quorum(results.len()) +} + +fn quorum(nodes: usize) -> usize { + nodes + .checked_mul(2) + .and_then(|nodes| nodes.checked_add(2)) + .and_then(|nodes| nodes.checked_div(3)) + .expect("test node count permits quorum calculation") +} + +fn drain_decisions( + decided_rx: &mut mpsc::UnboundedReceiver<(usize, Duration)>, + results: &mut [SimResult], +) { + while let Ok((peer_idx, duration)) = decided_rx.try_recv() { + if let Some(result) = results.get_mut(peer_idx) { + result.decided = true; + result.duration = Some(duration); + } + } +} + +fn decided_round(results: &[SimResult]) -> i64 { + results + .iter() + .find(|result| result.decided) + .and_then(|result| result.round) + .expect("decided result has sniffed commit round") +} + +fn quorum_decided_duration(results: &[SimResult]) -> Duration { + let mut durations = results + .iter() + .filter(|result| result.decided) + .map(|result| result.duration.expect("decided result has duration")) + .collect::>(); + assert!( + durations.len() >= quorum(results.len()), + "not enough decided durations" + ); + + durations.sort(); + let quorum_index = quorum(results.len()) + .checked_sub(1) + .expect("quorum for non-empty results is positive"); + durations[quorum_index] +} + +fn f64_from_usize(value: usize) -> f64 { + f64::from(u32::try_from(value).expect("test matrix count fits u32")) +} + +fn f64_from_i64(value: i64) -> f64 { + f64::from(i32::try_from(value).expect("test round fits i32")) +} + +fn all_started_peers_decided(results: &[SimResult], config: &SimConfig) -> bool { + results.iter().all(|result| { + result.decided + || config + .start_by_peer + .get(&result.peer_idx) + .is_some_and(|delay| *delay == DISABLED) + }) +} + +struct SimNetwork { + nodes: Arc>>>, + delivery_ct: CancellationToken, +} + +impl SimNetwork { + fn new( + peer_count: usize, + config: &SimConfig, + round_tx: mpsc::UnboundedSender<(usize, i64)>, + ) -> Self { + let nodes = Arc::new(Mutex::new(Vec::with_capacity(peer_count))); + let delivery_ct = CancellationToken::new(); + let peers = peers(peer_count); + let rng = Arc::new(Mutex::new(TestRng::new(config.seed))); + let latency_per_peer = Arc::new(config.latency_per_peer.clone()); + + for peer_idx in 0..peer_count { + let network = Arc::clone(&nodes); + let rng = Arc::clone(&rng); + let latency_per_peer = Arc::clone(&latency_per_peer); + let latency_jitter = config.latency_jitter; + let delivery_ct = delivery_ct.clone(); + let broadcaster: component::Broadcaster = Arc::new(move |ct, msg| { + let network = Arc::clone(&network); + let rng = Arc::clone(&rng); + let latency_per_peer = Arc::clone(&latency_per_peer); + let delivery_ct = delivery_ct.clone(); + Box::pin(async move { + broadcast_with_latency( + network, + rng, + latency_per_peer, + latency_jitter, + delivery_ct, + ct, + msg, + ) + .await + }) + }); + + let consensus = Arc::new( + Consensus::new(Config { + peers: peers.clone(), + local_peer_idx: i64::try_from(peer_idx).expect("test peer index fits i64"), + privkey: component::tests::secret_key( + u8::try_from(peer_idx.checked_add(1).expect("test peer index increments")) + .expect("test peer index fits u8"), + ), + broadcaster, + timer_func: timer_func(config.timer_strategy), + sniffer: { + let round_tx = round_tx.clone(); + Arc::new(move |instance| { + if let Some(round) = decided_round_from_sniffer(&instance) { + let _ = round_tx.send((peer_idx, round)); + } + }) + }, + ..component::tests::config_base(false) + }) + .unwrap(), + ); + nodes + .lock() + .unwrap_or_else(PoisonError::into_inner) + .push(consensus); + } + + Self { nodes, delivery_ct } + } + + fn nodes(&self) -> Vec> { + self.nodes + .lock() + .unwrap_or_else(PoisonError::into_inner) + .clone() + } + + fn cancel(&self) { + self.delivery_ct.cancel(); + } +} + +fn decided_round_from_sniffer(instance: &pbconsensus::SniffedConsensusInstance) -> Option { + instance + .msgs + .iter() + .filter_map(|sniffed| sniffed.msg.as_ref()) + .filter_map(|outer| outer.msg.as_ref()) + .filter(|msg| msg.r#type == i64::from(qbft::MSG_COMMIT)) + .map(|msg| msg.round) + .max() +} + +async fn broadcast_with_latency( + network: Arc>>>, + rng: Arc>, + latency_per_peer: Arc>, + latency_jitter: Duration, + delivery_ct: CancellationToken, + sender_ct: CancellationToken, + msg: pbconsensus::QbftConsensusMsg, +) -> component::BroadcastResult { + if sender_ct.is_cancelled() { + return Ok(()); + } + + let source = msg.msg.as_ref().map_or(-1, |msg| msg.peer_idx); + let nodes = network + .lock() + .unwrap_or_else(PoisonError::into_inner) + .clone(); + + for (peer_idx, node) in nodes.into_iter().enumerate() { + if i64::try_from(peer_idx).expect("test peer index fits i64") == source { + continue; + } + + let Some(mean) = latency_per_peer.get(&peer_idx).copied() else { + continue; + }; + let delay = { + let mut rng = rng.lock().unwrap_or_else(PoisonError::into_inner); + jittered_latency(mean, latency_jitter, &mut rng) + }; + let delivery_ct = delivery_ct.clone(); + let msg = msg.clone(); + + tokio::spawn(async move { + tokio::select! { + () = delivery_ct.cancelled() => {} + () = tokio::time::sleep(delay) => { + let _ = node.handle(&delivery_ct, Some(msg)).await; + } + } + }); + } + + Ok(()) +} + +fn timer_func(strategy: TimerStrategy) -> RoundTimerFunc { + Box::new(move |_| Box::new(StrategyRoundTimer::new(strategy))) +} + +#[derive(Debug)] +struct StrategyRoundTimer { + strategy: TimerStrategy, + deadlines: Mutex>, +} + +impl StrategyRoundTimer { + fn new(strategy: TimerStrategy) -> Self { + Self { + strategy, + deadlines: Mutex::new(HashMap::new()), + } + } +} + +impl RoundTimer for StrategyRoundTimer { + fn timer_type(&self) -> TimerType { + self.strategy.timer_type() + } + + fn timer(&self, round: i64) -> crate::timer::Result { + let duration = self.strategy.duration(round); + let mut deadlines = self + .deadlines + .lock() + .unwrap_or_else(PoisonError::into_inner); + let deadline = match deadlines.entry(round) { + Entry::Occupied(mut entry) if self.strategy.double() => { + let deadline = entry + .get() + .checked_add(duration) + .expect("test timer deadline fits"); + entry.insert(deadline); + deadline + } + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let deadline = Instant::now() + .checked_add(duration) + .expect("test timer deadline fits"); + entry.insert(deadline); + deadline + } + }; + + Ok(Box::pin(async move { + tokio::time::sleep_until(deadline).await; + deadline + })) + } +} + +fn matrix_configs(iters_per_config: usize) -> Vec { + let sizes = [ + ("small-all", 4usize, 4usize), + ("small-min", 3, 4), + ("medium-all", 6, 6), + ("medium-min", 4, 6), + ("large-all", 9, 9), + ("large-min", 6, 9), + ]; + let distributions = [ + ( + "colocated", + vec![Duration::from_millis(5), Duration::from_millis(10)], + vec![ + Duration::from_millis(5), + Duration::from_millis(10), + Duration::from_millis(25), + Duration::from_millis(50), + ], + ), + ( + "regional", + vec![Duration::from_millis(10), Duration::from_millis(25)], + vec![ + Duration::from_millis(50), + Duration::from_millis(100), + Duration::from_millis(250), + ], + ), + ( + "global", + vec![Duration::from_millis(50), Duration::from_millis(100)], + vec![ + Duration::from_millis(250), + Duration::from_millis(250), + Duration::from_millis(500), + Duration::from_millis(500), + Duration::from_millis(750), + ], + ), + ]; + let timers = [ + TimerStrategy::Increasing, + TimerStrategy::Exp { + base: Duration::from_millis(1_000), + }, + TimerStrategy::ExpDouble { + base: Duration::from_millis(1_000), + }, + TimerStrategy::Linear { + base: Duration::from_millis(1_000), + }, + TimerStrategy::LinearDouble { + base: Duration::from_millis(1_000), + }, + ]; + + let mut configs = Vec::new(); + for (size, up, nodes) in sizes { + for (distribution, jitters, latencies) in &distributions { + for timer_strategy in timers { + let disabled_count = nodes.checked_sub(up).expect("up count is bounded by nodes"); + let mut timer_configs = random_configs( + MatrixLabel { size, distribution }, + nodes, + iters_per_config, + timer_strategy, + jitters, + latencies, + ); + disable_random_nodes(&mut timer_configs, disabled_count); + configs.extend(timer_configs); + } + } + } + + configs +} + +fn print_matrix_summaries(summaries: &BTreeMap) { + print_summary_header(); + for (key, summary) in summaries { + print_summary(key.size, key.distribution, &key.timer, summary); + } +} + +fn print_timer_aggregates(summaries: &BTreeMap) { + println!("\n\nTimer aggregate results\n"); + + let mut aggregates = BTreeMap::::new(); + for (key, summary) in summaries { + let aggregate = aggregates.entry(key.timer.clone()).or_default(); + aggregate.total = aggregate + .total + .checked_add(summary.total) + .expect("aggregate total fits usize"); + aggregate.undecided = aggregate + .undecided + .checked_add(summary.undecided) + .expect("aggregate undecided count fits usize"); + aggregate.rounds.extend(summary.rounds.iter().copied()); + aggregate + .durations + .extend(summary.durations.iter().copied()); + } + + print_summary_header(); + for (timer, summary) in aggregates { + print_summary("", "", &timer, &summary); + } +} + +fn print_summary_header() { + println!("Size\tDistribution\tTimer\tTotal\tUndecided\tAvgRound\tMeanDuration\tStdDevDuration"); +} + +fn print_summary(size: &str, distribution: &str, timer: &str, summary: &MatrixSummary) { + println!( + "{size}\t{distribution}\t{timer}\t{}\t{:.2}%\t{:.2}\t{:.2}s\t{:.2}s", + summary.total, + summary.undecided_percent(), + summary.avg_round(), + summary.avg_duration().as_secs_f64(), + summary.stddev_duration().as_secs_f64() + ); +} + +fn random_configs( + label: MatrixLabel, + peer_count: usize, + count: usize, + timer_strategy: TimerStrategy, + jitters: &[Duration], + latencies: &[Duration], +) -> Vec { + let mut rng = TestRng::new(0); + let mut configs = Vec::with_capacity(count); + + for seed in 0..count { + let mut latency_per_peer = BTreeMap::new(); + for peer_idx in 0..peer_count { + latency_per_peer.insert(peer_idx, latencies[rng.gen_range(latencies.len())]); + } + + configs.push(SimConfig { + label: Some(label), + seed: u64::try_from(seed).expect("test seed fits u64"), + latency_jitter: jitters[seed.checked_rem(jitters.len()).expect("non-empty jitters")], + latency_per_peer, + start_by_peer: jittered_start_latencies(peer_count, &mut rng), + timer_strategy, + timeout: SIM_TIMEOUT, + }); + } + + configs +} + +fn disable_random_nodes(configs: &mut [SimConfig], count: usize) { + let mut rng = TestRng::new(0); + + for config in configs { + let peer_count = config.latency_per_peer.len(); + assert!(count <= peer_count); + + let mut disabled = HashSet::with_capacity(count); + while disabled.len() < count { + disabled.insert(rng.gen_range(peer_count)); + } + + for peer_idx in disabled { + config.start_by_peer.insert(peer_idx, DISABLED); + } + } +} + +fn jittered_start_latencies(peer_count: usize, rng: &mut TestRng) -> BTreeMap { + let mut starts = BTreeMap::new(); + for peer_idx in 0..peer_count { + starts.insert( + peer_idx, + jittered_latency(Duration::from_millis(463), Duration::from_millis(273), rng), + ); + } + + starts +} + +fn jittered_latency(mean: Duration, jitter: Duration, rng: &mut TestRng) -> Duration { + if jitter.is_zero() { + return mean; + } + + let spread = u64::try_from(jitter.as_nanos()).expect("test jitter fits u64 nanos"); + let range = spread + .checked_mul(2) + .and_then(|value| value.checked_add(1)) + .expect("test jitter range fits u64"); + let sample = rng + .next_u64() + .checked_rem(range) + .expect("test jitter range is non-zero"); + + if sample <= spread { + mean.checked_sub(Duration::from_nanos( + spread + .checked_sub(sample) + .expect("sample is bounded by spread"), + )) + .unwrap_or(Duration::ZERO) + } else { + mean.checked_add(Duration::from_nanos( + sample + .checked_sub(spread) + .expect("sample is greater than spread"), + )) + .expect("test jittered latency fits Duration") + } +} + +struct TestRng { + state: u64, +} + +impl TestRng { + fn new(seed: u64) -> Self { + Self { state: seed } + } + + fn next_u64(&mut self) -> u64 { + self.state = self + .state + .wrapping_mul(6_364_136_223_846_793_005) + .wrapping_add(1); + self.state + } + + fn gen_range(&mut self, end: usize) -> usize { + let end = u64::try_from(end).expect("test range fits u64"); + assert_ne!(end, 0); + usize::try_from( + self.next_u64() + .checked_rem(end) + .expect("test range is non-zero"), + ) + .expect("test sample fits usize") + } +} + +fn peers(count: usize) -> Vec { + (0..count) + .map(|index| Peer { + index: i64::try_from(index).expect("test peer index fits i64"), + name: format!("node-{index}"), + public_key: component::tests::secret_key( + u8::try_from(index.checked_add(1).expect("test peer index increments")) + .expect("test peer index fits u8"), + ) + .public_key(), + }) + .collect() +} + +fn unsigned_value(seed: usize) -> pbcore::UnsignedDataSet { + let mut set = BTreeMap::new(); + set.insert( + format!("validator-{seed}"), + Bytes::from(format!("unsigned-{seed}")), + ); + pbcore::UnsignedDataSet { set } +} From 4ec4a18f291c1ec0fe459cc827d5e04a79bc3c12 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 3 Jun 2026 15:00:49 +0700 Subject: [PATCH 17/21] fix: add featureset to core --- Cargo.lock | 1 + crates/core/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 785dbfec..71823135 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5640,6 +5640,7 @@ dependencies = [ "pluto-crypto", "pluto-eth2api", "pluto-eth2util", + "pluto-featureset", "pluto-ssz", "pluto-testutil", "pluto-tracing", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 90f58ce6..c9bc25aa 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -18,6 +18,7 @@ hex.workspace = true vise.workspace = true pluto-crypto.workspace = true pluto-eth2api.workspace = true +pluto-featureset.workspace = true prost.workspace = true prost-types.workspace = true regex.workspace = true From 0e1c43421138e66b7458a4689797b4e060081ab7 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 3 Jun 2026 16:05:33 +0700 Subject: [PATCH 18/21] refactor: simplify logic component --- crates/consensus/src/qbft/component.rs | 65 +++++++++----------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 19999d2c..37763ac2 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -258,7 +258,6 @@ impl SubscriberSet { /// QBFT consensus component. pub struct Consensus { peers: Vec, - #[cfg(test)] peer_labels: Vec, pubkeys: HashMap, local_peer_idx: i64, @@ -277,13 +276,11 @@ impl Consensus { /// Creates a new QBFT consensus component. pub fn new(config: Config) -> Result { let mut pubkeys = HashMap::with_capacity(config.peers.len()); - #[cfg(test)] let mut peer_labels = Vec::with_capacity(config.peers.len()); for (index, peer) in config.peers.iter().enumerate() { let peer_idx = i64::try_from(index).map_err(|_| Error::PeerIndexOverflow { index })?; pubkeys.insert(peer_idx, peer.public_key); - #[cfg(test)] peer_labels.push(format!("{}:{}", peer.index, peer.name)); } @@ -295,7 +292,6 @@ impl Consensus { Ok(Self { peers: config.peers, - #[cfg(test)] peer_labels, pubkeys, local_peer_idx: config.local_peer_idx, @@ -336,15 +332,14 @@ impl Consensus { pub async fn handle( &self, ct: &CancellationToken, - req: Option, + pb_msg: pbconsensus::QbftConsensusMsg, ) -> Result<()> { - let pb_msg = req.ok_or(Error::InvalidConsensusMessage)?; let msg = pb_msg.msg.as_ref().ok_or(Error::InvalidConsensusMessage)?; self.verify_msg(msg)?; let duty = duty_from_msg(msg)?; - if !self.duty_allowed(&duty) { + if !(self.duty_gater)(&duty) { return Err(Error::InvalidDuty); } @@ -470,11 +465,6 @@ impl Consensus { self.pubkeys.get(&peer_idx) } - /// Returns whether local policy admits consensus for the duty. - pub(crate) fn duty_allowed(&self, duty: &Duty) -> bool { - (self.duty_gater)(duty) - } - /// Registers the duty with the deadline scheduler. pub(crate) async fn add_deadline(&self, duty: Duty) -> AddOutcome { self.deadliner.add(duty).await @@ -705,22 +695,12 @@ pub(crate) mod tests { ); } - #[tokio::test] - async fn handle_rejects_invalid_outer_message() { - let err = consensus(0, true) - .handle(&CancellationToken::new(), None) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "invalid consensus message"); - } - #[tokio::test] async fn handle_rejects_missing_inner_message() { let err = consensus(0, true) .handle( &CancellationToken::new(), - Some(pbconsensus::QbftConsensusMsg::default()), + pbconsensus::QbftConsensusMsg::default(), ) .await .unwrap_err(); @@ -810,10 +790,7 @@ pub(crate) mod tests { #[tokio::test] async fn handle_rejects_duty_gate_false() { let err = consensus(0, false) - .handle( - &CancellationToken::new(), - Some(consensus_msg(signed_msg(0))), - ) + .handle(&CancellationToken::new(), consensus_msg(signed_msg(0))) .await .unwrap_err(); @@ -831,7 +808,7 @@ pub(crate) mod tests { }; let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(outer)) + .handle(&CancellationToken::new(), outer) .await .unwrap_err(); @@ -853,7 +830,7 @@ pub(crate) mod tests { }; let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(outer)) + .handle(&CancellationToken::new(), outer) .await .unwrap_err(); @@ -874,7 +851,7 @@ pub(crate) mod tests { outer.justification = vec![sign_for_peer(justification, 0)]; consensus - .handle(&CancellationToken::new(), Some(outer)) + .handle(&CancellationToken::new(), outer) .await .unwrap(); @@ -921,7 +898,7 @@ pub(crate) mod tests { let msg = sign_for_peer(msg, 0); let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .handle(&CancellationToken::new(), consensus_msg(msg)) .await .unwrap_err(); @@ -939,7 +916,7 @@ pub(crate) mod tests { let msg = sign_for_peer(msg, 0); let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .handle(&CancellationToken::new(), consensus_msg(msg)) .await .unwrap_err(); @@ -959,7 +936,7 @@ pub(crate) mod tests { let msg = sign_for_peer(msg, 0); let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .handle(&CancellationToken::new(), consensus_msg(msg)) .await .unwrap_err(); @@ -975,7 +952,7 @@ pub(crate) mod tests { let msg = sign_for_peer(msg, 0); let err = consensus(0, true) - .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .handle(&CancellationToken::new(), consensus_msg(msg)) .await .unwrap_err(); @@ -996,7 +973,7 @@ pub(crate) mod tests { let inst = consensus.get_instance_io(duty()); consensus - .handle(&CancellationToken::new(), Some(consensus_msg(msg))) + .handle(&CancellationToken::new(), consensus_msg(msg)) .await .unwrap(); @@ -1022,11 +999,11 @@ pub(crate) mod tests { consensus .handle( &CancellationToken::new(), - Some(pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg { msg: Some(msg), justification: vec![], values: vec![any], - }), + }, ) .await .unwrap(); @@ -1046,7 +1023,7 @@ pub(crate) mod tests { .unwrap(); let err = consensus - .handle(&CancellationToken::new(), Some(valid_consensus_msg(0))) + .handle(&CancellationToken::new(), valid_consensus_msg(0)) .await .unwrap_err(); @@ -1059,7 +1036,7 @@ pub(crate) mod tests { ct.cancel(); let err = consensus(0, true) - .handle(&ct, Some(valid_consensus_msg(0))) + .handle(&ct, valid_consensus_msg(0)) .await .unwrap_err(); @@ -1076,7 +1053,7 @@ pub(crate) mod tests { } let ct = CancellationToken::new(); - let handle = consensus.handle(&ct, Some(valid_consensus_msg(0))); + let handle = consensus.handle(&ct, valid_consensus_msg(0)); tokio::pin!(handle); tokio::select! { @@ -1103,7 +1080,7 @@ pub(crate) mod tests { } let ct = CancellationToken::new(); - let handle = consensus.handle(&ct, Some(valid_consensus_msg(0))); + let handle = consensus.handle(&ct, valid_consensus_msg(0)); tokio::pin!(handle); tokio::select! { @@ -1138,11 +1115,11 @@ pub(crate) mod tests { consensus .handle( &CancellationToken::new(), - Some(pbconsensus::QbftConsensusMsg { + pbconsensus::QbftConsensusMsg { msg: Some(msg), justification: vec![], values: vec![any], - }), + }, ) .await .unwrap(); @@ -1159,7 +1136,7 @@ pub(crate) mod tests { .expect("recv receiver should be available"); consensus - .handle(&CancellationToken::new(), Some(reference_consensus_msg())) + .handle(&CancellationToken::new(), reference_consensus_msg()) .await .expect("reference message should be admitted"); From 99b8f9b6c1f7a0474ebb429a9c17f4669e598efb Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 3 Jun 2026 17:08:10 +0700 Subject: [PATCH 19/21] refactor: simplify logic runner --- crates/consensus/src/qbft/runner.rs | 100 ++++++++++++---------------- 1 file changed, 44 insertions(+), 56 deletions(-) diff --git a/crates/consensus/src/qbft/runner.rs b/crates/consensus/src/qbft/runner.rs index 86b50eb4..4d3763a7 100644 --- a/crates/consensus/src/qbft/runner.rs +++ b/crates/consensus/src/qbft/runner.rs @@ -9,7 +9,11 @@ use cancellation::CancellationTokenSource; use crossbeam::channel as mpmc; use prost::{Message, Name}; use prost_types::Any; -use tokio::{sync::mpsc, task::JoinError, time::Duration}; +use tokio::{ + sync::mpsc, + task::{JoinError, JoinSet}, + time::Duration, +}; use tokio_util::sync::CancellationToken; use crate::instance::{self, InstanceIo, RunnerError, RunnerResult}; @@ -159,7 +163,11 @@ pub(crate) async fn participate( return Ok(()); } - if !consensus_participate_enabled() { + if !pluto_featureset::GLOBAL_STATE + .read() + .expect("global feature set lock poisoned") + .enabled(pluto_featureset::Feature::ConsensusParticipate) + { return Ok(()); } @@ -182,7 +190,10 @@ pub(crate) async fn run_instance( inst: Arc>, ) -> Result<()> { let result = run_instance_inner(consensus, parent_ct, duty.clone(), Arc::clone(&inst)).await; - let runner_result = to_runner_result(&result); + let runner_result: RunnerResult = result + .as_ref() + .map_err(|err| Box::new(RunnerResultError(err.to_string())) as RunnerError) + .copied(); let _ = inst.err_tx.send(runner_result).await; result @@ -205,7 +216,7 @@ async fn run_instance_inner( let value_rx = inst.take_value_rx()?; let verify_rx = inst.take_verify_rx()?; - let instance_ct = CancellationToken::new(); + let instance_ct = parent_ct.child_token(); let core_cts = Arc::new(CancellationTokenSource::new()); let core_ct = core_cts.token().clone(); let decided = Arc::new(AtomicBool::new(false)); @@ -227,50 +238,43 @@ async fn run_instance_inner( Sniffer::new(i64::try_from(nodes).expect("node count fits i64"), peer_idx), )); - let mut tasks = vec![ - tokio::spawn(bridge_mpsc_to_crossbeam( - instance_ct.clone(), - inner_recv_rx, - core_recv_tx, - )), - tokio::spawn(bridge_mpsc_to_crossbeam( - instance_ct.clone(), - hash_rx, - core_hash_tx, - )), - tokio::spawn(bridge_mpsc_to_crossbeam( - instance_ct.clone(), - verify_rx, - core_verify_tx, - )), - ]; + let mut tasks = JoinSet::new(); + tasks.spawn(bridge_mpsc_to_crossbeam( + instance_ct.clone(), + inner_recv_rx, + core_recv_tx, + )); + tasks.spawn(bridge_mpsc_to_crossbeam( + instance_ct.clone(), + hash_rx, + core_hash_tx, + )); + tasks.spawn(bridge_mpsc_to_crossbeam( + instance_ct.clone(), + verify_rx, + core_verify_tx, + )); { let transport = Arc::clone(&transport); let instance_ct = instance_ct.clone(); let transport_error = Arc::clone(&transport_error); - tasks.push(tokio::spawn(async move { + tasks.spawn(async move { if let Err(err) = transport.process_receives(instance_ct, outer_rx).await { *transport_error .lock() .unwrap_or_else(PoisonError::into_inner) = Some(err.to_string()); } - })); + }); } { - let parent_ct = parent_ct.clone(); let instance_ct = instance_ct.clone(); let core_cts = Arc::clone(&core_cts); - tasks.push(tokio::spawn(async move { - tokio::select! { - () = parent_ct.cancelled() => { - instance_ct.cancel(); - core_cts.cancel(); - } - () = instance_ct.cancelled() => core_cts.cancel(), - } - })); + tasks.spawn(async move { + instance_ct.cancelled().await; + core_cts.cancel(); + }); } let decide_callback: DecideCallback = { @@ -299,7 +303,7 @@ async fn run_instance_inner( let runtime = runtime.clone(); let instance_ct = instance_ct.clone(); let transport_error = Arc::clone(&transport_error); - move |request| { + move |request: qbft::BroadcastRequest<'_, ConsensusQbftTypes>| { let justification = request.justification.cloned().unwrap_or_default(); let result = runtime.block_on(transport.broadcast(transport::BroadcastRequest { ct: instance_ct.clone(), @@ -327,27 +331,25 @@ async fn run_instance_inner( receive: core_recv_rx, }; - let duty_for_run = duty.clone(); let core_ct_for_run = core_ct.clone(); let core_result = tokio::task::spawn_blocking(move || { qbft::run( &core_ct_for_run, &def, &core_transport, - &duty_for_run, + &duty, peer_idx, core_hash_rx, core_verify_rx, ) }) - .await - .map_err(Error::Join)?; + .await; let canceled_before_teardown = parent_ct.is_cancelled() || instance_ct.is_cancelled() || core_ct.is_canceled(); instance_ct.cancel(); - for task in tasks { - let _ = task.await; + while let Some(result) = tasks.join_next().await { + let _ = result; } let sniffer = consensus.sniffer(); @@ -361,6 +363,8 @@ async fn run_instance_inner( return Err(Error::Transport(err)); } + let core_result = core_result.map_err(Error::Join)?; + match core_result { Ok(()) => Ok(()), Err(qbft::QbftError::ContextCanceled) if decided.load(Ordering::Relaxed) => Ok(()), @@ -450,22 +454,6 @@ fn transport_broadcaster(broadcaster: super::component::Broadcaster) -> transpor }) } -/// Converts a runner result into the channel payload shared with joiners. -fn to_runner_result(result: &Result<()>) -> RunnerResult { - match result { - Ok(()) => Ok(()), - Err(err) => Err(Box::new(RunnerResultError(err.to_string())) as RunnerError), - } -} - -/// Returns whether passive consensus participation is enabled. -fn consensus_participate_enabled() -> bool { - pluto_featureset::GLOBAL_STATE - .read() - .expect("global feature set lock poisoned") - .enabled(pluto_featureset::Feature::ConsensusParticipate) -} - #[cfg(test)] mod tests { use std::{ From 37118ef7a5212ff1b71deb7a590f261d6229b831 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 3 Jun 2026 17:46:12 +0700 Subject: [PATCH 20/21] refactor: test: --- crates/consensus/src/qbft/p2p.rs | 35 ++++++------------- crates/consensus/src/qbft/qbft_run_test.rs | 2 +- .../consensus/src/qbft/strategy_sim_test.rs | 2 +- 3 files changed, 12 insertions(+), 27 deletions(-) diff --git a/crates/consensus/src/qbft/p2p.rs b/crates/consensus/src/qbft/p2p.rs index 6d7b7d24..d548a13c 100644 --- a/crates/consensus/src/qbft/p2p.rs +++ b/crates/consensus/src/qbft/p2p.rs @@ -132,14 +132,7 @@ pub struct Handle { impl Handle { /// Enqueues a QBFT message for async broadcast to every non-self peer. - /// - /// The token is accepted for the shared broadcaster shape. After enqueue, - /// network fanout is best-effort and is not cancelled by this token. - pub async fn broadcast( - &self, - _ct: CancellationToken, - msg: pbconsensus::QbftConsensusMsg, - ) -> BroadcastResult { + pub async fn broadcast(&self, msg: pbconsensus::QbftConsensusMsg) -> BroadcastResult { let request_id = self.next_request_id.fetch_add(1, Ordering::Relaxed); self.cmd_tx .send(BroadcastCommand { request_id, msg }) @@ -149,9 +142,9 @@ impl Handle { /// Returns a consensus broadcaster callback backed by this handle. pub fn broadcaster(&self) -> super::Broadcaster { let handle = self.clone(); - Arc::new(move |ct, msg| { + Arc::new(move |_ct, msg| { let handle = handle.clone(); - Box::pin(async move { handle.broadcast(ct, msg).await }) + Box::pin(async move { handle.broadcast(msg).await }) }) } } @@ -353,7 +346,7 @@ where .map_err(|error| error.to_string())?; consensus - .handle(&cancellation, Some(msg)) + .handle(&cancellation, msg) .await .map_err(|error| error.to_string()) }) @@ -846,9 +839,7 @@ mod tests { cancellation: CancellationToken::new(), })?; - handle - .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 1)?) - .await?; + handle.broadcast(signed_consensus_msg(&duty(), 1)?).await?; let events = drain_behaviour_events(&mut behaviour); let targets = events @@ -890,9 +881,7 @@ mod tests { local_peer_id, cancellation: CancellationToken::new(), })?; - handle - .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 0)?) - .await?; + handle.broadcast(signed_consensus_msg(&duty(), 0)?).await?; let _ = drain_behaviour_events(&mut behaviour); let error = DialError::DialPeerConditionFalse(PeerCondition::DisconnectedAndNotDialing); @@ -926,9 +915,7 @@ mod tests { local_peer_id, cancellation: CancellationToken::new(), })?; - handle - .broadcast(CancellationToken::new(), signed_consensus_msg(&duty(), 0)?) - .await?; + handle.broadcast(signed_consensus_msg(&duty(), 0)?).await?; let _ = drain_behaviour_events(&mut behaviour); let error = DialError::NoAddresses; @@ -994,9 +981,7 @@ mod tests { wait_for_connections(&mut conn_rx, &peer_ids[..2]).await?; let network_msg = signed_consensus_msg(&duty(), 0)?; - handle - .broadcast(CancellationToken::new(), network_msg.clone()) - .await?; + handle.broadcast(network_msg.clone()).await?; wait_for_event(&mut event_rx, 1, |event| { matches!(event, Event::Received { .. }) @@ -1147,12 +1132,12 @@ mod tests { let handle_slot = Arc::new(OnceLock::::new()); let broadcaster = { let handle_slot = Arc::clone(&handle_slot); - Arc::new(move |ct, msg| { + Arc::new(move |_ct, msg| { let handle = handle_slot .get() .expect("test p2p handle initialized") .clone(); - Box::pin(async move { handle.broadcast(ct, msg).await }) + Box::pin(async move { handle.broadcast(msg).await }) as futures::future::BoxFuture<'static, BroadcastResult> }) }; diff --git a/crates/consensus/src/qbft/qbft_run_test.rs b/crates/consensus/src/qbft/qbft_run_test.rs index 362dd411..4ec5e22a 100644 --- a/crates/consensus/src/qbft/qbft_run_test.rs +++ b/crates/consensus/src/qbft/qbft_run_test.rs @@ -591,7 +591,7 @@ fn in_memory_network( if i64::try_from(index).expect("test peer index fits i64") == peer_idx { continue; } - if let Err(err) = consensus.handle(&ct, Some(msg.clone())).await { + if let Err(err) = consensus.handle(&ct, msg.clone()).await { return Err(Box::new(err) as Box); } } diff --git a/crates/consensus/src/qbft/strategy_sim_test.rs b/crates/consensus/src/qbft/strategy_sim_test.rs index 05710f24..df54a94e 100644 --- a/crates/consensus/src/qbft/strategy_sim_test.rs +++ b/crates/consensus/src/qbft/strategy_sim_test.rs @@ -551,7 +551,7 @@ async fn broadcast_with_latency( tokio::select! { () = delivery_ct.cancelled() => {} () = tokio::time::sleep(delay) => { - let _ = node.handle(&delivery_ct, Some(msg)).await; + let _ = node.handle(&delivery_ct, msg).await; } } }); From 032dd763407684e6c3b17ee219205173a994cca3 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 3 Jun 2026 18:12:27 +0700 Subject: [PATCH 21/21] test: mutex to avoid timeout test --- crates/consensus/examples/qbft.rs | 4 ++-- crates/consensus/src/qbft/component.rs | 4 ++++ crates/consensus/src/qbft/qbft_run_test.rs | 24 ++++++++++++++++++++-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/crates/consensus/examples/qbft.rs b/crates/consensus/examples/qbft.rs index 1f3af47a..53255abe 100644 --- a/crates/consensus/examples/qbft.rs +++ b/crates/consensus/examples/qbft.rs @@ -576,14 +576,14 @@ fn build_consensus( ) -> Result { let handle_slot = Arc::new(OnceLock::::new()); let broadcaster_slot = Arc::clone(&handle_slot); - let broadcaster: qbft::Broadcaster = Arc::new(move |ct, msg| { + let broadcaster: qbft::Broadcaster = Arc::new(move |_ct, msg| { let broadcaster_slot = Arc::clone(&broadcaster_slot); Box::pin(async move { let Some(handle) = broadcaster_slot.get() else { let err = std::io::Error::other("qbft p2p handle not initialized"); return Err(Box::new(err) as Box); }; - handle.broadcast(ct, msg).await + handle.broadcast(msg).await }) }); diff --git a/crates/consensus/src/qbft/component.rs b/crates/consensus/src/qbft/component.rs index 37763ac2..c04ff54a 100644 --- a/crates/consensus/src/qbft/component.rs +++ b/crates/consensus/src/qbft/component.rs @@ -258,6 +258,7 @@ impl SubscriberSet { /// QBFT consensus component. pub struct Consensus { peers: Vec, + #[cfg(test)] peer_labels: Vec, pubkeys: HashMap, local_peer_idx: i64, @@ -276,11 +277,13 @@ impl Consensus { /// Creates a new QBFT consensus component. pub fn new(config: Config) -> Result { let mut pubkeys = HashMap::with_capacity(config.peers.len()); + #[cfg(test)] let mut peer_labels = Vec::with_capacity(config.peers.len()); for (index, peer) in config.peers.iter().enumerate() { let peer_idx = i64::try_from(index).map_err(|_| Error::PeerIndexOverflow { index })?; pubkeys.insert(peer_idx, peer.public_key); + #[cfg(test)] peer_labels.push(format!("{}:{}", peer.index, peer.name)); } @@ -292,6 +295,7 @@ impl Consensus { Ok(Self { peers: config.peers, + #[cfg(test)] peer_labels, pubkeys, local_peer_idx: config.local_peer_idx, diff --git a/crates/consensus/src/qbft/qbft_run_test.rs b/crates/consensus/src/qbft/qbft_run_test.rs index 4ec5e22a..511fceda 100644 --- a/crates/consensus/src/qbft/qbft_run_test.rs +++ b/crates/consensus/src/qbft/qbft_run_test.rs @@ -16,7 +16,10 @@ use pluto_eth2api::spec::phase0; use prost::bytes::Bytes; use prost_types::Any; use test_case::test_case; -use tokio::{sync::mpsc, task::JoinSet}; +use tokio::{ + sync::{Mutex as AsyncMutex, mpsc}, + task::JoinSet, +}; use tokio_util::sync::CancellationToken; use super::{ @@ -27,23 +30,29 @@ use super::{ }; use crate::timer::{RoundTimer, RoundTimerFunc, RoundTimerFuture, TimerType}; +const CONSENSUS_RECV_TIMEOUT: Duration = Duration::from_secs(5); +static FULL_RUN_TEST_LOCK: AsyncMutex<()> = AsyncMutex::const_new(()); + #[test_case(2, 3 ; "two_of_three")] #[test_case(3, 4 ; "three_of_four")] #[test_case(4, 4 ; "four_of_four")] #[test_case(4, 6 ; "four_of_six")] #[tokio::test] async fn qbft_consensus(threshold: usize, cluster_nodes: usize) { + let _guard = full_run_test_guard().await; assert!(threshold <= cluster_nodes); run_qbft_consensus(threshold, cluster_nodes, false, unsigned_value).await; } #[tokio::test] async fn qbft_consensus_attester_compare_enabled() { + let _guard = full_run_test_guard().await; run_qbft_consensus(3, 3, true, |_| attester_value(0)).await; } #[tokio::test] async fn qbft_sniffed_instance_replay_decides() { + let _guard = full_run_test_guard().await; let sniffed = run_qbft_consensus(4, 4, false, unsigned_value).await; let instance = sniffed .into_iter() @@ -56,6 +65,7 @@ async fn qbft_sniffed_instance_replay_decides() { #[tokio::test] async fn qbft_priority_consensus() { + let _guard = full_run_test_guard().await; let threshold = 3; let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); let active_nodes = in_memory_network(threshold, threshold, false, None, sniffed_tx); @@ -121,6 +131,7 @@ async fn qbft_priority_consensus() { #[tokio::test] async fn qbft_consensus_participate_then_late_propose() { + let _guard = full_run_test_guard().await; let threshold = 4; let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); let active_nodes = in_memory_network(threshold, threshold, false, None, sniffed_tx); @@ -206,6 +217,7 @@ async fn qbft_consensus_participate_then_late_propose() { #[tokio::test] async fn qbft_consensus_attester_compare_mismatch_does_not_decide() { + let _guard = full_run_test_guard().await; let threshold = 3; let (sniffed_tx, _sniffed_rx) = mpsc::unbounded_channel(); let active_nodes = in_memory_network( @@ -357,6 +369,12 @@ async fn run_qbft_consensus( sniffed } +async fn full_run_test_guard() -> tokio::sync::MutexGuard<'static, ()> { + // Each test spins up an in-memory multi-node cluster. Running several of + // them concurrently can turn liveness checks into scheduler-load flakes. + FULL_RUN_TEST_LOCK.lock().await +} + async fn replay_sniffed_instance_decides(instance: pbconsensus::SniffedConsensusInstance) { assert!(!instance.msgs.is_empty()); @@ -483,7 +501,9 @@ fn hash32(value: &[u8]) -> Option<[u8; 32]> { } async fn recv_one(rx: &mut mpsc::UnboundedReceiver) -> T { - tokio::time::timeout(Duration::from_secs(1), rx.recv()) + // Consensus liveness is tested by receiving a decision, not by a tight + // wall-clock bound. Keep a guard for hangs while allowing scheduler load. + tokio::time::timeout(CONSENSUS_RECV_TIMEOUT, rx.recv()) .await .expect("receiver timed out") .expect("receiver closed")