From 4c980c179a3df13abcec2c6da7c7f56122b63431 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:04:11 +0100 Subject: [PATCH 01/34] docs: add repository agent guide --- AGENTS.md | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..c0e0271b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,85 @@ +# RustiFlow Agent Guide + +This repository is a Rust workspace for a network flow extractor. The main crates are: + +- `rustiflow`: user-space CLI, pcap reader, realtime capture, flow extraction, CSV/TUI output +- `common`: shared packet/event structs used by user space and eBPF programs +- `xtask`: helper commands for building and running the project +- `ebpf-ipv4` / `ebpf-ipv6`: Linux eBPF programs used for realtime capture + +## Remote Machine Guardrails + +- Remote Linux machines reachable over SSH may be used only for this RustiFlow project. +- On those machines, run only RustiFlow-related commands, builds, checks, and tests. +- Do not use those machines for unrelated exploration, installs, experiments, or general development tasks. +- If remote work needs a dedicated workspace or directory, ask the user to create/provide it first. +- If any software or dependency needs to be installed on those machines, ask the user to do it. +- If there is any uncertainty about whether a command is appropriate to run on those machines, ask the user before running it. + +## Working Principles + +- Prefer small, targeted changes over broad rewrites. +- Keep flow logic modular. Shared measurement logic belongs in `rustiflow/src/flows/features/`; exporter-specific schema logic belongs in the relevant flow type. +- Preserve output compatibility unless a schema change is intentional and documented. +- When changing CLI behavior, config structure, or CSV headers, update the README and any related examples. +- When using `format!`, inline variables into `{}` when possible. +- Prefer exhaustive `match` statements when practical; avoid wildcard arms that hide protocol or feature cases. +- Avoid bool-heavy APIs that create unclear call sites. Prefer enums or named methods when that improves clarity. +- Prefer comparing whole values in tests instead of asserting many individual fields when feasible. +- Do not add one-off helper functions that are only used once unless they make a complex block substantially clearer. + +## Rust Style + +- Follow `rustfmt` and Clippy guidance. +- Collapse nested `if` statements when it improves readability. +- Inline `format!` arguments when possible. +- Use method references instead of trivial closures when that is clearer. +- Keep modules from growing unnecessarily large. Prefer extracting a focused submodule instead of adding more unrelated logic to an already large file. + +## RustiFlow-Specific Guidance + +- Treat the offline pcap path and the realtime eBPF path as two distinct ingestion modes that should stay semantically aligned. +- Be careful with timing-related features. Realtime and offline timestamp sources differ, so changes to timing, IAT, active/idle, or expiration logic should be validated deliberately. +- Be careful with packet length semantics. Realtime and offline paths may observe slightly different length fields. +- `BasicFlow` owns flow lifecycle and termination behavior. Do not duplicate expiration or TCP teardown logic in higher-level flow types unless there is a strong reason. +- If you add a new feature family, first decide whether it belongs in: + - a reusable `FlowFeature` implementation, or + - one exporter only +- If you change contamination-free exports, keep in mind that these outputs intentionally avoid raw identifiers such as exact ports/IPs. + +## Platform Notes + +- Realtime eBPF support is Linux-specific. +- macOS may be usable for some read-only work, formatting, and limited code inspection, but Linux is the source of truth for full build and runtime validation. +- Do not assume that successful macOS builds imply realtime correctness. +- When touching `aya`/eBPF/realtime code, prefer validating on Linux or in a Linux container/VM. + +## Commands + +Use the smallest command that gives confidence: + +- Format: + - `cargo fmt` +- Check the main crate: + - `cargo check -p rustiflow` +- Run Rust tests for the main crate: + - `cargo test -p rustiflow` +- Build eBPF programs: + - `cargo xtask ebpf-ipv4` + - `cargo xtask ebpf-ipv6` +- Run in dev mode: + - `cargo xtask run -- [OPTIONS] ` + +If a change touches shared code used by multiple crates, prefer checking the workspace as needed. + +## Validation Expectations + +- After Rust code changes, run `cargo fmt`. +- Run the narrowest relevant check/test command for the code you changed. +- If you change dependencies, run at least `cargo check` again after the dependency update. +- If you change CSV headers, config behavior, or user-facing commands, verify the corresponding documentation and examples. + +## Notes On Existing Tests + +- Treat the current test suite carefully: some tests may be stale or incomplete relative to the active code. +- When adding or repairing tests, prefer tests that reflect the current flow architecture and public behavior rather than resurrecting outdated internal field expectations. From 47960a8b9bdf492e29ccfaed05585e6e7d7dd944 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:04:14 +0100 Subject: [PATCH 02/34] build: support non-linux userspace development --- rustiflow/Cargo.toml | 8 +++-- rustiflow/src/main.rs | 6 ++++ rustiflow/src/packet_features.rs | 7 ++++- rustiflow/src/realtime.rs | 51 ++++++++++++++++++++------------ rustiflow/src/realtime_stub.rs | 22 ++++++++++++++ 5 files changed, 71 insertions(+), 23 deletions(-) create mode 100644 rustiflow/src/realtime_stub.rs diff --git a/rustiflow/Cargo.toml b/rustiflow/Cargo.toml index 12238023..388f38b7 100644 --- a/rustiflow/Cargo.toml +++ b/rustiflow/Cargo.toml @@ -10,9 +10,7 @@ publish = false clap = { version = "4.5.0", features = ["derive"] } csv = "1.3.0" serde = { version = "1.0.196", features = ["derive"] } -aya = { version = "0.13.0", features = ["async_tokio"] } -aya-log = "0.2.1" -common = { path = "../common", features = ["user"] } +common = { path = "../common" } anyhow = "1" log = "0.4" tokio = { version = "1.25", features = [ @@ -39,6 +37,10 @@ tui = "0.19" strum = "0.26.3" strum_macros = "0.26.4" +[target.'cfg(target_os = "linux")'.dependencies] +aya = { version = "0.13.0", features = ["async_tokio"] } +aya-log = "0.2.1" + [[bin]] name = "rustiflow" path = "src/main.rs" diff --git a/rustiflow/src/main.rs b/rustiflow/src/main.rs index 99de0203..c785082c 100644 --- a/rustiflow/src/main.rs +++ b/rustiflow/src/main.rs @@ -1,11 +1,17 @@ mod args; mod flow_table; +#[cfg(target_os = "linux")] mod flow_tui; mod flows; mod output; +#[cfg(target_os = "linux")] mod packet_counts; mod packet_features; mod pcap; +#[cfg(target_os = "linux")] +mod realtime; +#[cfg(not(target_os = "linux"))] +#[path = "realtime_stub.rs"] mod realtime; mod tests; mod tui; diff --git a/rustiflow/src/packet_features.rs b/rustiflow/src/packet_features.rs index 5944ea8d..0277af72 100644 --- a/rustiflow/src/packet_features.rs +++ b/rustiflow/src/packet_features.rs @@ -1,6 +1,9 @@ -use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; +#[cfg(target_os = "linux")] +use std::net::Ipv6Addr; +use std::net::{IpAddr, Ipv4Addr}; use chrono::Utc; +#[cfg(target_os = "linux")] use common::{EbpfEventIpv4, EbpfEventIpv6}; use log::debug; use pnet::packet::{ @@ -80,6 +83,7 @@ pub struct PacketFeatures { } impl PacketFeatures { + #[cfg(target_os = "linux")] // Constructor to create PacketFeatures from EbpfEventIpv4 pub fn from_ebpf_event_ipv4(event: &EbpfEventIpv4) -> Self { PacketFeatures { @@ -117,6 +121,7 @@ impl PacketFeatures { } } + #[cfg(target_os = "linux")] // Constructor to create PacketFeatures from EbpfEventIpv6 pub fn from_ebpf_event_ipv6(event: &EbpfEventIpv6) -> Self { PacketFeatures { diff --git a/rustiflow/src/realtime.rs b/rustiflow/src/realtime.rs index 6b1bdd8a..7546e6bb 100644 --- a/rustiflow/src/realtime.rs +++ b/rustiflow/src/realtime.rs @@ -1,12 +1,13 @@ use std::hash::{DefaultHasher, Hash, Hasher}; +use std::path::PathBuf; use std::sync::Arc; use crate::debug; use crate::flow_tui::launch_packet_tui; use crate::packet_counts::PacketCountPerSecond; use crate::{flow_table::FlowTable, flows::flow::Flow, packet_features::PacketFeatures}; +use anyhow::Context; use aya::{ - include_bytes_aligned, maps::{PerCpuArray, RingBuf}, programs::{tc, SchedClassifier, TcAttachType}, Ebpf, @@ -272,16 +273,30 @@ fn bump_memlock_rlimit() { } } +fn ebpf_binary_path(program_name: &str) -> PathBuf { + let target_dir = std::env::var_os("CARGO_TARGET_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../target")); + let profile = if cfg!(debug_assertions) { + "debug" + } else { + "release" + }; + + target_dir + .join("bpfel-unknown-none") + .join(profile) + .join(program_name) +} + fn load_ebpf_ipv4(interface: &str, tc_attach_type: TcAttachType) -> Result { - // Loading the eBPF program, the macros make sure the correct file is loaded - #[cfg(debug_assertions)] - let mut bpf_ipv4 = Ebpf::load(include_bytes_aligned!( - "../../target/bpfel-unknown-none/debug/rustiflow-ebpf-ipv4" - ))?; - #[cfg(not(debug_assertions))] - let mut bpf_ipv4 = Ebpf::load(include_bytes_aligned!( - "../../target/bpfel-unknown-none/release/rustiflow-ebpf-ipv4" - ))?; + let binary_path = ebpf_binary_path("rustiflow-ebpf-ipv4"); + let mut bpf_ipv4 = Ebpf::load_file(&binary_path).with_context(|| { + format!( + "Failed to load eBPF IPv4 binary from {}. Build it first with `cargo xtask ebpf-ipv4`.", + binary_path.display() + ) + })?; // Attach the eBPF program function let _ = EbpfLogger::init(&mut bpf_ipv4); @@ -304,15 +319,13 @@ fn load_ebpf_ipv4(interface: &str, tc_attach_type: TcAttachType) -> Result Result { - // Loading the eBPF program, the macros make sure the correct file is loaded - #[cfg(debug_assertions)] - let mut bpf_ipv6 = Ebpf::load(include_bytes_aligned!( - "../../target/bpfel-unknown-none/debug/rustiflow-ebpf-ipv6" - ))?; - #[cfg(not(debug_assertions))] - let mut bpf_ipv6 = Ebpf::load(include_bytes_aligned!( - "../../target/bpfel-unknown-none/release/rustiflow-ebpf-ipv6" - ))?; + let binary_path = ebpf_binary_path("rustiflow-ebpf-ipv6"); + let mut bpf_ipv6 = Ebpf::load_file(&binary_path).with_context(|| { + format!( + "Failed to load eBPF IPv6 binary from {}. Build it first with `cargo xtask ebpf-ipv6`.", + binary_path.display() + ) + })?; // Attach the eBPF program function let _ = EbpfLogger::init(&mut bpf_ipv6); diff --git a/rustiflow/src/realtime_stub.rs b/rustiflow/src/realtime_stub.rs new file mode 100644 index 00000000..d3f658c9 --- /dev/null +++ b/rustiflow/src/realtime_stub.rs @@ -0,0 +1,22 @@ +use crate::flows::flow::Flow; +use tokio::sync::mpsc::Sender; + +/// Realtime capture depends on Aya/eBPF and is only available on Linux. +pub async fn handle_realtime( + interface: &str, + _output_channel: Sender, + _num_threads: u8, + _active_timeout: u64, + _idle_timeout: u64, + _early_export: Option, + _expiration_check_interval: u64, + _ingress_only: bool, + _performance_mode_disabled: bool, +) -> Result +where + T: Flow, +{ + Err(anyhow::anyhow!( + "Realtime capture on interface {interface:?} is only supported on Linux" + )) +} From b3eed7ac4c31fd1dd626c5cf23f0949bb4fdd9b7 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:04:17 +0100 Subject: [PATCH 03/34] test: restore focused flow coverage --- rustiflow/src/flows/basic_flow.rs | 4 +- rustiflow/src/tests/flows/basic_flow_test.rs | 322 ++++-------------- rustiflow/src/tests/flows/flow_table_test.rs | 44 +++ rustiflow/src/tests/flows/mod.rs | 8 +- .../src/tests/flows/packet_features_test.rs | 34 ++ 5 files changed, 140 insertions(+), 272 deletions(-) create mode 100644 rustiflow/src/tests/flows/flow_table_test.rs create mode 100644 rustiflow/src/tests/flows/packet_features_test.rs diff --git a/rustiflow/src/flows/basic_flow.rs b/rustiflow/src/flows/basic_flow.rs index 1e6d1774..f207e6e6 100644 --- a/rustiflow/src/flows/basic_flow.rs +++ b/rustiflow/src/flows/basic_flow.rs @@ -159,8 +159,8 @@ impl Flow for BasicFlow { false } - fn close_flow(&mut self, _timestamp_us: i64, _cause: FlowExpireCause) -> () { - // No active state to close + fn close_flow(&mut self, _timestamp_us: i64, cause: FlowExpireCause) -> () { + self.flow_expire_cause = cause; } fn dump(&self) -> String { diff --git a/rustiflow/src/tests/flows/basic_flow_test.rs b/rustiflow/src/tests/flows/basic_flow_test.rs index 43fa002f..7fbb4161 100644 --- a/rustiflow/src/tests/flows/basic_flow_test.rs +++ b/rustiflow/src/tests/flows/basic_flow_test.rs @@ -1,290 +1,82 @@ #[cfg(test)] mod tests { - use chrono::{Duration, Utc}; use std::net::{IpAddr, Ipv4Addr}; use crate::{ - flows::{ - basic_flow::{BasicFlow, FlowState}, - flow::Flow, - }, + flows::{basic_flow::BasicFlow, flow::Flow, util::FlowExpireCause}, packet_features::PacketFeatures, }; - #[test] - fn test_basic_flow_creation() { - let ip_src = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ip_dst = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let flow = BasicFlow::new( - "flow1".to_string(), - ip_src, - 8080, - ip_dst, - 80, - 6, // TCP protocol - Utc::now(), - ); - - assert_eq!(flow.flow_key, "flow1"); - assert_eq!(flow.ip_source, ip_src); - assert_eq!(flow.port_source, 8080); - assert_eq!(flow.ip_destination, ip_dst); - assert_eq!(flow.port_destination, 80); - assert_eq!(flow.protocol, 6); - assert_eq!(flow.fwd_packet_count, 0); - assert_eq!(flow.bwd_packet_count, 0); - } - - #[test] - fn test_basic_flow_update_forward() { - let ip_src = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ip_dst = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let mut flow = BasicFlow::new( - "flow1".to_string(), - ip_src, - 8080, - ip_dst, - 80, - 6, // TCP protocol - Utc::now(), - ); - - let packet = PacketFeatures { - source_ip: ip_src, - destination_ip: ip_dst, - source_port: 8080, - destination_port: 80, - protocol: 6, - timestamp: Utc::now(), - fin_flag: 0, - syn_flag: 1, - rst_flag: 0, - psh_flag: 0, - ack_flag: 0, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 1, - sequence_number_ack: 0, - }; - - let flow_ended = flow.update_flow(&packet, true); - assert!(!flow_ended); - assert_eq!(flow.fwd_packet_count, 1); - assert_eq!(flow.fwd_syn_flag_count, 1); - } - - #[test] - fn test_basic_flow_update_backward() { - let ip_src = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ip_dst = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let mut flow = BasicFlow::new( - "flow1".to_string(), - ip_src, - 8080, - ip_dst, - 80, - 6, // TCP protocol - Utc::now(), - ); - - let packet = PacketFeatures { - source_ip: ip_dst, - destination_ip: ip_src, - source_port: 80, - destination_port: 8080, + fn build_packet( + source_ip: IpAddr, + source_port: u16, + destination_ip: IpAddr, + destination_port: u16, + timestamp_us: i64, + ) -> PacketFeatures { + PacketFeatures { + source_ip, + destination_ip, + source_port, + destination_port, protocol: 6, - timestamp: Utc::now(), - fin_flag: 0, - syn_flag: 1, - rst_flag: 0, - psh_flag: 0, - ack_flag: 0, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 1, - sequence_number_ack: 0, - }; - - let flow_ended = flow.update_flow(&packet, false); - assert!(!flow_ended); - assert_eq!(flow.bwd_packet_count, 1); - assert_eq!(flow.bwd_syn_flag_count, 1); + timestamp_us, + ..Default::default() + } } #[test] - fn test_tcp_flow_termination() { - let ip_src = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ip_dst = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); + fn close_flow_records_expiration_cause() { + let ip_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 10)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 20)); let mut flow = BasicFlow::new( - "flow1".to_string(), - ip_src, - 8080, - ip_dst, - 80, - 6, // TCP protocol - Utc::now(), + "flow-1".to_string(), + ip_source, + 4242, + ip_destination, + 443, + 6, + 1_000_000, ); - // Forward FIN - let packet_fin_fwd = PacketFeatures { - source_ip: ip_src, - destination_ip: ip_dst, - source_port: 8080, - destination_port: 80, - protocol: 6, - timestamp: Utc::now(), - fin_flag: 1, - syn_flag: 0, - rst_flag: 0, - psh_flag: 0, - ack_flag: 0, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 100, - sequence_number_ack: 0, - }; - flow.update_flow(&packet_fin_fwd, true); - - // Backward ACK for FIN - let packet_ack_bwd = PacketFeatures { - source_ip: ip_dst, - destination_ip: ip_src, - source_port: 80, - destination_port: 8080, - protocol: 6, - timestamp: Utc::now(), - fin_flag: 0, - syn_flag: 0, - rst_flag: 0, - psh_flag: 0, - ack_flag: 1, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 200, - sequence_number_ack: 101, - }; - flow.update_flow(&packet_ack_bwd, false); - - // Backward FIN - let packet_fin_bwd = PacketFeatures { - source_ip: ip_dst, - destination_ip: ip_src, - source_port: 80, - destination_port: 8080, - protocol: 6, - timestamp: Utc::now(), - fin_flag: 1, - syn_flag: 0, - rst_flag: 0, - psh_flag: 0, - ack_flag: 0, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 300, - sequence_number_ack: 0, - }; - flow.update_flow(&packet_fin_bwd, false); - - // Forward ACK for FIN - let packet_ack_fwd = PacketFeatures { - source_ip: ip_src, - destination_ip: ip_dst, - source_port: 8080, - destination_port: 80, - protocol: 6, - timestamp: Utc::now(), - fin_flag: 0, - syn_flag: 0, - rst_flag: 0, - psh_flag: 0, - ack_flag: 1, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 400, - sequence_number_ack: 301, - }; - let flow_ended = flow.update_flow(&packet_ack_fwd, true); + flow.close_flow(2_000_000, FlowExpireCause::IdleTimeout); - assert!(flow_ended); - assert_eq!(flow.state_fwd, FlowState::FinAcked); - assert_eq!(flow.state_bwd, FlowState::FinAcked); + assert_eq!(flow.flow_expire_cause, FlowExpireCause::IdleTimeout); } #[test] - fn test_flow_expiry() { - let ip_src = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ip_dst = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let first_timestamp = Utc::now(); + fn tcp_fin_handshake_terminates_flow() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)); let mut flow = BasicFlow::new( - "flow1".to_string(), - ip_src, - 8080, - ip_dst, + "flow-2".to_string(), + ip_source, + 50000, + ip_destination, 80, - 6, // TCP protocol - first_timestamp, + 6, + 1_000_000, ); - // Active timeout should expire - let timestamp = first_timestamp + Duration::seconds(61); - assert!(flow.is_expired(timestamp, 60, 30)); - - // Idle timeout should expire - let packet = PacketFeatures { - source_ip: ip_src, - destination_ip: ip_dst, - source_port: 8080, - destination_port: 80, - protocol: 6, - timestamp: first_timestamp, - fin_flag: 0, - syn_flag: 1, - rst_flag: 0, - psh_flag: 0, - ack_flag: 0, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 0, - header_length: 20, - length: 40, - window_size: 1024, - sequence_number: 1, - sequence_number_ack: 0, - }; - flow.update_flow(&packet, true); - - let timestamp = first_timestamp + Duration::seconds(31); - assert!(flow.is_expired(timestamp, 60, 30)); + let mut fin_fwd = build_packet(ip_source, 50000, ip_destination, 80, 1_000_100); + fin_fwd.fin_flag = 1; + fin_fwd.sequence_number = 100; + assert!(!flow.update_flow(&fin_fwd, true)); + + let mut ack_bwd = build_packet(ip_destination, 80, ip_source, 50000, 1_000_200); + ack_bwd.ack_flag = 1; + ack_bwd.sequence_number_ack = 101; + assert!(!flow.update_flow(&ack_bwd, false)); + + let mut fin_bwd = build_packet(ip_destination, 80, ip_source, 50000, 1_000_300); + fin_bwd.fin_flag = 1; + fin_bwd.sequence_number = 200; + assert!(!flow.update_flow(&fin_bwd, false)); + + let mut ack_fwd = build_packet(ip_source, 50000, ip_destination, 80, 1_000_400); + ack_fwd.ack_flag = 1; + ack_fwd.sequence_number_ack = 201; + assert!(flow.update_flow(&ack_fwd, true)); + assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpTermination); } } diff --git a/rustiflow/src/tests/flows/flow_table_test.rs b/rustiflow/src/tests/flows/flow_table_test.rs new file mode 100644 index 00000000..99cb2f16 --- /dev/null +++ b/rustiflow/src/tests/flows/flow_table_test.rs @@ -0,0 +1,44 @@ +#[cfg(test)] +mod tests { + use std::net::{IpAddr, Ipv4Addr}; + + use tokio::sync::mpsc; + + use crate::{ + flow_table::FlowTable, + flows::{basic_flow::BasicFlow, util::FlowExpireCause}, + packet_features::PacketFeatures, + }; + + fn build_packet(timestamp_us: i64) -> PacketFeatures { + PacketFeatures { + source_ip: IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)), + destination_ip: IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)), + source_port: 12345, + destination_port: 443, + protocol: 6, + timestamp_us, + ..Default::default() + } + } + + #[tokio::test] + async fn exports_idle_timed_out_flow_with_idle_timeout_cause() { + let (tx, mut rx) = mpsc::channel::(4); + let mut flow_table = FlowTable::new(3600, 1, None, tx, 60); + + flow_table.process_packet(&build_packet(1_000_000)).await; + flow_table.export_expired_flows(3_000_000).await; + + let exported_flow = rx.recv().await.expect("expected an exported flow"); + + assert_eq!( + exported_flow.flow_expire_cause, + FlowExpireCause::IdleTimeout + ); + assert_eq!( + exported_flow.flow_key, + "192.168.1.1:12345-192.168.1.2:443-6".to_string() + ); + } +} diff --git a/rustiflow/src/tests/flows/mod.rs b/rustiflow/src/tests/flows/mod.rs index 4e25a00f..fd662600 100644 --- a/rustiflow/src/tests/flows/mod.rs +++ b/rustiflow/src/tests/flows/mod.rs @@ -1,5 +1,3 @@ -// mod basic_flow_test; -// mod cic_flow_test; -// mod cidds_flow_test; -// mod nf_flow_test; -// mod rusti_flow_test; +mod basic_flow_test; +mod flow_table_test; +mod packet_features_test; diff --git a/rustiflow/src/tests/flows/packet_features_test.rs b/rustiflow/src/tests/flows/packet_features_test.rs new file mode 100644 index 00000000..e6b73011 --- /dev/null +++ b/rustiflow/src/tests/flows/packet_features_test.rs @@ -0,0 +1,34 @@ +#[cfg(test)] +mod tests { + use std::net::{IpAddr, Ipv4Addr}; + + use crate::packet_features::PacketFeatures; + + fn build_packet( + source_ip: IpAddr, + source_port: u16, + destination_ip: IpAddr, + destination_port: u16, + ) -> PacketFeatures { + PacketFeatures { + source_ip, + destination_ip, + source_port, + destination_port, + protocol: 6, + timestamp_us: 1_000_000, + ..Default::default() + } + } + + #[test] + fn biflow_key_is_direction_invariant() { + let client_ip = IpAddr::V4(Ipv4Addr::new(192, 168, 0, 10)); + let server_ip = IpAddr::V4(Ipv4Addr::new(192, 168, 0, 20)); + + let forward = build_packet(client_ip, 55000, server_ip, 443); + let backward = build_packet(server_ip, 443, client_ip, 55000); + + assert_eq!(forward.biflow_key(), backward.biflow_key()); + } +} From c161f86a99df52c531feb8abad1d375f355c6119 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:04:22 +0100 Subject: [PATCH 04/34] chore: clean workspace and ebpf build warnings --- Cargo.toml | 3 ++- ebpf-ipv4/Cargo.toml | 4 ++-- ebpf-ipv6/Cargo.toml | 4 ++-- rustiflow/src/flow_tui.rs | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d40cb3e3..6760849c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,3 @@ [workspace] -members = ["rustiflow", "common", "xtask"] \ No newline at end of file +members = ["rustiflow", "common", "xtask"] +resolver = "2" diff --git a/ebpf-ipv4/Cargo.toml b/ebpf-ipv4/Cargo.toml index 36ffd898..608d3021 100644 --- a/ebpf-ipv4/Cargo.toml +++ b/ebpf-ipv4/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] aya-ebpf = "0.1.1" -aya-log-ebpf = "0.1.1" +aya-log-ebpf = "0.1.0" common = { path = "../common" } network-types = "0.0.7" @@ -30,4 +30,4 @@ panic = "abort" codegen-units = 1 [workspace] -members = [] \ No newline at end of file +members = [] diff --git a/ebpf-ipv6/Cargo.toml b/ebpf-ipv6/Cargo.toml index 2784227f..db5764b7 100644 --- a/ebpf-ipv6/Cargo.toml +++ b/ebpf-ipv6/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] aya-ebpf = "0.1.1" -aya-log-ebpf = "0.1.1" +aya-log-ebpf = "0.1.0" common = { path = "../common" } network-types = "0.0.7" @@ -30,4 +30,4 @@ panic = "abort" codegen-units = 1 [workspace] -members = [] \ No newline at end of file +members = [] diff --git a/rustiflow/src/flow_tui.rs b/rustiflow/src/flow_tui.rs index 047537d9..e4b5ef68 100644 --- a/rustiflow/src/flow_tui.rs +++ b/rustiflow/src/flow_tui.rs @@ -41,7 +41,7 @@ impl App { fn get_bar_data(&self) -> Vec { self.packet_data .iter() - .map(|data| (data.count)) + .map(|data| data.count) .take(self.max_visible_intervals) .collect() } From 7d5c098e5e4b187a1230de4f6694f41cf73b93ef Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:33:54 +0100 Subject: [PATCH 05/34] docs: add concise protocol coverage guide (#88) --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 313e3200..fe8ea744 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,23 @@ This tool is engineered for robust and efficient feature extraction, particularl See the [wiki](https://github.com/idlab-discover/RustiFlow/wiki) for the different feature sets available. +## Supported Packet/Header Coverage + +RustiFlow currently extracts flows from the following protocol/header combinations: + +| Layer | Offline pcap | Realtime eBPF | +| --- | --- | --- | +| Link | Ethernet, Linux cooked capture, 802.1Q VLAN | Ethernet | +| Network | IPv4, IPv6 | IPv4, IPv6 | +| IPv6 extras | Extension headers supported before transport parsing | Extension headers supported before transport parsing | +| Transport | TCP, UDP, ICMP, ICMPv6 | TCP, UDP, ICMP, ICMPv6 | + +Notes: + +- Realtime support is Linux-only. +- Offline and realtime aim to expose the same flow semantics, but timestamp and packet-length sources can differ slightly. +- Realtime VLAN parsing is not implemented yet. + ## Architecture ### Realtime processing From ec87d279e7ee0e4a155cf35682e944fc0a6a178a Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:34:03 +0100 Subject: [PATCH 06/34] test: restore current exporter coverage (#78) --- rustiflow/src/flows/cidds_flow.rs | 2 +- rustiflow/src/tests/flows/cidds_flow_test.rs | 134 +++-------- rustiflow/src/tests/flows/mod.rs | 3 + rustiflow/src/tests/flows/nf_flow_test.rs | 241 ++----------------- rustiflow/src/tests/flows/rusti_flow_test.rs | 180 ++------------ 5 files changed, 85 insertions(+), 475 deletions(-) diff --git a/rustiflow/src/flows/cidds_flow.rs b/rustiflow/src/flows/cidds_flow.rs index fead6934..96f93aeb 100644 --- a/rustiflow/src/flows/cidds_flow.rs +++ b/rustiflow/src/flows/cidds_flow.rs @@ -111,9 +111,9 @@ impl Flow for CiddsFlow { fn dump_without_contamination(&self) -> String { format!( "{},{},{},{},{},{},{}", - self.format_protocol(self.basic_flow.protocol), iana_port_mapping(self.basic_flow.port_source), iana_port_mapping(self.basic_flow.port_destination), + self.format_protocol(self.basic_flow.protocol), self.basic_flow.get_flow_duration_msec(), self.packet_stats.flow_total(), self.packet_stats.flow_count(), diff --git a/rustiflow/src/tests/flows/cidds_flow_test.rs b/rustiflow/src/tests/flows/cidds_flow_test.rs index cbccf1ef..b3fec1b2 100644 --- a/rustiflow/src/tests/flows/cidds_flow_test.rs +++ b/rustiflow/src/tests/flows/cidds_flow_test.rs @@ -1,120 +1,62 @@ #[cfg(test)] mod tests { - use crate::flows::{cidds_flow::CiddsFlow, flow::Flow}; - use crate::packet_features::PacketFeatures; - use chrono::Utc; use std::net::{IpAddr, Ipv4Addr}; - fn setup_ciddsflow() -> CiddsFlow { + use crate::{ + flows::{cidds_flow::CiddsFlow, flow::Flow}, + packet_features::{PacketFeatures, SYN_FLAG}, + }; + + fn setup_cidds_flow() -> CiddsFlow { CiddsFlow::new( - "test_flow_id".to_string(), - IpAddr::V4(Ipv4Addr::from(1)), - 80, - IpAddr::V4(Ipv4Addr::from(2)), - 8080, + "cidds-flow".to_string(), + IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + 443, + IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)), + 51515, 6, - chrono::Utc::now(), + 1_000_000, ) } - #[test] - fn test_get_flags_string() { - let mut flow = setup_ciddsflow(); - assert_eq!(flow.get_flags_string(), "......"); - - flow.basic_flow.fwd_urg_flag_count = 1; - assert_eq!(flow.get_flags_string(), "U....."); - - flow.basic_flow.fwd_fin_flag_count = 1; - assert_eq!(flow.get_flags_string(), "U....F"); - - flow.basic_flow.fwd_ack_flag_count = 1; - assert_eq!(flow.get_flags_string(), "UA...F"); - - flow.basic_flow.fwd_psh_flag_count = 1; - assert_eq!(flow.get_flags_string(), "UAP..F"); - - flow.basic_flow.fwd_rst_flag_count = 1; - assert_eq!(flow.get_flags_string(), "UAPR.F"); - - flow.basic_flow.fwd_syn_flag_count = 1; - assert_eq!(flow.get_flags_string(), "UAPRSF"); + fn count_csv_fields(row: &str) -> usize { + row.split(',').count() } #[test] - fn test_new_flow() { - let flow = setup_ciddsflow(); - assert_eq!(flow.basic_flow.ip_source, IpAddr::V4(Ipv4Addr::from(1))); - assert_eq!(flow.basic_flow.port_source, 80); + fn dump_matches_feature_headers() { + let flow = setup_cidds_flow(); + + assert_eq!( + count_csv_fields(&flow.dump()), + count_csv_fields(&CiddsFlow::get_features()) + ); assert_eq!( - flow.basic_flow.ip_destination, - IpAddr::V4(Ipv4Addr::from(2)) + count_csv_fields(&flow.dump_without_contamination()), + count_csv_fields(&CiddsFlow::get_features_without_contamination()) ); - assert_eq!(flow.basic_flow.port_destination, 8080); - assert_eq!(flow.basic_flow.protocol, 6); - assert_eq!(flow.bytes, 0); } #[test] - fn test_update_flow() { - let mut flow = setup_ciddsflow(); + fn update_flow_tracks_bytes_packets_and_flags() { + let mut flow = setup_cidds_flow(); let packet = PacketFeatures { - length: 100, + source_ip: flow.basic_flow.ip_source, + destination_ip: flow.basic_flow.ip_destination, + source_port: flow.basic_flow.port_source, + destination_port: flow.basic_flow.port_destination, + protocol: flow.basic_flow.protocol, + timestamp_us: 1_000_500, + length: 128, + syn_flag: 1, + flags: SYN_FLAG, ..Default::default() }; - let end = flow.update_flow(&packet, true); - assert!(!end); - assert_eq!(flow.bytes, 100); - assert_eq!(flow.basic_flow.fwd_packet_count, 1); - } - - #[test] - fn test_dump() { - let flow = setup_ciddsflow(); - let dumped = flow.dump(); - assert!(dumped.contains(&flow.basic_flow.ip_source.to_string())); - assert!(dumped.contains(&flow.basic_flow.ip_destination.to_string())); - assert!(dumped.contains("TCP")); - } - #[test] - fn test_get_features() { - let features = CiddsFlow::get_features(); - assert_eq!(features, "FIRST_TIMESTAMP,LAST_TIMESTAMP,PROTOCOL,SOURCE_IP,SOURCE_PORT,DESTINATION_IP,DESTINATION_PORT,PACKET_COUNT,BYTES,FLAGS"); - } - - #[test] - fn test_dump_without_contamination() { - let flow = setup_ciddsflow(); - let dumped = flow.dump_without_contamination(); - assert!(dumped.contains("TCP")); - assert!(dumped.contains("0")); // packet count and bytes are 0 initially - } - - #[test] - fn test_get_features_without_contamination() { - let features = CiddsFlow::get_features_without_contamination(); - assert_eq!(features, "DURATION,PROTOCOL,PACKET_COUNT,BYTES,FLAGS"); - } + assert!(!flow.update_flow(&packet, true)); - #[test] - fn test_get_first_timestamp() { - let flow = setup_ciddsflow(); - let first_timestamp = flow.get_first_timestamp(); - assert_eq!(first_timestamp, flow.basic_flow.first_timestamp); - } - - #[test] - fn test_is_expired() { - let flow = setup_ciddsflow(); - let now = Utc::now(); - let expired = flow.is_expired(now, 10000, 5000); - assert!(!expired); - } - - #[test] - fn test_flow_key() { - let flow = setup_ciddsflow(); - assert_eq!(flow.flow_key(), &flow.basic_flow.flow_key); + assert_eq!(flow.packet_stats.flow_total(), 128.0); + assert_eq!(flow.packet_stats.flow_count(), 1); + assert_eq!(flow.tcp_flag_stats.get_flags(), "....S."); } } diff --git a/rustiflow/src/tests/flows/mod.rs b/rustiflow/src/tests/flows/mod.rs index fd662600..a8afa429 100644 --- a/rustiflow/src/tests/flows/mod.rs +++ b/rustiflow/src/tests/flows/mod.rs @@ -1,3 +1,6 @@ mod basic_flow_test; +mod cidds_flow_test; mod flow_table_test; +mod nf_flow_test; mod packet_features_test; +mod rusti_flow_test; diff --git a/rustiflow/src/tests/flows/nf_flow_test.rs b/rustiflow/src/tests/flows/nf_flow_test.rs index f64a1a49..9e598572 100644 --- a/rustiflow/src/tests/flows/nf_flow_test.rs +++ b/rustiflow/src/tests/flows/nf_flow_test.rs @@ -1,239 +1,50 @@ #[cfg(test)] mod tests { - use crate::{ - flows::{flow::Flow, nf_flow::NfFlow}, - packet_features::PacketFeatures, - }; - - use chrono::{DateTime, Utc}; use std::net::{IpAddr, Ipv4Addr}; - fn create_packet_features( - ip_source: IpAddr, - ip_destination: IpAddr, - timestamp: DateTime, - fwd: bool, - ) -> PacketFeatures { - PacketFeatures { - source_ip: ip_source, - destination_ip: ip_destination, - source_port: if fwd { 12345 } else { 80 }, - destination_port: if fwd { 80 } else { 12345 }, - protocol: 6, // TCP - timestamp, - fin_flag: 0, - syn_flag: 1, - rst_flag: 0, - psh_flag: 0, - ack_flag: 0, - urg_flag: 0, - cwr_flag: 0, - ece_flag: 0, - data_length: 100, - header_length: 20, - length: 120, - window_size: 1000, - sequence_number: 123456, - sequence_number_ack: 654321, - } - } - - #[test] - fn test_nf_flow_initialization() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); + use crate::flows::{flow::Flow, nf_flow::NfFlow, util::FlowExpireCause}; - let flow = NfFlow::new( - flow_id.clone(), - ipv4_source, + fn setup_nf_flow() -> NfFlow { + NfFlow::new( + "nf-flow".to_string(), + IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)), 12345, - ipv4_destination, + IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)), 80, 6, - timestamp, - ); - - assert_eq!(flow.cic_flow.basic_flow.flow_key, flow_id); - assert_eq!(flow.cic_flow.basic_flow.ip_source, ipv4_source); - assert_eq!(flow.cic_flow.basic_flow.ip_destination, ipv4_destination); - assert_eq!(flow.cic_flow.basic_flow.first_timestamp, timestamp); - assert_eq!(flow.cic_flow.basic_flow.last_timestamp, timestamp); - assert_eq!(flow.cic_flow.basic_flow.first_timestamp, timestamp); - assert_eq!(flow.fwd_last_timestamp, timestamp); - assert!(flow.bwd_first_timestamp.is_none()); - assert!(flow.bwd_last_timestamp.is_none()); + 1_000_000, + ) } - #[test] - fn test_nf_flow_update_forward() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); - - let mut flow = NfFlow::new( - flow_id, - ipv4_source, - 12345, - ipv4_destination, - 80, - 6, - timestamp, - ); - - let packet = create_packet_features(ipv4_source, ipv4_destination, timestamp, true); - let is_terminated = flow.update_flow(&packet, true); - - assert_eq!(flow.fwd_last_timestamp, packet.timestamp); - assert!(!is_terminated); + fn count_csv_fields(row: &str) -> usize { + row.split(',').count() } #[test] - fn test_nf_flow_update_backward() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); + fn dump_matches_feature_headers() { + let flow = setup_nf_flow(); - let mut flow = NfFlow::new( - flow_id, - ipv4_source, - 12345, - ipv4_destination, - 80, - 6, - timestamp, + assert_eq!( + count_csv_fields(&flow.dump()), + count_csv_fields(&NfFlow::get_features()) ); - - let packet = create_packet_features(ipv4_destination, ipv4_source, timestamp, false); - let is_terminated = flow.update_flow(&packet, false); - - assert_eq!(flow.bwd_first_timestamp, Some(packet.timestamp)); - assert_eq!(flow.bwd_last_timestamp, Some(packet.timestamp)); - assert!(!is_terminated); - } - - #[test] - fn test_get_bwd_duration() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); - - let mut flow = NfFlow::new( - flow_id, - ipv4_source, - 12345, - ipv4_destination, - 80, - 6, - timestamp, + assert_eq!( + count_csv_fields(&flow.dump_without_contamination()), + count_csv_fields(&NfFlow::get_features_without_contamination()) ); - - let bwd_first = timestamp + chrono::Duration::milliseconds(100); - let bwd_last = timestamp + chrono::Duration::milliseconds(200); - flow.bwd_first_timestamp = Some(bwd_first); - flow.bwd_last_timestamp = Some(bwd_last); - - assert_eq!(flow.get_bwd_duration(), 100); } #[test] - fn test_get_first_bwd_timestamp() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); - - let mut flow = NfFlow::new( - flow_id, - ipv4_source, - 12345, - ipv4_destination, - 80, - 6, - timestamp, - ); - - let bwd_first = timestamp + chrono::Duration::milliseconds(100); - flow.bwd_first_timestamp = Some(bwd_first); - - assert_eq!(flow.get_first_bwd_timestamp(), bwd_first.timestamp_millis()); - } - - #[test] - fn test_dump() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); - - let flow = NfFlow::new( - flow_id.clone(), - ipv4_source, - 12345, - ipv4_destination, - 80, - 6, - timestamp, - ); - - let dump = flow.dump(); - assert!(dump.contains(&flow_id)); - assert!(dump.contains(&ipv4_source.to_string())); - assert!(dump.contains(&ipv4_destination.to_string())); - } - - #[test] - fn test_is_expired() { - let flow_id = "flow-1".to_string(); - let ipv4_source = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)); - let ipv4_destination = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 2)); - let timestamp = Utc::now(); - - let mut flow = NfFlow::new( - flow_id, - ipv4_source, - 12345, - ipv4_destination, - 80, - 6, - timestamp, - ); - - // Active timeout: 60 seconds, Idle timeout: 30 seconds - let active_timeout = 60; - let idle_timeout = 30; - - // Case 1: Not expired, within active and idle timeout - let within_active_and_idle = timestamp + chrono::Duration::seconds(29); - assert!(!flow.is_expired(within_active_and_idle, active_timeout, idle_timeout)); - - // Case 2: Idle timeout exceeded - let idle_timeout_exceeded = timestamp + chrono::Duration::seconds(31); - assert!(flow.is_expired(idle_timeout_exceeded, active_timeout, idle_timeout)); + fn expiration_id_maps_from_close_cause() { + let mut flow = setup_nf_flow(); - // Case 3: Active timeout exceeded - let active_timeout_exceeded = timestamp + chrono::Duration::seconds(61); - assert!(flow.is_expired(active_timeout_exceeded, active_timeout, idle_timeout)); + flow.close_flow(2_000_000, FlowExpireCause::ActiveTimeout); + assert_eq!(flow.get_expiration_id(), 1); - // Case 4: Update the last timestamp to reset idle timeout - flow.cic_flow.basic_flow.last_timestamp = timestamp + chrono::Duration::seconds(40); - flow.cic_flow.basic_flow.first_timestamp = flow.cic_flow.basic_flow.last_timestamp; // Reset first timestamp - let after_update_within_idle = - flow.cic_flow.basic_flow.last_timestamp + chrono::Duration::seconds(29); - let exp = flow.is_expired(after_update_within_idle, active_timeout, idle_timeout); - assert!(!exp); + flow.close_flow(3_000_000, FlowExpireCause::IdleTimeout); + assert_eq!(flow.get_expiration_id(), 0); - // Case 5: Idle timeout exceeded after update - let after_update_idle_timeout_exceeded = - flow.cic_flow.basic_flow.last_timestamp + chrono::Duration::seconds(31); - assert!(flow.is_expired( - after_update_idle_timeout_exceeded, - active_timeout, - idle_timeout - )); + flow.close_flow(4_000_000, FlowExpireCause::TcpReset); + assert_eq!(flow.get_expiration_id(), -1); } } diff --git a/rustiflow/src/tests/flows/rusti_flow_test.rs b/rustiflow/src/tests/flows/rusti_flow_test.rs index 3cca2bb4..362fd403 100644 --- a/rustiflow/src/tests/flows/rusti_flow_test.rs +++ b/rustiflow/src/tests/flows/rusti_flow_test.rs @@ -6,177 +6,31 @@ mod tests { fn setup_rusti_flow() -> RustiFlow { RustiFlow::new( - "".to_string(), - IpAddr::V4(Ipv4Addr::from(1)), - 80, - IpAddr::V4(Ipv4Addr::from(2)), - 8080, + "rusti-flow".to_string(), + IpAddr::V4(Ipv4Addr::new(172, 16, 0, 1)), + 44444, + IpAddr::V4(Ipv4Addr::new(172, 16, 0, 2)), + 443, 6, - chrono::Utc::now(), + 1_000_000, ) } - #[test] - fn test_update_fwd_pkt_len_stats() { - let mut rusti_flow = setup_rusti_flow(); - - rusti_flow.cic_flow.basic_flow.fwd_packet_count = 1; - - rusti_flow.update_fwd_header_len_stats(100); - - assert_eq!(rusti_flow.fwd_header_len_max, 100); - assert_eq!(rusti_flow.fwd_header_len_min, 100); - assert_eq!(rusti_flow.fwd_header_len_mean, 100.0); - assert_eq!(rusti_flow.fwd_header_len_std, 0.0); - assert_eq!(rusti_flow.cic_flow.fwd_header_length, 100); - - rusti_flow.cic_flow.basic_flow.fwd_packet_count = 2; - - rusti_flow.update_fwd_header_len_stats(50); - - assert_eq!(rusti_flow.fwd_header_len_max, 100); - assert_eq!(rusti_flow.fwd_header_len_min, 50); - assert_eq!(rusti_flow.fwd_header_len_mean, 75.0); - assert_eq!(rusti_flow.fwd_header_len_std, 25.0); - assert_eq!(rusti_flow.cic_flow.fwd_header_length, 150); - - rusti_flow.cic_flow.basic_flow.fwd_packet_count = 3; - - rusti_flow.update_fwd_header_len_stats(0); - - assert_eq!(rusti_flow.fwd_header_len_max, 100); - assert_eq!(rusti_flow.fwd_header_len_min, 0); - assert_eq!(rusti_flow.fwd_header_len_mean, 50.0); - assert_eq!(rusti_flow.fwd_header_len_std, 40.824829046386306); - assert_eq!(rusti_flow.cic_flow.fwd_header_length, 150); - } - - #[test] - fn test_update_bwd_pkt_len_stats() { - let mut rusti_flow = setup_rusti_flow(); - - rusti_flow.cic_flow.basic_flow.bwd_packet_count = 1; - - rusti_flow.update_bwd_header_len_stats(100); - - assert_eq!(rusti_flow.bwd_header_len_max, 100); - assert_eq!(rusti_flow.bwd_header_len_min, 100); - assert_eq!(rusti_flow.bwd_header_len_mean, 100.0); - assert_eq!(rusti_flow.bwd_header_len_std, 0.0); - assert_eq!(rusti_flow.cic_flow.bwd_header_length, 100); - - rusti_flow.cic_flow.basic_flow.bwd_packet_count = 2; - - rusti_flow.update_bwd_header_len_stats(50); - - assert_eq!(rusti_flow.bwd_header_len_max, 100); - assert_eq!(rusti_flow.bwd_header_len_min, 50); - assert_eq!(rusti_flow.bwd_header_len_mean, 75.0); - assert_eq!(rusti_flow.bwd_header_len_std, 25.0); - assert_eq!(rusti_flow.cic_flow.bwd_header_length, 150); - - rusti_flow.cic_flow.basic_flow.bwd_packet_count = 3; - - rusti_flow.update_bwd_header_len_stats(0); - - assert_eq!(rusti_flow.bwd_header_len_max, 100); - assert_eq!(rusti_flow.bwd_header_len_min, 0); - assert_eq!(rusti_flow.bwd_header_len_mean, 50.0); - assert_eq!(rusti_flow.bwd_header_len_std, 40.824829046386306); - assert_eq!(rusti_flow.cic_flow.bwd_header_length, 150); - } - - #[test] - fn test_get_fwd_header_length_min() { - let mut cic_flow = setup_rusti_flow(); - - assert_eq!(cic_flow.get_fwd_header_length_min(), 0); - - cic_flow.fwd_header_len_min = 50; - - assert_eq!(cic_flow.get_fwd_header_length_min(), 50); - } - - #[test] - fn test_get_bwd_header_length_min() { - let mut cic_flow = setup_rusti_flow(); - - assert_eq!(cic_flow.get_bwd_header_length_min(), 0); - - cic_flow.bwd_header_len_min = 100; - - assert_eq!(cic_flow.get_bwd_header_length_min(), 100); - } - - #[test] - fn test_get_flow_packet_length_min() { - let mut cic_flow = setup_rusti_flow(); - - cic_flow.fwd_header_len_min = 100; - cic_flow.bwd_header_len_min = 50; - - assert_eq!(cic_flow.get_flow_header_length_min(), 50); + fn count_csv_fields(row: &str) -> usize { + row.split(',').count() } #[test] - fn test_get_flow_packet_length_max() { - let mut cic_flow = setup_rusti_flow(); - - cic_flow.fwd_header_len_max = 100; - cic_flow.bwd_header_len_max = 50; - - assert_eq!(cic_flow.get_flow_header_length_max(), 100); - } - - #[test] - fn test_get_flow_packet_length_mean() { - let mut cic_flow = setup_rusti_flow(); - - //let forward_iat = [10, 20, 30, 40, 50]; - //let backward_iat = [15, 25, 35]; - - cic_flow.fwd_header_len_mean = 30.0; - cic_flow.bwd_header_len_mean = 25.0; - - cic_flow.cic_flow.basic_flow.fwd_packet_count = 5; - cic_flow.cic_flow.basic_flow.bwd_packet_count = 3; + fn dump_matches_feature_headers() { + let flow = setup_rusti_flow(); - assert_eq!(cic_flow.get_flow_header_length_mean(), 28.125); - } - - #[test] - fn test_get_flow_packet_length_variance() { - let mut cic_flow = setup_rusti_flow(); - - //let forward_iat = [10, 20, 30, 40, 50]; - //let backward_iat = [15, 25, 35]; - - cic_flow.fwd_header_len_std = 14.142135623731; - cic_flow.bwd_header_len_std = 8.1649658092773; - - cic_flow.cic_flow.basic_flow.fwd_packet_count = 5; - cic_flow.cic_flow.basic_flow.bwd_packet_count = 3; - - assert_eq!(cic_flow.get_flow_header_length_variance() as u32, 155); // removing everything behind the comma because of arithmetic errors - } - - #[test] - fn test_get_flow_packet_length_std() { - let mut cic_flow = setup_rusti_flow(); - let epsilon = 1e-1; // floating-point arithmetic is not exact, here we have a lot of casting and the formula is also an approximation - - //let forward_iat = [10, 20, 30, 40, 50]; - //let backward_iat = [15, 25, 35]; - - cic_flow.fwd_header_len_std = 14.142135623731; - cic_flow.bwd_header_len_std = 8.1649658092773; - - cic_flow.cic_flow.basic_flow.fwd_packet_count = 5; - cic_flow.cic_flow.basic_flow.bwd_packet_count = 3; - - assert!( - (cic_flow.get_flow_header_length_std() - 12.484365222149).abs() < epsilon, - "get_flow_packet_length_std is not within the expected range" + assert_eq!( + count_csv_fields(&flow.dump()), + count_csv_fields(&RustiFlow::get_features()) + ); + assert_eq!( + count_csv_fields(&flow.dump_without_contamination()), + count_csv_fields(&RustiFlow::get_features_without_contamination()) ); } } From a292df4b295e81330d9c029a03115dbe82446636 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:34:10 +0100 Subject: [PATCH 07/34] fix: parse IPv6 extension headers in pcap path (#86) --- rustiflow/src/packet_features.rs | 57 ++++++++++++++++- .../src/tests/flows/packet_features_test.rs | 62 +++++++++++++++++++ 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/rustiflow/src/packet_features.rs b/rustiflow/src/packet_features.rs index 0277af72..7ff68149 100644 --- a/rustiflow/src/packet_features.rs +++ b/rustiflow/src/packet_features.rs @@ -173,13 +173,14 @@ impl PacketFeatures { // Constructor to create PacketFeatures from an IPv6 packet pub fn from_ipv6_packet(packet: &Ipv6Packet, timestamp_us: i64) -> Option { + let (protocol, payload) = skip_ipv6_extension_headers(packet)?; extract_packet_features_transport( packet.get_source().into(), packet.get_destination().into(), - packet.get_next_header(), + protocol, timestamp_us, packet.packet().len() as u16, - packet.payload(), + payload, ) } @@ -241,6 +242,58 @@ fn get_tcp_flag(value: u8, flag: u8) -> u8 { ((value & flag) != 0) as u8 } +fn skip_ipv6_extension_headers<'a>( + packet: &'a Ipv6Packet<'a>, +) -> Option<(IpNextHeaderProtocol, &'a [u8])> { + const MAX_EXTENSION_HEADERS: usize = 8; + const HOP_BY_HOP: u8 = 0; + const ROUTING: u8 = 43; + const FRAGMENT: u8 = 44; + const ESP: u8 = 50; + const AUTHENTICATION: u8 = 51; + const DESTINATION_OPTIONS: u8 = 60; + const MOBILITY: u8 = 135; + const HIP: u8 = 139; + const SHIM6: u8 = 140; + + let mut next_header = packet.get_next_header(); + let mut payload = packet.payload(); + + for _ in 0..MAX_EXTENSION_HEADERS { + let header_len = match next_header.0 { + HOP_BY_HOP | ROUTING | DESTINATION_OPTIONS | MOBILITY | HIP | SHIM6 => { + if payload.len() < 8 { + return None; + } + (usize::from(payload[1]) + 1) * 8 + } + FRAGMENT => { + if payload.len() < 8 { + return None; + } + 8 + } + AUTHENTICATION => { + if payload.len() < 8 { + return None; + } + (usize::from(payload[1]) + 2) * 4 + } + ESP => return None, + _ => return Some((next_header, payload)), + }; + + if payload.len() < header_len { + return None; + } + + next_header = IpNextHeaderProtocol(payload[0]); + payload = &payload[header_len..]; + } + + Some((next_header, payload)) +} + fn extract_packet_features_transport( source_ip: IpAddr, destination_ip: IpAddr, diff --git a/rustiflow/src/tests/flows/packet_features_test.rs b/rustiflow/src/tests/flows/packet_features_test.rs index e6b73011..403fa383 100644 --- a/rustiflow/src/tests/flows/packet_features_test.rs +++ b/rustiflow/src/tests/flows/packet_features_test.rs @@ -1,5 +1,6 @@ #[cfg(test)] mod tests { + use pnet::packet::{ip::IpNextHeaderProtocols, ipv6::Ipv6Packet}; use std::net::{IpAddr, Ipv4Addr}; use crate::packet_features::PacketFeatures; @@ -31,4 +32,65 @@ mod tests { assert_eq!(forward.biflow_key(), backward.biflow_key()); } + + fn build_ipv6_packet(next_header: u8, payload: &[u8]) -> Vec { + let mut packet = vec![0_u8; 40 + payload.len()]; + packet[0] = 0x60; + packet[4..6].copy_from_slice(&(payload.len() as u16).to_be_bytes()); + packet[6] = next_header; + packet[7] = 64; + packet[8..24] + .copy_from_slice(&[0x20, 0x01, 0x0d, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]); + packet[24..40] + .copy_from_slice(&[0x20, 0x01, 0x0d, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]); + packet[40..].copy_from_slice(payload); + packet + } + + #[test] + fn ipv6_hop_by_hop_extension_is_skipped_before_tcp_parse() { + let mut payload = vec![0_u8; 8 + 20]; + payload[0] = IpNextHeaderProtocols::Tcp.0; + payload[1] = 0; + payload[8..10].copy_from_slice(&12345_u16.to_be_bytes()); + payload[10..12].copy_from_slice(&443_u16.to_be_bytes()); + payload[20] = 0x50; + payload[21] = 0x02; + + let bytes = build_ipv6_packet(IpNextHeaderProtocols::Hopopt.0, &payload); + let packet = Ipv6Packet::new(&bytes).unwrap(); + let features = PacketFeatures::from_ipv6_packet(&packet, 42).unwrap(); + + assert_eq!(features.protocol, IpNextHeaderProtocols::Tcp.0); + assert_eq!(features.source_port, 12345); + assert_eq!(features.destination_port, 443); + } + + #[test] + fn ipv6_fragment_extension_is_skipped_before_udp_parse() { + let mut payload = vec![0_u8; 8 + 8]; + payload[0] = IpNextHeaderProtocols::Udp.0; + payload[8..10].copy_from_slice(&5353_u16.to_be_bytes()); + payload[10..12].copy_from_slice(&53_u16.to_be_bytes()); + payload[12..14].copy_from_slice(&8_u16.to_be_bytes()); + + let bytes = build_ipv6_packet(IpNextHeaderProtocols::Ipv6Frag.0, &payload); + let packet = Ipv6Packet::new(&bytes).unwrap(); + let features = PacketFeatures::from_ipv6_packet(&packet, 99).unwrap(); + + assert_eq!(features.protocol, IpNextHeaderProtocols::Udp.0); + assert_eq!(features.source_port, 5353); + assert_eq!(features.destination_port, 53); + } + + #[test] + fn truncated_ipv6_extension_header_is_rejected() { + let bytes = build_ipv6_packet( + IpNextHeaderProtocols::Hopopt.0, + &[IpNextHeaderProtocols::Tcp.0], + ); + let packet = Ipv6Packet::new(&bytes).unwrap(); + + assert!(PacketFeatures::from_ipv6_packet(&packet, 7).is_none()); + } } From 05df785bcbbd8db9241c74b71e02384414d56423 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:01:56 +0100 Subject: [PATCH 08/34] test: add feature module invariants --- .../src/tests/flows/feature_modules_test.rs | 167 ++++++++++++++++++ rustiflow/src/tests/flows/mod.rs | 2 + 2 files changed, 169 insertions(+) create mode 100644 rustiflow/src/tests/flows/feature_modules_test.rs diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs new file mode 100644 index 00000000..9a4aa5e9 --- /dev/null +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -0,0 +1,167 @@ +#[cfg(test)] +mod tests { + use pnet::packet::ip::IpNextHeaderProtocols; + + use crate::{ + flows::{ + features::{ + active_idle_stats::ActiveIdleStats, icmp_stats::IcmpStats, + payload_stats::PayloadLengthStats, retransmission_stats::RetransmissionStats, + subflow_stats::SubflowStats, util::FlowFeature, window_size_stats::WindowSizeStats, + }, + util::FlowExpireCause, + }, + packet_features::{PacketFeatures, ACK_FLAG}, + }; + + fn packet(timestamp_us: i64) -> PacketFeatures { + PacketFeatures { + timestamp_us, + ..Default::default() + } + } + + #[test] + fn icmp_stats_only_keep_first_packet_type_and_code() { + let mut stats = IcmpStats::new(); + + let mut first = packet(1_000_000); + first.protocol = IpNextHeaderProtocols::Icmp.0; + first.icmp_type = Some(8); + first.icmp_code = Some(0); + stats.update(&first, true, first.timestamp_us); + + let mut second = packet(2_000_000); + second.protocol = IpNextHeaderProtocols::Icmp.0; + second.icmp_type = Some(3); + second.icmp_code = Some(1); + stats.update(&second, false, first.timestamp_us); + + assert_eq!(stats.get_type(), 8); + assert_eq!(stats.get_code(), 0); + } + + #[test] + fn retransmission_stats_skip_pure_acks_and_icmp_and_track_duplicates_by_direction() { + let mut stats = RetransmissionStats::new(); + + let mut pure_ack = packet(1_000_000); + pure_ack.protocol = IpNextHeaderProtocols::Tcp.0; + pure_ack.flags = ACK_FLAG; + pure_ack.ack_flag = 1; + pure_ack.sequence_number = 11; + stats.update(&pure_ack, true, pure_ack.timestamp_us); + stats.update(&pure_ack, true, pure_ack.timestamp_us); + + let mut icmp = packet(1_500_000); + icmp.protocol = IpNextHeaderProtocols::Icmp.0; + icmp.sequence_number = 22; + stats.update(&icmp, true, pure_ack.timestamp_us); + stats.update(&icmp, true, pure_ack.timestamp_us); + + let mut fwd = packet(2_000_000); + fwd.protocol = IpNextHeaderProtocols::Tcp.0; + fwd.sequence_number = 100; + stats.update(&fwd, true, pure_ack.timestamp_us); + stats.update(&fwd, true, pure_ack.timestamp_us); + + let mut bwd = packet(2_500_000); + bwd.protocol = IpNextHeaderProtocols::Tcp.0; + bwd.sequence_number = 200; + stats.update(&bwd, false, fwd.timestamp_us); + stats.update(&bwd, false, fwd.timestamp_us); + + assert_eq!(stats.fwd_retransmission_count, 1); + assert_eq!(stats.bwd_retransmission_count, 1); + assert_eq!(stats.dump(), "2,1,1"); + } + + #[test] + fn window_size_stats_capture_initial_sizes_for_each_direction() { + let mut stats = WindowSizeStats::new(); + + let mut fwd_first = packet(1_000_000); + fwd_first.window_size = 1_024; + stats.update(&fwd_first, true, fwd_first.timestamp_us); + + let mut bwd_first = packet(1_100_000); + bwd_first.window_size = 2_048; + stats.update(&bwd_first, false, fwd_first.timestamp_us); + + let mut fwd_second = packet(1_200_000); + fwd_second.window_size = 4_096; + stats.update(&fwd_second, true, bwd_first.timestamp_us); + + let mut bwd_second = packet(1_300_000); + bwd_second.window_size = 8_192; + stats.update(&bwd_second, false, fwd_second.timestamp_us); + + assert_eq!(stats.fwd_init_window_size, 1_024); + assert_eq!(stats.bwd_init_window_size, 2_048); + assert_eq!(stats.fwd_window_size.get_count(), 2); + assert_eq!(stats.bwd_window_size.get_count(), 2); + } + + #[test] + fn payload_stats_count_non_zero_payload_packets_per_direction() { + let mut stats = PayloadLengthStats::new(); + + let mut fwd_zero = packet(1_000_000); + fwd_zero.data_length = 0; + stats.update(&fwd_zero, true, fwd_zero.timestamp_us); + + let mut fwd_payload = packet(1_100_000); + fwd_payload.data_length = 37; + stats.update(&fwd_payload, true, fwd_zero.timestamp_us); + + let mut bwd_zero = packet(1_200_000); + bwd_zero.data_length = 0; + stats.update(&bwd_zero, false, fwd_payload.timestamp_us); + + let mut bwd_payload = packet(1_300_000); + bwd_payload.data_length = 19; + stats.update(&bwd_payload, false, bwd_zero.timestamp_us); + + assert_eq!(stats.fwd_non_zero_payload_packets, 1); + assert_eq!(stats.bwd_non_zero_payload_packets, 1); + assert_eq!(stats.payload_len.get_count(), 4); + } + + #[test] + fn subflow_stats_increment_only_on_gaps_greater_than_one_second() { + let mut stats = SubflowStats::new(); + + let first_ts = 1_000_000; + let second_ts = 2_000_000; + let third_ts = 3_000_001; + + let first = packet(first_ts); + stats.update(&first, true, first_ts); + + let second = packet(second_ts); + stats.update(&second, false, first_ts); + + let third = packet(third_ts); + stats.update(&third, true, second_ts); + + assert_eq!(stats.subflow_count, 1); + } + + #[test] + fn active_idle_stats_record_active_and_idle_periods_on_gap_and_close() { + let mut stats = ActiveIdleStats::new(0); + + let first = packet(1_000_000); + stats.update(&first, true, 0); + + let second = packet(7_000_000); + stats.update(&second, false, first.timestamp_us); + + stats.close(10_000_000, FlowExpireCause::IdleTimeout); + + assert_eq!(stats.active_stats.get_total(), 1_000.0); + assert_eq!(stats.active_stats.get_count(), 1); + assert_eq!(stats.idle_stats.get_total(), 9_000.0); + assert_eq!(stats.idle_stats.get_count(), 2); + } +} diff --git a/rustiflow/src/tests/flows/mod.rs b/rustiflow/src/tests/flows/mod.rs index a8afa429..cd6471ad 100644 --- a/rustiflow/src/tests/flows/mod.rs +++ b/rustiflow/src/tests/flows/mod.rs @@ -1,6 +1,8 @@ mod basic_flow_test; mod cidds_flow_test; +mod feature_modules_test; mod flow_table_test; mod nf_flow_test; mod packet_features_test; +mod pcap_fixture_test; mod rusti_flow_test; From e3fc34acb218ac63b3521fb8fa5e785d95a7a03a Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:02:07 +0100 Subject: [PATCH 09/34] test: add tiny pcap regression fixture --- .../src/tests/flows/pcap_fixture_test.rs | 92 ++++++++++++++++++ .../tests/data/nmap_tcp_syn_version.pcap | Bin 0 -> 3653 bytes 2 files changed, 92 insertions(+) create mode 100644 rustiflow/src/tests/flows/pcap_fixture_test.rs create mode 100644 rustiflow/tests/data/nmap_tcp_syn_version.pcap diff --git a/rustiflow/src/tests/flows/pcap_fixture_test.rs b/rustiflow/src/tests/flows/pcap_fixture_test.rs new file mode 100644 index 00000000..41d0600b --- /dev/null +++ b/rustiflow/src/tests/flows/pcap_fixture_test.rs @@ -0,0 +1,92 @@ +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use tokio::sync::mpsc; + + use crate::{ + flows::{flow::Flow, rusti_flow::RustiFlow, util::FlowExpireCause}, + pcap::read_pcap_file, + }; + + fn fixture_path(name: &str) -> String { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(name) + .to_string_lossy() + .into_owned() + } + + fn count_csv_fields(row: &str) -> usize { + row.split(',').count() + } + + #[tokio::test] + async fn tiny_concap_tcp_syn_fixture_extracts_expected_flows() { + let (tx, mut rx) = mpsc::channel::(64); + + read_pcap_file::( + &fixture_path("nmap_tcp_syn_version.pcap"), + tx, + 1, + 3600, + 120, + None, + 60, + ) + .await + .expect("fixture pcap should parse successfully"); + + let mut flows = Vec::new(); + while let Some(flow) = rx.recv().await { + flows.push(flow); + } + + assert_eq!(flows.len(), 17); + + for flow in &flows { + assert_eq!( + count_csv_fields(&flow.dump()), + count_csv_fields(&RustiFlow::get_features()) + ); + } + + let established_http_flow = flows + .iter() + .find(|flow| flow.basic_flow.flow_key == "192.168.126.228:54122-192.168.126.224:80-6") + .expect("expected established HTTP flow in fixture"); + assert_eq!(established_http_flow.packet_len_stats.flow_count(), 10); + assert_eq!( + established_http_flow + .packet_len_stats + .fwd_packet_len + .get_count(), + 5 + ); + assert_eq!( + established_http_flow + .packet_len_stats + .bwd_packet_len + .get_count(), + 5 + ); + assert_eq!( + established_http_flow.basic_flow.flow_expire_cause, + FlowExpireCause::TcpTermination + ); + assert_eq!(established_http_flow.tcp_flags_stats.get_flags(), ".AP.SF"); + + let icmp_flow = flows + .iter() + .find(|flow| flow.basic_flow.flow_key == "192.168.126.228:0-192.168.126.224:0-1") + .expect("expected ICMP flow in fixture"); + assert_eq!(icmp_flow.packet_len_stats.flow_count(), 4); + assert_eq!(icmp_flow.icmp_stats.get_type(), 8); + assert_eq!(icmp_flow.icmp_stats.get_code(), 0); + assert_eq!( + icmp_flow.basic_flow.flow_expire_cause, + FlowExpireCause::ExporterShutdown + ); + } +} diff --git a/rustiflow/tests/data/nmap_tcp_syn_version.pcap b/rustiflow/tests/data/nmap_tcp_syn_version.pcap new file mode 100644 index 0000000000000000000000000000000000000000..d5a439996bdd68515692baf907ff790ffefb6da8 GIT binary patch literal 3653 zcmb7{eM}o=7{H%{?$#ALH(cgsG&>ngn3U3TeAv_qI11%sG*G5vLMY`*nXCnR(=8@K zG$s;FMp*X2B7{veFgA6BX*OJ9;vXium>FPmWF|9N=08TGe{7kXna_LI^S;QGm&m~1qv5(ka?0F# z9^xI^_N6rD*bFt_JMh;Hd>GnOOpvh?AjPz5j@V}*sThAJP3Mq<22e)NUz8lE`LP*7 zLE6v$4FI;g;!Ii;6rE4b`$2q?erLDr1O5c%hJT$2p6e1RoeIjMUBEv!BhAW>sj>OT9Rsot)GB?-&&od6lk$ODwNL9V(Z||nlzB)4VCmIo6h1zc z&u6j^gCr?mkB$%cG(NA0KFx+cm(l@p>E_c4pL&%KR=2c+g_85B*Z2fPpW}u;O&w$h zn-3^_cB*_%%RVQQ^VzBKX%c-F4SlxV11O+xT~heesC+(^eSS&Kr$*!Rvgq@4zft{s z-wu#ZSL{;wRI7YGmwjwVN}_(MH9osVpMFCh&jA1jec7Y%X;Ar0%RX->=hL9^Nu1@Y z;zsc){f*S~_3soubt<1Tvd=~&B@v%GjZd=}p9Vvp7Th^Y>FHU8k5}b0EBm~foR3%I zgF7|ZpX+#|`1DPZ{rPZ)!lz2*b58bYK~fU&snYo1=}LUIM$(}M&k)jd=Y`v6CZ|q( zT=H4i3;v!%nYcCqCQ|VH+9CK%m{QD^w)Y-j%mJv>)YXLSRgiW2qdRdjeu8B2tr6uA}Y`;Mf!;`V#VX zbRC{d2iyRKrXzZN7J)djB5@NE@jX%?4n4j|h*gyV#?Dj+0)D$Azrbp-TF4mZa16;> z7Y{amqzvCMt2b-nk&5vMqxBuhdTDvvl{OrQ5YJDgLkM5LNz)Rk=J-)U^xj>OxQV(m z*eY5h@kc^T_&OM8K>^cHYqjhM@|=tD_IKwo&SIu!e-GYqviQfv7P|^rrm8+*wKQ@4 zuXFt_ro1;8ig0#kKAT@y%H%XhyAR}AEp@>-Z>x`Wc89yUP8U-ji{>#72eP7MloG~K z>MAO9ITEbOK(NcjWI8)-Y@st)#IeDW!eD7Zrqxm&3UR%>tuYww;^Ho*;~>w)t(KQ! zQJ#zPwmL4_#YbEWTO|AzLi}X63Sw|sw_wnuM%Qrqt(bUU2r7-P6;oPi6u=O jvj^8J9XC*tW0``ug5z<)aihjD8y&kerTd_5Xan^xx_0NX literal 0 HcmV?d00001 From 68ec27075ee699d76da6d61dbffd75ceaee5d71d Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:12:06 +0100 Subject: [PATCH 10/34] test: add concap udp fixture coverage --- .../src/tests/flows/pcap_fixture_test.rs | 85 +++++++++++++++--- rustiflow/tests/data/nmap_udp_version.pcap | Bin 0 -> 6302 bytes 2 files changed, 72 insertions(+), 13 deletions(-) create mode 100644 rustiflow/tests/data/nmap_udp_version.pcap diff --git a/rustiflow/src/tests/flows/pcap_fixture_test.rs b/rustiflow/src/tests/flows/pcap_fixture_test.rs index 41d0600b..6741bccb 100644 --- a/rustiflow/src/tests/flows/pcap_fixture_test.rs +++ b/rustiflow/src/tests/flows/pcap_fixture_test.rs @@ -22,27 +22,25 @@ mod tests { row.split(',').count() } - #[tokio::test] - async fn tiny_concap_tcp_syn_fixture_extracts_expected_flows() { + async fn extract_fixture(name: &str) -> Vec { let (tx, mut rx) = mpsc::channel::(64); - read_pcap_file::( - &fixture_path("nmap_tcp_syn_version.pcap"), - tx, - 1, - 3600, - 120, - None, - 60, - ) - .await - .expect("fixture pcap should parse successfully"); + read_pcap_file::(&fixture_path(name), tx, 1, 3600, 120, None, 60) + .await + .expect("fixture pcap should parse successfully"); let mut flows = Vec::new(); while let Some(flow) = rx.recv().await { flows.push(flow); } + flows + } + + #[tokio::test] + async fn tiny_concap_tcp_syn_fixture_extracts_expected_flows() { + let flows = extract_fixture("nmap_tcp_syn_version.pcap").await; + assert_eq!(flows.len(), 17); for flow in &flows { @@ -89,4 +87,65 @@ mod tests { FlowExpireCause::ExporterShutdown ); } + + #[tokio::test] + async fn tiny_concap_udp_fixture_extracts_expected_protocol_mix() { + let flows = extract_fixture("nmap_udp_version.pcap").await; + + assert_eq!(flows.len(), 56); + + for flow in &flows { + assert_eq!( + count_csv_fields(&flow.dump()), + count_csv_fields(&RustiFlow::get_features()) + ); + } + + assert_eq!( + flows + .iter() + .filter(|flow| flow.basic_flow.protocol == 17) + .count(), + 53 + ); + assert_eq!( + flows + .iter() + .filter(|flow| flow.basic_flow.protocol == 6) + .count(), + 2 + ); + assert_eq!( + flows + .iter() + .filter(|flow| flow.basic_flow.protocol == 1) + .count(), + 1 + ); + + let tcp_port_80_flow = flows + .iter() + .find(|flow| flow.basic_flow.flow_key == "192.168.177.151:48385-192.168.126.204:80-6") + .expect("expected TCP port 80 response flow in fixture"); + assert_eq!(tcp_port_80_flow.packet_len_stats.flow_count(), 2); + assert_eq!( + tcp_port_80_flow.basic_flow.flow_expire_cause, + FlowExpireCause::TcpTermination + ); + assert_eq!(tcp_port_80_flow.tcp_flags_stats.get_flags(), ".A.R.."); + + let icmp_flow = flows + .iter() + .find(|flow| flow.basic_flow.flow_key == "192.168.177.151:0-192.168.126.204:0-1") + .expect("expected ICMP response flow in fixture"); + assert_eq!(icmp_flow.packet_len_stats.flow_count(), 22); + assert_eq!(icmp_flow.packet_len_stats.fwd_packet_len.get_count(), 2); + assert_eq!(icmp_flow.packet_len_stats.bwd_packet_len.get_count(), 20); + assert_eq!(icmp_flow.icmp_stats.get_type(), 8); + assert_eq!(icmp_flow.icmp_stats.get_code(), 0); + assert_eq!( + icmp_flow.basic_flow.flow_expire_cause, + FlowExpireCause::ExporterShutdown + ); + } } diff --git a/rustiflow/tests/data/nmap_udp_version.pcap b/rustiflow/tests/data/nmap_udp_version.pcap new file mode 100644 index 0000000000000000000000000000000000000000..6f415fe467bf6a2f451074bb2dd193c93a9b4b5c GIT binary patch literal 6302 zcmcIo3s6*L6h3!%VL=hOh6t&MqJX48B9GOC1#x}g1BD!%97Tqeun`^9M8VWj8*CCB zbu2Uotc=Xa7lTF+NT#UyNag*=V)B((I++GS=iGJgUb?%#T5!i7S>XH5cfND}*S%7H zI9CBGFg+^Z@I&j!?1gEs*@HKJPh(F_zgt%eTZG33CIAc=WD78oD|Ro=|RsVvZ~N z>vB^r{xQTUR4_FZK%ufaO6oPp6o@~kA=BW63P5X}(ch?nD=Mb60onFOIRYf!Fl4et z!Xne-Rd5A8u=bc}(gRUXSaPErRG^PFu;fcvo~{L=iaBFeQCOO!1Ov;55*E3ZBsIWG z+zJNgZu?MDV|vJ!LJ5mpOIV{&n{cBq_E7ebk%fD7ZWxU=WO2KrVDv^{KlFv) zXJyn<}>xo{1{+3zh#v~ z&s5Y?M|vK!^$ZmB_?y$iVm*$m!Kj;ttXAHm4&toh56m;^;fla1@osQJSt@sI*V#-; z8AzR7ivnWsdm1vgbm6Xox}g$vHH1XBo+HK^7-KP_XvnnAC#}QyL&H01jl3||sl9_j z2~Qx-9a<&m@os-i|D~_gx^6x_BqztYW1G#H13eOaD~2R|W0U8U;o^tCR)8-jBzO>m zXW$QL$P!EoA4CamY%wMHl|B9;CD`4aTJ{GG^moL7lHeoj4m= ztdsAu^en!Pt1_<-km#9)dTL2e3=>&Z3E;~sbQ0ubL^cv~J`=gW2*8i`Ss+2`5P6M| zNlfGkoO&bp7f(x&(-B!i$O0yEX z^>mNvv)bCYESDf@jtP}_OlSGE98Wm`e4MicN%Oj% zVoqct-G88YU4Umv!zr{x3AC0}wx*V!zHw~d75?g*>N!~fFpB4+C3?t%I?}WG zQ&uK1p434b{*)le_iFITUbxrOmf=AlcCCibHg&D7P_arvUP7d04Rp2}!`6O3-tgh1?NKJ5}g+BGL+IwW#JWla3=j7#>_ZK&!>?OGKnv zKvojc^=J>42YBMsa&Mn9=K+iL!OETtJqeMtX0N#<>gg$XP(^yikF{g0ydikh4CJ@p z6_HkWmvhoKbzXUiXHJJ5LqY2+bk}HZlJL;lUr)&J@?H!D!|(v^&mYVck!nFfgIvMQ zO{|;wvG=GZE?Xr$q@Y%T3OsBKcM{rxWM_Tv75Ljxe7v=Yv=tQ8byh$;*?11LeBz9b zruBSXg@lI`R1-2TlVf;rdNrK^9JhWheq-1Dc zgzqCf`IQ?*q?NGiZzJRbL^5XeQK6%Eo9k#6tGiAGv#=gCWHpJ%bk{-%-zuI6&Nrh3 zzl&)p?E_yx06hxeHst`fGu`m9^HR$h)_J3sv1__xvFYZ>OV_`hdvO7%!@HE-y@zmc1E63TH!nv-2-d9fZ{)T z)NkrjmL)AoR>dbJCjdBrl6cdQ7e5I@cFmlv(@^EsKay1LCUN04^MAzb8_mka`CG~* zCDfQp{SI{9W4iw7xdb6EBNu#ihPga6m&<>T>4q^a_vk0{DVJZxS-aI8>+5Il|EwKE z71CmaWb~YdtU_@*VaHs5N0Qb3Cl!k5is%bsRE&2ZO&2J?FKmsq+P??;F0y%a7rX1JW>ud-a3*-EYii}$tqRP5l$!*jaGl^_hx zaAlm}3NDLsSGrkP8Va+#MrGY0SN?V|_R4P>oC_Wy$Jq1Q4%;9ip5=B>3h_pZC+=Ox ekvH;w?=Engg0H~fmffDThTLA8WzOwx6!s5#dKVY~ literal 0 HcmV?d00001 From 088dbacfa95d9debf24b5756e6f4a4797e6cc211 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:12:09 +0100 Subject: [PATCH 11/34] test: add concap smoke helper --- README.md | 14 ++++++++ scripts/concap_smoke.sh | 80 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100755 scripts/concap_smoke.sh diff --git a/README.md b/README.md index fe8ea744..fbf449a1 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,20 @@ Options: RUST_LOG=info cargo xtask run -- ``` +Run the focused Rust test suite with: + +```bash +cargo test -p rustiflow +``` + +If you also have the sibling `concap` repository and a reachable Kubernetes cluster, +you can run a tiny ConCap-backed smoke check and then reprocess the downloaded pcap +with your current local RustiFlow checkout: + +```bash +./scripts/concap_smoke.sh ../concap nmap-tcp-syn-version.yaml +``` + ### Binary ```bash diff --git a/scripts/concap_smoke.sh b/scripts/concap_smoke.sh new file mode 100755 index 00000000..6dfe7a14 --- /dev/null +++ b/scripts/concap_smoke.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: scripts/concap_smoke.sh [CONCAP_DIR] [SCENARIO_FILE] + +Run a tiny ConCap scenario, then reprocess the downloaded pcap with the current +local RustiFlow checkout. + +Arguments: + CONCAP_DIR Path to the ConCap repository (default: ../concap) + SCENARIO_FILE Scenario YAML filename from ConCap example/scenarios + (default: nmap-tcp-syn-version.yaml) +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +concap_dir="${1:-"$repo_root/../concap"}" +scenario_file="${2:-nmap-tcp-syn-version.yaml}" +scenario_name="${scenario_file%.yaml}" +smoke_dir="$concap_dir/rustiflow-smoke" +completed_dir="$smoke_dir/completed/$scenario_name" +pcap_path="$completed_dir/dump.pcap" +output_csv="$completed_dir/rustiflow-current.csv" + +if [[ ! -x "$concap_dir/concap" ]]; then + echo "error: expected ConCap binary at $concap_dir/concap" >&2 + exit 1 +fi + +if [[ ! -f "$concap_dir/example/scenarios/$scenario_file" ]]; then + echo "error: missing scenario file $concap_dir/example/scenarios/$scenario_file" >&2 + exit 1 +fi + +mkdir -p "$smoke_dir/scenarios" "$smoke_dir/processingpods" "$smoke_dir/completed" +rm -rf "$completed_dir" +cp "$concap_dir/example/scenarios/$scenario_file" "$smoke_dir/scenarios/" +cp "$concap_dir/example/processingpods/rustiflow.yaml" "$smoke_dir/processingpods/" + +echo "Running ConCap scenario $scenario_file" +"$concap_dir/concap" -d "$smoke_dir" -s "$scenario_file" -w 1 + +if [[ ! -f "$pcap_path" ]]; then + echo "error: expected downloaded pcap at $pcap_path" >&2 + exit 1 +fi + +echo "Reprocessing $pcap_path with current local RustiFlow" +cargo run -p rustiflow -- \ + -f rustiflow \ + --header \ + --idle-timeout 120 \ + --active-timeout 3600 \ + --output csv \ + --export-path "$output_csv" \ + pcap "$pcap_path" + +python3 - "$output_csv" <<'PY' +import csv +import sys +from pathlib import Path + +path = Path(sys.argv[1]) +with path.open() as handle: + rows = list(csv.reader(handle)) + +header = rows[0] +widths = {len(row) for row in rows[1:]} + +print(f"Current RustiFlow rows: {len(rows) - 1}") +print(f"Current RustiFlow columns: {len(header)}") +print(f"Stable row width: {widths == {len(header)}}") +PY From 5460a7b5a91c8d88e7c0a2cd6906c5f6dc318ae0 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:35:23 +0100 Subject: [PATCH 12/34] docs: add agent priorities and performance roadmap --- AGENTS.md | 86 ++++++++++++++ docs/performance-roadmap.md | 224 ++++++++++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+) create mode 100644 docs/performance-roadmap.md diff --git a/AGENTS.md b/AGENTS.md index c0e0271b..7ac7799b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,6 +16,20 @@ This repository is a Rust workspace for a network flow extractor. The main crate - If any software or dependency needs to be installed on those machines, ask the user to do it. - If there is any uncertainty about whether a command is appropriate to run on those machines, ask the user before running it. +## Non-Negotiables + +- When writing or editing Rust in this repository, always apply the Rust guidance in this file first. Treat it as an active coding standard, not optional reading. +- Prefer changes that are small, local, and easy to review. Avoid broad opportunistic refactors unless the task specifically calls for them. +- Preserve the existing human-made structure of the codebase where possible. Fit new work into current boundaries before creating new ones. + +## Commit Hygiene + +- Keep commits clean, bounded, and purpose-specific. +- Prefer one logical change per commit. Do not mix unrelated fixes, refactors, docs updates, and test rewrites unless they are tightly coupled. +- When work spans multiple concerns, split it into a short chain of commits with readable messages. +- Before committing, check that the diff matches the stated purpose of the commit and does not include unrelated workspace noise. +- If a change is exploratory or lower confidence, prefer using a separate branch until it is trusted. + ## Working Principles - Prefer small, targeted changes over broad rewrites. @@ -83,3 +97,75 @@ If a change touches shared code used by multiple crates, prefer checking the wor - Treat the current test suite carefully: some tests may be stale or incomplete relative to the active code. - When adding or repairing tests, prefer tests that reflect the current flow architecture and public behavior rather than resurrecting outdated internal field expectations. + +## Feature Engineering Priorities + +When deciding where to spend effort on RustiFlow features, prefer this order: + +1. Semantic correctness across ingestion modes +2. Quality of existing transport and timing features +3. New diagnostic features with clear operational value +4. Exporter-specific schema completeness + +### 1. Keep offline and realtime semantically aligned + +Highest-leverage files: + +- `rustiflow/src/packet_features.rs` +- `rustiflow/src/pcap.rs` +- `rustiflow/src/realtime.rs` +- `common/src/lib.rs` +- `ebpf-ipv4/src/main.rs` +- `ebpf-ipv6/src/main.rs` + +Focus areas: + +- Prefer improving packet metadata fidelity before adding many new derived features. +- Treat timestamp semantics, packet length semantics, and parser coverage as foundational. +- If offline and realtime do not mean the same thing, higher-level flow features are less trustworthy. + +### 2. Strengthen existing weak feature families before adding many new columns + +Most likely to benefit from refinement: + +- `rustiflow/src/flows/features/retransmission_stats.rs` +- `rustiflow/src/flows/features/iat_stats.rs` +- `rustiflow/src/flows/features/timing_stats.rs` +- `rustiflow/src/flows/features/active_idle_stats.rs` +- `rustiflow/src/flows/features/bulk_stats.rs` +- `rustiflow/src/flows/features/icmp_stats.rs` +- `rustiflow/src/flows/basic_flow.rs` + +Concrete priorities: + +- Upgrade retransmission logic beyond simple repeated TCP sequence numbers. +- Preserve more timing precision for short flows and LAN traffic. +- Revisit hardcoded active/idle and subflow thresholds. +- Make TCP lifecycle quality more explicit: handshake success, reset behavior, close style. +- Expand ICMP handling beyond first seen type/code. + +### 3. Highest-value new feature families + +Prefer new features that explain behavior, not just more totals and moments. + +Best candidates: + +- TCP quality: handshake completion, SYN-to-SYN/ACK timing, duplicate ACKs, zero-window events, reset phase, close style. +- IP/path signals: TTL or hop-limit stats, DSCP/ECN stats, IPv4 fragmentation, IPv6 fragment behavior. +- ICMP behavior: request/response balance, error classes, unreachable subtype counts. +- Optional lightweight application-aware features: DNS, TLS, HTTP, QUIC handshake metadata without deep DPI dependence. +- Better contamination-free abstractions than only coarse IANA port buckets. + +### 4. Exporter-specific gaps worth filling + +- `rustiflow/src/flows/nf_flow.rs` still documents missing fields such as `ip_version`, `vlan_id`, and `tunnel_id`. +- Prefer filling exporter gaps after the underlying packet metadata exists in both offline and realtime paths. + +### Working rule + +Before adding a new feature, ask: + +- Is the underlying packet metadata trustworthy in both offline and realtime modes? +- Does this feature improve diagnostics or model usefulness more than refining an existing weak feature? +- Can it be expressed as a reusable `FlowFeature` instead of exporter-specific logic? +- Can it be tested with a tiny deterministic fixture? diff --git a/docs/performance-roadmap.md b/docs/performance-roadmap.md new file mode 100644 index 00000000..fff966b9 --- /dev/null +++ b/docs/performance-roadmap.md @@ -0,0 +1,224 @@ +# Performance Roadmap + +This file tracks performance work for pushing RustiFlow beyond already-successful +`10Gbps` realtime capture. + +Use this as an execution checklist, not as a design essay. + +## Ground Rules + +- [ ] Measure before and after every meaningful optimization. +- [ ] Prefer hot-path wins over broad rewrites. +- [ ] Do not trade away feature correctness for speed without making that trade explicit. +- [ ] Keep performance work in clean, bounded commits. + +## Phase 0: Baseline And Profiling + +- [ ] Establish a repeatable Linux benchmark setup on a real target machine. +- [ ] Capture baseline numbers for: + - packets per second + - dropped packets + - CPU usage by userspace and kernel path + - active flow count + - export throughput +- [ ] Collect flamegraphs or equivalent profiling for: + - realtime ingest + - flow-table updates + - export path +- [ ] Separate ingress-only and ingress+egress benchmark modes. + +Why this matters: +The current implementation already performs well. Past this point, guessing is expensive. + +## Phase 1: Biggest Likely Wins + +### 1. Typed Flow Keys + +- [ ] Replace string-based packet keys with compact typed keys. +- [ ] Remove repeated `String` creation from: + - `flow_key()` + - `flow_key_bwd()` + - `biflow_key()` +- [ ] Use typed keys in shard selection and flow-table lookup. +- [ ] Keep string formatting only for export. + +Primary files: + +- `rustiflow/src/packet_features.rs` +- `rustiflow/src/flow_table.rs` +- `rustiflow/src/realtime.rs` + +Expected value: +Very high. This is a likely hot-path allocation and hashing tax. + +### 2. Cheaper Running Statistics + +- [ ] Replace per-update standard deviation work with a running variance form such as Welford. +- [ ] Store enough state to compute `std` at dump/close time. +- [ ] Benchmark impact across heavily used feature families. + +Primary files: + +- `rustiflow/src/flows/features/util.rs` +- `rustiflow/src/flows/features/*.rs` + +Expected value: +High. Many feature modules pay this cost on every packet. + +### 3. Realtime Timestamp Fix + +- [ ] Carry capture timestamps from kernel to userspace instead of calling `Utc::now()` per event. +- [ ] Keep timestamp semantics aligned with offline mode as much as possible. +- [ ] Re-benchmark after this change because it affects both correctness and overhead. + +Primary files: + +- `common/src/lib.rs` +- `ebpf-ipv4/src/main.rs` +- `ebpf-ipv6/src/main.rs` +- `rustiflow/src/packet_features.rs` + +Expected value: +High. Improves correctness and removes per-event userspace time acquisition. + +## Phase 2: Hot-Path Structural Cleanup + +### 4. FlowTable Access Patterns + +- [ ] Reduce repeated hashing and key rebuilding in flow lookup. +- [ ] Avoid `contains_key` plus `remove` plus `insert` churn where possible. +- [ ] Revisit direction resolution after typed keys are introduced. + +Primary file: + +- `rustiflow/src/flow_table.rs` + +Expected value: +High once typed keys exist. + +### 5. Export Without Cloning Full Flow State + +- [ ] Reduce or remove full-flow cloning for early export and termination export. +- [ ] Consider separating mutable hot-path state from export snapshots. +- [ ] Measure clone cost for `RustiFlow` specifically before redesigning too far. + +Primary files: + +- `rustiflow/src/flow_table.rs` +- `rustiflow/src/flows/flow.rs` +- `rustiflow/src/flows/*.rs` + +Expected value: +Medium to high depending on export rate and flow size. + +### 6. Performance Mode Should Mean Performance + +- [ ] Make sure high-throughput runs bypass packet-TUI work completely. +- [ ] Audit mutexes, watch channels, and per-packet UI accounting in performance-sensitive modes. +- [ ] Keep observability available, but not in the critical path by default. + +Primary files: + +- `rustiflow/src/realtime.rs` +- `rustiflow/src/packet_counts.rs` +- `rustiflow/src/flow_tui.rs` + +Expected value: +Medium to high for very fast realtime capture. + +## Phase 3: Throughput Scaling + +### 7. Batching Between Stages + +- [ ] Benchmark per-packet `mpsc` overhead. +- [ ] Try batched ring-buffer draining. +- [ ] Try batched shard submission. +- [ ] Keep changes narrow until measurement proves batching is worth the complexity. + +Primary file: + +- `rustiflow/src/realtime.rs` + +Expected value: +Medium, possibly high at very large packet rates. + +### 8. Faster Internal Hashing + +- [ ] Benchmark a faster internal hasher after typed keys are in place. +- [ ] Prefer a fast non-adversarial hasher only for internal packet-processing paths. +- [ ] Keep any public or security-sensitive hashing decisions separate. + +Primary files: + +- `rustiflow/src/realtime.rs` +- `rustiflow/src/flow_table.rs` + +Expected value: +Medium. Probably not worth doing first. + +### 9. Smarter Expiration Scheduling + +- [ ] Benchmark expiration scans at high concurrent flow counts. +- [ ] If scans become costly, evaluate timing buckets or a timer wheel. +- [ ] Do not build a more complex expiry structure before profiling says the scan is a real bottleneck. + +Primary file: + +- `rustiflow/src/flow_table.rs` + +Expected value: +Medium, but workload-dependent. + +## Phase 4: Export Path Optimization + +### 10. Cheaper Serialization + +- [ ] Measure cost of giant `format!`-based CSV assembly. +- [ ] Consider more streaming-oriented serialization for high export rates. +- [ ] Keep export-path changes isolated from flow semantics. + +Primary files: + +- `rustiflow/src/output.rs` +- `rustiflow/src/flows/*.rs` + +Expected value: +Medium. Important once export volume becomes the limiter. + +## Operational Metrics To Add + +- [ ] Per-source ring buffer drain rate +- [ ] Per-shard queue depth or backlog +- [ ] Active flow count over time +- [ ] Export throughput and export lag +- [ ] Dropped packet counters split by ingress/egress and IPv4/IPv6 +- [ ] A lightweight performance summary mode for realtime runs + +Why this matters: +If RustiFlow is going to chase higher link rates, it needs good self-observability. + +## Not Early Priorities + +- [ ] Do not start with micro-optimizing individual feature modules before fixing keying and stats math. +- [ ] Do not move large parts of flow aggregation into eBPF without profiling evidence. +- [ ] Do not do broad architecture rewrites before collecting hard measurements. +- [ ] Do not let exporter churn distract from realtime hot-path costs. + +## Current Best Order + +- [ ] Phase 0: Baseline and profiling +- [ ] Phase 1.1: Typed flow keys +- [ ] Phase 1.2: Cheaper running statistics +- [ ] Phase 1.3: Kernel-carried timestamps +- [ ] Phase 2.4: FlowTable access cleanup +- [ ] Phase 2.5: Export without cloning +- [ ] Phase 2.6: Strict performance mode +- [ ] Phase 3.7: Batching +- [ ] Phase 3.8: Faster hashing +- [ ] Phase 3.9: Smarter expiration scheduling +- [ ] Phase 4.10: Serialization optimization + +## Progress Notes + +- Use short dated notes here when a measurement or optimization changes priorities. +- If a planned optimization turns out not to matter, mark it done and note that it was ruled out. From 3867f4e544fe0ba5371d50f1e54092ca2cd6295f Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:35:28 +0100 Subject: [PATCH 13/34] realtime: preserve kernel capture timestamps --- common/src/lib.rs | 10 ++++++++-- ebpf-ipv4/src/main.rs | 2 ++ ebpf-ipv6/src/main.rs | 2 ++ rustiflow/src/packet_features.rs | 16 ++++++++++------ rustiflow/src/realtime.rs | 30 ++++++++++++++++++++++++++++-- 5 files changed, 50 insertions(+), 10 deletions(-) diff --git a/common/src/lib.rs b/common/src/lib.rs index 4aa969e5..aeaf20d6 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -2,10 +2,11 @@ pub use network_types::{icmp::IcmpHdr, tcp::TcpHdr, udp::UdpHdr}; -/// BasicFeaturesIpv4 is a struct collection all ipv4 traffic data and is 32 bytes in size. +/// BasicFeaturesIpv4 is a struct collection all ipv4 traffic data. #[repr(C, packed)] #[derive(Copy, Clone)] pub struct EbpfEventIpv4 { + pub timestamp_ns: u64, pub ipv4_destination: u32, pub ipv4_source: u32, pub port_destination: u16, @@ -25,6 +26,7 @@ pub struct EbpfEventIpv4 { impl EbpfEventIpv4 { pub fn new( + timestamp_ns: u64, ipv4_destination: u32, ipv4_source: u32, port_destination: u16, @@ -41,6 +43,7 @@ impl EbpfEventIpv4 { icmp_code: u8, ) -> Self { EbpfEventIpv4 { + timestamp_ns, ipv4_destination, ipv4_source, port_destination, @@ -63,10 +66,11 @@ impl EbpfEventIpv4 { #[cfg(feature = "user")] unsafe impl aya::Pod for EbpfEventIpv4 {} -/// BasicFeaturesIpv6 is a struct collection all ipv6 traffic data and is 64 bytes in size. +/// BasicFeaturesIpv6 is a struct collection all ipv6 traffic data. #[repr(C, packed)] #[derive(Clone, Copy)] pub struct EbpfEventIpv6 { + pub timestamp_ns: u64, pub ipv6_destination: u128, pub ipv6_source: u128, pub port_destination: u16, @@ -86,6 +90,7 @@ pub struct EbpfEventIpv6 { impl EbpfEventIpv6 { pub fn new( + timestamp_ns: u64, ipv6_destination: u128, ipv6_source: u128, port_destination: u16, @@ -102,6 +107,7 @@ impl EbpfEventIpv6 { icmp_code: u8, ) -> Self { EbpfEventIpv6 { + timestamp_ns, ipv6_destination, ipv6_source, port_destination, diff --git a/ebpf-ipv4/src/main.rs b/ebpf-ipv4/src/main.rs index d056fea0..2c0dd98f 100644 --- a/ebpf-ipv4/src/main.rs +++ b/ebpf-ipv4/src/main.rs @@ -4,6 +4,7 @@ use aya_ebpf::{ bindings::TC_ACT_PIPE, + helpers::gen::bpf_ktime_get_ns, macros::{classifier, map}, maps::{PerCpuArray, RingBuf}, programs::TcContext, @@ -104,6 +105,7 @@ impl PacketInfo { #[inline(always)] fn to_packet_log(&self, header: &T) -> EbpfEventIpv4 { EbpfEventIpv4::new( + unsafe { bpf_ktime_get_ns() }, self.ipv4_destination, self.ipv4_source, header.destination_port(), diff --git a/ebpf-ipv6/src/main.rs b/ebpf-ipv6/src/main.rs index 1341c85e..b094dc1a 100644 --- a/ebpf-ipv6/src/main.rs +++ b/ebpf-ipv6/src/main.rs @@ -4,6 +4,7 @@ use aya_ebpf::{ bindings::TC_ACT_PIPE, + helpers::gen::bpf_ktime_get_ns, macros::{classifier, map}, maps::{PerCpuArray, RingBuf}, programs::TcContext, @@ -157,6 +158,7 @@ impl PacketInfo { #[inline(always)] fn to_packet_log(&self, header: &T) -> EbpfEventIpv6 { EbpfEventIpv6::new( + unsafe { bpf_ktime_get_ns() }, self.ipv6_destination, self.ipv6_source, header.destination_port(), diff --git a/rustiflow/src/packet_features.rs b/rustiflow/src/packet_features.rs index 7ff68149..b1f146ba 100644 --- a/rustiflow/src/packet_features.rs +++ b/rustiflow/src/packet_features.rs @@ -2,7 +2,6 @@ use std::net::Ipv6Addr; use std::net::{IpAddr, Ipv4Addr}; -use chrono::Utc; #[cfg(target_os = "linux")] use common::{EbpfEventIpv4, EbpfEventIpv6}; use log::debug; @@ -34,7 +33,7 @@ impl Default for PacketFeatures { source_port: 0, destination_port: 0, protocol: 0, - timestamp_us: Utc::now().timestamp_micros(), + timestamp_us: 0, fin_flag: 0, syn_flag: 0, rst_flag: 0, @@ -85,14 +84,14 @@ pub struct PacketFeatures { impl PacketFeatures { #[cfg(target_os = "linux")] // Constructor to create PacketFeatures from EbpfEventIpv4 - pub fn from_ebpf_event_ipv4(event: &EbpfEventIpv4) -> Self { + pub fn from_ebpf_event_ipv4(event: &EbpfEventIpv4, realtime_offset_us: i64) -> Self { PacketFeatures { source_ip: IpAddr::V4(Ipv4Addr::from(event.ipv4_source.to_be())), destination_ip: IpAddr::V4(Ipv4Addr::from(event.ipv4_destination.to_be())), source_port: event.port_source, destination_port: event.port_destination, protocol: event.protocol, - timestamp_us: chrono::Utc::now().timestamp_micros(), + timestamp_us: monotonic_ns_to_epoch_us(event.timestamp_ns, realtime_offset_us), fin_flag: get_tcp_flag(event.combined_flags, FIN_FLAG), syn_flag: get_tcp_flag(event.combined_flags, SYN_FLAG), rst_flag: get_tcp_flag(event.combined_flags, RST_FLAG), @@ -123,14 +122,14 @@ impl PacketFeatures { #[cfg(target_os = "linux")] // Constructor to create PacketFeatures from EbpfEventIpv6 - pub fn from_ebpf_event_ipv6(event: &EbpfEventIpv6) -> Self { + pub fn from_ebpf_event_ipv6(event: &EbpfEventIpv6, realtime_offset_us: i64) -> Self { PacketFeatures { source_ip: IpAddr::V6(Ipv6Addr::from(event.ipv6_source.to_be())), destination_ip: IpAddr::V6(Ipv6Addr::from(event.ipv6_destination.to_be())), source_port: event.port_source, destination_port: event.port_destination, protocol: event.protocol, - timestamp_us: chrono::Utc::now().timestamp_micros(), + timestamp_us: monotonic_ns_to_epoch_us(event.timestamp_ns, realtime_offset_us), fin_flag: get_tcp_flag(event.combined_flags, FIN_FLAG), syn_flag: get_tcp_flag(event.combined_flags, SYN_FLAG), rst_flag: get_tcp_flag(event.combined_flags, RST_FLAG), @@ -242,6 +241,11 @@ fn get_tcp_flag(value: u8, flag: u8) -> u8 { ((value & flag) != 0) as u8 } +#[cfg(target_os = "linux")] +fn monotonic_ns_to_epoch_us(timestamp_ns: u64, realtime_offset_us: i64) -> i64 { + realtime_offset_us + (timestamp_ns / 1_000) as i64 +} + fn skip_ipv6_extension_headers<'a>( packet: &'a Ipv6Packet<'a>, ) -> Option<(IpNextHeaderProtocol, &'a [u8])> { diff --git a/rustiflow/src/realtime.rs b/rustiflow/src/realtime.rs index 7546e6bb..34d01a0c 100644 --- a/rustiflow/src/realtime.rs +++ b/rustiflow/src/realtime.rs @@ -1,4 +1,5 @@ use std::hash::{DefaultHasher, Hash, Hasher}; +use std::io; use std::path::PathBuf; use std::sync::Arc; @@ -43,6 +44,8 @@ where // Needed for older kernels bump_memlock_rlimit(); + let realtime_offset_us = compute_realtime_offset_us()?; + // Load the eBPF programs and attach to the event arrays let mut bpf_ingress_ipv4 = load_ebpf_ipv4(interface, TcAttachType::Ingress)?; let mut bpf_ingress_ipv6 = load_ebpf_ipv6(interface, TcAttachType::Ingress)?; @@ -122,6 +125,7 @@ where let shard_senders_clone = shard_senders.clone(); let packet_counter_clone = Arc::clone(&packet_counter); let packet_tx_clone = packet_tx.clone(); + let realtime_offset_us = realtime_offset_us; handle_set.spawn(async move { // Wrap the RingBuf in AsyncFd to poll it with tokio @@ -142,7 +146,8 @@ where } let ebpf_event_ipv4: EbpfEventIpv4 = unsafe { std::ptr::read(event.as_ptr() as *const _) }; - let packet_features = PacketFeatures::from_ebpf_event_ipv4(&ebpf_event_ipv4); + let packet_features = + PacketFeatures::from_ebpf_event_ipv4(&ebpf_event_ipv4, realtime_offset_us); let flow_key = packet_features.biflow_key(); let shard_index = compute_shard_index(&flow_key, num_threads); @@ -164,6 +169,7 @@ where let shard_senders_clone = shard_senders.clone(); let packet_counter_clone = Arc::clone(&packet_counter); let packet_tx_clone = packet_tx.clone(); + let realtime_offset_us = realtime_offset_us; handle_set.spawn(async move { // Wrap the RingBuf in AsyncFd to poll it with tokio @@ -184,7 +190,8 @@ where } let ebpf_event_ipv6: EbpfEventIpv6 = unsafe { std::ptr::read(event.as_ptr() as *const _) }; - let packet_features = PacketFeatures::from_ebpf_event_ipv6(&ebpf_event_ipv6); + let packet_features = + PacketFeatures::from_ebpf_event_ipv6(&ebpf_event_ipv6, realtime_offset_us); let flow_key = packet_features.biflow_key(); let shard_index = compute_shard_index(&flow_key, num_threads); @@ -260,6 +267,25 @@ fn compute_shard_index(flow_key: &str, num_shards: u8) -> usize { (hash % num_shards as u64) as usize } +fn compute_realtime_offset_us() -> Result { + let realtime_us = read_clock_us(libc::CLOCK_REALTIME)?; + let monotonic_us = read_clock_us(libc::CLOCK_MONOTONIC)?; + Ok(realtime_us - monotonic_us) +} + +fn read_clock_us(clock_id: libc::clockid_t) -> Result { + let mut ts = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + + if unsafe { libc::clock_gettime(clock_id, &mut ts) } != 0 { + return Err(io::Error::last_os_error().into()); + } + + Ok(ts.tv_sec * 1_000_000 + ts.tv_nsec / 1_000) +} + fn bump_memlock_rlimit() { // Bump the memlock rlimit. This is needed for older kernels that don't use the // new memcg based accounting, see https://lwn.net/Articles/837122/ From 9910fafd686c7e3008cd44a0e6a7d903e9592549 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:43:38 +0100 Subject: [PATCH 14/34] realtime: align ebpf packet length fields --- common/src/lib.rs | 2 +- ebpf-ipv4/src/main.rs | 21 ++++++++++++++------- ebpf-ipv6/src/main.rs | 26 +++++++++++++++++++------- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/common/src/lib.rs b/common/src/lib.rs index aeaf20d6..4c24d2dd 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -163,7 +163,7 @@ impl NetworkHeader for TcpHdr { | ((self.cwr() as u8) << 7) } fn header_length(&self) -> u8 { - TcpHdr::LEN as u8 + (self.doff() * 4) as u8 } fn sequence_number(&self) -> u32 { self.seq diff --git a/ebpf-ipv4/src/main.rs b/ebpf-ipv4/src/main.rs index 2c0dd98f..8dc5a846 100644 --- a/ebpf-ipv4/src/main.rs +++ b/ebpf-ipv4/src/main.rs @@ -47,8 +47,8 @@ fn process_packet(ctx: &TcContext) -> Result { } let ipv4hdr = ctx.load::(EthHdr::LEN).map_err(|_| ())?; - let packet_info = PacketInfo::new(&ipv4hdr, ctx.data_end() - ctx.data())?; let ip_header_length = (ipv4hdr.ihl() as usize) * 4; + let packet_info = PacketInfo::new(&ipv4hdr, ip_header_length)?; let hdr_offset = EthHdr::LEN + ip_header_length; match ipv4hdr.proto { @@ -88,34 +88,41 @@ fn process_transport_packet( struct PacketInfo { ipv4_source: u32, ipv4_destination: u32, - data_length: u16, + total_length: u16, + network_header_length: u16, protocol: u8, } impl PacketInfo { - fn new(ipv4hdr: &Ipv4Hdr, data_length: usize) -> Result { + fn new(ipv4hdr: &Ipv4Hdr, network_header_length: usize) -> Result { Ok(Self { ipv4_source: ipv4hdr.src_addr, ipv4_destination: ipv4hdr.dst_addr, - data_length: data_length as u16, + total_length: u16::from_be(ipv4hdr.tot_len), + network_header_length: network_header_length as u16, protocol: ipv4hdr.proto as u8, }) } #[inline(always)] fn to_packet_log(&self, header: &T) -> EbpfEventIpv4 { + let header_length = header.header_length(); + let data_length = self + .total_length + .saturating_sub(self.network_header_length + u16::from(header_length)); + EbpfEventIpv4::new( unsafe { bpf_ktime_get_ns() }, self.ipv4_destination, self.ipv4_source, header.destination_port(), header.source_port(), - self.data_length, - self.data_length + header.header_length() as u16, + data_length, + self.total_length, header.window_size(), header.combined_flags(), self.protocol, - header.header_length(), + header_length, header.sequence_number(), header.sequence_number_ack(), header.icmp_type(), diff --git a/ebpf-ipv6/src/main.rs b/ebpf-ipv6/src/main.rs index b094dc1a..a1d69c34 100644 --- a/ebpf-ipv6/src/main.rs +++ b/ebpf-ipv6/src/main.rs @@ -99,7 +99,8 @@ fn process_packet(ctx: &TcContext) -> Result { // 2) Build packet_info for IPv6 let ipv6hdr = ctx.load::(EthHdr::LEN).map_err(|_| ())?; - let packet_info = PacketInfo::new(&ipv6hdr, ctx.data_end() - ctx.data(), final_proto)?; + let network_header_length = offset_after_ext - EthHdr::LEN; + let packet_info = PacketInfo::new(&ipv6hdr, final_proto, network_header_length)?; // 3) Dispatch on the final protocol match final_proto { @@ -141,34 +142,45 @@ fn process_transport_packet( struct PacketInfo { ipv6_source: u128, ipv6_destination: u128, - data_length: u16, + total_length: u16, + network_header_length: u16, protocol: u8, } impl PacketInfo { - fn new(ipv6hdr: &Ipv6Hdr, data_length: usize, protocol: IpProto) -> Result { + fn new( + ipv6hdr: &Ipv6Hdr, + protocol: IpProto, + network_header_length: usize, + ) -> Result { Ok(Self { ipv6_source: u128::from_be_bytes(unsafe { ipv6hdr.src_addr.in6_u.u6_addr8 }), ipv6_destination: u128::from_be_bytes(unsafe { ipv6hdr.dst_addr.in6_u.u6_addr8 }), - data_length: data_length as u16, + total_length: Ipv6Hdr::LEN as u16 + u16::from_be(ipv6hdr.payload_len), + network_header_length: network_header_length as u16, protocol: protocol as u8, }) } #[inline(always)] fn to_packet_log(&self, header: &T) -> EbpfEventIpv6 { + let header_length = header.header_length(); + let data_length = self + .total_length + .saturating_sub(self.network_header_length + u16::from(header_length)); + EbpfEventIpv6::new( unsafe { bpf_ktime_get_ns() }, self.ipv6_destination, self.ipv6_source, header.destination_port(), header.source_port(), - self.data_length, - self.data_length + header.header_length() as u16, + data_length, + self.total_length, header.window_size(), header.combined_flags(), self.protocol, - header.header_length(), + header_length, header.sequence_number(), header.sequence_number_ack(), header.icmp_type(), From 278d56429d76a8522da5c5c3d6ea5002d1c4319f Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:03:51 +0100 Subject: [PATCH 15/34] docs: note deferred stress testing options --- docs/performance-roadmap.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/performance-roadmap.md b/docs/performance-roadmap.md index fff966b9..60d7b921 100644 --- a/docs/performance-roadmap.md +++ b/docs/performance-roadmap.md @@ -11,6 +11,7 @@ Use this as an execution checklist, not as a design essay. - [ ] Prefer hot-path wins over broad rewrites. - [ ] Do not trade away feature correctness for speed without making that trade explicit. - [ ] Keep performance work in clean, bounded commits. +- [ ] After recent ingestion-semantics fixes, stabilize and measure before expanding the eBPF event payload further. ## Phase 0: Baseline And Profiling @@ -197,6 +198,34 @@ Medium. Important once export volume becomes the limiter. Why this matters: If RustiFlow is going to chase higher link rates, it needs good self-observability. +## Deferred Stress Testing Notes + +- [ ] Remember that `10Gbps+` software-path testing is possible without a physical external link. +- [ ] Prefer doing this on an actual Linux development machine instead of macOS. +- [ ] Treat software-only stress testing as useful for RustiFlow and eBPF/userspace throughput, but not as a full substitute for real NIC validation. + +Practical options for later: + +- `veth` pair + network namespaces + RustiFlow on one side +- Linux `pktgen` for high packet-rate stress +- TRex for more realistic replay and traffic profiles +- MoonGen for high-rate scripted generation + +What this is good for: + +- packet-rate pressure +- flow-table pressure +- eBPF event rate +- userspace queueing and dropped packets +- export throughput + +What this does not fully prove: + +- physical NIC behavior +- PCIe and DMA effects +- hardware offloads +- real RSS / hardware queue behavior + ## Not Early Priorities - [ ] Do not start with micro-optimizing individual feature modules before fixing keying and stats math. @@ -222,3 +251,4 @@ If RustiFlow is going to chase higher link rates, it needs good self-observabili - Use short dated notes here when a measurement or optimization changes priorities. - If a planned optimization turns out not to matter, mark it done and note that it was ruled out. +- 2026-03-25: Decision: stabilize and measure after the current timestamp and length/header-length alignment work before adding more packet metadata to eBPF events. From b787c8612553d550c7de77eafaad40be8ef33244 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:03:54 +0100 Subject: [PATCH 16/34] features: preserve sub-millisecond timing precision --- rustiflow/src/flows/features/iat_stats.rs | 30 ++++---- rustiflow/src/flows/features/timing_stats.rs | 74 ++++++++++++------- rustiflow/src/flows/nf_flow.rs | 12 +-- .../src/tests/flows/feature_modules_test.rs | 46 +++++++++++- 4 files changed, 110 insertions(+), 52 deletions(-) diff --git a/rustiflow/src/flows/features/iat_stats.rs b/rustiflow/src/flows/features/iat_stats.rs index 9338b351..b5f90850 100644 --- a/rustiflow/src/flows/features/iat_stats.rs +++ b/rustiflow/src/flows/features/iat_stats.rs @@ -7,9 +7,9 @@ pub struct IATStats { pub fwd_iat: FeatureStats, pub bwd_iat: FeatureStats, pub iat: FeatureStats, - last_timestamp_fwd_ms: Option, - last_timestamp_bwd_ms: Option, - last_timestamp_ms: Option, + last_timestamp_fwd_us: Option, + last_timestamp_bwd_us: Option, + last_timestamp_us: Option, } impl IATStats { @@ -18,36 +18,36 @@ impl IATStats { fwd_iat: FeatureStats::new(), bwd_iat: FeatureStats::new(), iat: FeatureStats::new(), - last_timestamp_fwd_ms: None, - last_timestamp_bwd_ms: None, - last_timestamp_ms: None, + last_timestamp_fwd_us: None, + last_timestamp_bwd_us: None, + last_timestamp_us: None, } } } impl FlowFeature for IATStats { fn update(&mut self, packet: &PacketFeatures, is_forward: bool, _last_timestamp_us: i64) { - let current_ts_ms = packet.timestamp_us / 1000; + let current_ts_us = packet.timestamp_us; - let duration_ms = |last_timestamp_ms: Option| { - last_timestamp_ms.map(|ts_ms| (current_ts_ms - ts_ms) as f64) + let duration_ms = |last_timestamp_us: Option| { + last_timestamp_us.map(|ts_us| (current_ts_us - ts_us) as f64 / 1_000.0) }; - if let Some(dur) = duration_ms(self.last_timestamp_ms) { + if let Some(dur) = duration_ms(self.last_timestamp_us) { self.iat.add_value(dur); } - self.last_timestamp_ms = Some(current_ts_ms); + self.last_timestamp_us = Some(current_ts_us); if is_forward { - if let Some(dur) = duration_ms(self.last_timestamp_fwd_ms) { + if let Some(dur) = duration_ms(self.last_timestamp_fwd_us) { self.fwd_iat.add_value(dur); } - self.last_timestamp_fwd_ms = Some(current_ts_ms); + self.last_timestamp_fwd_us = Some(current_ts_us); } else { - if let Some(dur) = duration_ms(self.last_timestamp_bwd_ms) { + if let Some(dur) = duration_ms(self.last_timestamp_bwd_us) { self.bwd_iat.add_value(dur); } - self.last_timestamp_bwd_ms = Some(current_ts_ms); + self.last_timestamp_bwd_us = Some(current_ts_us); } } diff --git a/rustiflow/src/flows/features/timing_stats.rs b/rustiflow/src/flows/features/timing_stats.rs index ea85dfac..a6c1ecea 100644 --- a/rustiflow/src/flows/features/timing_stats.rs +++ b/rustiflow/src/flows/features/timing_stats.rs @@ -4,54 +4,74 @@ use super::util::FlowFeature; #[derive(Clone)] pub struct TimingStats { - pub first_timestamp_fwd_ms: Option, - pub first_timestamp_bwd_ms: Option, - pub last_timestamp_fwd_ms: Option, - pub last_timestamp_bwd_ms: Option, + first_timestamp_fwd_us: Option, + first_timestamp_bwd_us: Option, + last_timestamp_fwd_us: Option, + last_timestamp_bwd_us: Option, } impl TimingStats { pub fn new() -> Self { TimingStats { - first_timestamp_fwd_ms: None, - first_timestamp_bwd_ms: None, - last_timestamp_fwd_ms: None, - last_timestamp_bwd_ms: None, + first_timestamp_fwd_us: None, + first_timestamp_bwd_us: None, + last_timestamp_fwd_us: None, + last_timestamp_bwd_us: None, } } - pub fn get_fwd_duration(&self) -> i64 { - if let (Some(first), Some(last)) = (self.first_timestamp_fwd_ms, self.last_timestamp_fwd_ms) + pub fn first_timestamp_fwd_ms(&self) -> f64 { + self.first_timestamp_fwd_us + .map_or(0.0, |timestamp_us| timestamp_us as f64 / 1_000.0) + } + + pub fn first_timestamp_bwd_ms(&self) -> f64 { + self.first_timestamp_bwd_us + .map_or(0.0, |timestamp_us| timestamp_us as f64 / 1_000.0) + } + + pub fn last_timestamp_fwd_ms(&self) -> f64 { + self.last_timestamp_fwd_us + .map_or(0.0, |timestamp_us| timestamp_us as f64 / 1_000.0) + } + + pub fn last_timestamp_bwd_ms(&self) -> f64 { + self.last_timestamp_bwd_us + .map_or(0.0, |timestamp_us| timestamp_us as f64 / 1_000.0) + } + + pub fn get_fwd_duration(&self) -> f64 { + if let (Some(first), Some(last)) = (self.first_timestamp_fwd_us, self.last_timestamp_fwd_us) { - last - first + (last - first) as f64 / 1_000.0 } else { - 0 + 0.0 } } - pub fn get_bwd_duration(&self) -> i64 { - if let (Some(first), Some(last)) = (self.first_timestamp_bwd_ms, self.last_timestamp_bwd_ms) + pub fn get_bwd_duration(&self) -> f64 { + if let (Some(first), Some(last)) = (self.first_timestamp_bwd_us, self.last_timestamp_bwd_us) { - last - first + (last - first) as f64 / 1_000.0 } else { - 0 + 0.0 } } } impl FlowFeature for TimingStats { fn update(&mut self, packet: &PacketFeatures, is_forward: bool, _last_timestamp_us: i64) { - let current_ts = packet.timestamp_us / 1000; + let current_ts = packet.timestamp_us; if is_forward { - if self.first_timestamp_fwd_ms.is_none() { - self.first_timestamp_fwd_ms = Some(current_ts); + if self.first_timestamp_fwd_us.is_none() { + self.first_timestamp_fwd_us = Some(current_ts); } - self.last_timestamp_fwd_ms = Some(current_ts); + self.last_timestamp_fwd_us = Some(current_ts); } else { - if self.first_timestamp_bwd_ms.is_none() { - self.first_timestamp_bwd_ms = Some(current_ts); + if self.first_timestamp_bwd_us.is_none() { + self.first_timestamp_bwd_us = Some(current_ts); } - self.last_timestamp_bwd_ms = Some(current_ts); + self.last_timestamp_bwd_us = Some(current_ts); } } @@ -62,10 +82,10 @@ impl FlowFeature for TimingStats { fn dump(&self) -> String { format!( "{},{},{},{},{},{}", - self.first_timestamp_fwd_ms.unwrap_or(0), - self.first_timestamp_bwd_ms.unwrap_or(0), - self.last_timestamp_fwd_ms.unwrap_or(0), - self.last_timestamp_bwd_ms.unwrap_or(0), + self.first_timestamp_fwd_ms(), + self.first_timestamp_bwd_ms(), + self.last_timestamp_fwd_ms(), + self.last_timestamp_bwd_ms(), self.get_fwd_duration(), self.get_bwd_duration() ) diff --git a/rustiflow/src/flows/nf_flow.rs b/rustiflow/src/flows/nf_flow.rs index 27b43dda..38eaf82b 100644 --- a/rustiflow/src/flows/nf_flow.rs +++ b/rustiflow/src/flows/nf_flow.rs @@ -145,17 +145,13 @@ impl Flow for NfFlow { self.basic_flow.get_flow_duration_msec(), self.packet_len_stats.flow_count(), self.packet_len_stats.flow_total(), - self.timing_stats - .first_timestamp_fwd_ms - .unwrap_or_else(|| 0), - self.timing_stats.last_timestamp_fwd_ms.unwrap_or_else(|| 0), + self.timing_stats.first_timestamp_fwd_ms(), + self.timing_stats.last_timestamp_fwd_ms(), self.timing_stats.get_fwd_duration(), self.packet_len_stats.fwd_packet_len.get_count(), self.packet_len_stats.fwd_packet_len.get_total(), - self.timing_stats - .first_timestamp_bwd_ms - .unwrap_or_else(|| 0), - self.timing_stats.last_timestamp_bwd_ms.unwrap_or_else(|| 0), + self.timing_stats.first_timestamp_bwd_ms(), + self.timing_stats.last_timestamp_bwd_ms(), self.timing_stats.get_bwd_duration(), self.packet_len_stats.bwd_packet_len.get_count(), self.packet_len_stats.bwd_packet_len.get_total(), diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index 9a4aa5e9..273bc294 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -5,9 +5,10 @@ mod tests { use crate::{ flows::{ features::{ - active_idle_stats::ActiveIdleStats, icmp_stats::IcmpStats, + active_idle_stats::ActiveIdleStats, iat_stats::IATStats, icmp_stats::IcmpStats, payload_stats::PayloadLengthStats, retransmission_stats::RetransmissionStats, - subflow_stats::SubflowStats, util::FlowFeature, window_size_stats::WindowSizeStats, + subflow_stats::SubflowStats, timing_stats::TimingStats, util::FlowFeature, + window_size_stats::WindowSizeStats, }, util::FlowExpireCause, }, @@ -164,4 +165,45 @@ mod tests { assert_eq!(stats.idle_stats.get_total(), 9_000.0); assert_eq!(stats.idle_stats.get_count(), 2); } + + #[test] + fn iat_stats_preserve_sub_millisecond_precision() { + let mut stats = IATStats::new(); + + let first = packet(1_000_000); + stats.update(&first, true, first.timestamp_us); + + let second = packet(1_000_500); + stats.update(&second, true, first.timestamp_us); + + let third = packet(1_001_250); + stats.update(&third, false, second.timestamp_us); + + assert_eq!(stats.fwd_iat.get_count(), 1); + assert!((stats.fwd_iat.get_mean() - 0.5).abs() < f64::EPSILON); + assert_eq!(stats.iat.get_count(), 2); + assert!((stats.iat.get_total() - 1.25).abs() < f64::EPSILON); + } + + #[test] + fn timing_stats_preserve_sub_millisecond_precision() { + let mut stats = TimingStats::new(); + + let first = packet(1_000_000); + stats.update(&first, true, first.timestamp_us); + + let second = packet(1_000_750); + stats.update(&second, true, first.timestamp_us); + + let third = packet(1_001_250); + stats.update(&third, false, second.timestamp_us); + + let fourth = packet(1_002_125); + stats.update(&fourth, false, third.timestamp_us); + + assert!((stats.first_timestamp_fwd_ms() - 1_000.0).abs() < f64::EPSILON); + assert!((stats.last_timestamp_fwd_ms() - 1_000.75).abs() < f64::EPSILON); + assert!((stats.get_fwd_duration() - 0.75).abs() < f64::EPSILON); + assert!((stats.get_bwd_duration() - 0.875).abs() < f64::EPSILON); + } } From 1c5cde266a94d0f8bb0d69673449971d4cf21aa5 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:11:13 +0100 Subject: [PATCH 17/34] docs: condense agent workplan --- AGENTS.md | 62 +++++++++++++++------------------------ docs/engineering-notes.md | 18 ++++++++++++ 2 files changed, 42 insertions(+), 38 deletions(-) create mode 100644 docs/engineering-notes.md diff --git a/AGENTS.md b/AGENTS.md index 7ac7799b..73d73eeb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -98,18 +98,18 @@ If a change touches shared code used by multiple crates, prefer checking the wor - Treat the current test suite carefully: some tests may be stale or incomplete relative to the active code. - When adding or repairing tests, prefer tests that reflect the current flow architecture and public behavior rather than resurrecting outdated internal field expectations. -## Feature Engineering Priorities +## Engineering Checklist -When deciding where to spend effort on RustiFlow features, prefer this order: +Keep this section short and current. Longer decision history belongs in +`docs/engineering-notes.md`. -1. Semantic correctness across ingestion modes -2. Quality of existing transport and timing features -3. New diagnostic features with clear operational value -4. Exporter-specific schema completeness +### 1. Ingestion semantics -### 1. Keep offline and realtime semantically aligned +- [x] Preserve kernel capture timestamps in realtime events. +- [x] Align realtime packet, header, and payload length semantics with offline mode. +- [ ] Stabilize and measure before expanding the eBPF event payload further. -Highest-leverage files: +Primary files: - `rustiflow/src/packet_features.rs` - `rustiflow/src/pcap.rs` @@ -118,15 +118,15 @@ Highest-leverage files: - `ebpf-ipv4/src/main.rs` - `ebpf-ipv6/src/main.rs` -Focus areas: +### 2. Existing feature families -- Prefer improving packet metadata fidelity before adding many new derived features. -- Treat timestamp semantics, packet length semantics, and parser coverage as foundational. -- If offline and realtime do not mean the same thing, higher-level flow features are less trustworthy. +- [x] Preserve sub-millisecond timing and IAT precision. +- [ ] Improve retransmission detection beyond exact duplicate TCP sequence numbers. +- [ ] Revisit active/idle and subflow threshold behavior. +- [ ] Expand ICMP behavior beyond first seen type/code. +- [ ] Make TCP lifecycle quality more explicit. -### 2. Strengthen existing weak feature families before adding many new columns - -Most likely to benefit from refinement: +Primary files: - `rustiflow/src/flows/features/retransmission_stats.rs` - `rustiflow/src/flows/features/iat_stats.rs` @@ -136,36 +136,22 @@ Most likely to benefit from refinement: - `rustiflow/src/flows/features/icmp_stats.rs` - `rustiflow/src/flows/basic_flow.rs` -Concrete priorities: - -- Upgrade retransmission logic beyond simple repeated TCP sequence numbers. -- Preserve more timing precision for short flows and LAN traffic. -- Revisit hardcoded active/idle and subflow thresholds. -- Make TCP lifecycle quality more explicit: handshake success, reset behavior, close style. -- Expand ICMP handling beyond first seen type/code. - -### 3. Highest-value new feature families - -Prefer new features that explain behavior, not just more totals and moments. - -Best candidates: +### 3. New diagnostic features -- TCP quality: handshake completion, SYN-to-SYN/ACK timing, duplicate ACKs, zero-window events, reset phase, close style. -- IP/path signals: TTL or hop-limit stats, DSCP/ECN stats, IPv4 fragmentation, IPv6 fragment behavior. -- ICMP behavior: request/response balance, error classes, unreachable subtype counts. -- Optional lightweight application-aware features: DNS, TLS, HTTP, QUIC handshake metadata without deep DPI dependence. -- Better contamination-free abstractions than only coarse IANA port buckets. +- [ ] TCP quality signals: handshake completion, duplicate ACKs, zero-window events, reset phase, close style. +- [ ] IP and path signals: TTL or hop-limit, DSCP or ECN, fragmentation behavior. +- [ ] Optional lightweight application-aware metadata: DNS, TLS, HTTP, QUIC. +- [ ] Better contamination-free abstractions than only coarse IANA port buckets. -### 4. Exporter-specific gaps worth filling +### 4. Exporter gaps -- `rustiflow/src/flows/nf_flow.rs` still documents missing fields such as `ip_version`, `vlan_id`, and `tunnel_id`. -- Prefer filling exporter gaps after the underlying packet metadata exists in both offline and realtime paths. +- [ ] Fill `nf_flow` gaps such as `ip_version`, `vlan_id`, and `tunnel_id` once packet metadata exists in both ingestion modes. ### Working rule Before adding a new feature, ask: - Is the underlying packet metadata trustworthy in both offline and realtime modes? -- Does this feature improve diagnostics or model usefulness more than refining an existing weak feature? -- Can it be expressed as a reusable `FlowFeature` instead of exporter-specific logic? +- Does this improve diagnostics more than refining an existing weak feature? +- Can it live in a reusable `FlowFeature`? - Can it be tested with a tiny deterministic fixture? diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md new file mode 100644 index 00000000..ba9bb6d9 --- /dev/null +++ b/docs/engineering-notes.md @@ -0,0 +1,18 @@ +# Engineering Notes + +This file keeps short-lived design choices and execution notes that would make +`AGENTS.md` too long. + +## 2026-03-25 + +- Use branch `codex/ingestion-semantics-foundation` for the AGENTS-driven + improvement track instead of landing exploratory changes directly on `main`. +- Prefer performance-aware correctness for ingestion work so foundational + metadata changes do not need to be redone later. +- Realtime packet events now carry kernel monotonic timestamps and aligned + packet/header/payload length semantics. Stabilize and measure before adding + more event fields. +- Timing and IAT features now preserve sub-millisecond precision internally. +- Retransmission work should stay bounded: fix non-TCP false positives, move + beyond exact duplicate sequence numbers, and leave richer TCP quality signals + such as duplicate ACKs and handshake analysis for later checklist items. From 6673889cd801400777a3f5031713cb82c6d2ce47 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:11:24 +0100 Subject: [PATCH 18/34] features: refine retransmission tracking --- AGENTS.md | 2 +- docs/engineering-notes.md | 3 + .../flows/features/retransmission_stats.rs | 115 ++++++++++++++---- .../src/tests/flows/feature_modules_test.rs | 52 +++++++- 4 files changed, 144 insertions(+), 28 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 73d73eeb..f13a1622 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -121,7 +121,7 @@ Primary files: ### 2. Existing feature families - [x] Preserve sub-millisecond timing and IAT precision. -- [ ] Improve retransmission detection beyond exact duplicate TCP sequence numbers. +- [x] Improve retransmission detection beyond exact duplicate TCP sequence numbers. - [ ] Revisit active/idle and subflow threshold behavior. - [ ] Expand ICMP behavior beyond first seen type/code. - [ ] Make TCP lifecycle quality more explicit. diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index ba9bb6d9..ef77481c 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -16,3 +16,6 @@ This file keeps short-lived design choices and execution notes that would make - Retransmission work should stay bounded: fix non-TCP false positives, move beyond exact duplicate sequence numbers, and leave richer TCP quality signals such as duplicate ACKs and handshake analysis for later checklist items. +- Retransmission stats now stay TCP-only and count overlap in TCP sequence + space, including SYN and FIN sequence-number use, instead of only exact + duplicate sequence numbers. diff --git a/rustiflow/src/flows/features/retransmission_stats.rs b/rustiflow/src/flows/features/retransmission_stats.rs index a25e1a1b..ee728f8e 100644 --- a/rustiflow/src/flows/features/retransmission_stats.rs +++ b/rustiflow/src/flows/features/retransmission_stats.rs @@ -1,10 +1,8 @@ -use std::collections::HashSet; - use pnet::packet::ip::IpNextHeaderProtocols; use crate::{ flows::util::FlowExpireCause, - packet_features::{PacketFeatures, ACK_FLAG}, + packet_features::{PacketFeatures, ACK_FLAG, FIN_FLAG, SYN_FLAG}, }; use super::util::FlowFeature; @@ -13,48 +11,82 @@ use super::util::FlowFeature; pub struct RetransmissionStats { pub fwd_retransmission_count: u32, pub bwd_retransmission_count: u32, - // We store seen sequence numbers in each direction - fwd_seen_seqs: HashSet, - bwd_seen_seqs: HashSet, + fwd_seen_ranges: Vec, + bwd_seen_ranges: Vec, +} + +#[derive(Clone, Copy)] +struct SequenceRange { + start: u32, + end: u32, +} + +impl SequenceRange { + fn overlaps(self, other: SequenceRange) -> bool { + self.start < other.end && other.start < self.end + } } -/// Our implementation only tracks full retransmissions, i.e., packets that are -/// sent more than once. We do not track partial retransmissions or overlapping segments. +/// Retransmissions are tracked for TCP sequence-space segments only. +/// +/// This counts one retransmission when a TCP packet's sequence-space range +/// overlaps data or SYN/FIN sequence space that was already seen in the same +/// direction. It still does not try to model full TCP stream reassembly or +/// sequence-number wraparound. impl RetransmissionStats { pub fn new() -> Self { RetransmissionStats { fwd_retransmission_count: 0, bwd_retransmission_count: 0, - fwd_seen_seqs: HashSet::new(), - bwd_seen_seqs: HashSet::new(), + fwd_seen_ranges: Vec::new(), + bwd_seen_ranges: Vec::new(), } } + + fn update_direction( + ranges: &mut Vec, + retransmission_count: &mut u32, + packet: &PacketFeatures, + ) { + let Some(packet_range) = sequence_range(packet) else { + return; + }; + + if ranges + .iter() + .any(|seen_range| seen_range.overlaps(packet_range)) + { + *retransmission_count += 1; + } + + insert_sequence_range(ranges, packet_range); + } } impl FlowFeature for RetransmissionStats { fn update(&mut self, packet: &PacketFeatures, is_forward: bool, _last_timestamp_us: i64) { - let seq = packet.sequence_number; - - if packet.protocol == IpNextHeaderProtocols::Icmp.0 - || packet.protocol == IpNextHeaderProtocols::Icmpv6.0 - { - // Skip ICMP packets + if packet.protocol != IpNextHeaderProtocols::Tcp.0 { + // Retransmission stats are TCP-only. return; } - // Exclude pure ACKs (only ACK flag set, no data length) + // Exclude pure ACKs (only ACK flag set, no data length). if packet.flags == ACK_FLAG && packet.data_length == 0 { return; } if is_forward { - if !self.fwd_seen_seqs.insert(seq) { - self.fwd_retransmission_count += 1; - } + Self::update_direction( + &mut self.fwd_seen_ranges, + &mut self.fwd_retransmission_count, + packet, + ); } else { - if !self.bwd_seen_seqs.insert(seq) { - self.bwd_retransmission_count += 1; - } + Self::update_direction( + &mut self.bwd_seen_ranges, + &mut self.bwd_retransmission_count, + packet, + ); } } @@ -75,3 +107,40 @@ impl FlowFeature for RetransmissionStats { "flow_retransmission_count,fwd_retransmission_count,bwd_retransmission_count".to_string() } } + +fn sequence_range(packet: &PacketFeatures) -> Option { + let control_length = + u32::from((packet.flags & SYN_FLAG) != 0) + u32::from((packet.flags & FIN_FLAG) != 0); + let segment_length = u32::from(packet.data_length) + control_length; + + if segment_length == 0 { + return None; + } + + Some(SequenceRange { + start: packet.sequence_number, + end: packet.sequence_number.saturating_add(segment_length), + }) +} + +fn insert_sequence_range(ranges: &mut Vec, mut new_range: SequenceRange) { + let mut index = 0; + + while index < ranges.len() { + let current = ranges[index]; + if current.end < new_range.start { + index += 1; + continue; + } + + if new_range.end < current.start { + break; + } + + new_range.start = new_range.start.min(current.start); + new_range.end = new_range.end.max(current.end); + ranges.remove(index); + } + + ranges.insert(index, new_range); +} diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index 273bc294..f779f019 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -12,7 +12,7 @@ mod tests { }, util::FlowExpireCause, }, - packet_features::{PacketFeatures, ACK_FLAG}, + packet_features::{PacketFeatures, ACK_FLAG, FIN_FLAG, SYN_FLAG}, }; fn packet(timestamp_us: i64) -> PacketFeatures { @@ -43,7 +43,7 @@ mod tests { } #[test] - fn retransmission_stats_skip_pure_acks_and_icmp_and_track_duplicates_by_direction() { + fn retransmission_stats_only_track_tcp_overlap_by_direction() { let mut stats = RetransmissionStats::new(); let mut pure_ack = packet(1_000_000); @@ -54,6 +54,12 @@ mod tests { stats.update(&pure_ack, true, pure_ack.timestamp_us); stats.update(&pure_ack, true, pure_ack.timestamp_us); + let mut udp = packet(1_250_000); + udp.protocol = IpNextHeaderProtocols::Udp.0; + udp.sequence_number = 0; + stats.update(&udp, true, pure_ack.timestamp_us); + stats.update(&udp, true, pure_ack.timestamp_us); + let mut icmp = packet(1_500_000); icmp.protocol = IpNextHeaderProtocols::Icmp.0; icmp.sequence_number = 22; @@ -63,20 +69,58 @@ mod tests { let mut fwd = packet(2_000_000); fwd.protocol = IpNextHeaderProtocols::Tcp.0; fwd.sequence_number = 100; + fwd.data_length = 100; stats.update(&fwd, true, pure_ack.timestamp_us); - stats.update(&fwd, true, pure_ack.timestamp_us); + + let mut partial_fwd = packet(2_100_000); + partial_fwd.protocol = IpNextHeaderProtocols::Tcp.0; + partial_fwd.sequence_number = 150; + partial_fwd.data_length = 100; + stats.update(&partial_fwd, true, fwd.timestamp_us); let mut bwd = packet(2_500_000); bwd.protocol = IpNextHeaderProtocols::Tcp.0; + bwd.flags = SYN_FLAG; + bwd.syn_flag = 1; bwd.sequence_number = 200; stats.update(&bwd, false, fwd.timestamp_us); - stats.update(&bwd, false, fwd.timestamp_us); + + let mut duplicate_syn = packet(2_600_000); + duplicate_syn.protocol = IpNextHeaderProtocols::Tcp.0; + duplicate_syn.flags = SYN_FLAG; + duplicate_syn.syn_flag = 1; + duplicate_syn.sequence_number = 200; + stats.update(&duplicate_syn, false, bwd.timestamp_us); assert_eq!(stats.fwd_retransmission_count, 1); assert_eq!(stats.bwd_retransmission_count, 1); assert_eq!(stats.dump(), "2,1,1"); } + #[test] + fn retransmission_stats_treat_fin_sequence_space_as_retransmittable() { + let mut stats = RetransmissionStats::new(); + + let mut fin = packet(1_000_000); + fin.protocol = IpNextHeaderProtocols::Tcp.0; + fin.flags = FIN_FLAG | ACK_FLAG; + fin.fin_flag = 1; + fin.ack_flag = 1; + fin.sequence_number = 500; + stats.update(&fin, true, fin.timestamp_us); + + let mut duplicate_fin = packet(1_100_000); + duplicate_fin.protocol = IpNextHeaderProtocols::Tcp.0; + duplicate_fin.flags = FIN_FLAG | ACK_FLAG; + duplicate_fin.fin_flag = 1; + duplicate_fin.ack_flag = 1; + duplicate_fin.sequence_number = 500; + stats.update(&duplicate_fin, true, fin.timestamp_us); + + assert_eq!(stats.fwd_retransmission_count, 1); + assert_eq!(stats.bwd_retransmission_count, 0); + } + #[test] fn window_size_stats_capture_initial_sizes_for_each_direction() { let mut stats = WindowSizeStats::new(); From 51f37f581915fb96e5e626af6ea7867569017fa3 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:16:31 +0100 Subject: [PATCH 19/34] features: tighten active and subflow thresholds --- AGENTS.md | 2 +- docs/engineering-notes.md | 3 +++ .../src/flows/features/active_idle_stats.rs | 15 +++++++------ rustiflow/src/flows/features/subflow_stats.rs | 12 +++++++++- .../src/tests/flows/feature_modules_test.rs | 22 +++++++++++++++++-- 5 files changed, 43 insertions(+), 11 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index f13a1622..427944fd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -122,7 +122,7 @@ Primary files: - [x] Preserve sub-millisecond timing and IAT precision. - [x] Improve retransmission detection beyond exact duplicate TCP sequence numbers. -- [ ] Revisit active/idle and subflow threshold behavior. +- [x] Revisit active/idle and subflow threshold behavior. - [ ] Expand ICMP behavior beyond first seen type/code. - [ ] Make TCP lifecycle quality more explicit. diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index ef77481c..d7ae0690 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -19,3 +19,6 @@ This file keeps short-lived design choices and execution notes that would make - Retransmission stats now stay TCP-only and count overlap in TCP sequence space, including SYN and FIN sequence-number use, instead of only exact duplicate sequence numbers. +- Active/idle tracking now compares thresholds in microseconds before converting + to exported millisecond values, and subflow counting now represents actual + subflows instead of only counting gap boundaries after the first packet. diff --git a/rustiflow/src/flows/features/active_idle_stats.rs b/rustiflow/src/flows/features/active_idle_stats.rs index 9e801d28..b8643756 100644 --- a/rustiflow/src/flows/features/active_idle_stats.rs +++ b/rustiflow/src/flows/features/active_idle_stats.rs @@ -2,7 +2,7 @@ use crate::{flows::util::FlowExpireCause, packet_features::PacketFeatures}; use super::util::{FeatureStats, FlowFeature}; -const ACTIVE_IDLE_TIMEOUT: i64 = 5_000; // 5s +const ACTIVE_IDLE_TIMEOUT_US: i64 = 5_000_000; // 5s #[derive(Clone)] pub struct ActiveIdleStats { @@ -26,14 +26,15 @@ impl ActiveIdleStats { impl FlowFeature for ActiveIdleStats { fn update(&mut self, packet: &PacketFeatures, _is_forward: bool, _last_timestamp_us: i64) { let current_ts = packet.timestamp_us; - let duration_ms = (current_ts - self.active_end) / 1_000; // Convert to milliseconds + let idle_gap_us = current_ts - self.active_end; - if duration_ms > ACTIVE_IDLE_TIMEOUT { - let active_duration = (self.active_end - self.active_start) / 1_000; - if active_duration > 0 { - self.active_stats.add_value(active_duration as f64); + if idle_gap_us > ACTIVE_IDLE_TIMEOUT_US { + let active_duration_us = self.active_end - self.active_start; + if active_duration_us > 0 { + self.active_stats + .add_value(active_duration_us as f64 / 1_000.0); } - self.idle_stats.add_value(duration_ms as f64); + self.idle_stats.add_value(idle_gap_us as f64 / 1_000.0); self.active_start = current_ts; } self.active_end = current_ts; diff --git a/rustiflow/src/flows/features/subflow_stats.rs b/rustiflow/src/flows/features/subflow_stats.rs index bc9b883b..5a8790b5 100644 --- a/rustiflow/src/flows/features/subflow_stats.rs +++ b/rustiflow/src/flows/features/subflow_stats.rs @@ -7,16 +7,26 @@ const SUBFLOW_TIMEOUT: i64 = 1_000_000; // 1s #[derive(Clone)] pub struct SubflowStats { pub subflow_count: u32, + seen_packet: bool, } impl SubflowStats { pub fn new() -> Self { - SubflowStats { subflow_count: 0 } + SubflowStats { + subflow_count: 0, + seen_packet: false, + } } } impl FlowFeature for SubflowStats { fn update(&mut self, packet: &PacketFeatures, _is_forward: bool, last_timestamp_us: i64) { + if !self.seen_packet { + self.subflow_count = 1; + self.seen_packet = true; + return; + } + let current_ts = packet.timestamp_us; if (current_ts - last_timestamp_us) > SUBFLOW_TIMEOUT { self.subflow_count += 1; diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index f779f019..234aed49 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -173,7 +173,7 @@ mod tests { } #[test] - fn subflow_stats_increment_only_on_gaps_greater_than_one_second() { + fn subflow_stats_count_initial_subflow_and_increment_only_on_gaps_greater_than_one_second() { let mut stats = SubflowStats::new(); let first_ts = 1_000_000; @@ -189,7 +189,7 @@ mod tests { let third = packet(third_ts); stats.update(&third, true, second_ts); - assert_eq!(stats.subflow_count, 1); + assert_eq!(stats.subflow_count, 2); } #[test] @@ -210,6 +210,24 @@ mod tests { assert_eq!(stats.idle_stats.get_count(), 2); } + #[test] + fn active_idle_stats_preserve_gap_precision_and_exact_threshold_behavior() { + let first = packet(1_000_000); + let mut stats = ActiveIdleStats::new(first.timestamp_us); + stats.update(&first, true, 0); + + let exact_threshold = packet(6_000_000); + stats.update(&exact_threshold, false, first.timestamp_us); + + let over_threshold = packet(11_000_500); + stats.update(&over_threshold, true, exact_threshold.timestamp_us); + + assert_eq!(stats.active_stats.get_count(), 1); + assert!((stats.active_stats.get_total() - 5_000.0).abs() < f64::EPSILON); + assert_eq!(stats.idle_stats.get_count(), 1); + assert!((stats.idle_stats.get_total() - 5_000.5).abs() < f64::EPSILON); + } + #[test] fn iat_stats_preserve_sub_millisecond_precision() { let mut stats = IATStats::new(); From e5ee6dd739e6ada2025526fbd01e4e6407ee63a3 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:23:47 +0100 Subject: [PATCH 20/34] features: expand icmp behavior tracking --- AGENTS.md | 2 +- README.md | 4 +- docs/engineering-notes.md | 3 + rustiflow/src/flows/cic_flow.rs | 20 ++++++- rustiflow/src/flows/custom_flow.rs | 20 +++++-- rustiflow/src/flows/features/icmp_stats.rs | 57 ++++++++++++++++++- .../src/tests/flows/feature_modules_test.rs | 19 ++++++- 7 files changed, 110 insertions(+), 15 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 427944fd..4ceed600 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -123,7 +123,7 @@ Primary files: - [x] Preserve sub-millisecond timing and IAT precision. - [x] Improve retransmission detection beyond exact duplicate TCP sequence numbers. - [x] Revisit active/idle and subflow threshold behavior. -- [ ] Expand ICMP behavior beyond first seen type/code. +- [x] Expand ICMP behavior beyond first seen type/code. - [ ] Make TCP lifecycle quality more explicit. Primary files: diff --git a/README.md b/README.md index fbf449a1..2fcbffdc 100644 --- a/README.md +++ b/README.md @@ -217,10 +217,10 @@ Options: Possible values: - basic: A basic flow that stores the basic features of a flow - - cic: Represents the CIC Flow, giving 83 features + - cic: Represents the CIC Flow, giving 87 features - cidds: Represents the CIDDS Flow, giving 10 features - nfstream: Represents a nfstream inspired flow, giving 69 features - - rustiflow: Represents the Rusti Flow, giving 120 features + - rustiflow: Represents the Rusti Flow, giving 124 features - custom: Represents a flow that you can implement yourself --active-timeout diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index d7ae0690..73b950ca 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -22,3 +22,6 @@ This file keeps short-lived design choices and execution notes that would make - Active/idle tracking now compares thresholds in microseconds before converting to exported millisecond values, and subflow counting now represents actual subflows instead of only counting gap boundaries after the first packet. +- ICMP stats now keep the original first seen type and code, but also track + echo request and reply counts plus error and destination-unreachable counts + across ICMPv4 and ICMPv6 traffic. diff --git a/rustiflow/src/flows/cic_flow.rs b/rustiflow/src/flows/cic_flow.rs index 375faccd..b2bae8e8 100644 --- a/rustiflow/src/flows/cic_flow.rs +++ b/rustiflow/src/flows/cic_flow.rs @@ -117,7 +117,7 @@ impl Flow for CicFlow { {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ - {},{},{}", + {},{},{},{},{},{},{}", // Basic Info self.basic_flow.flow_key, self.basic_flow.ip_source, @@ -252,6 +252,10 @@ impl Flow for CicFlow { // ICMP Stats self.icmp_stats.get_code(), self.icmp_stats.get_type(), + self.icmp_stats.echo_request_count, + self.icmp_stats.echo_reply_count, + self.icmp_stats.error_count, + self.icmp_stats.destination_unreachable_count, // Retransmission Stats self.retransmission_stats.fwd_retransmission_count, self.retransmission_stats.bwd_retransmission_count, @@ -354,6 +358,10 @@ impl Flow for CicFlow { "Idle Min", "ICMP Code", "ICMP Type", + "ICMP Echo Request Count", + "ICMP Echo Reply Count", + "ICMP Error Count", + "ICMP Destination Unreachable Count", "Fwd TCP Retrans. Count", "Bwd TCP Retrans. Count", "Total TCP Retrans. Count", @@ -372,7 +380,7 @@ impl Flow for CicFlow { {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{}", + {},{},{},{},{},{},{},{},{},{},{},{},{}", // Basic Info iana_port_mapping(self.basic_flow.port_source), iana_port_mapping(self.basic_flow.port_destination), @@ -503,6 +511,10 @@ impl Flow for CicFlow { // ICMP Stats self.icmp_stats.get_code(), self.icmp_stats.get_type(), + self.icmp_stats.echo_request_count, + self.icmp_stats.echo_reply_count, + self.icmp_stats.error_count, + self.icmp_stats.destination_unreachable_count, // Retransmission Stats self.retransmission_stats.fwd_retransmission_count, self.retransmission_stats.bwd_retransmission_count, @@ -601,6 +613,10 @@ impl Flow for CicFlow { "Idle Min", "ICMP Code", "ICMP Type", + "ICMP Echo Request Count", + "ICMP Echo Reply Count", + "ICMP Error Count", + "ICMP Destination Unreachable Count", "Fwd TCP Retrans. Count", "Bwd TCP Retrans. Count", "Total TCP Retrans. Count", diff --git a/rustiflow/src/flows/custom_flow.rs b/rustiflow/src/flows/custom_flow.rs index c707a213..a495c982 100644 --- a/rustiflow/src/flows/custom_flow.rs +++ b/rustiflow/src/flows/custom_flow.rs @@ -68,30 +68,38 @@ impl Flow for CustomFlow { fn dump(&self) -> String { // Add here the dump of the custom flow. format!( - "{},{},{}", + "{},{},{},{},{},{},{}", self.basic_flow.flow_key, self.icmp_stats.get_type(), - self.icmp_stats.get_code() + self.icmp_stats.get_code(), + self.icmp_stats.echo_request_count, + self.icmp_stats.echo_reply_count, + self.icmp_stats.error_count, + self.icmp_stats.destination_unreachable_count ) } fn get_features() -> String { // Add here the features of the custom flow. - format!("flow_id,icmp_type,icmp_code") + format!("flow_id,icmp_type,icmp_code,icmp_echo_request_count,icmp_echo_reply_count,icmp_error_count,icmp_destination_unreachable_count") } fn dump_without_contamination(&self) -> String { // Add here the dump of the custom flow without contaminant features. format!( - "{},{}", + "{},{},{},{},{},{}", self.icmp_stats.get_type(), - self.icmp_stats.get_code() + self.icmp_stats.get_code(), + self.icmp_stats.echo_request_count, + self.icmp_stats.echo_reply_count, + self.icmp_stats.error_count, + self.icmp_stats.destination_unreachable_count ) } fn get_features_without_contamination() -> String { // Add here the features of the custom flow without contaminant features. - format!("icmp_type,icmp_code") + format!("icmp_type,icmp_code,icmp_echo_request_count,icmp_echo_reply_count,icmp_error_count,icmp_destination_unreachable_count") } fn get_first_timestamp_us(&self) -> i64 { diff --git a/rustiflow/src/flows/features/icmp_stats.rs b/rustiflow/src/flows/features/icmp_stats.rs index 0c25b8f4..b5abe435 100644 --- a/rustiflow/src/flows/features/icmp_stats.rs +++ b/rustiflow/src/flows/features/icmp_stats.rs @@ -1,4 +1,5 @@ use crate::{flows::util::FlowExpireCause, packet_features::PacketFeatures}; +use pnet::packet::ip::IpNextHeaderProtocols; use super::util::FlowFeature; @@ -7,6 +8,10 @@ pub struct IcmpStats { pub first_packet: bool, pub icmp_type: Option, pub icmp_code: Option, + pub echo_request_count: u32, + pub echo_reply_count: u32, + pub error_count: u32, + pub destination_unreachable_count: u32, } impl IcmpStats { @@ -15,6 +20,10 @@ impl IcmpStats { first_packet: true, icmp_type: None, icmp_code: None, + echo_request_count: 0, + echo_reply_count: 0, + error_count: 0, + destination_unreachable_count: 0, } } @@ -25,16 +34,50 @@ impl IcmpStats { pub fn get_code(&self) -> i16 { self.icmp_code.map(|v| v as i16).unwrap_or(-1) } + + fn record_behavior(&mut self, packet: &PacketFeatures) { + match packet.protocol { + protocol if protocol == IpNextHeaderProtocols::Icmp.0 => match packet.icmp_type { + Some(8) => self.echo_request_count += 1, + Some(0) => self.echo_reply_count += 1, + Some(3) => { + self.error_count += 1; + self.destination_unreachable_count += 1; + } + Some(4 | 5 | 11 | 12) => self.error_count += 1, + _ => {} + }, + protocol if protocol == IpNextHeaderProtocols::Icmpv6.0 => match packet.icmp_type { + Some(128) => self.echo_request_count += 1, + Some(129) => self.echo_reply_count += 1, + Some(1) => { + self.error_count += 1; + self.destination_unreachable_count += 1; + } + Some(2 | 3 | 4) => self.error_count += 1, + _ => {} + }, + _ => {} + } + } } impl FlowFeature for IcmpStats { fn update(&mut self, packet: &PacketFeatures, _is_forward: bool, _last_timestamp_us: i64) { - // Set ICMP type and code for the first packet + if packet.protocol != IpNextHeaderProtocols::Icmp.0 + && packet.protocol != IpNextHeaderProtocols::Icmpv6.0 + { + return; + } + + // Set ICMP type and code for the first ICMP packet. if self.first_packet { self.icmp_type = packet.icmp_type; self.icmp_code = packet.icmp_code; self.first_packet = false; } + + self.record_behavior(packet); } fn close(&mut self, _last_timestamp_us: i64, _cause: FlowExpireCause) { @@ -42,10 +85,18 @@ impl FlowFeature for IcmpStats { } fn dump(&self) -> String { - format!("{},{}", self.get_type(), self.get_code()) + format!( + "{},{},{},{},{},{}", + self.get_type(), + self.get_code(), + self.echo_request_count, + self.echo_reply_count, + self.error_count, + self.destination_unreachable_count + ) } fn headers() -> String { - "icmp_type,icmp_code".to_string() + "icmp_type,icmp_code,icmp_echo_request_count,icmp_echo_reply_count,icmp_error_count,icmp_destination_unreachable_count".to_string() } } diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index 234aed49..9cd38c1b 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -23,7 +23,7 @@ mod tests { } #[test] - fn icmp_stats_only_keep_first_packet_type_and_code() { + fn icmp_stats_keep_first_type_code_and_track_behavior_counts() { let mut stats = IcmpStats::new(); let mut first = packet(1_000_000); @@ -38,8 +38,25 @@ mod tests { second.icmp_code = Some(1); stats.update(&second, false, first.timestamp_us); + let mut third = packet(3_000_000); + third.protocol = IpNextHeaderProtocols::Icmpv6.0; + third.icmp_type = Some(129); + third.icmp_code = Some(0); + stats.update(&third, true, second.timestamp_us); + + let mut fourth = packet(4_000_000); + fourth.protocol = IpNextHeaderProtocols::Icmpv6.0; + fourth.icmp_type = Some(1); + fourth.icmp_code = Some(4); + stats.update(&fourth, false, third.timestamp_us); + assert_eq!(stats.get_type(), 8); assert_eq!(stats.get_code(), 0); + assert_eq!(stats.echo_request_count, 1); + assert_eq!(stats.echo_reply_count, 1); + assert_eq!(stats.error_count, 2); + assert_eq!(stats.destination_unreachable_count, 2); + assert_eq!(stats.dump(), "8,0,1,1,2,2"); } #[test] From c7c092c233ec14a9f54e2322a5d78b8bbdfb2c93 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:44:41 +0100 Subject: [PATCH 21/34] features: expose tcp lifecycle quality --- AGENTS.md | 2 +- README.md | 4 +- docs/engineering-notes.md | 3 + rustiflow/src/flows/basic_flow.rs | 54 +++++++++- rustiflow/src/flows/cic_flow.rs | 16 ++- rustiflow/src/flows/rusti_flow.rs | 20 +++- rustiflow/src/tests/flows/basic_flow_test.rs | 103 +++++++++++++++++++ 7 files changed, 192 insertions(+), 10 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 4ceed600..c7213263 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -124,7 +124,7 @@ Primary files: - [x] Improve retransmission detection beyond exact duplicate TCP sequence numbers. - [x] Revisit active/idle and subflow threshold behavior. - [x] Expand ICMP behavior beyond first seen type/code. -- [ ] Make TCP lifecycle quality more explicit. +- [x] Make TCP lifecycle quality more explicit. Primary files: diff --git a/README.md b/README.md index 2fcbffdc..2d5899ea 100644 --- a/README.md +++ b/README.md @@ -217,10 +217,10 @@ Options: Possible values: - basic: A basic flow that stores the basic features of a flow - - cic: Represents the CIC Flow, giving 87 features + - cic: Represents the CIC Flow, giving 90 features - cidds: Represents the CIDDS Flow, giving 10 features - nfstream: Represents a nfstream inspired flow, giving 69 features - - rustiflow: Represents the Rusti Flow, giving 124 features + - rustiflow: Represents the Rusti Flow, giving 127 features - custom: Represents a flow that you can implement yourself --active-timeout diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 73b950ca..62777cfc 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -25,3 +25,6 @@ This file keeps short-lived design choices and execution notes that would make - ICMP stats now keep the original first seen type and code, but also track echo request and reply counts plus error and destination-unreachable counts across ICMPv4 and ICMPv6 traffic. +- TCP lifecycle export now distinguishes observed handshake completion from + resets seen before or after that observed handshake, so richer flow schemas + do not have to infer lifecycle quality from flag totals alone. diff --git a/rustiflow/src/flows/basic_flow.rs b/rustiflow/src/flows/basic_flow.rs index f207e6e6..0f9dc3aa 100644 --- a/rustiflow/src/flows/basic_flow.rs +++ b/rustiflow/src/flows/basic_flow.rs @@ -1,6 +1,7 @@ use std::net::IpAddr; use chrono::{DateTime, Utc}; +use pnet::packet::ip::IpNextHeaderProtocols; use crate::{flows::util::iana_port_mapping, packet_features::PacketFeatures}; @@ -40,9 +41,48 @@ pub struct BasicFlow { pub(crate) state_bwd: FlowState, expected_ack_seq_fwd: Option, expected_ack_seq_bwd: Option, + saw_syn_fwd: bool, + saw_syn_ack_bwd: bool, + expected_handshake_ack_seq_fwd: Option, + pub tcp_handshake_completed: bool, + pub tcp_reset_before_handshake: bool, + pub tcp_reset_after_handshake: bool, } impl BasicFlow { + fn is_tcp(&self) -> bool { + self.protocol == IpNextHeaderProtocols::Tcp.0 + } + + fn observe_tcp_handshake(&mut self, packet: &PacketFeatures, forward: bool) { + if !self.is_tcp() || self.tcp_handshake_completed { + return; + } + + if forward && packet.syn_flag > 0 && packet.ack_flag == 0 { + self.saw_syn_fwd = true; + self.saw_syn_ack_bwd = false; + self.expected_handshake_ack_seq_fwd = None; + return; + } + + if !forward && self.saw_syn_fwd && packet.syn_flag > 0 && packet.ack_flag > 0 { + self.saw_syn_ack_bwd = true; + self.expected_handshake_ack_seq_fwd = Some(packet.sequence_number + 1); + return; + } + + if forward + && self.saw_syn_fwd + && self.saw_syn_ack_bwd + && packet.ack_flag > 0 + && packet.syn_flag == 0 + && Some(packet.sequence_number_ack) == self.expected_handshake_ack_seq_fwd + { + self.tcp_handshake_completed = true; + } + } + /// Checks if the flow is finished. /// /// A flow is considered finished when both FIN flags are set and the last ACK is received, @@ -140,18 +180,30 @@ impl Flow for BasicFlow { state_bwd: FlowState::Established, expected_ack_seq_fwd: None, expected_ack_seq_bwd: None, + saw_syn_fwd: false, + saw_syn_ack_bwd: false, + expected_handshake_ack_seq_fwd: None, + tcp_handshake_completed: false, + tcp_reset_before_handshake: false, + tcp_reset_after_handshake: false, } } fn update_flow(&mut self, packet: &PacketFeatures, fwd: bool) -> bool { self.last_timestamp_us = packet.timestamp_us; + self.observe_tcp_handshake(packet, fwd); if self.is_tcp_finished(packet, fwd) { self.flow_expire_cause = FlowExpireCause::TcpTermination; return true; } - if packet.rst_flag > 0 { + if self.is_tcp() && packet.rst_flag > 0 { + if self.tcp_handshake_completed { + self.tcp_reset_after_handshake = true; + } else { + self.tcp_reset_before_handshake = true; + } self.flow_expire_cause = FlowExpireCause::TcpReset; return true; } diff --git a/rustiflow/src/flows/cic_flow.rs b/rustiflow/src/flows/cic_flow.rs index b2bae8e8..98013f2a 100644 --- a/rustiflow/src/flows/cic_flow.rs +++ b/rustiflow/src/flows/cic_flow.rs @@ -117,7 +117,7 @@ impl Flow for CicFlow { {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{}", + {},{},{},{},{},{},{},{},{},{}", // Basic Info self.basic_flow.flow_key, self.basic_flow.ip_source, @@ -127,6 +127,9 @@ impl Flow for CicFlow { self.basic_flow.protocol, self.basic_flow.get_first_timestamp(), self.basic_flow.get_flow_duration_usec(), + u8::from(self.basic_flow.tcp_handshake_completed), + u8::from(self.basic_flow.tcp_reset_before_handshake), + u8::from(self.basic_flow.tcp_reset_after_handshake), // Packet Length Stats (fwd & bwd) self.payload_len_stats.fwd_payload_len.get_count(), self.payload_len_stats.bwd_payload_len.get_count(), @@ -277,6 +280,9 @@ impl Flow for CicFlow { "Protocol", "Timestamp", "Flow Duration", + "TCP Handshake Completed", + "TCP Reset Before Handshake", + "TCP Reset After Handshake", "Total Fwd Packet", "Total Bwd packets", "Total Length of Fwd Packet", @@ -380,12 +386,15 @@ impl Flow for CicFlow { {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{},{},{},{}", + {},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}", // Basic Info iana_port_mapping(self.basic_flow.port_source), iana_port_mapping(self.basic_flow.port_destination), self.basic_flow.protocol, self.basic_flow.get_flow_duration_usec(), + u8::from(self.basic_flow.tcp_handshake_completed), + u8::from(self.basic_flow.tcp_reset_before_handshake), + u8::from(self.basic_flow.tcp_reset_after_handshake), // Packet Length Stats (fwd & bwd) self.payload_len_stats.fwd_payload_len.get_count(), self.payload_len_stats.bwd_payload_len.get_count(), @@ -532,6 +541,9 @@ impl Flow for CicFlow { "Dst Port (IANA)", "Protocol", "Flow Duration", + "TCP Handshake Completed", + "TCP Reset Before Handshake", + "TCP Reset After Handshake", "Total Fwd Packet", "Total Bwd packets", "Total Length of Fwd Packet", diff --git a/rustiflow/src/flows/rusti_flow.rs b/rustiflow/src/flows/rusti_flow.rs index fdcdbb6b..a2079e23 100644 --- a/rustiflow/src/flows/rusti_flow.rs +++ b/rustiflow/src/flows/rusti_flow.rs @@ -121,7 +121,7 @@ impl Flow for RustiFlow { "{},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ - {},{},{}", + {},{},{},{},{},{}", // Basic Info self.basic_flow.flow_key, self.basic_flow.ip_source, @@ -133,6 +133,9 @@ impl Flow for RustiFlow { self.basic_flow.get_last_timestamp(), duration_us, self.basic_flow.flow_expire_cause.as_str(), + u8::from(self.basic_flow.tcp_handshake_completed), + u8::from(self.basic_flow.tcp_reset_before_handshake), + u8::from(self.basic_flow.tcp_reset_after_handshake), // Timing Stats self.timing_stats.dump(), // IAT Stats @@ -211,7 +214,7 @@ impl Flow for RustiFlow { "{},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ - {},{},{}", + {},{},{},{},{},{}", // Basic Info "flow_id", "source_ip", @@ -223,6 +226,9 @@ impl Flow for RustiFlow { "timestamp_last", "flow_duration_us", "flow_expire_cause", + "tcp_handshake_completed", + "tcp_reset_before_handshake", + "tcp_reset_after_handshake", // Timing Stats TimingStats::headers(), // IAT Stats @@ -268,13 +274,16 @@ impl Flow for RustiFlow { format!( "{},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{}\ - ,{},{},{},{},{},{},{},{},{}", + ,{},{},{},{},{},{},{},{},{},{},{},{}", // Basic Info iana_port_mapping(self.basic_flow.port_source), iana_port_mapping(self.basic_flow.port_destination), self.basic_flow.protocol, duration_us, self.basic_flow.flow_expire_cause.as_str(), + u8::from(self.basic_flow.tcp_handshake_completed), + u8::from(self.basic_flow.tcp_reset_before_handshake), + u8::from(self.basic_flow.tcp_reset_after_handshake), // Timing Stats self.timing_stats.get_fwd_duration(), self.timing_stats.get_bwd_duration(), @@ -353,13 +362,16 @@ impl Flow for RustiFlow { format!( "{},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{}\ - ,{},{},{},{},{},{},{},{},{}", + ,{},{},{},{},{},{},{},{},{},{},{},{}", // Basic Info "source_port_iana", "destination_port_iana", "protocol", "flow_duration_us", "flow_expire_cause", + "tcp_handshake_completed", + "tcp_reset_before_handshake", + "tcp_reset_after_handshake", // Timing Stats "fwd_duration_ms", "bwd_duration_ms", diff --git a/rustiflow/src/tests/flows/basic_flow_test.rs b/rustiflow/src/tests/flows/basic_flow_test.rs index 7fbb4161..f653654c 100644 --- a/rustiflow/src/tests/flows/basic_flow_test.rs +++ b/rustiflow/src/tests/flows/basic_flow_test.rs @@ -79,4 +79,107 @@ mod tests { assert!(flow.update_flow(&ack_fwd, true)); assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpTermination); } + + #[test] + fn tcp_handshake_completion_is_tracked() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)); + let mut flow = BasicFlow::new( + "flow-3".to_string(), + ip_source, + 50001, + ip_destination, + 443, + 6, + 1_000_000, + ); + + let mut syn = build_packet(ip_source, 50001, ip_destination, 443, 1_000_100); + syn.syn_flag = 1; + assert!(!flow.update_flow(&syn, true)); + assert!(!flow.tcp_handshake_completed); + + let mut syn_ack = build_packet(ip_destination, 443, ip_source, 50001, 1_000_200); + syn_ack.syn_flag = 1; + syn_ack.ack_flag = 1; + syn_ack.sequence_number = 700; + assert!(!flow.update_flow(&syn_ack, false)); + assert!(!flow.tcp_handshake_completed); + + let mut ack = build_packet(ip_source, 50001, ip_destination, 443, 1_000_300); + ack.ack_flag = 1; + ack.sequence_number_ack = 701; + assert!(!flow.update_flow(&ack, true)); + + assert!(flow.tcp_handshake_completed); + assert!(!flow.tcp_reset_before_handshake); + assert!(!flow.tcp_reset_after_handshake); + } + + #[test] + fn tcp_reset_before_handshake_is_classified() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)); + let mut flow = BasicFlow::new( + "flow-4".to_string(), + ip_source, + 50002, + ip_destination, + 22, + 6, + 1_000_000, + ); + + let mut syn = build_packet(ip_source, 50002, ip_destination, 22, 1_000_100); + syn.syn_flag = 1; + assert!(!flow.update_flow(&syn, true)); + + let mut rst = build_packet(ip_destination, 22, ip_source, 50002, 1_000_200); + rst.rst_flag = 1; + assert!(flow.update_flow(&rst, false)); + + assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpReset); + assert!(!flow.tcp_handshake_completed); + assert!(flow.tcp_reset_before_handshake); + assert!(!flow.tcp_reset_after_handshake); + } + + #[test] + fn tcp_reset_after_handshake_is_classified() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)); + let mut flow = BasicFlow::new( + "flow-5".to_string(), + ip_source, + 50003, + ip_destination, + 22, + 6, + 1_000_000, + ); + + let mut syn = build_packet(ip_source, 50003, ip_destination, 22, 1_000_100); + syn.syn_flag = 1; + assert!(!flow.update_flow(&syn, true)); + + let mut syn_ack = build_packet(ip_destination, 22, ip_source, 50003, 1_000_200); + syn_ack.syn_flag = 1; + syn_ack.ack_flag = 1; + syn_ack.sequence_number = 900; + assert!(!flow.update_flow(&syn_ack, false)); + + let mut ack = build_packet(ip_source, 50003, ip_destination, 22, 1_000_300); + ack.ack_flag = 1; + ack.sequence_number_ack = 901; + assert!(!flow.update_flow(&ack, true)); + + let mut rst = build_packet(ip_destination, 22, ip_source, 50003, 1_000_400); + rst.rst_flag = 1; + assert!(flow.update_flow(&rst, false)); + + assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpReset); + assert!(flow.tcp_handshake_completed); + assert!(!flow.tcp_reset_before_handshake); + assert!(flow.tcp_reset_after_handshake); + } } From d95f4df3b091e82f60b9be047257878b41c913a5 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:59:41 +0100 Subject: [PATCH 22/34] test: harden parser and lifecycle edge cases --- AGENTS.md | 1 + docs/engineering-notes.md | 8 ++ rustiflow/src/packet_features.rs | 7 ++ rustiflow/src/pcap.rs | 5 + rustiflow/src/tests/flows/basic_flow_test.rs | 115 ++++++++++++++++++ rustiflow/src/tests/flows/mod.rs | 1 + .../src/tests/flows/packet_features_test.rs | 42 +++++++ .../src/tests/flows/pcap_fixture_test.rs | 4 + rustiflow/src/tests/flows/pcap_reader_test.rs | 64 ++++++++++ 9 files changed, 247 insertions(+) create mode 100644 rustiflow/src/tests/flows/pcap_reader_test.rs diff --git a/AGENTS.md b/AGENTS.md index c7213263..a04f619d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -97,6 +97,7 @@ If a change touches shared code used by multiple crates, prefer checking the wor - Treat the current test suite carefully: some tests may be stale or incomplete relative to the active code. - When adding or repairing tests, prefer tests that reflect the current flow architecture and public behavior rather than resurrecting outdated internal field expectations. +- Before adding more feature work, prefer adversarial deterministic tests around TCP lifecycle, parser edge cases, and tiny offline fixtures that prove exported semantics. ## Engineering Checklist diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 62777cfc..68f1182f 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -28,3 +28,11 @@ This file keeps short-lived design choices and execution notes that would make - TCP lifecycle export now distinguishes observed handshake completion from resets seen before or after that observed handshake, so richer flow schemas do not have to infer lifecycle quality from flag totals alone. +- Current test-hardening focus is to add adversarial deterministic cases before + more feature work: false handshake completion, teardown edge cases, parser + rejection behavior, and tiny fixture assertions that prove exported + lifecycle semantics. +- Test hardening already exposed two parser quirks worth locking down: short + unsupported offline frames must not panic the reader, and non-first IPv6 + fragments should be dropped instead of being treated like fresh transport + headers. diff --git a/rustiflow/src/packet_features.rs b/rustiflow/src/packet_features.rs index b1f146ba..81cfd936 100644 --- a/rustiflow/src/packet_features.rs +++ b/rustiflow/src/packet_features.rs @@ -275,6 +275,13 @@ fn skip_ipv6_extension_headers<'a>( if payload.len() < 8 { return None; } + + let fragment_offset_field = u16::from_be_bytes([payload[2], payload[3]]); + let fragment_offset = (fragment_offset_field & 0xfff8) >> 3; + if fragment_offset > 0 { + return None; + } + 8 } AUTHENTICATION => { diff --git a/rustiflow/src/pcap.rs b/rustiflow/src/pcap.rs index c26e2141..28b32b6a 100644 --- a/rustiflow/src/pcap.rs +++ b/rustiflow/src/pcap.rs @@ -132,6 +132,11 @@ where } _ => { // Check if it is a Linux cooked capture + if packet.data.len() < 16 { + debug!("Packet too short for Linux cooked capture header"); + continue; + } + let ethertype = u16::from_be_bytes([packet.data[14], packet.data[15]]); match ethertype { SLL_IPV4 => { diff --git a/rustiflow/src/tests/flows/basic_flow_test.rs b/rustiflow/src/tests/flows/basic_flow_test.rs index f653654c..a82a5952 100644 --- a/rustiflow/src/tests/flows/basic_flow_test.rs +++ b/rustiflow/src/tests/flows/basic_flow_test.rs @@ -182,4 +182,119 @@ mod tests { assert!(!flow.tcp_reset_before_handshake); assert!(flow.tcp_reset_after_handshake); } + + #[test] + fn ack_only_packet_does_not_complete_tcp_handshake() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 1, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 1, 2)); + let mut flow = BasicFlow::new( + "flow-6".to_string(), + ip_source, + 50010, + ip_destination, + 443, + 6, + 1_000_000, + ); + + let mut ack = build_packet(ip_source, 50010, ip_destination, 443, 1_000_100); + ack.ack_flag = 1; + ack.sequence_number_ack = 42; + assert!(!flow.update_flow(&ack, true)); + + assert!(!flow.tcp_handshake_completed); + assert_eq!(flow.flow_expire_cause, FlowExpireCause::None); + } + + #[test] + fn syn_ack_without_initial_syn_does_not_complete_handshake() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 2, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 2, 2)); + let mut flow = BasicFlow::new( + "flow-7".to_string(), + ip_source, + 50011, + ip_destination, + 443, + 6, + 1_000_000, + ); + + let mut syn_ack = build_packet(ip_destination, 443, ip_source, 50011, 1_000_100); + syn_ack.syn_flag = 1; + syn_ack.ack_flag = 1; + syn_ack.sequence_number = 1000; + assert!(!flow.update_flow(&syn_ack, false)); + + let mut ack = build_packet(ip_source, 50011, ip_destination, 443, 1_000_200); + ack.ack_flag = 1; + ack.sequence_number_ack = 1001; + assert!(!flow.update_flow(&ack, true)); + + assert!(!flow.tcp_handshake_completed); + assert_eq!(flow.flow_expire_cause, FlowExpireCause::None); + } + + #[test] + fn fin_with_payload_uses_payload_sequence_space_for_termination() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 3, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 3, 2)); + let mut flow = BasicFlow::new( + "flow-8".to_string(), + ip_source, + 50012, + ip_destination, + 80, + 6, + 1_000_000, + ); + + let mut fin_fwd = build_packet(ip_source, 50012, ip_destination, 80, 1_000_100); + fin_fwd.fin_flag = 1; + fin_fwd.sequence_number = 100; + fin_fwd.data_length = 20; + assert!(!flow.update_flow(&fin_fwd, true)); + + let mut ack_bwd = build_packet(ip_destination, 80, ip_source, 50012, 1_000_200); + ack_bwd.ack_flag = 1; + ack_bwd.sequence_number_ack = 121; + assert!(!flow.update_flow(&ack_bwd, false)); + + let mut fin_bwd = build_packet(ip_destination, 80, ip_source, 50012, 1_000_300); + fin_bwd.fin_flag = 1; + fin_bwd.sequence_number = 200; + assert!(!flow.update_flow(&fin_bwd, false)); + + let mut ack_fwd = build_packet(ip_source, 50012, ip_destination, 80, 1_000_400); + ack_fwd.ack_flag = 1; + ack_fwd.sequence_number_ack = 201; + assert!(flow.update_flow(&ack_fwd, true)); + + assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpTermination); + } + + #[test] + fn non_tcp_flows_ignore_rst_classification() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 4, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 4, 2)); + let mut flow = BasicFlow::new( + "flow-9".to_string(), + ip_source, + 53000, + ip_destination, + 53, + 17, + 1_000_000, + ); + + let mut packet = build_packet(ip_source, 53000, ip_destination, 53, 1_000_100); + packet.protocol = 17; + packet.rst_flag = 1; + assert!(!flow.update_flow(&packet, true)); + + assert_eq!(flow.flow_expire_cause, FlowExpireCause::None); + assert!(!flow.tcp_handshake_completed); + assert!(!flow.tcp_reset_before_handshake); + assert!(!flow.tcp_reset_after_handshake); + } } diff --git a/rustiflow/src/tests/flows/mod.rs b/rustiflow/src/tests/flows/mod.rs index cd6471ad..b2536cb8 100644 --- a/rustiflow/src/tests/flows/mod.rs +++ b/rustiflow/src/tests/flows/mod.rs @@ -5,4 +5,5 @@ mod flow_table_test; mod nf_flow_test; mod packet_features_test; mod pcap_fixture_test; +mod pcap_reader_test; mod rusti_flow_test; diff --git a/rustiflow/src/tests/flows/packet_features_test.rs b/rustiflow/src/tests/flows/packet_features_test.rs index 403fa383..38d4397e 100644 --- a/rustiflow/src/tests/flows/packet_features_test.rs +++ b/rustiflow/src/tests/flows/packet_features_test.rs @@ -83,6 +83,21 @@ mod tests { assert_eq!(features.destination_port, 53); } + #[test] + fn ipv6_non_first_fragment_is_rejected() { + let mut payload = vec![0_u8; 8 + 8]; + payload[0] = IpNextHeaderProtocols::Udp.0; + payload[2..4].copy_from_slice(&0x0008_u16.to_be_bytes()); + payload[8..10].copy_from_slice(&5353_u16.to_be_bytes()); + payload[10..12].copy_from_slice(&53_u16.to_be_bytes()); + payload[12..14].copy_from_slice(&8_u16.to_be_bytes()); + + let bytes = build_ipv6_packet(IpNextHeaderProtocols::Ipv6Frag.0, &payload); + let packet = Ipv6Packet::new(&bytes).unwrap(); + + assert!(PacketFeatures::from_ipv6_packet(&packet, 100).is_none()); + } + #[test] fn truncated_ipv6_extension_header_is_rejected() { let bytes = build_ipv6_packet( @@ -93,4 +108,31 @@ mod tests { assert!(PacketFeatures::from_ipv6_packet(&packet, 7).is_none()); } + + #[test] + fn ipv6_auth_extension_is_skipped_before_tcp_parse() { + let mut payload = vec![0_u8; 12 + 20]; + payload[0] = IpNextHeaderProtocols::Tcp.0; + payload[1] = 1; + payload[12..14].copy_from_slice(&42424_u16.to_be_bytes()); + payload[14..16].copy_from_slice(&443_u16.to_be_bytes()); + payload[24] = 0x50; + payload[25] = 0x10; + + let bytes = build_ipv6_packet(IpNextHeaderProtocols::Ah.0, &payload); + let packet = Ipv6Packet::new(&bytes).unwrap(); + let features = PacketFeatures::from_ipv6_packet(&packet, 123).unwrap(); + + assert_eq!(features.protocol, IpNextHeaderProtocols::Tcp.0); + assert_eq!(features.source_port, 42424); + assert_eq!(features.destination_port, 443); + } + + #[test] + fn ipv6_esp_extension_is_rejected() { + let bytes = build_ipv6_packet(IpNextHeaderProtocols::Esp.0, &[0_u8; 16]); + let packet = Ipv6Packet::new(&bytes).unwrap(); + + assert!(PacketFeatures::from_ipv6_packet(&packet, 55).is_none()); + } } diff --git a/rustiflow/src/tests/flows/pcap_fixture_test.rs b/rustiflow/src/tests/flows/pcap_fixture_test.rs index 6741bccb..dfd16406 100644 --- a/rustiflow/src/tests/flows/pcap_fixture_test.rs +++ b/rustiflow/src/tests/flows/pcap_fixture_test.rs @@ -73,6 +73,9 @@ mod tests { established_http_flow.basic_flow.flow_expire_cause, FlowExpireCause::TcpTermination ); + assert!(established_http_flow.basic_flow.tcp_handshake_completed); + assert!(!established_http_flow.basic_flow.tcp_reset_before_handshake); + assert!(!established_http_flow.basic_flow.tcp_reset_after_handshake); assert_eq!(established_http_flow.tcp_flags_stats.get_flags(), ".AP.SF"); let icmp_flow = flows @@ -82,6 +85,7 @@ mod tests { assert_eq!(icmp_flow.packet_len_stats.flow_count(), 4); assert_eq!(icmp_flow.icmp_stats.get_type(), 8); assert_eq!(icmp_flow.icmp_stats.get_code(), 0); + assert!(!icmp_flow.basic_flow.tcp_handshake_completed); assert_eq!( icmp_flow.basic_flow.flow_expire_cause, FlowExpireCause::ExporterShutdown diff --git a/rustiflow/src/tests/flows/pcap_reader_test.rs b/rustiflow/src/tests/flows/pcap_reader_test.rs new file mode 100644 index 00000000..7917f977 --- /dev/null +++ b/rustiflow/src/tests/flows/pcap_reader_test.rs @@ -0,0 +1,64 @@ +#[cfg(test)] +mod tests { + use std::{ + fs, + path::PathBuf, + time::{SystemTime, UNIX_EPOCH}, + }; + + use tokio::sync::mpsc; + + use crate::{flows::rusti_flow::RustiFlow, pcap::read_pcap_file}; + + fn temp_pcap_path(name: &str) -> PathBuf { + let unique = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("rustiflow-{name}-{unique}.pcap")) + } + + fn malformed_short_frame_pcap() -> Vec { + let mut bytes = Vec::new(); + + // pcap global header, little-endian, Ethernet linktype. + bytes.extend_from_slice(&0xa1b2c3d4_u32.to_le_bytes()); + bytes.extend_from_slice(&2_u16.to_le_bytes()); + bytes.extend_from_slice(&4_u16.to_le_bytes()); + bytes.extend_from_slice(&0_i32.to_le_bytes()); + bytes.extend_from_slice(&0_u32.to_le_bytes()); + bytes.extend_from_slice(&65535_u32.to_le_bytes()); + bytes.extend_from_slice(&1_u32.to_le_bytes()); + + let packet_data = [ + 0, 1, 2, 3, 4, 5, // dst mac + 6, 7, 8, 9, 10, 11, // src mac + 0x12, 0x34, // unsupported ethertype, forces fallback branch + 0x99, // only one byte after ethertype, so old code could panic at [15] + ]; + + bytes.extend_from_slice(&1_u32.to_le_bytes()); + bytes.extend_from_slice(&0_u32.to_le_bytes()); + bytes.extend_from_slice(&(packet_data.len() as u32).to_le_bytes()); + bytes.extend_from_slice(&(packet_data.len() as u32).to_le_bytes()); + bytes.extend_from_slice(&packet_data); + + bytes + } + + #[tokio::test] + async fn short_unsupported_frame_does_not_panic_offline_reader() { + let path = temp_pcap_path("short-unsupported-frame"); + fs::write(&path, malformed_short_frame_pcap()).expect("pcap should be written"); + + let (tx, mut rx) = mpsc::channel::(8); + + let result = + read_pcap_file::(path.to_str().unwrap(), tx, 1, 3600, 120, None, 60).await; + + fs::remove_file(&path).ok(); + + assert!(result.is_ok()); + assert!(rx.try_recv().is_err()); + } +} From 8eb37b2983b916e1a8e533daa32b32e7ca44b734 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:01:31 +0100 Subject: [PATCH 23/34] fix: preserve flow table termination causes --- docs/engineering-notes.md | 3 ++ rustiflow/src/flow_table.rs | 7 ++-- rustiflow/src/tests/flows/flow_table_test.rs | 36 +++++++++++++++++++ .../src/tests/flows/pcap_fixture_test.rs | 5 ++- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 68f1182f..5c15cbb7 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -36,3 +36,6 @@ This file keeps short-lived design choices and execution notes that would make unsupported offline frames must not panic the reader, and non-first IPv6 fragments should be dropped instead of being treated like fresh transport headers. +- Test hardening also exposed a real `FlowTable` lifecycle bug: packet-driven + termination export could overwrite `TcpReset` with `TcpTermination`, and a + first-packet-terminated flow could be left behind for duplicate export. diff --git a/rustiflow/src/flow_table.rs b/rustiflow/src/flow_table.rs index ec59b5ad..46c387a4 100644 --- a/rustiflow/src/flow_table.rs +++ b/rustiflow/src/flow_table.rs @@ -79,8 +79,10 @@ where packet.protocol, packet.timestamp_us, ); - self.update_flow_with_packet(&mut new_flow, packet).await; - self.flow_map.insert(packet.flow_key(), new_flow); + let is_terminated = self.update_flow_with_packet(&mut new_flow, packet).await; + if !is_terminated { + self.flow_map.insert(packet.flow_key(), new_flow); + } } /// Updates a flow with a packet and exports flow if terminated. @@ -92,7 +94,6 @@ where if flow_terminated { // If terminated, export the flow - flow.close_flow(packet.timestamp_us, FlowExpireCause::TcpTermination); self.export_flow(flow.clone()).await; } else if let Some(early_export) = self.early_export { // If flow duration is greater than early export, export the flow immediately (without deletion from the flow table) diff --git a/rustiflow/src/tests/flows/flow_table_test.rs b/rustiflow/src/tests/flows/flow_table_test.rs index 99cb2f16..56610dd0 100644 --- a/rustiflow/src/tests/flows/flow_table_test.rs +++ b/rustiflow/src/tests/flows/flow_table_test.rs @@ -41,4 +41,40 @@ mod tests { "192.168.1.1:12345-192.168.1.2:443-6".to_string() ); } + + #[tokio::test] + async fn preserves_tcp_reset_cause_when_packet_terminates_flow() { + let (tx, mut rx) = mpsc::channel::(4); + let mut flow_table = FlowTable::new(3600, 120, None, tx, 60); + + let mut syn = build_packet(1_000_000); + syn.syn_flag = 1; + flow_table.process_packet(&syn).await; + + let mut rst = build_packet(1_100_000); + rst.rst_flag = 1; + flow_table.process_packet(&rst).await; + + let exported_flow = rx.recv().await.expect("expected exported reset flow"); + assert_eq!(exported_flow.flow_expire_cause, FlowExpireCause::TcpReset); + assert!(!exported_flow.tcp_handshake_completed); + assert!(exported_flow.tcp_reset_before_handshake); + } + + #[tokio::test] + async fn does_not_reexport_first_packet_terminated_flow() { + let (tx, mut rx) = mpsc::channel::(4); + let mut flow_table = FlowTable::new(3600, 120, None, tx, 60); + + let mut rst = build_packet(1_000_000); + rst.rst_flag = 1; + flow_table.process_packet(&rst).await; + + let exported_flow = rx.recv().await.expect("expected first export"); + assert_eq!(exported_flow.flow_expire_cause, FlowExpireCause::TcpReset); + + flow_table.export_all_flows(2_000_000).await; + + assert!(rx.try_recv().is_err()); + } } diff --git a/rustiflow/src/tests/flows/pcap_fixture_test.rs b/rustiflow/src/tests/flows/pcap_fixture_test.rs index dfd16406..40d2a423 100644 --- a/rustiflow/src/tests/flows/pcap_fixture_test.rs +++ b/rustiflow/src/tests/flows/pcap_fixture_test.rs @@ -134,8 +134,11 @@ mod tests { assert_eq!(tcp_port_80_flow.packet_len_stats.flow_count(), 2); assert_eq!( tcp_port_80_flow.basic_flow.flow_expire_cause, - FlowExpireCause::TcpTermination + FlowExpireCause::TcpReset ); + assert!(!tcp_port_80_flow.basic_flow.tcp_handshake_completed); + assert!(tcp_port_80_flow.basic_flow.tcp_reset_before_handshake); + assert!(!tcp_port_80_flow.basic_flow.tcp_reset_after_handshake); assert_eq!(tcp_port_80_flow.tcp_flags_stats.get_flags(), ".A.R.."); let icmp_flow = flows From 04b79043eedc3fbe0551e37b237258024bf85685 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:39:20 +0100 Subject: [PATCH 24/34] test: harden tcp lifecycle and flow integration cases --- rustiflow/src/tests/flows/basic_flow_test.rs | 51 ++++++++++++ .../src/tests/flows/feature_modules_test.rs | 27 ++++++ rustiflow/src/tests/flows/rusti_flow_test.rs | 83 ++++++++++++++++++- 3 files changed, 160 insertions(+), 1 deletion(-) diff --git a/rustiflow/src/tests/flows/basic_flow_test.rs b/rustiflow/src/tests/flows/basic_flow_test.rs index a82a5952..79528456 100644 --- a/rustiflow/src/tests/flows/basic_flow_test.rs +++ b/rustiflow/src/tests/flows/basic_flow_test.rs @@ -297,4 +297,55 @@ mod tests { assert!(!flow.tcp_reset_before_handshake); assert!(!flow.tcp_reset_after_handshake); } + + #[test] + fn simultaneous_tcp_close_terminates_after_final_ack() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 5, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 5, 2)); + let mut flow = BasicFlow::new( + "flow-10".to_string(), + ip_source, + 50013, + ip_destination, + 443, + 6, + 1_000_000, + ); + + let mut syn = build_packet(ip_source, 50013, ip_destination, 443, 1_000_100); + syn.syn_flag = 1; + assert!(!flow.update_flow(&syn, true)); + + let mut syn_ack = build_packet(ip_destination, 443, ip_source, 50013, 1_000_200); + syn_ack.syn_flag = 1; + syn_ack.ack_flag = 1; + syn_ack.sequence_number = 700; + assert!(!flow.update_flow(&syn_ack, false)); + + let mut ack = build_packet(ip_source, 50013, ip_destination, 443, 1_000_300); + ack.ack_flag = 1; + ack.sequence_number_ack = 701; + assert!(!flow.update_flow(&ack, true)); + assert!(flow.tcp_handshake_completed); + + let mut fin_fwd = build_packet(ip_source, 50013, ip_destination, 443, 1_000_400); + fin_fwd.fin_flag = 1; + fin_fwd.sequence_number = 100; + assert!(!flow.update_flow(&fin_fwd, true)); + + let mut fin_ack_bwd = build_packet(ip_destination, 443, ip_source, 50013, 1_000_500); + fin_ack_bwd.fin_flag = 1; + fin_ack_bwd.ack_flag = 1; + fin_ack_bwd.sequence_number = 200; + fin_ack_bwd.sequence_number_ack = 101; + assert!(!flow.update_flow(&fin_ack_bwd, false)); + + let mut final_ack = build_packet(ip_source, 50013, ip_destination, 443, 1_000_600); + final_ack.ack_flag = 1; + final_ack.sequence_number_ack = 201; + assert!(flow.update_flow(&final_ack, true)); + + assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpTermination); + assert!(flow.tcp_handshake_completed); + } } diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index 9cd38c1b..f043d428 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -138,6 +138,33 @@ mod tests { assert_eq!(stats.bwd_retransmission_count, 0); } + #[test] + fn retransmission_stats_ignore_adjacent_tcp_segments_but_count_later_overlap() { + let mut stats = RetransmissionStats::new(); + + let mut first = packet(1_000_000); + first.protocol = IpNextHeaderProtocols::Tcp.0; + first.sequence_number = 100; + first.data_length = 100; + stats.update(&first, true, first.timestamp_us); + + let mut adjacent = packet(1_050_000); + adjacent.protocol = IpNextHeaderProtocols::Tcp.0; + adjacent.sequence_number = 200; + adjacent.data_length = 100; + stats.update(&adjacent, true, first.timestamp_us); + + let mut overlap = packet(1_100_000); + overlap.protocol = IpNextHeaderProtocols::Tcp.0; + overlap.sequence_number = 150; + overlap.data_length = 100; + stats.update(&overlap, true, adjacent.timestamp_us); + + assert_eq!(stats.fwd_retransmission_count, 1); + assert_eq!(stats.bwd_retransmission_count, 0); + assert_eq!(stats.dump(), "1,1,0"); + } + #[test] fn window_size_stats_capture_initial_sizes_for_each_direction() { let mut stats = WindowSizeStats::new(); diff --git a/rustiflow/src/tests/flows/rusti_flow_test.rs b/rustiflow/src/tests/flows/rusti_flow_test.rs index 362fd403..942d4e6b 100644 --- a/rustiflow/src/tests/flows/rusti_flow_test.rs +++ b/rustiflow/src/tests/flows/rusti_flow_test.rs @@ -2,7 +2,10 @@ mod tests { use std::net::{IpAddr, Ipv4Addr}; - use crate::flows::{flow::Flow, rusti_flow::RustiFlow}; + use crate::{ + flows::{flow::Flow, rusti_flow::RustiFlow}, + packet_features::{PacketFeatures, ACK_FLAG, SYN_FLAG}, + }; fn setup_rusti_flow() -> RustiFlow { RustiFlow::new( @@ -20,6 +23,24 @@ mod tests { row.split(',').count() } + fn packet( + source_ip: IpAddr, + source_port: u16, + destination_ip: IpAddr, + destination_port: u16, + timestamp_us: i64, + ) -> PacketFeatures { + PacketFeatures { + source_ip, + destination_ip, + source_port, + destination_port, + protocol: 6, + timestamp_us, + ..Default::default() + } + } + #[test] fn dump_matches_feature_headers() { let flow = setup_rusti_flow(); @@ -33,4 +54,64 @@ mod tests { count_csv_fields(&RustiFlow::get_features_without_contamination()) ); } + + #[test] + fn rusti_flow_updates_lifecycle_timing_and_retransmission_features_together() { + let source_ip = IpAddr::V4(Ipv4Addr::new(172, 16, 0, 1)); + let destination_ip = IpAddr::V4(Ipv4Addr::new(172, 16, 0, 2)); + let mut flow = setup_rusti_flow(); + + let mut syn = packet(source_ip, 44444, destination_ip, 443, 1_000_100); + syn.syn_flag = 1; + syn.flags = SYN_FLAG; + assert!(!flow.update_flow(&syn, true)); + + let mut syn_ack = packet(destination_ip, 443, source_ip, 44444, 1_000_200); + syn_ack.syn_flag = 1; + syn_ack.ack_flag = 1; + syn_ack.flags = SYN_FLAG | ACK_FLAG; + syn_ack.sequence_number = 700; + assert!(!flow.update_flow(&syn_ack, false)); + + let mut ack = packet(source_ip, 44444, destination_ip, 443, 1_000_300); + ack.ack_flag = 1; + ack.flags = ACK_FLAG; + ack.sequence_number_ack = 701; + assert!(!flow.update_flow(&ack, true)); + + let mut data = packet(source_ip, 44444, destination_ip, 443, 1_001_000); + data.ack_flag = 1; + data.flags = ACK_FLAG; + data.sequence_number = 100; + data.sequence_number_ack = 701; + data.data_length = 100; + assert!(!flow.update_flow(&data, true)); + + let mut ack_bwd = packet(destination_ip, 443, source_ip, 44444, 1_001_500); + ack_bwd.ack_flag = 1; + ack_bwd.flags = ACK_FLAG; + ack_bwd.sequence_number = 701; + ack_bwd.sequence_number_ack = 200; + assert!(!flow.update_flow(&ack_bwd, false)); + + let mut overlap = packet(source_ip, 44444, destination_ip, 443, 1_001_800); + overlap.ack_flag = 1; + overlap.flags = ACK_FLAG; + overlap.sequence_number = 150; + overlap.sequence_number_ack = 701; + overlap.data_length = 100; + assert!(!flow.update_flow(&overlap, true)); + + assert!(flow.basic_flow.tcp_handshake_completed); + assert_eq!(flow.retransmission_stats.fwd_retransmission_count, 1); + assert_eq!(flow.retransmission_stats.bwd_retransmission_count, 0); + assert_eq!(flow.iat_stats.iat.get_count(), 5); + assert_eq!(flow.iat_stats.fwd_iat.get_count(), 3); + assert_eq!(flow.iat_stats.bwd_iat.get_count(), 1); + assert_eq!(flow.subflow_stats.subflow_count, 1); + assert!((flow.timing_stats.get_fwd_duration() - 1.7).abs() < f64::EPSILON); + assert!((flow.timing_stats.get_bwd_duration() - 1.3).abs() < f64::EPSILON); + assert_eq!(flow.payload_len_stats.fwd_non_zero_payload_packets, 2); + assert_eq!(flow.payload_len_stats.bwd_non_zero_payload_packets, 0); + } } From a7e46b2fe272b59c68e56ed08e16e40987b13662 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:39:25 +0100 Subject: [PATCH 25/34] fix: reject non-first ipv4 fragments offline --- docs/engineering-notes.md | 7 ++++ rustiflow/src/packet_features.rs | 4 ++ .../src/tests/flows/packet_features_test.rs | 39 ++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 5c15cbb7..0e5ea943 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -39,3 +39,10 @@ This file keeps short-lived design choices and execution notes that would make - Test hardening also exposed a real `FlowTable` lifecycle bug: packet-driven termination export could overwrite `TcpReset` with `TcpTermination`, and a first-packet-terminated flow could be left behind for duplicate export. +- The next adversarial test layer should prefer integrated semantics over raw + test count: simultaneous close teardown, contiguous-versus-overlapping TCP + segments, and wrapper-level feature coordination in `RustiFlow`. +- That test layer exposed another real parser bug: offline IPv4 parsing was + treating non-first IPv4 fragments as if they started with a fresh transport + header. Non-first IPv4 fragments should now be dropped while first fragments + still parse their transport header normally. diff --git a/rustiflow/src/packet_features.rs b/rustiflow/src/packet_features.rs index 81cfd936..85698767 100644 --- a/rustiflow/src/packet_features.rs +++ b/rustiflow/src/packet_features.rs @@ -160,6 +160,10 @@ impl PacketFeatures { // Constructor to create PacketFeatures from an IPv4 packet pub fn from_ipv4_packet(packet: &Ipv4Packet, timestamp_us: i64) -> Option { + if packet.get_fragment_offset() > 0 { + return None; + } + extract_packet_features_transport( packet.get_source().into(), packet.get_destination().into(), diff --git a/rustiflow/src/tests/flows/packet_features_test.rs b/rustiflow/src/tests/flows/packet_features_test.rs index 38d4397e..8c88e042 100644 --- a/rustiflow/src/tests/flows/packet_features_test.rs +++ b/rustiflow/src/tests/flows/packet_features_test.rs @@ -1,6 +1,6 @@ #[cfg(test)] mod tests { - use pnet::packet::{ip::IpNextHeaderProtocols, ipv6::Ipv6Packet}; + use pnet::packet::{ip::IpNextHeaderProtocols, ipv4::Ipv4Packet, ipv6::Ipv6Packet}; use std::net::{IpAddr, Ipv4Addr}; use crate::packet_features::PacketFeatures; @@ -47,6 +47,19 @@ mod tests { packet } + fn build_ipv4_packet(protocol: u8, fragment_offset: u16, payload: &[u8]) -> Vec { + let mut packet = vec![0_u8; 20 + payload.len()]; + packet[0] = 0x45; + packet[2..4].copy_from_slice(&((20 + payload.len()) as u16).to_be_bytes()); + packet[6..8].copy_from_slice(&(fragment_offset & 0x1fff).to_be_bytes()); + packet[8] = 64; + packet[9] = protocol; + packet[12..16].copy_from_slice(&[192, 0, 2, 1]); + packet[16..20].copy_from_slice(&[192, 0, 2, 2]); + packet[20..].copy_from_slice(payload); + packet + } + #[test] fn ipv6_hop_by_hop_extension_is_skipped_before_tcp_parse() { let mut payload = vec![0_u8; 8 + 20]; @@ -135,4 +148,28 @@ mod tests { assert!(PacketFeatures::from_ipv6_packet(&packet, 55).is_none()); } + + #[test] + fn ipv4_non_first_fragment_is_rejected() { + let bytes = build_ipv4_packet(IpNextHeaderProtocols::Udp.0, 1, &[0_u8; 8]); + let packet = Ipv4Packet::new(&bytes).unwrap(); + + assert!(PacketFeatures::from_ipv4_packet(&packet, 88).is_none()); + } + + #[test] + fn ipv4_first_fragment_still_parses_transport_header() { + let mut payload = vec![0_u8; 8]; + payload[0..2].copy_from_slice(&5353_u16.to_be_bytes()); + payload[2..4].copy_from_slice(&53_u16.to_be_bytes()); + payload[4..6].copy_from_slice(&8_u16.to_be_bytes()); + + let bytes = build_ipv4_packet(IpNextHeaderProtocols::Udp.0, 0, &payload); + let packet = Ipv4Packet::new(&bytes).unwrap(); + let features = PacketFeatures::from_ipv4_packet(&packet, 89).unwrap(); + + assert_eq!(features.protocol, IpNextHeaderProtocols::Udp.0); + assert_eq!(features.source_port, 5353); + assert_eq!(features.destination_port, 53); + } } From 63d6c89e70ed3742bda71ada9b89dc20514fde66 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 23:00:27 +0100 Subject: [PATCH 26/34] docs: refresh agent engineering checklist --- AGENTS.md | 40 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index a04f619d..ce811483 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -101,14 +101,16 @@ If a change touches shared code used by multiple crates, prefer checking the wor ## Engineering Checklist -Keep this section short and current. Longer decision history belongs in -`docs/engineering-notes.md`. +Keep this section short and current. Completed work and decision history belong +in `docs/engineering-notes.md`. -### 1. Ingestion semantics +### Current Focus -- [x] Preserve kernel capture timestamps in realtime events. -- [x] Align realtime packet, header, and payload length semantics with offline mode. - [ ] Stabilize and measure before expanding the eBPF event payload further. +- [ ] Finish the remaining TCP quality signals that current metadata already supports: + duplicate ACKs, zero-window events, and close style. +- [ ] Add the next IP and path signals once they can be trusted in both offline + and realtime modes. Primary files: @@ -118,35 +120,15 @@ Primary files: - `common/src/lib.rs` - `ebpf-ipv4/src/main.rs` - `ebpf-ipv6/src/main.rs` - -### 2. Existing feature families - -- [x] Preserve sub-millisecond timing and IAT precision. -- [x] Improve retransmission detection beyond exact duplicate TCP sequence numbers. -- [x] Revisit active/idle and subflow threshold behavior. -- [x] Expand ICMP behavior beyond first seen type/code. -- [x] Make TCP lifecycle quality more explicit. - -Primary files: - -- `rustiflow/src/flows/features/retransmission_stats.rs` -- `rustiflow/src/flows/features/iat_stats.rs` -- `rustiflow/src/flows/features/timing_stats.rs` -- `rustiflow/src/flows/features/active_idle_stats.rs` -- `rustiflow/src/flows/features/bulk_stats.rs` -- `rustiflow/src/flows/features/icmp_stats.rs` - `rustiflow/src/flows/basic_flow.rs` +- `rustiflow/src/flows/features/` -### 3. New diagnostic features +### Later Work -- [ ] TCP quality signals: handshake completion, duplicate ACKs, zero-window events, reset phase, close style. -- [ ] IP and path signals: TTL or hop-limit, DSCP or ECN, fragmentation behavior. - [ ] Optional lightweight application-aware metadata: DNS, TLS, HTTP, QUIC. - [ ] Better contamination-free abstractions than only coarse IANA port buckets. - -### 4. Exporter gaps - -- [ ] Fill `nf_flow` gaps such as `ip_version`, `vlan_id`, and `tunnel_id` once packet metadata exists in both ingestion modes. +- [ ] Fill `nf_flow` gaps such as `ip_version`, `vlan_id`, and `tunnel_id` once + packet metadata exists in both ingestion modes. ### Working rule From 30b29f161f8b7a5ac80212e7d0b32316f668d097 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 23:53:47 +0100 Subject: [PATCH 27/34] test: remove stale unwired cic test --- rustiflow/src/tests/flows/cic_flow_test.rs | 400 --------------------- 1 file changed, 400 deletions(-) delete mode 100644 rustiflow/src/tests/flows/cic_flow_test.rs diff --git a/rustiflow/src/tests/flows/cic_flow_test.rs b/rustiflow/src/tests/flows/cic_flow_test.rs deleted file mode 100644 index e808c22b..00000000 --- a/rustiflow/src/tests/flows/cic_flow_test.rs +++ /dev/null @@ -1,400 +0,0 @@ -#[cfg(test)] -mod tests { - use crate::flows::{cic_flow::CicFlow, flow::Flow}; - use chrono::Utc; - use std::net::{IpAddr, Ipv4Addr}; - - fn setup_cic_flow() -> CicFlow { - CicFlow::new( - "".to_string(), - IpAddr::V4(Ipv4Addr::from(1)), - 80, - IpAddr::V4(Ipv4Addr::from(2)), - 8080, - 6, - chrono::Utc::now(), - ) - } - - #[test] - fn test_increase_fwd_header_length() { - let mut cic_flow = setup_cic_flow(); - - let initial_length = cic_flow.fwd_header_length; - - cic_flow.increase_fwd_header_length(20); - assert_eq!(cic_flow.fwd_header_length, initial_length + 20); - - cic_flow.increase_fwd_header_length(0); - assert_eq!(cic_flow.fwd_header_length, initial_length + 20); - } - - #[test] - fn test_increase_bwd_header_length() { - let mut cic_flow = setup_cic_flow(); - - let initial_length = cic_flow.bwd_header_length; - - cic_flow.increase_bwd_header_length(30); - assert_eq!(cic_flow.bwd_header_length, initial_length + 30); - - cic_flow.increase_bwd_header_length(0); - assert_eq!(cic_flow.bwd_header_length, initial_length + 30); - } - - #[test] - fn test_update_fwd_pkt_len_stats() { - let mut cic_flow = setup_cic_flow(); - - cic_flow.basic_flow.fwd_packet_count = 1; - - cic_flow.update_fwd_pkt_len_stats(100); - - assert_eq!(cic_flow.fwd_pkt_len_max, 100); - assert_eq!(cic_flow.fwd_pkt_len_min, 100); - assert_eq!(cic_flow.fwd_pkt_len_mean, 100.0); - assert_eq!(cic_flow.fwd_pkt_len_std, 0.0); - assert_eq!(cic_flow.fwd_pkt_len_tot, 100); - - cic_flow.basic_flow.fwd_packet_count = 2; - - cic_flow.update_fwd_pkt_len_stats(50); - - assert_eq!(cic_flow.fwd_pkt_len_max, 100); - assert_eq!(cic_flow.fwd_pkt_len_min, 50); - assert_eq!(cic_flow.fwd_pkt_len_mean, 75.0); - assert_eq!(cic_flow.fwd_pkt_len_std, 25.0); - assert_eq!(cic_flow.fwd_pkt_len_tot, 150); - - cic_flow.basic_flow.fwd_packet_count = 3; - - cic_flow.update_fwd_pkt_len_stats(0); - - assert_eq!(cic_flow.fwd_pkt_len_max, 100); - assert_eq!(cic_flow.fwd_pkt_len_min, 0); - assert_eq!(cic_flow.fwd_pkt_len_mean, 50.0); - assert_eq!(cic_flow.fwd_pkt_len_std, 40.824829046386306); - assert_eq!(cic_flow.fwd_pkt_len_tot, 150); - } - - #[test] - fn test_update_bwd_pkt_len_stats() { - let mut cic_flow = setup_cic_flow(); - - cic_flow.basic_flow.bwd_packet_count = 1; - - cic_flow.update_bwd_pkt_len_stats(100); - - assert_eq!(cic_flow.bwd_pkt_len_max, 100); - assert_eq!(cic_flow.bwd_pkt_len_min, 100); - assert_eq!(cic_flow.bwd_pkt_len_mean, 100.0); - assert_eq!(cic_flow.bwd_pkt_len_std, 0.0); - assert_eq!(cic_flow.bwd_pkt_len_tot, 100); - - cic_flow.basic_flow.bwd_packet_count = 2; - - cic_flow.update_bwd_pkt_len_stats(50); - - assert_eq!(cic_flow.bwd_pkt_len_max, 100); - assert_eq!(cic_flow.bwd_pkt_len_min, 50); - assert_eq!(cic_flow.bwd_pkt_len_mean, 75.0); - assert_eq!(cic_flow.bwd_pkt_len_std, 25.0); - assert_eq!(cic_flow.bwd_pkt_len_tot, 150); - - cic_flow.basic_flow.bwd_packet_count = 3; - - cic_flow.update_bwd_pkt_len_stats(0); - - assert_eq!(cic_flow.bwd_pkt_len_max, 100); - assert_eq!(cic_flow.bwd_pkt_len_min, 0); - assert_eq!(cic_flow.bwd_pkt_len_mean, 50.0); - assert_eq!(cic_flow.bwd_pkt_len_std, 40.824829046386306); - assert_eq!(cic_flow.bwd_pkt_len_tot, 150); - } - - #[test] - fn test_update_fwd_iat_stats() { - let mut cic_flow = setup_cic_flow(); - let epsilon = 1e-9; // floating-point arithmetic is not exact - - cic_flow.basic_flow.fwd_packet_count = 2; - - cic_flow.update_fwd_iat_stats(0.05); - - assert_eq!(cic_flow.fwd_iat_max, 0.05); - assert_eq!(cic_flow.fwd_iat_min, 0.05); - assert_eq!(cic_flow.fwd_iat_mean, 0.05); - assert_eq!(cic_flow.fwd_iat_std, 0.0); - assert_eq!(cic_flow.fwd_iat_total, 0.05); - - cic_flow.basic_flow.fwd_packet_count = 3; - - cic_flow.update_fwd_iat_stats(0.01); - - assert_eq!(cic_flow.fwd_iat_max, 0.05); - assert_eq!(cic_flow.fwd_iat_min, 0.01); - assert!( - (cic_flow.fwd_iat_mean - 0.03).abs() < epsilon, - "fwd_iat_mean is not within the expected range" - ); - assert_eq!(cic_flow.fwd_iat_std, 0.02); - assert!( - (cic_flow.fwd_iat_total - 0.06).abs() < epsilon, - "fwd_iat_total is not within the expected range" - ); - - cic_flow.basic_flow.fwd_packet_count = 4; - - cic_flow.update_fwd_iat_stats(0.698456231458); - - assert_eq!(cic_flow.fwd_iat_max, 0.698456231458); - assert_eq!(cic_flow.fwd_iat_min, 0.01); - assert_eq!(cic_flow.fwd_iat_mean, 0.25281874381933334); - assert_eq!(cic_flow.fwd_iat_std, 0.31553613400230096); - assert_eq!(cic_flow.fwd_iat_total, 0.758456231458); - } - - #[test] - fn test_update_bwd_iat_stats() { - let mut cic_flow = setup_cic_flow(); - let epsilon = 1e-9; // floating-point arithmetic is not exact - - cic_flow.basic_flow.bwd_packet_count = 2; - - cic_flow.update_bwd_iat_stats(0.05); - - assert_eq!(cic_flow.bwd_iat_max, 0.05); - assert_eq!(cic_flow.bwd_iat_min, 0.05); - assert_eq!(cic_flow.bwd_iat_mean, 0.05); - assert_eq!(cic_flow.bwd_iat_std, 0.0); - assert_eq!(cic_flow.bwd_iat_total, 0.05); - - cic_flow.basic_flow.bwd_packet_count = 3; - - cic_flow.update_bwd_iat_stats(0.01); - - assert_eq!(cic_flow.bwd_iat_max, 0.05); - assert_eq!(cic_flow.bwd_iat_min, 0.01); - assert!( - (cic_flow.bwd_iat_mean - 0.03).abs() < epsilon, - "fwd_iat_mean is not within the expected range" - ); - assert_eq!(cic_flow.bwd_iat_std, 0.02); - assert!( - (cic_flow.bwd_iat_total - 0.06).abs() < epsilon, - "fwd_iat_total is not within the expected range" - ); - - cic_flow.basic_flow.bwd_packet_count = 4; - - cic_flow.update_bwd_iat_stats(0.698456231458); - - assert_eq!(cic_flow.bwd_iat_max, 0.698456231458); - assert_eq!(cic_flow.bwd_iat_min, 0.01); - assert_eq!(cic_flow.bwd_iat_mean, 0.25281874381933334); - assert_eq!(cic_flow.bwd_iat_std, 0.31553613400230096); - assert_eq!(cic_flow.bwd_iat_total, 0.758456231458); - } - - #[test] - fn test_update_fwd_bulk_stats() { - let mut cic_flow = setup_cic_flow(); - let timestamp = Utc::now(); - let timestamp_2 = Utc::now(); - let timestamp_3 = Utc::now(); - let timestamp_4 = Utc::now(); - - cic_flow.update_fwd_bulk_stats(×tamp, 100); - - assert_eq!(cic_flow.fwd_bulk_state_count, 0); - assert_eq!(cic_flow.fwd_bulk_packet_count, 0); - assert_eq!(cic_flow.fwd_bulk_size_total, 0); - assert_eq!(cic_flow.fwd_bulk_duration, 0.0); - assert_eq!(cic_flow.fwd_bulk_packet_count_help, 1); - assert_eq!(cic_flow.fwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.fwd_bulk_size_help, 100); - assert_eq!(cic_flow.fwd_last_bulk_timestamp, Some(timestamp)); - - cic_flow.update_fwd_bulk_stats(×tamp_2, 200); - - assert_eq!(cic_flow.fwd_bulk_state_count, 0); - assert_eq!(cic_flow.fwd_bulk_packet_count, 0); - assert_eq!(cic_flow.fwd_bulk_size_total, 0); - assert_eq!(cic_flow.fwd_bulk_duration, 0.0); - assert_eq!(cic_flow.fwd_bulk_packet_count_help, 2); - assert_eq!(cic_flow.fwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.fwd_bulk_size_help, 300); - assert_eq!(cic_flow.fwd_last_bulk_timestamp, Some(timestamp_2)); - - cic_flow.update_fwd_bulk_stats(×tamp_3, 150); - - assert_eq!(cic_flow.fwd_bulk_state_count, 0); - assert_eq!(cic_flow.fwd_bulk_packet_count, 0); - assert_eq!(cic_flow.fwd_bulk_size_total, 0); - assert_eq!(cic_flow.fwd_bulk_duration, 0.0); - assert_eq!(cic_flow.fwd_bulk_packet_count_help, 3); - assert_eq!(cic_flow.fwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.fwd_bulk_size_help, 450); - assert_eq!(cic_flow.fwd_last_bulk_timestamp, Some(timestamp_3)); - - cic_flow.update_fwd_bulk_stats(×tamp_4, 50); - - assert_eq!(cic_flow.fwd_bulk_state_count, 1); - assert_eq!(cic_flow.fwd_bulk_packet_count, 4); - assert_eq!(cic_flow.fwd_bulk_size_total, 500); - assert_eq!( - cic_flow.fwd_bulk_duration, - timestamp_4 - .signed_duration_since(timestamp) - .num_microseconds() - .unwrap() as f64 - ); - assert_eq!(cic_flow.fwd_bulk_packet_count_help, 4); - assert_eq!(cic_flow.fwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.fwd_bulk_size_help, 500); - assert_eq!(cic_flow.fwd_last_bulk_timestamp, Some(timestamp_4)); - - std::thread::sleep(std::time::Duration::from_secs(1)); - - let new_timestamp = Utc::now(); - - cic_flow.update_fwd_bulk_stats(&new_timestamp, 50); - - assert_eq!(cic_flow.fwd_bulk_state_count, 1); - assert_eq!(cic_flow.fwd_bulk_packet_count, 5); - assert_eq!(cic_flow.fwd_bulk_size_total, 550); - assert_eq!( - cic_flow.fwd_bulk_duration, - new_timestamp - .signed_duration_since(timestamp) - .num_microseconds() - .unwrap() as f64 - ); - assert_eq!(cic_flow.fwd_bulk_packet_count_help, 5); - assert_eq!(cic_flow.fwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.fwd_bulk_size_help, 550); - assert_eq!(cic_flow.fwd_last_bulk_timestamp, Some(new_timestamp)); - } - - #[test] - fn test_update_bwd_bulk_stats() { - let mut cic_flow = setup_cic_flow(); - let timestamp = Utc::now(); - let timestamp_2 = Utc::now(); - let timestamp_3 = Utc::now(); - let timestamp_4 = Utc::now(); - - cic_flow.update_bwd_bulk_stats(×tamp, 100); - - assert_eq!(cic_flow.bwd_bulk_state_count, 0); - assert_eq!(cic_flow.bwd_bulk_packet_count, 0); - assert_eq!(cic_flow.bwd_bulk_size_total, 0); - assert_eq!(cic_flow.bwd_bulk_duration, 0.0); - assert_eq!(cic_flow.bwd_bulk_packet_count_help, 1); - assert_eq!(cic_flow.bwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.bwd_bulk_size_help, 100); - assert_eq!(cic_flow.bwd_last_bulk_timestamp, Some(timestamp)); - - cic_flow.update_bwd_bulk_stats(×tamp_2, 200); - - assert_eq!(cic_flow.bwd_bulk_state_count, 0); - assert_eq!(cic_flow.bwd_bulk_packet_count, 0); - assert_eq!(cic_flow.bwd_bulk_size_total, 0); - assert_eq!(cic_flow.bwd_bulk_duration, 0.0); - assert_eq!(cic_flow.bwd_bulk_packet_count_help, 2); - assert_eq!(cic_flow.bwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.bwd_bulk_size_help, 300); - assert_eq!(cic_flow.bwd_last_bulk_timestamp, Some(timestamp_2)); - - cic_flow.update_bwd_bulk_stats(×tamp_3, 150); - - assert_eq!(cic_flow.bwd_bulk_state_count, 0); - assert_eq!(cic_flow.bwd_bulk_packet_count, 0); - assert_eq!(cic_flow.bwd_bulk_size_total, 0); - assert_eq!(cic_flow.bwd_bulk_duration, 0.0); - assert_eq!(cic_flow.bwd_bulk_packet_count_help, 3); - assert_eq!(cic_flow.bwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.bwd_bulk_size_help, 450); - assert_eq!(cic_flow.bwd_last_bulk_timestamp, Some(timestamp_3)); - - cic_flow.update_bwd_bulk_stats(×tamp_4, 50); - - assert_eq!(cic_flow.bwd_bulk_state_count, 1); - assert_eq!(cic_flow.bwd_bulk_packet_count, 4); - assert_eq!(cic_flow.bwd_bulk_size_total, 500); - assert_eq!( - cic_flow.bwd_bulk_duration, - timestamp_4 - .signed_duration_since(timestamp) - .num_microseconds() - .unwrap() as f64 - ); - assert_eq!(cic_flow.bwd_bulk_packet_count_help, 4); - assert_eq!(cic_flow.bwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.bwd_bulk_size_help, 500); - assert_eq!(cic_flow.bwd_last_bulk_timestamp, Some(timestamp_4)); - - std::thread::sleep(std::time::Duration::from_secs(1)); - - let new_timestamp = Utc::now(); - - cic_flow.update_bwd_bulk_stats(&new_timestamp, 50); - - assert_eq!(cic_flow.bwd_bulk_state_count, 1); - assert_eq!(cic_flow.bwd_bulk_packet_count, 5); - assert_eq!(cic_flow.bwd_bulk_size_total, 550); - assert_eq!( - cic_flow.bwd_bulk_duration, - new_timestamp - .signed_duration_since(timestamp) - .num_microseconds() - .unwrap() as f64 - ); - assert_eq!(cic_flow.bwd_bulk_packet_count_help, 5); - assert_eq!(cic_flow.bwd_bulk_start_help, Some(timestamp)); - assert_eq!(cic_flow.bwd_bulk_size_help, 550); - assert_eq!(cic_flow.bwd_last_bulk_timestamp, Some(new_timestamp)); - } - - #[test] - fn test_update_active_flow() { - let mut cic_flow = setup_cic_flow(); - - cic_flow.update_active_flow(100.0); - - assert_eq!(cic_flow.active_max, 100.0); - assert_eq!(cic_flow.active_min, 100.0); - assert_eq!(cic_flow.active_mean, 100.0); - assert_eq!(cic_flow.active_std, 0.0); - assert_eq!(cic_flow.active_count, 1); - - cic_flow.update_active_flow(50.0); - - assert_eq!(cic_flow.active_max, 100.0); - assert_eq!(cic_flow.active_min, 50.0); - assert_eq!(cic_flow.active_mean, 75.0); - assert_eq!(cic_flow.active_std, 25.0); - assert_eq!(cic_flow.active_count, 2); - } - - #[test] - fn test_update_idle_flow() { - let mut cic_flow = setup_cic_flow(); - - cic_flow.update_idle_flow(100.0); - - assert_eq!(cic_flow.idle_max, 100.0); - assert_eq!(cic_flow.idle_min, 100.0); - assert_eq!(cic_flow.idle_mean, 100.0); - assert_eq!(cic_flow.idle_std, 0.0); - assert_eq!(cic_flow.idle_count, 1); - - cic_flow.update_idle_flow(50.0); - - assert_eq!(cic_flow.idle_max, 100.0); - assert_eq!(cic_flow.idle_min, 50.0); - assert_eq!(cic_flow.idle_mean, 75.0); - assert_eq!(cic_flow.idle_std, 25.0); - assert_eq!(cic_flow.idle_count, 2); - } -} From cc78a4538c5d54536e99b1b09d36409c6da1e264 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 23:53:52 +0100 Subject: [PATCH 28/34] feat: add tcp quality signals to rustiflow --- AGENTS.md | 2 +- README.md | 3 +- docs/engineering-notes.md | 6 + rustiflow/src/args.rs | 2 +- rustiflow/src/flows/basic_flow.rs | 66 ++++ rustiflow/src/flows/features/mod.rs | 1 + .../src/flows/features/tcp_quality_stats.rs | 123 ++++++ rustiflow/src/flows/rusti_flow.rs | 358 ++++++++---------- rustiflow/src/tests/flows/basic_flow_test.rs | 37 +- .../src/tests/flows/feature_modules_test.rs | 69 +++- rustiflow/src/tests/flows/rusti_flow_test.rs | 29 +- 11 files changed, 490 insertions(+), 206 deletions(-) create mode 100644 rustiflow/src/flows/features/tcp_quality_stats.rs diff --git a/AGENTS.md b/AGENTS.md index ce811483..44b9621e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -107,7 +107,7 @@ in `docs/engineering-notes.md`. ### Current Focus - [ ] Stabilize and measure before expanding the eBPF event payload further. -- [ ] Finish the remaining TCP quality signals that current metadata already supports: +- [x] Finish the remaining TCP quality signals that current metadata already supports: duplicate ACKs, zero-window events, and close style. - [ ] Add the next IP and path signals once they can be trusted in both offline and realtime modes. diff --git a/README.md b/README.md index 2d5899ea..98cf285e 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ This tool is engineered for robust and efficient feature extraction, particularl - **Versatile Feature Sets:** Offers a variety of pre-defined feature sets (flows) and the flexibility to create custom feature sets tailored to specific requirements. An example of the custom flow is shown [here](https://github.com/idlab-discover/RustiFlow/blob/main/rustiflow/src/flows/custom_flow.rs). - **Pcap File Support:** Facilitates packet analysis from pcap files, compatible with both Linux and Windows generated files. - **Diverse Output Options:** Features can be outputted to the console, a CSV file, or other formats with minimal effort. +- **Richer TCP Quality Signals:** The RustiFlow feature set exports duplicate ACK counts, zero-window observations, and TCP close style in addition to the existing lifecycle and retransmission fields. ## Feature sets @@ -220,7 +221,7 @@ Options: - cic: Represents the CIC Flow, giving 90 features - cidds: Represents the CIDDS Flow, giving 10 features - nfstream: Represents a nfstream inspired flow, giving 69 features - - rustiflow: Represents the Rusti Flow, giving 127 features + - rustiflow: Represents the Rusti Flow, giving 199 features - custom: Represents a flow that you can implement yourself --active-timeout diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 0e5ea943..208b30bd 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -28,6 +28,12 @@ This file keeps short-lived design choices and execution notes that would make - TCP lifecycle export now distinguishes observed handshake completion from resets seen before or after that observed handshake, so richer flow schemas do not have to infer lifecycle quality from flag totals alone. +- RustiFlow export now includes duplicate ACK counts, zero-window + observations, and `tcp_close_style`. Duplicate ACKs currently mean repeated + pure ACKs with the same ACK number and advertised window; zero-window events + count TCP packets advertising a zero receive window; close style stays rooted + in `BasicFlow` lifecycle state so timeout/reset/FIN semantics are not + reimplemented in exporter code. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/rustiflow/src/args.rs b/rustiflow/src/args.rs index df5ffbc3..cbc80c31 100644 --- a/rustiflow/src/args.rs +++ b/rustiflow/src/args.rs @@ -170,7 +170,7 @@ pub enum FlowType { /// Represents a nfstream inspired flow, giving 69 features. Nfstream, - /// Represents the Rusti Flow, giving 120 features. + /// Represents the Rusti Flow, giving 199 features. Rustiflow, /// Represents a flow that you can implement yourself. diff --git a/rustiflow/src/flows/basic_flow.rs b/rustiflow/src/flows/basic_flow.rs index 0f9dc3aa..5b46090f 100644 --- a/rustiflow/src/flows/basic_flow.rs +++ b/rustiflow/src/flows/basic_flow.rs @@ -14,6 +14,31 @@ pub(crate) enum FlowState { FinAcked, } +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum TcpCloseStyle { + NotApplicable, + None, + HalfClose, + BidirectionalFin, + FourWayFin, + SimultaneousFin, + Reset, +} + +impl TcpCloseStyle { + pub fn as_str(&self) -> &'static str { + match self { + Self::NotApplicable => "not_applicable", + Self::None => "none", + Self::HalfClose => "half_close", + Self::BidirectionalFin => "bidirectional_fin", + Self::FourWayFin => "four_way_fin", + Self::SimultaneousFin => "simultaneous_fin", + Self::Reset => "reset", + } + } +} + /// A basic flow that stores the basic features of a flow. #[derive(Clone)] pub struct BasicFlow { @@ -47,6 +72,10 @@ pub struct BasicFlow { pub tcp_handshake_completed: bool, pub tcp_reset_before_handshake: bool, pub tcp_reset_after_handshake: bool, + pub tcp_close_style: TcpCloseStyle, + saw_fin_fwd: bool, + saw_fin_bwd: bool, + tcp_simultaneous_close: bool, } impl BasicFlow { @@ -83,6 +112,28 @@ impl BasicFlow { } } + fn update_tcp_close_style(&mut self, cause: FlowExpireCause) { + self.tcp_close_style = if !self.is_tcp() { + TcpCloseStyle::NotApplicable + } else if cause == FlowExpireCause::TcpReset { + TcpCloseStyle::Reset + } else if self.saw_fin_fwd && self.saw_fin_bwd { + if self.state_fwd == FlowState::FinAcked && self.state_bwd == FlowState::FinAcked { + if self.tcp_simultaneous_close { + TcpCloseStyle::SimultaneousFin + } else { + TcpCloseStyle::FourWayFin + } + } else { + TcpCloseStyle::BidirectionalFin + } + } else if self.saw_fin_fwd || self.saw_fin_bwd { + TcpCloseStyle::HalfClose + } else { + TcpCloseStyle::None + }; + } + /// Checks if the flow is finished. /// /// A flow is considered finished when both FIN flags are set and the last ACK is received, @@ -99,10 +150,18 @@ impl BasicFlow { // Update state when receiving FIN flag if packet.fin_flag > 0 { if forward { + if self.state_bwd == FlowState::FinSent { + self.tcp_simultaneous_close = true; + } + self.saw_fin_fwd = true; self.state_fwd = FlowState::FinSent; self.expected_ack_seq_bwd = Some(packet.sequence_number + packet.data_length as u32 + 1); } else { + if self.state_fwd == FlowState::FinSent { + self.tcp_simultaneous_close = true; + } + self.saw_fin_bwd = true; self.state_bwd = FlowState::FinSent; self.expected_ack_seq_fwd = Some(packet.sequence_number + packet.data_length as u32 + 1); @@ -186,6 +245,10 @@ impl Flow for BasicFlow { tcp_handshake_completed: false, tcp_reset_before_handshake: false, tcp_reset_after_handshake: false, + tcp_close_style: TcpCloseStyle::None, + saw_fin_fwd: false, + saw_fin_bwd: false, + tcp_simultaneous_close: false, } } @@ -195,6 +258,7 @@ impl Flow for BasicFlow { if self.is_tcp_finished(packet, fwd) { self.flow_expire_cause = FlowExpireCause::TcpTermination; + self.update_tcp_close_style(self.flow_expire_cause); return true; } @@ -205,6 +269,7 @@ impl Flow for BasicFlow { self.tcp_reset_before_handshake = true; } self.flow_expire_cause = FlowExpireCause::TcpReset; + self.update_tcp_close_style(self.flow_expire_cause); return true; } @@ -213,6 +278,7 @@ impl Flow for BasicFlow { fn close_flow(&mut self, _timestamp_us: i64, cause: FlowExpireCause) -> () { self.flow_expire_cause = cause; + self.update_tcp_close_style(cause); } fn dump(&self) -> String { diff --git a/rustiflow/src/flows/features/mod.rs b/rustiflow/src/flows/features/mod.rs index 4dc22a4d..6b0ea1e8 100644 --- a/rustiflow/src/flows/features/mod.rs +++ b/rustiflow/src/flows/features/mod.rs @@ -8,6 +8,7 @@ pub mod payload_stats; pub mod retransmission_stats; pub mod subflow_stats; pub mod tcp_flag_stats; +pub mod tcp_quality_stats; pub mod timing_stats; pub mod util; pub mod window_size_stats; diff --git a/rustiflow/src/flows/features/tcp_quality_stats.rs b/rustiflow/src/flows/features/tcp_quality_stats.rs new file mode 100644 index 00000000..c93f2860 --- /dev/null +++ b/rustiflow/src/flows/features/tcp_quality_stats.rs @@ -0,0 +1,123 @@ +use pnet::packet::ip::IpNextHeaderProtocols; + +use crate::{ + flows::util::FlowExpireCause, + packet_features::{PacketFeatures, ACK_FLAG}, +}; + +use super::util::FlowFeature; + +#[derive(Clone, Copy)] +struct AckObservation { + ack_number: u32, + window_size: u16, +} + +#[derive(Clone)] +pub struct TcpQualityStats { + pub fwd_duplicate_ack_count: u32, + pub bwd_duplicate_ack_count: u32, + pub fwd_zero_window_count: u32, + pub bwd_zero_window_count: u32, + last_fwd_ack: Option, + last_bwd_ack: Option, +} + +impl TcpQualityStats { + pub fn new() -> Self { + Self { + fwd_duplicate_ack_count: 0, + bwd_duplicate_ack_count: 0, + fwd_zero_window_count: 0, + bwd_zero_window_count: 0, + last_fwd_ack: None, + last_bwd_ack: None, + } + } + + fn is_duplicate_ack_candidate(packet: &PacketFeatures) -> bool { + packet.protocol == IpNextHeaderProtocols::Tcp.0 + && packet.flags == ACK_FLAG + && packet.data_length == 0 + } + + fn update_duplicate_ack_state( + last_ack: &mut Option, + duplicate_ack_count: &mut u32, + packet: &PacketFeatures, + ) { + if !Self::is_duplicate_ack_candidate(packet) { + *last_ack = None; + return; + } + + let observation = AckObservation { + ack_number: packet.sequence_number_ack, + window_size: packet.window_size, + }; + + if last_ack.is_some_and(|last| { + last.ack_number == observation.ack_number && last.window_size == observation.window_size + }) { + *duplicate_ack_count += 1; + } + + *last_ack = Some(observation); + } +} + +impl FlowFeature for TcpQualityStats { + fn update(&mut self, packet: &PacketFeatures, is_forward: bool, _last_timestamp_us: i64) { + if packet.protocol != IpNextHeaderProtocols::Tcp.0 { + return; + } + + if is_forward { + if packet.window_size == 0 { + self.fwd_zero_window_count += 1; + } + Self::update_duplicate_ack_state( + &mut self.last_fwd_ack, + &mut self.fwd_duplicate_ack_count, + packet, + ); + } else { + if packet.window_size == 0 { + self.bwd_zero_window_count += 1; + } + Self::update_duplicate_ack_state( + &mut self.last_bwd_ack, + &mut self.bwd_duplicate_ack_count, + packet, + ); + } + } + + fn close(&mut self, _last_timestamp_us: i64, _cause: FlowExpireCause) { + // No active state to close. + } + + fn dump(&self) -> String { + format!( + "{},{},{},{},{},{}", + self.fwd_duplicate_ack_count + self.bwd_duplicate_ack_count, + self.fwd_duplicate_ack_count, + self.bwd_duplicate_ack_count, + self.fwd_zero_window_count + self.bwd_zero_window_count, + self.fwd_zero_window_count, + self.bwd_zero_window_count, + ) + } + + fn headers() -> String { + [ + "flow_duplicate_ack_count", + "fwd_duplicate_ack_count", + "bwd_duplicate_ack_count", + "flow_zero_window_count", + "fwd_zero_window_count", + "bwd_zero_window_count", + ] + .join(",") + } +} diff --git a/rustiflow/src/flows/rusti_flow.rs b/rustiflow/src/flows/rusti_flow.rs index a2079e23..e9438da9 100644 --- a/rustiflow/src/flows/rusti_flow.rs +++ b/rustiflow/src/flows/rusti_flow.rs @@ -14,8 +14,9 @@ use super::{ active_idle_stats::ActiveIdleStats, bulk_stats::BulkStats, header_stats::HeaderLengthStats, iat_stats::IATStats, icmp_stats::IcmpStats, packet_stats::PacketLengthStats, payload_stats::PayloadLengthStats, retransmission_stats::RetransmissionStats, - subflow_stats::SubflowStats, tcp_flag_stats::TcpFlagStats, timing_stats::TimingStats, - util::FlowFeature, window_size_stats::WindowSizeStats, + subflow_stats::SubflowStats, tcp_flag_stats::TcpFlagStats, + tcp_quality_stats::TcpQualityStats, timing_stats::TimingStats, util::FlowFeature, + window_size_stats::WindowSizeStats, }, flow::Flow, util::FlowExpireCause, @@ -35,6 +36,7 @@ pub struct RustiFlow { pub active_idle_stats: ActiveIdleStats, pub icmp_stats: IcmpStats, pub retransmission_stats: RetransmissionStats, + pub tcp_quality_stats: TcpQualityStats, pub window_size_stats: WindowSizeStats, pub timing_stats: TimingStats, } @@ -69,6 +71,7 @@ impl Flow for RustiFlow { active_idle_stats: ActiveIdleStats::new(timestamp_us), icmp_stats: IcmpStats::new(), retransmission_stats: RetransmissionStats::new(), + tcp_quality_stats: TcpQualityStats::new(), window_size_stats: WindowSizeStats::new(), timing_stats: TimingStats::new(), } @@ -91,6 +94,8 @@ impl Flow for RustiFlow { self.icmp_stats.update(packet, fwd, last_timestamp_us); self.retransmission_stats .update(packet, fwd, last_timestamp_us); + self.tcp_quality_stats + .update(packet, fwd, last_timestamp_us); self.window_size_stats .update(packet, fwd, last_timestamp_us); self.timing_stats.update(packet, fwd, last_timestamp_us); @@ -111,306 +116,267 @@ impl Flow for RustiFlow { self.active_idle_stats.close(timestamp_us, cause); self.icmp_stats.close(timestamp_us, cause); self.retransmission_stats.close(timestamp_us, cause); + self.tcp_quality_stats.close(timestamp_us, cause); self.window_size_stats.close(timestamp_us, cause); self.timing_stats.close(timestamp_us, cause); } fn dump(&self) -> String { let duration_us = self.basic_flow.get_flow_duration_usec(); - format!( - "{},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{}", - // Basic Info - self.basic_flow.flow_key, - self.basic_flow.ip_source, - self.basic_flow.port_source, - self.basic_flow.ip_destination, - self.basic_flow.port_destination, - self.basic_flow.protocol, - self.basic_flow.get_first_timestamp(), - self.basic_flow.get_last_timestamp(), - duration_us, - self.basic_flow.flow_expire_cause.as_str(), - u8::from(self.basic_flow.tcp_handshake_completed), - u8::from(self.basic_flow.tcp_reset_before_handshake), - u8::from(self.basic_flow.tcp_reset_after_handshake), - // Timing Stats + vec![ + self.basic_flow.flow_key.clone(), + self.basic_flow.ip_source.to_string(), + self.basic_flow.port_source.to_string(), + self.basic_flow.ip_destination.to_string(), + self.basic_flow.port_destination.to_string(), + self.basic_flow.protocol.to_string(), + self.basic_flow.get_first_timestamp().to_string(), + self.basic_flow.get_last_timestamp().to_string(), + duration_us.to_string(), + self.basic_flow.flow_expire_cause.as_str().to_string(), + u8::from(self.basic_flow.tcp_handshake_completed).to_string(), + u8::from(self.basic_flow.tcp_reset_before_handshake).to_string(), + u8::from(self.basic_flow.tcp_reset_after_handshake).to_string(), + self.basic_flow.tcp_close_style.as_str().to_string(), self.timing_stats.dump(), - // IAT Stats self.iat_stats.dump(), - // Packet Length Stats self.packet_len_stats.dump(), - // Packet Header Length Stats self.header_len_stats.dump(), - // Payload Length Stats self.payload_len_stats.dump(), - // Bulk Stats self.bulk_stats.dump(), - // Subflow Stats self.subflow_stats.dump(), - // Active Idle Stats self.active_idle_stats.dump(), - // ICMP Stats self.icmp_stats.dump(), - // Retransmission Stats self.retransmission_stats.dump(), - // Window Size Stats + self.tcp_quality_stats.dump(), self.window_size_stats.dump(), - // TCP Flag Stats self.tcp_flags_stats.dump(), - // Rate Stats (per second) safe_per_second_rate( self.payload_len_stats.payload_len.get_total(), - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.payload_len.get_count() as f64, - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.fwd_payload_len.get_total(), - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.fwd_payload_len.get_count() as f64, - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.bwd_payload_len.get_total(), - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.bwd_payload_len.get_count() as f64, - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_div_int( self.payload_len_stats.fwd_payload_len.get_count(), - self.subflow_stats.subflow_count - ), + self.subflow_stats.subflow_count, + ) + .to_string(), safe_div( self.payload_len_stats.fwd_payload_len.get_total(), - self.subflow_stats.subflow_count as f64 - ), + self.subflow_stats.subflow_count as f64, + ) + .to_string(), safe_div_int( self.payload_len_stats.bwd_payload_len.get_count(), - self.subflow_stats.subflow_count - ), + self.subflow_stats.subflow_count, + ) + .to_string(), safe_div( self.payload_len_stats.bwd_payload_len.get_total(), - self.subflow_stats.subflow_count as f64 - ), - // UP/DOWN Ratio + self.subflow_stats.subflow_count as f64, + ) + .to_string(), safe_div_int( self.payload_len_stats.bwd_payload_len.get_count(), - self.payload_len_stats.fwd_payload_len.get_count() - ), - ) + self.payload_len_stats.fwd_payload_len.get_count(), + ) + .to_string(), + ] + .join(",") } fn get_features() -> String { - format!( - "{},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{}", - // Basic Info - "flow_id", - "source_ip", - "source_port", - "destination_ip", - "destination_port", - "protocol", - "timestamp_first", - "timestamp_last", - "flow_duration_us", - "flow_expire_cause", - "tcp_handshake_completed", - "tcp_reset_before_handshake", - "tcp_reset_after_handshake", - // Timing Stats + vec![ + "flow_id".to_string(), + "source_ip".to_string(), + "source_port".to_string(), + "destination_ip".to_string(), + "destination_port".to_string(), + "protocol".to_string(), + "timestamp_first".to_string(), + "timestamp_last".to_string(), + "flow_duration_us".to_string(), + "flow_expire_cause".to_string(), + "tcp_handshake_completed".to_string(), + "tcp_reset_before_handshake".to_string(), + "tcp_reset_after_handshake".to_string(), + "tcp_close_style".to_string(), TimingStats::headers(), - // IAT Stats IATStats::headers(), - // Packet Length Stats PacketLengthStats::headers(), - // Packet Header Length Stats HeaderLengthStats::headers(), - // Payload Length Stats PayloadLengthStats::headers(), - // Bulk Stats BulkStats::headers(), - // Subflow Stats SubflowStats::headers(), - // Active Idle Stats ActiveIdleStats::headers(), - // ICMP Stats IcmpStats::headers(), - // Retransmission Stats RetransmissionStats::headers(), - // Window Size Stats + TcpQualityStats::headers(), WindowSizeStats::headers(), - // TCP Flag Stats TcpFlagStats::headers(), - // Rate Stats (per second) - "flow_bytes_s", - "flow_packets_s", - "fwd_bytes_s", - "fwd_packets_s", - "bwd_bytes_s", - "bwd_packets_s", - "fwd_subflow_packets_mean", - "fwd_subflow_bytes_mean", - "bwd_subflow_packets_mean", - "bwd_subflow_bytes_mean", - // UP/DOWN Ratio - "up_down_ratio", - ) + "flow_bytes_s".to_string(), + "flow_packets_s".to_string(), + "fwd_bytes_s".to_string(), + "fwd_packets_s".to_string(), + "bwd_bytes_s".to_string(), + "bwd_packets_s".to_string(), + "fwd_subflow_packets_mean".to_string(), + "fwd_subflow_bytes_mean".to_string(), + "bwd_subflow_packets_mean".to_string(), + "bwd_subflow_bytes_mean".to_string(), + "up_down_ratio".to_string(), + ] + .join(",") } fn dump_without_contamination(&self) -> String { let duration_us = self.basic_flow.get_flow_duration_usec(); - format!( - "{},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{}\ - ,{},{},{},{},{},{},{},{},{},{},{},{}", - // Basic Info - iana_port_mapping(self.basic_flow.port_source), - iana_port_mapping(self.basic_flow.port_destination), - self.basic_flow.protocol, - duration_us, - self.basic_flow.flow_expire_cause.as_str(), - u8::from(self.basic_flow.tcp_handshake_completed), - u8::from(self.basic_flow.tcp_reset_before_handshake), - u8::from(self.basic_flow.tcp_reset_after_handshake), - // Timing Stats - self.timing_stats.get_fwd_duration(), - self.timing_stats.get_bwd_duration(), - // IAT Stats + vec![ + iana_port_mapping(self.basic_flow.port_source).to_string(), + iana_port_mapping(self.basic_flow.port_destination).to_string(), + self.basic_flow.protocol.to_string(), + duration_us.to_string(), + self.basic_flow.flow_expire_cause.as_str().to_string(), + u8::from(self.basic_flow.tcp_handshake_completed).to_string(), + u8::from(self.basic_flow.tcp_reset_before_handshake).to_string(), + u8::from(self.basic_flow.tcp_reset_after_handshake).to_string(), + self.basic_flow.tcp_close_style.as_str().to_string(), + self.timing_stats.get_fwd_duration().to_string(), + self.timing_stats.get_bwd_duration().to_string(), self.iat_stats.dump(), - // Packet Length Stats self.packet_len_stats.dump(), - // Packet Header Length Stats self.header_len_stats.dump(), - // Payload Length Stats self.payload_len_stats.dump(), - // Bulk Stats self.bulk_stats.dump(), - // Subflow Stats self.subflow_stats.dump(), - // Active Idle Stats self.active_idle_stats.dump(), - // ICMP Stats self.icmp_stats.dump(), - // Retransmission Stats self.retransmission_stats.dump(), - // Window Size Stats + self.tcp_quality_stats.dump(), self.window_size_stats.dump(), - // TCP Flag Stats self.tcp_flags_stats.dump(), - // Rate Stats (per second) safe_per_second_rate( self.payload_len_stats.payload_len.get_total(), - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.payload_len.get_count() as f64, - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.fwd_payload_len.get_total(), - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.fwd_payload_len.get_count() as f64, - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.bwd_payload_len.get_total(), - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_per_second_rate( self.payload_len_stats.bwd_payload_len.get_count() as f64, - duration_us as f64 - ), + duration_us as f64, + ) + .to_string(), safe_div_int( self.payload_len_stats.fwd_payload_len.get_count(), - self.subflow_stats.subflow_count - ), + self.subflow_stats.subflow_count, + ) + .to_string(), safe_div( self.payload_len_stats.fwd_payload_len.get_total(), - self.subflow_stats.subflow_count as f64 - ), + self.subflow_stats.subflow_count as f64, + ) + .to_string(), safe_div_int( self.payload_len_stats.bwd_payload_len.get_count(), - self.subflow_stats.subflow_count - ), + self.subflow_stats.subflow_count, + ) + .to_string(), safe_div( self.payload_len_stats.bwd_payload_len.get_total(), - self.subflow_stats.subflow_count as f64 - ), - // UP/DOWN Ratio + self.subflow_stats.subflow_count as f64, + ) + .to_string(), safe_div_int( self.payload_len_stats.bwd_payload_len.get_count(), - self.payload_len_stats.fwd_payload_len.get_count() - ), - ) + self.payload_len_stats.fwd_payload_len.get_count(), + ) + .to_string(), + ] + .join(",") } fn get_features_without_contamination() -> String { - format!( - "{},{},{},{},{},{},{},{},{},{},\ - {},{},{},{},{},{},{},{},{},{}\ - ,{},{},{},{},{},{},{},{},{},{},{},{}", - // Basic Info - "source_port_iana", - "destination_port_iana", - "protocol", - "flow_duration_us", - "flow_expire_cause", - "tcp_handshake_completed", - "tcp_reset_before_handshake", - "tcp_reset_after_handshake", - // Timing Stats - "fwd_duration_ms", - "bwd_duration_ms", - // IAT Stats + vec![ + "source_port_iana".to_string(), + "destination_port_iana".to_string(), + "protocol".to_string(), + "flow_duration_us".to_string(), + "flow_expire_cause".to_string(), + "tcp_handshake_completed".to_string(), + "tcp_reset_before_handshake".to_string(), + "tcp_reset_after_handshake".to_string(), + "tcp_close_style".to_string(), + "fwd_duration_ms".to_string(), + "bwd_duration_ms".to_string(), IATStats::headers(), - // Packet Length Stats PacketLengthStats::headers(), - // Packet Header Length Stats HeaderLengthStats::headers(), - // Payload Length Stats PayloadLengthStats::headers(), - // Bulk Stats BulkStats::headers(), - // Subflow Stats SubflowStats::headers(), - // Active Idle Stats ActiveIdleStats::headers(), - // ICMP Stats IcmpStats::headers(), - // Retransmission Stats RetransmissionStats::headers(), - // Window Size Stats + TcpQualityStats::headers(), WindowSizeStats::headers(), - // TCP Flag Stats TcpFlagStats::headers(), - // Rate Stats (per second) - "flow_bytes_s", - "flow_packets_s", - "fwd_bytes_s", - "fwd_packets_s", - "bwd_bytes_s", - "bwd_packets_s", - "fwd_subflow_packets_mean", - "fwd_subflow_bytes_mean", - "bwd_subflow_packets_mean", - "bwd_subflow_bytes_mean", - // UP/DOWN Ratio - "up_down_ratio", - ) + "flow_bytes_s".to_string(), + "flow_packets_s".to_string(), + "fwd_bytes_s".to_string(), + "fwd_packets_s".to_string(), + "bwd_bytes_s".to_string(), + "bwd_packets_s".to_string(), + "fwd_subflow_packets_mean".to_string(), + "fwd_subflow_bytes_mean".to_string(), + "bwd_subflow_packets_mean".to_string(), + "bwd_subflow_bytes_mean".to_string(), + "up_down_ratio".to_string(), + ] + .join(",") } fn get_first_timestamp_us(&self) -> i64 { diff --git a/rustiflow/src/tests/flows/basic_flow_test.rs b/rustiflow/src/tests/flows/basic_flow_test.rs index 79528456..07995522 100644 --- a/rustiflow/src/tests/flows/basic_flow_test.rs +++ b/rustiflow/src/tests/flows/basic_flow_test.rs @@ -3,7 +3,11 @@ mod tests { use std::net::{IpAddr, Ipv4Addr}; use crate::{ - flows::{basic_flow::BasicFlow, flow::Flow, util::FlowExpireCause}, + flows::{ + basic_flow::{BasicFlow, TcpCloseStyle}, + flow::Flow, + util::FlowExpireCause, + }, packet_features::PacketFeatures, }; @@ -78,6 +82,7 @@ mod tests { ack_fwd.sequence_number_ack = 201; assert!(flow.update_flow(&ack_fwd, true)); assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpTermination); + assert_eq!(flow.tcp_close_style, TcpCloseStyle::FourWayFin); } #[test] @@ -142,6 +147,7 @@ mod tests { assert!(!flow.tcp_handshake_completed); assert!(flow.tcp_reset_before_handshake); assert!(!flow.tcp_reset_after_handshake); + assert_eq!(flow.tcp_close_style, TcpCloseStyle::Reset); } #[test] @@ -181,6 +187,7 @@ mod tests { assert!(flow.tcp_handshake_completed); assert!(!flow.tcp_reset_before_handshake); assert!(flow.tcp_reset_after_handshake); + assert_eq!(flow.tcp_close_style, TcpCloseStyle::Reset); } #[test] @@ -296,6 +303,8 @@ mod tests { assert!(!flow.tcp_handshake_completed); assert!(!flow.tcp_reset_before_handshake); assert!(!flow.tcp_reset_after_handshake); + flow.close_flow(2_000_000, FlowExpireCause::IdleTimeout); + assert_eq!(flow.tcp_close_style, TcpCloseStyle::NotApplicable); } #[test] @@ -347,5 +356,31 @@ mod tests { assert_eq!(flow.flow_expire_cause, FlowExpireCause::TcpTermination); assert!(flow.tcp_handshake_completed); + assert_eq!(flow.tcp_close_style, TcpCloseStyle::SimultaneousFin); + } + + #[test] + fn half_close_style_is_preserved_when_timeout_happens_after_single_fin() { + let ip_source = IpAddr::V4(Ipv4Addr::new(10, 0, 6, 1)); + let ip_destination = IpAddr::V4(Ipv4Addr::new(10, 0, 6, 2)); + let mut flow = BasicFlow::new( + "flow-11".to_string(), + ip_source, + 50014, + ip_destination, + 443, + 6, + 1_000_000, + ); + + let mut fin_fwd = build_packet(ip_source, 50014, ip_destination, 443, 1_000_100); + fin_fwd.fin_flag = 1; + fin_fwd.sequence_number = 100; + assert!(!flow.update_flow(&fin_fwd, true)); + + flow.close_flow(2_000_000, FlowExpireCause::IdleTimeout); + + assert_eq!(flow.flow_expire_cause, FlowExpireCause::IdleTimeout); + assert_eq!(flow.tcp_close_style, TcpCloseStyle::HalfClose); } } diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index f043d428..54f79974 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -7,8 +7,8 @@ mod tests { features::{ active_idle_stats::ActiveIdleStats, iat_stats::IATStats, icmp_stats::IcmpStats, payload_stats::PayloadLengthStats, retransmission_stats::RetransmissionStats, - subflow_stats::SubflowStats, timing_stats::TimingStats, util::FlowFeature, - window_size_stats::WindowSizeStats, + subflow_stats::SubflowStats, tcp_quality_stats::TcpQualityStats, + timing_stats::TimingStats, util::FlowFeature, window_size_stats::WindowSizeStats, }, util::FlowExpireCause, }, @@ -216,6 +216,71 @@ mod tests { assert_eq!(stats.payload_len.get_count(), 4); } + #[test] + fn tcp_quality_stats_count_duplicate_acks_and_zero_window_events() { + let mut stats = TcpQualityStats::new(); + + let mut first_ack = packet(1_000_000); + first_ack.protocol = IpNextHeaderProtocols::Tcp.0; + first_ack.flags = ACK_FLAG; + first_ack.ack_flag = 1; + first_ack.sequence_number_ack = 500; + first_ack.window_size = 4096; + stats.update(&first_ack, true, first_ack.timestamp_us); + + let mut duplicate_ack = packet(1_000_500); + duplicate_ack.protocol = IpNextHeaderProtocols::Tcp.0; + duplicate_ack.flags = ACK_FLAG; + duplicate_ack.ack_flag = 1; + duplicate_ack.sequence_number_ack = 500; + duplicate_ack.window_size = 4096; + stats.update(&duplicate_ack, true, first_ack.timestamp_us); + + let mut changed_window = packet(1_001_000); + changed_window.protocol = IpNextHeaderProtocols::Tcp.0; + changed_window.flags = ACK_FLAG; + changed_window.ack_flag = 1; + changed_window.sequence_number_ack = 500; + changed_window.window_size = 2048; + stats.update(&changed_window, true, duplicate_ack.timestamp_us); + + let mut zero_window = packet(1_001_500); + zero_window.protocol = IpNextHeaderProtocols::Tcp.0; + zero_window.flags = ACK_FLAG; + zero_window.ack_flag = 1; + zero_window.sequence_number_ack = 800; + zero_window.window_size = 0; + stats.update(&zero_window, false, changed_window.timestamp_us); + + let mut zero_window_repeat = packet(1_002_000); + zero_window_repeat.protocol = IpNextHeaderProtocols::Tcp.0; + zero_window_repeat.flags = ACK_FLAG; + zero_window_repeat.ack_flag = 1; + zero_window_repeat.sequence_number_ack = 800; + zero_window_repeat.window_size = 0; + stats.update(&zero_window_repeat, false, zero_window.timestamp_us); + + let mut tcp_payload = packet(1_002_500); + tcp_payload.protocol = IpNextHeaderProtocols::Tcp.0; + tcp_payload.ack_flag = 1; + tcp_payload.flags = ACK_FLAG; + tcp_payload.sequence_number_ack = 500; + tcp_payload.data_length = 32; + tcp_payload.window_size = 4096; + stats.update(&tcp_payload, true, zero_window_repeat.timestamp_us); + + let mut udp = packet(1_003_000); + udp.protocol = IpNextHeaderProtocols::Udp.0; + udp.window_size = 0; + stats.update(&udp, false, tcp_payload.timestamp_us); + + assert_eq!(stats.fwd_duplicate_ack_count, 1); + assert_eq!(stats.bwd_duplicate_ack_count, 1); + assert_eq!(stats.fwd_zero_window_count, 0); + assert_eq!(stats.bwd_zero_window_count, 2); + assert_eq!(stats.dump(), "2,1,1,2,0,2"); + } + #[test] fn subflow_stats_count_initial_subflow_and_increment_only_on_gaps_greater_than_one_second() { let mut stats = SubflowStats::new(); diff --git a/rustiflow/src/tests/flows/rusti_flow_test.rs b/rustiflow/src/tests/flows/rusti_flow_test.rs index 942d4e6b..1d641589 100644 --- a/rustiflow/src/tests/flows/rusti_flow_test.rs +++ b/rustiflow/src/tests/flows/rusti_flow_test.rs @@ -3,7 +3,7 @@ mod tests { use std::net::{IpAddr, Ipv4Addr}; use crate::{ - flows::{flow::Flow, rusti_flow::RustiFlow}, + flows::{basic_flow::TcpCloseStyle, flow::Flow, rusti_flow::RustiFlow}, packet_features::{PacketFeatures, ACK_FLAG, SYN_FLAG}, }; @@ -37,6 +37,7 @@ mod tests { destination_port, protocol: 6, timestamp_us, + window_size: 4096, ..Default::default() } } @@ -94,6 +95,21 @@ mod tests { ack_bwd.sequence_number_ack = 200; assert!(!flow.update_flow(&ack_bwd, false)); + let mut duplicate_ack_bwd = packet(destination_ip, 443, source_ip, 44444, 1_001_650); + duplicate_ack_bwd.ack_flag = 1; + duplicate_ack_bwd.flags = ACK_FLAG; + duplicate_ack_bwd.sequence_number = 702; + duplicate_ack_bwd.sequence_number_ack = 200; + assert!(!flow.update_flow(&duplicate_ack_bwd, false)); + + let mut zero_window_bwd = packet(destination_ip, 443, source_ip, 44444, 1_001_700); + zero_window_bwd.ack_flag = 1; + zero_window_bwd.flags = ACK_FLAG; + zero_window_bwd.sequence_number = 703; + zero_window_bwd.sequence_number_ack = 200; + zero_window_bwd.window_size = 0; + assert!(!flow.update_flow(&zero_window_bwd, false)); + let mut overlap = packet(source_ip, 44444, destination_ip, 443, 1_001_800); overlap.ack_flag = 1; overlap.flags = ACK_FLAG; @@ -103,14 +119,19 @@ mod tests { assert!(!flow.update_flow(&overlap, true)); assert!(flow.basic_flow.tcp_handshake_completed); + assert_eq!(flow.basic_flow.tcp_close_style, TcpCloseStyle::None); assert_eq!(flow.retransmission_stats.fwd_retransmission_count, 1); assert_eq!(flow.retransmission_stats.bwd_retransmission_count, 0); - assert_eq!(flow.iat_stats.iat.get_count(), 5); + assert_eq!(flow.tcp_quality_stats.fwd_duplicate_ack_count, 0); + assert_eq!(flow.tcp_quality_stats.bwd_duplicate_ack_count, 1); + assert_eq!(flow.tcp_quality_stats.fwd_zero_window_count, 0); + assert_eq!(flow.tcp_quality_stats.bwd_zero_window_count, 1); + assert_eq!(flow.iat_stats.iat.get_count(), 7); assert_eq!(flow.iat_stats.fwd_iat.get_count(), 3); - assert_eq!(flow.iat_stats.bwd_iat.get_count(), 1); + assert_eq!(flow.iat_stats.bwd_iat.get_count(), 3); assert_eq!(flow.subflow_stats.subflow_count, 1); assert!((flow.timing_stats.get_fwd_duration() - 1.7).abs() < f64::EPSILON); - assert!((flow.timing_stats.get_bwd_duration() - 1.3).abs() < f64::EPSILON); + assert!((flow.timing_stats.get_bwd_duration() - 1.5).abs() < f64::EPSILON); assert_eq!(flow.payload_len_stats.fwd_non_zero_payload_packets, 2); assert_eq!(flow.payload_len_stats.bwd_non_zero_payload_packets, 0); } From 153fd9b982b2b4215668cfdef0dd57ca4e65497e Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Wed, 25 Mar 2026 23:55:38 +0100 Subject: [PATCH 29/34] feat: export ip version in nf flow --- AGENTS.md | 2 +- README.md | 2 +- docs/engineering-notes.md | 4 ++ rustiflow/src/args.rs | 2 +- rustiflow/src/flows/nf_flow.rs | 17 +++++- rustiflow/src/tests/flows/nf_flow_test.rs | 72 ++++++++++++++++++++++- 6 files changed, 91 insertions(+), 8 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 44b9621e..84cce2b7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -127,7 +127,7 @@ Primary files: - [ ] Optional lightweight application-aware metadata: DNS, TLS, HTTP, QUIC. - [ ] Better contamination-free abstractions than only coarse IANA port buckets. -- [ ] Fill `nf_flow` gaps such as `ip_version`, `vlan_id`, and `tunnel_id` once +- [ ] Fill remaining `nf_flow` gaps such as `vlan_id` and `tunnel_id` once packet metadata exists in both ingestion modes. ### Working rule diff --git a/README.md b/README.md index 98cf285e..a51079b7 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ Options: - basic: A basic flow that stores the basic features of a flow - cic: Represents the CIC Flow, giving 90 features - cidds: Represents the CIDDS Flow, giving 10 features - - nfstream: Represents a nfstream inspired flow, giving 69 features + - nfstream: Represents a nfstream inspired flow, giving 71 features - rustiflow: Represents the Rusti Flow, giving 199 features - custom: Represents a flow that you can implement yourself diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 208b30bd..4a011e2e 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -34,6 +34,10 @@ This file keeps short-lived design choices and execution notes that would make count TCP packets advertising a zero receive window; close style stays rooted in `BasicFlow` lifecycle state so timeout/reset/FIN semantics are not reimplemented in exporter code. +- `nf_flow` now exports `ip_version` without expanding the eBPF event payload. + The value is derived from the normalized `IpAddr` already shared by offline + and realtime ingestion, and fixture-backed tests lock down the IPv4 path + while direct flow construction locks down the IPv6 path. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/rustiflow/src/args.rs b/rustiflow/src/args.rs index cbc80c31..8bc76e44 100644 --- a/rustiflow/src/args.rs +++ b/rustiflow/src/args.rs @@ -167,7 +167,7 @@ pub enum FlowType { /// Represents the CIDDS Flow, giving 10 features. CIDDS, - /// Represents a nfstream inspired flow, giving 69 features. + /// Represents a nfstream inspired flow, giving 71 features. Nfstream, /// Represents the Rusti Flow, giving 199 features. diff --git a/rustiflow/src/flows/nf_flow.rs b/rustiflow/src/flows/nf_flow.rs index 38eaf82b..a59df0a9 100644 --- a/rustiflow/src/flows/nf_flow.rs +++ b/rustiflow/src/flows/nf_flow.rs @@ -44,6 +44,13 @@ impl NfFlow { _ => -1, } } + + pub fn get_ip_version(&self) -> u8 { + match self.basic_flow.ip_source { + IpAddr::V4(_) => 4, + IpAddr::V6(_) => 6, + } + } } impl Flow for NfFlow { @@ -124,7 +131,7 @@ impl Flow for NfFlow { fn dump(&self) -> String { format!( - "{},{},{},{},{},{},{},{},{},{},\ + "{},{},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ @@ -132,7 +139,7 @@ impl Flow for NfFlow { {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{}", // NFlow Core Features - // 7 features are missing: src_mac, src_oui, dst_mac, dst_oui, ip_version, vlan_id, tunner_id + // 6 features are missing: src_mac, src_oui, dst_mac, dst_oui, vlan_id, tunner_id self.basic_flow.flow_key, self.get_expiration_id(), self.basic_flow.ip_source, @@ -140,6 +147,7 @@ impl Flow for NfFlow { self.basic_flow.ip_destination, self.basic_flow.port_destination, self.basic_flow.protocol, + self.get_ip_version(), self.basic_flow.first_timestamp_us / 1000, self.basic_flow.last_timestamp_us / 1000, self.basic_flow.get_flow_duration_msec(), @@ -216,6 +224,7 @@ impl Flow for NfFlow { "dst_ip", "dst_port", "protocol", + "ip_version", "bidirectional_first_seen_ms", "bidirectional_last_seen_ms", "bidirectional_duration_ms", @@ -285,7 +294,7 @@ impl Flow for NfFlow { fn dump_without_contamination(&self) -> String { format!( - "{},{},{},{},{},{},{},{},{},{},\ + "{},{},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ {},{},{},{},{},{},{},{},{},{},\ @@ -295,6 +304,7 @@ impl Flow for NfFlow { iana_port_mapping(self.basic_flow.port_source), iana_port_mapping(self.basic_flow.port_destination), self.basic_flow.protocol, + self.get_ip_version(), self.basic_flow.get_flow_duration_msec(), self.packet_len_stats.flow_count(), self.packet_len_stats.flow_total(), @@ -361,6 +371,7 @@ impl Flow for NfFlow { "src_port_iana", "dst_port_iana", "protocol", + "ip_version", "bidirectional_duration_ms", "bidirectional_packets", "bidirectional_bytes", diff --git a/rustiflow/src/tests/flows/nf_flow_test.rs b/rustiflow/src/tests/flows/nf_flow_test.rs index 9e598572..c9881201 100644 --- a/rustiflow/src/tests/flows/nf_flow_test.rs +++ b/rustiflow/src/tests/flows/nf_flow_test.rs @@ -1,8 +1,16 @@ #[cfg(test)] mod tests { - use std::net::{IpAddr, Ipv4Addr}; + use std::{ + net::{IpAddr, Ipv4Addr, Ipv6Addr}, + path::PathBuf, + }; - use crate::flows::{flow::Flow, nf_flow::NfFlow, util::FlowExpireCause}; + use tokio::sync::mpsc; + + use crate::{ + flows::{flow::Flow, nf_flow::NfFlow, util::FlowExpireCause}, + pcap::read_pcap_file, + }; fn setup_nf_flow() -> NfFlow { NfFlow::new( @@ -20,6 +28,19 @@ mod tests { row.split(',').count() } + fn csv_fields(row: &str) -> Vec<&str> { + row.split(',').collect() + } + + fn fixture_path(name: &str) -> String { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(name) + .to_string_lossy() + .into_owned() + } + #[test] fn dump_matches_feature_headers() { let flow = setup_nf_flow(); @@ -47,4 +68,51 @@ mod tests { flow.close_flow(4_000_000, FlowExpireCause::TcpReset); assert_eq!(flow.get_expiration_id(), -1); } + + #[test] + fn ip_version_is_exported_for_ipv4_and_ipv6_flows() { + let ipv4_flow = setup_nf_flow(); + assert_eq!(ipv4_flow.get_ip_version(), 4); + assert_eq!(csv_fields(&ipv4_flow.dump())[7], "4"); + assert_eq!(csv_fields(&ipv4_flow.dump_without_contamination())[3], "4"); + + let ipv6_flow = NfFlow::new( + "nf-flow-v6".to_string(), + IpAddr::V6(Ipv6Addr::LOCALHOST), + 12345, + IpAddr::V6(Ipv6Addr::new(0x2001, 0xdb8, 0, 0, 0, 0, 0, 1)), + 80, + 6, + 1_000_000, + ); + assert_eq!(ipv6_flow.get_ip_version(), 6); + assert_eq!(csv_fields(&ipv6_flow.dump())[7], "6"); + assert_eq!(csv_fields(&ipv6_flow.dump_without_contamination())[3], "6"); + } + + #[tokio::test] + async fn offline_fixture_exports_ipv4_version_for_all_flows() { + let (tx, mut rx) = mpsc::channel::(64); + + read_pcap_file::( + &fixture_path("nmap_tcp_syn_version.pcap"), + tx, + 1, + 3600, + 120, + None, + 60, + ) + .await + .expect("fixture pcap should parse successfully"); + + let mut flow_count = 0; + while let Some(flow) = rx.recv().await { + flow_count += 1; + assert_eq!(flow.get_ip_version(), 4); + assert_eq!(csv_fields(&flow.dump())[7], "4"); + } + + assert_eq!(flow_count, 17); + } } From 45324658335a851d01a9363671bd485a816c68e3 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Thu, 26 Mar 2026 00:02:44 +0100 Subject: [PATCH 30/34] perf: use typed flow keys in hot paths --- docs/engineering-notes.md | 3 + docs/performance-roadmap.md | 6 ++ rustiflow/src/flow_key.rs | 66 +++++++++++++++++++ rustiflow/src/flow_table.rs | 49 +++++++++----- rustiflow/src/flows/basic_flow.rs | 4 -- rustiflow/src/flows/cic_flow.rs | 4 -- rustiflow/src/flows/cidds_flow.rs | 4 -- rustiflow/src/flows/custom_flow.rs | 4 -- rustiflow/src/flows/flow.rs | 3 - rustiflow/src/flows/nf_flow.rs | 4 -- rustiflow/src/flows/rusti_flow.rs | 4 -- rustiflow/src/main.rs | 1 + rustiflow/src/packet_features.rs | 51 ++------------ rustiflow/src/pcap.rs | 4 +- rustiflow/src/realtime.rs | 6 +- rustiflow/src/tests/flows/flow_table_test.rs | 36 +++++++++- .../src/tests/flows/packet_features_test.rs | 14 +++- 17 files changed, 169 insertions(+), 94 deletions(-) create mode 100644 rustiflow/src/flow_key.rs diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 4a011e2e..51b2ace3 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -38,6 +38,9 @@ This file keeps short-lived design choices and execution notes that would make The value is derived from the normalized `IpAddr` already shared by offline and realtime ingestion, and fixture-backed tests lock down the IPv4 path while direct flow construction locks down the IPv6 path. +- Internal sharding and flow-table lookup now use typed `FlowKey` values + instead of rebuilding formatted strings on the hot path. String flow ids are + still created when a new flow is instantiated for export compatibility. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/docs/performance-roadmap.md b/docs/performance-roadmap.md index 60d7b921..d61e68ff 100644 --- a/docs/performance-roadmap.md +++ b/docs/performance-roadmap.md @@ -252,3 +252,9 @@ What this does not fully prove: - Use short dated notes here when a measurement or optimization changes priorities. - If a planned optimization turns out not to matter, mark it done and note that it was ruled out. - 2026-03-25: Decision: stabilize and measure after the current timestamp and length/header-length alignment work before adding more packet metadata to eBPF events. +- 2026-03-25: Typed internal flow keys now replace string keys in sharding and + flow-table lookup while keeping string formatting only for exported flow ids. + On a locally amplified offline fixture (`nmap_udp_version.pcap` packet + records repeated 400x, ~2.5 MB), warm-cache `--release` runs dropped from + about `39.8 ms` to `19.7 ms` mean over 5 runs on this machine. Treat this as + a local directional signal, not a Linux realtime substitute. diff --git a/rustiflow/src/flow_key.rs b/rustiflow/src/flow_key.rs new file mode 100644 index 00000000..a35cea4a --- /dev/null +++ b/rustiflow/src/flow_key.rs @@ -0,0 +1,66 @@ +use std::{fmt, net::IpAddr}; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct EndpointKey { + pub ip: IpAddr, + pub port: u16, +} + +impl EndpointKey { + pub fn new(ip: IpAddr, port: u16) -> Self { + Self { ip, port } + } +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct FlowKey { + pub source: EndpointKey, + pub destination: EndpointKey, + pub protocol: u8, +} + +impl FlowKey { + pub fn new( + source_ip: IpAddr, + source_port: u16, + destination_ip: IpAddr, + destination_port: u16, + protocol: u8, + ) -> Self { + Self { + source: EndpointKey::new(source_ip, source_port), + destination: EndpointKey::new(destination_ip, destination_port), + protocol, + } + } + + pub fn reverse(self) -> Self { + Self { + source: self.destination, + destination: self.source, + protocol: self.protocol, + } + } + + pub fn canonical(self) -> Self { + if self.source <= self.destination { + self + } else { + self.reverse() + } + } +} + +impl fmt::Display for FlowKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}:{}-{}:{}-{}", + self.source.ip, + self.source.port, + self.destination.ip, + self.destination.port, + self.protocol + ) + } +} diff --git a/rustiflow/src/flow_table.rs b/rustiflow/src/flow_table.rs index 46c387a4..d32df90a 100644 --- a/rustiflow/src/flow_table.rs +++ b/rustiflow/src/flow_table.rs @@ -1,11 +1,13 @@ use std::collections::HashMap; -use crate::{flows::util::FlowExpireCause, packet_features::PacketFeatures, Flow}; +use crate::{ + flow_key::FlowKey, flows::util::FlowExpireCause, packet_features::PacketFeatures, Flow, +}; use log::{debug, error}; use tokio::sync::mpsc; pub struct FlowTable { - flow_map: HashMap, // HashMap for fast flow access by key + flow_map: HashMap, // HashMap for fast flow access by key active_timeout: u64, idle_timeout: u64, early_export: Option, @@ -42,14 +44,9 @@ where self.check_and_export_expired_flows(packet.timestamp_us) .await; - // Determine the flow direction and key - let flow_key = if self.flow_map.contains_key(&packet.flow_key_bwd()) { - packet.flow_key_bwd() - } else { - packet.flow_key() - }; + let flow_key = packet.flow_key_value(); + let reverse_flow_key = flow_key.reverse(); - // Update the flow if it exists, otherwise create a new flow if let Some(mut flow) = self.flow_map.remove(&flow_key) { let (is_expired, cause) = flow.is_expired(packet.timestamp_us, self.active_timeout, self.idle_timeout); @@ -58,11 +55,24 @@ where self.export_flow(flow).await; self.create_and_insert_flow(packet).await; } else { - let is_terminated = self.update_flow_with_packet(&mut flow, packet).await; + let is_terminated = self.update_flow_with_packet(&mut flow, packet, true).await; if !is_terminated { self.flow_map.insert(flow_key, flow); } } + } else if let Some(mut flow) = self.flow_map.remove(&reverse_flow_key) { + let (is_expired, cause) = + flow.is_expired(packet.timestamp_us, self.active_timeout, self.idle_timeout); + if is_expired { + flow.close_flow(packet.timestamp_us, cause); + self.export_flow(flow).await; + self.create_and_insert_flow(packet).await; + } else { + let is_terminated = self.update_flow_with_packet(&mut flow, packet, false).await; + if !is_terminated { + self.flow_map.insert(reverse_flow_key, flow); + } + } } else { self.create_and_insert_flow(packet).await; } @@ -70,8 +80,9 @@ where /// Create and insert a new flow for the given packet. async fn create_and_insert_flow(&mut self, packet: &PacketFeatures) { + let flow_key = packet.flow_key_value(); let mut new_flow = T::new( - packet.flow_key(), + flow_key.to_string(), packet.source_ip, packet.source_port, packet.destination_ip, @@ -79,17 +90,23 @@ where packet.protocol, packet.timestamp_us, ); - let is_terminated = self.update_flow_with_packet(&mut new_flow, packet).await; + let is_terminated = self + .update_flow_with_packet(&mut new_flow, packet, true) + .await; if !is_terminated { - self.flow_map.insert(packet.flow_key(), new_flow); + self.flow_map.insert(flow_key, new_flow); } } /// Updates a flow with a packet and exports flow if terminated. /// /// Returns a boolean indicating if the flow is terminated. - async fn update_flow_with_packet(&mut self, flow: &mut T, packet: &PacketFeatures) -> bool { - let is_forward = *flow.flow_key() == packet.flow_key(); + async fn update_flow_with_packet( + &mut self, + flow: &mut T, + packet: &PacketFeatures, + is_forward: bool, + ) -> bool { let flow_terminated = flow.update_flow(&packet, is_forward); if flow_terminated { @@ -158,7 +175,7 @@ where let (is_expired, cause) = flow.is_expired(timestamp_us, self.active_timeout, self.idle_timeout); if is_expired { - Some((key.clone(), cause)) + Some((*key, cause)) } else { None } diff --git a/rustiflow/src/flows/basic_flow.rs b/rustiflow/src/flows/basic_flow.rs index 5b46090f..1725c78c 100644 --- a/rustiflow/src/flows/basic_flow.rs +++ b/rustiflow/src/flows/basic_flow.rs @@ -343,8 +343,4 @@ impl Flow for BasicFlow { (false, FlowExpireCause::None) } - - fn flow_key(&self) -> &String { - &self.flow_key - } } diff --git a/rustiflow/src/flows/cic_flow.rs b/rustiflow/src/flows/cic_flow.rs index 98013f2a..2540560c 100644 --- a/rustiflow/src/flows/cic_flow.rs +++ b/rustiflow/src/flows/cic_flow.rs @@ -650,8 +650,4 @@ impl Flow for CicFlow { self.basic_flow .is_expired(timestamp_us, active_timeout, idle_timeout) } - - fn flow_key(&self) -> &String { - &self.basic_flow.flow_key - } } diff --git a/rustiflow/src/flows/cidds_flow.rs b/rustiflow/src/flows/cidds_flow.rs index 96f93aeb..6c9c81f6 100644 --- a/rustiflow/src/flows/cidds_flow.rs +++ b/rustiflow/src/flows/cidds_flow.rs @@ -147,8 +147,4 @@ impl Flow for CiddsFlow { self.basic_flow .is_expired(timestamp_us, active_timeout, idle_timeout) } - - fn flow_key(&self) -> &String { - &self.basic_flow.flow_key - } } diff --git a/rustiflow/src/flows/custom_flow.rs b/rustiflow/src/flows/custom_flow.rs index a495c982..2dc2256d 100644 --- a/rustiflow/src/flows/custom_flow.rs +++ b/rustiflow/src/flows/custom_flow.rs @@ -115,8 +115,4 @@ impl Flow for CustomFlow { self.basic_flow .is_expired(timestamp_us, active_timeout, idle_timeout) } - - fn flow_key(&self) -> &String { - &self.basic_flow.flow_key - } } diff --git a/rustiflow/src/flows/flow.rs b/rustiflow/src/flows/flow.rs index 0c1b3f49..b6cfd7c5 100644 --- a/rustiflow/src/flows/flow.rs +++ b/rustiflow/src/flows/flow.rs @@ -38,9 +38,6 @@ pub trait Flow: Send + Sync + 'static + Clone { timestamp_us: i64, ) -> Self; - /// Returns the flow key. - fn flow_key(&self) -> &String; - /// Updates the flow with a new packet. /// /// This method processes a packet and updates the internal state of the flow diff --git a/rustiflow/src/flows/nf_flow.rs b/rustiflow/src/flows/nf_flow.rs index a59df0a9..f5421853 100644 --- a/rustiflow/src/flows/nf_flow.rs +++ b/rustiflow/src/flows/nf_flow.rs @@ -446,8 +446,4 @@ impl Flow for NfFlow { self.basic_flow .is_expired(timestamp_us, active_timeout, idle_timeout) } - - fn flow_key(&self) -> &String { - &self.basic_flow.flow_key - } } diff --git a/rustiflow/src/flows/rusti_flow.rs b/rustiflow/src/flows/rusti_flow.rs index e9438da9..4cd5c966 100644 --- a/rustiflow/src/flows/rusti_flow.rs +++ b/rustiflow/src/flows/rusti_flow.rs @@ -392,8 +392,4 @@ impl Flow for RustiFlow { self.basic_flow .is_expired(timestamp_us, active_timeout, idle_timeout) } - - fn flow_key(&self) -> &String { - &self.basic_flow.flow_key - } } diff --git a/rustiflow/src/main.rs b/rustiflow/src/main.rs index c785082c..8093bd3d 100644 --- a/rustiflow/src/main.rs +++ b/rustiflow/src/main.rs @@ -1,4 +1,5 @@ mod args; +mod flow_key; mod flow_table; #[cfg(target_os = "linux")] mod flow_tui; diff --git a/rustiflow/src/packet_features.rs b/rustiflow/src/packet_features.rs index 85698767..17cb5371 100644 --- a/rustiflow/src/packet_features.rs +++ b/rustiflow/src/packet_features.rs @@ -15,6 +15,8 @@ use pnet::packet::{ Packet, }; +use crate::flow_key::FlowKey; + // Define TCP flags pub const FIN_FLAG: u8 = 0b00000001; pub const SYN_FLAG: u8 = 0b00000010; @@ -187,57 +189,18 @@ impl PacketFeatures { ) } - /// Generates a flow key based on IPs, ports, and protocol - pub fn flow_key(&self) -> String { - format!( - "{}:{}-{}:{}-{}", + pub fn flow_key_value(&self) -> FlowKey { + FlowKey::new( self.source_ip, self.source_port, self.destination_ip, self.destination_port, - self.protocol - ) - } - - /// Generates a flow key based on IPs, ports, and protocol in the reverse direction - pub fn flow_key_bwd(&self) -> String { - format!( - "{}:{}-{}:{}-{}", - self.destination_ip, - self.destination_port, - self.source_ip, - self.source_port, - self.protocol + self.protocol, ) } - /// Generates a biflow key - pub fn biflow_key(&self) -> String { - // Create tuples of (IP, port) for comparison - let src = (&self.source_ip, self.source_port); - let dst = (&self.destination_ip, self.destination_port); - - // Determine the correct order (src < dst) - if src < dst { - format!( - "{}:{}-{}:{}-{}", - self.source_ip, - self.source_port, - self.destination_ip, - self.destination_port, - self.protocol - ) - } else { - // If destination IP/port is "smaller", swap the order - format!( - "{}:{}-{}:{}-{}", - self.destination_ip, - self.destination_port, - self.source_ip, - self.source_port, - self.protocol - ) - } + pub fn biflow_key_value(&self) -> FlowKey { + self.flow_key_value().canonical() } } diff --git a/rustiflow/src/pcap.rs b/rustiflow/src/pcap.rs index 28b32b6a..b2f47966 100644 --- a/rustiflow/src/pcap.rs +++ b/rustiflow/src/pcap.rs @@ -228,7 +228,7 @@ async fn process_packet( P: Packet, { if let Some(packet_features) = extractor(packet, timestamp_us) { - let flow_key = packet_features.biflow_key(); + let flow_key = packet_features.biflow_key_value(); let shard_index = compute_shard_index(&flow_key, num_shards); if let Err(e) = shard_senders[shard_index].send(packet_features).await { @@ -240,7 +240,7 @@ async fn process_packet( } } -fn compute_shard_index(flow_key: &str, num_shards: u8) -> usize { +fn compute_shard_index(flow_key: &H, num_shards: u8) -> usize { assert!(num_shards > 0, "num_shards must be greater than 0"); let mut hasher = DefaultHasher::new(); flow_key.hash(&mut hasher); diff --git a/rustiflow/src/realtime.rs b/rustiflow/src/realtime.rs index 34d01a0c..e1262a62 100644 --- a/rustiflow/src/realtime.rs +++ b/rustiflow/src/realtime.rs @@ -148,7 +148,7 @@ where unsafe { std::ptr::read(event.as_ptr() as *const _) }; let packet_features = PacketFeatures::from_ebpf_event_ipv4(&ebpf_event_ipv4, realtime_offset_us); - let flow_key = packet_features.biflow_key(); + let flow_key = packet_features.biflow_key_value(); let shard_index = compute_shard_index(&flow_key, num_threads); if let Err(e) = shard_senders_clone[shard_index].send(packet_features).await { @@ -192,7 +192,7 @@ where unsafe { std::ptr::read(event.as_ptr() as *const _) }; let packet_features = PacketFeatures::from_ebpf_event_ipv6(&ebpf_event_ipv6, realtime_offset_us); - let flow_key = packet_features.biflow_key(); + let flow_key = packet_features.biflow_key_value(); let shard_index = compute_shard_index(&flow_key, num_threads); if let Err(e) = shard_senders_clone[shard_index].send(packet_features).await { @@ -259,7 +259,7 @@ where Ok(total_dropped) } -fn compute_shard_index(flow_key: &str, num_shards: u8) -> usize { +fn compute_shard_index(flow_key: &H, num_shards: u8) -> usize { assert!(num_shards > 0, "num_shards must be greater than 0"); let mut hasher = DefaultHasher::new(); flow_key.hash(&mut hasher); diff --git a/rustiflow/src/tests/flows/flow_table_test.rs b/rustiflow/src/tests/flows/flow_table_test.rs index 56610dd0..6497f243 100644 --- a/rustiflow/src/tests/flows/flow_table_test.rs +++ b/rustiflow/src/tests/flows/flow_table_test.rs @@ -6,7 +6,7 @@ mod tests { use crate::{ flow_table::FlowTable, - flows::{basic_flow::BasicFlow, util::FlowExpireCause}, + flows::{basic_flow::BasicFlow, cidds_flow::CiddsFlow, util::FlowExpireCause}, packet_features::PacketFeatures, }; @@ -77,4 +77,38 @@ mod tests { assert!(rx.try_recv().is_err()); } + + #[tokio::test] + async fn reverse_direction_packets_stay_in_one_bidirectional_flow() { + let (tx, mut rx) = mpsc::channel::(4); + let mut flow_table = FlowTable::new(3600, 120, None, tx, 60); + + let mut forward = build_packet(1_000_000); + forward.length = 120; + flow_table.process_packet(&forward).await; + + let reverse = PacketFeatures { + source_ip: forward.destination_ip, + destination_ip: forward.source_ip, + source_port: forward.destination_port, + destination_port: forward.source_port, + protocol: forward.protocol, + timestamp_us: 1_000_500, + length: 80, + ..Default::default() + }; + flow_table.process_packet(&reverse).await; + + flow_table.export_all_flows(2_000_000).await; + + let exported_flow = rx.recv().await.expect("expected exported flow"); + assert_eq!( + exported_flow.basic_flow.flow_key, + forward.flow_key_value().to_string() + ); + assert_eq!(exported_flow.packet_stats.flow_count(), 2); + assert_eq!(exported_flow.packet_stats.fwd_packet_len.get_count(), 1); + assert_eq!(exported_flow.packet_stats.bwd_packet_len.get_count(), 1); + assert!(rx.try_recv().is_err()); + } } diff --git a/rustiflow/src/tests/flows/packet_features_test.rs b/rustiflow/src/tests/flows/packet_features_test.rs index 8c88e042..ee86c6d0 100644 --- a/rustiflow/src/tests/flows/packet_features_test.rs +++ b/rustiflow/src/tests/flows/packet_features_test.rs @@ -30,7 +30,19 @@ mod tests { let forward = build_packet(client_ip, 55000, server_ip, 443); let backward = build_packet(server_ip, 443, client_ip, 55000); - assert_eq!(forward.biflow_key(), backward.biflow_key()); + assert_eq!(forward.biflow_key_value(), backward.biflow_key_value()); + assert_eq!( + forward.biflow_key_value().to_string(), + backward.biflow_key_value().to_string() + ); + assert_eq!( + backward.flow_key_value().to_string(), + "192.168.0.20:443-192.168.0.10:55000-6" + ); + assert_eq!( + forward.flow_key_value().to_string(), + "192.168.0.10:55000-192.168.0.20:443-6" + ); } fn build_ipv6_packet(next_header: u8, payload: &[u8]) -> Vec { From 65ce1ae2c914dcd9bf766f45b7c1820ee89a5a3f Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Thu, 26 Mar 2026 00:05:00 +0100 Subject: [PATCH 31/34] perf: use welford variance in feature stats --- docs/engineering-notes.md | 4 + docs/performance-roadmap.md | 5 ++ rustiflow/src/flows/features/util.rs | 31 ++++---- .../src/tests/flows/feature_modules_test.rs | 78 ++++++++++++++++++- 4 files changed, 96 insertions(+), 22 deletions(-) diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 51b2ace3..611e4b8e 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -41,6 +41,10 @@ This file keeps short-lived design choices and execution notes that would make - Internal sharding and flow-table lookup now use typed `FlowKey` values instead of rebuilding formatted strings on the hot path. String flow ids are still created when a new flow is instantiated for export compatibility. +- `FeatureStats` now keeps running variance state (`m2`) and derives standard + deviation on demand instead of updating `std` itself on every packet. + Dedicated tests now lock down population-std semantics, order invariance, and + merged directional variance behavior. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/docs/performance-roadmap.md b/docs/performance-roadmap.md index d61e68ff..0d3ef9d0 100644 --- a/docs/performance-roadmap.md +++ b/docs/performance-roadmap.md @@ -258,3 +258,8 @@ What this does not fully prove: records repeated 400x, ~2.5 MB), warm-cache `--release` runs dropped from about `39.8 ms` to `19.7 ms` mean over 5 runs on this machine. Treat this as a local directional signal, not a Linux realtime substitute. +- 2026-03-25: `FeatureStats` now uses a Welford-style running variance + accumulator and computes `std` at readout time. On the same local amplified + offline fixture, warm-cache `--release` runs moved from about `21.3 ms` to + `20.5 ms` mean over 5 runs on this machine. Smaller win than typed keys, but + still in the expected direction. diff --git a/rustiflow/src/flows/features/util.rs b/rustiflow/src/flows/features/util.rs index 6d4f3dfa..6e92b2c5 100644 --- a/rustiflow/src/flows/features/util.rs +++ b/rustiflow/src/flows/features/util.rs @@ -23,7 +23,7 @@ pub struct FeatureStats { min: f64, max: f64, mean: f64, - std: f64, + m2: f64, count: u32, } @@ -34,7 +34,7 @@ impl FeatureStats { min: f64::MAX, max: f64::MIN, mean: 0.0, - std: 0.0, + m2: 0.0, count: 0, } } @@ -64,24 +64,17 @@ impl FeatureStats { } pub fn get_std(&self) -> f64 { - self.std + if self.count == 0 { + 0.0 + } else { + (self.m2 / self.count as f64).sqrt() + } } pub fn get_count(&self) -> u32 { self.count } - fn update_mean(&mut self, value: f64) { - self.mean = (((self.count - 1) as f64 * self.mean) + value) / self.count as f64; - } - - fn update_std(&mut self, value: f64, old_mean: f64) { - self.std = ((((self.count - 1) as f64 * self.std.powf(2.0)) - + ((value - old_mean) * (value - self.mean))) - / self.count as f64) - .sqrt(); - } - fn update_min(&mut self, value: f64) { if value < self.min { self.min = value; @@ -95,13 +88,15 @@ impl FeatureStats { } pub fn add_value(&mut self, value: f64) { - self.count += 1; self.total += value; self.update_min(value); self.update_max(value); - let old_mean: f64 = self.mean; - self.update_mean(value); - self.update_std(value, old_mean); + self.count += 1; + + let delta = value - self.mean; + self.mean += delta / self.count as f64; + let delta2 = value - self.mean; + self.m2 += delta * delta2; } pub fn dump_headers(prefix: &str) -> String { diff --git a/rustiflow/src/tests/flows/feature_modules_test.rs b/rustiflow/src/tests/flows/feature_modules_test.rs index 54f79974..2db481f2 100644 --- a/rustiflow/src/tests/flows/feature_modules_test.rs +++ b/rustiflow/src/tests/flows/feature_modules_test.rs @@ -5,10 +5,17 @@ mod tests { use crate::{ flows::{ features::{ - active_idle_stats::ActiveIdleStats, iat_stats::IATStats, icmp_stats::IcmpStats, - payload_stats::PayloadLengthStats, retransmission_stats::RetransmissionStats, - subflow_stats::SubflowStats, tcp_quality_stats::TcpQualityStats, - timing_stats::TimingStats, util::FlowFeature, window_size_stats::WindowSizeStats, + active_idle_stats::ActiveIdleStats, + iat_stats::IATStats, + icmp_stats::IcmpStats, + packet_stats::PacketLengthStats, + payload_stats::PayloadLengthStats, + retransmission_stats::RetransmissionStats, + subflow_stats::SubflowStats, + tcp_quality_stats::TcpQualityStats, + timing_stats::TimingStats, + util::{FeatureStats, FlowFeature}, + window_size_stats::WindowSizeStats, }, util::FlowExpireCause, }, @@ -22,6 +29,69 @@ mod tests { } } + fn population_std(values: &[f64]) -> f64 { + let mean = values.iter().sum::() / values.len() as f64; + let variance = values + .iter() + .map(|value| (value - mean).powi(2)) + .sum::() + / values.len() as f64; + variance.sqrt() + } + + #[test] + fn feature_stats_match_population_statistics_and_are_order_invariant() { + let values = [100.0, 50.0, 0.0, 75.0, 200.0, 125.0]; + + let mut forward = FeatureStats::new(); + for value in values { + forward.add_value(value); + } + + let mut reverse = FeatureStats::new(); + for value in values.into_iter().rev() { + reverse.add_value(value); + } + + let expected_total = values.iter().sum::(); + let expected_mean = expected_total / values.len() as f64; + let expected_std = population_std(&values); + + for stats in [&forward, &reverse] { + assert_eq!(stats.get_count(), values.len() as u32); + assert!((stats.get_total() - expected_total).abs() < f64::EPSILON); + assert!((stats.get_mean() - expected_mean).abs() < f64::EPSILON); + assert!((stats.get_std() - expected_std).abs() < f64::EPSILON); + assert_eq!(stats.get_min(), 0.0); + assert_eq!(stats.get_max(), 200.0); + } + } + + #[test] + fn packet_length_stats_merge_directional_variance_correctly() { + let mut stats = PacketLengthStats::new(); + + let mut fwd_first = packet(1_000_000); + fwd_first.length = 60; + stats.update(&fwd_first, true, fwd_first.timestamp_us); + + let mut bwd_first = packet(1_000_500); + bwd_first.length = 30; + stats.update(&bwd_first, false, fwd_first.timestamp_us); + + let mut fwd_second = packet(1_001_000); + fwd_second.length = 90; + stats.update(&fwd_second, true, bwd_first.timestamp_us); + + let mut bwd_second = packet(1_001_500); + bwd_second.length = 150; + stats.update(&bwd_second, false, fwd_second.timestamp_us); + + let expected = [60.0, 30.0, 90.0, 150.0]; + assert!((stats.flow_mean() - 82.5).abs() < f64::EPSILON); + assert!((stats.flow_std() - population_std(&expected)).abs() < f64::EPSILON); + } + #[test] fn icmp_stats_keep_first_type_code_and_track_behavior_counts() { let mut stats = IcmpStats::new(); From 11d5a6ff5d65fdb258633c582deb17a2c8cf5029 Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Thu, 26 Mar 2026 00:11:31 +0100 Subject: [PATCH 32/34] perf: bypass packet graph state in headless mode --- docs/engineering-notes.md | 4 ++ docs/performance-roadmap.md | 11 ++-- rustiflow/src/main.rs | 6 ++- rustiflow/src/realtime.rs | 58 +++++++++++++-------- rustiflow/src/realtime_mode.rs | 95 ++++++++++++++++++++++++++++++++++ rustiflow/src/realtime_stub.rs | 3 +- 6 files changed, 150 insertions(+), 27 deletions(-) create mode 100644 rustiflow/src/realtime_mode.rs diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index 611e4b8e..ec2f52a9 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -45,6 +45,10 @@ This file keeps short-lived design choices and execution notes that would make deviation on demand instead of updating `std` itself on every packet. Dedicated tests now lock down population-std semantics, order invariance, and merged directional variance behavior. +- Realtime packet-graph mode is now explicit and testable. When the graph is + disabled, RustiFlow no longer constructs the packet-count watch channel or + mutex-protected counter state, so high-throughput runs skip that observability + plumbing entirely instead of merely branching around it in the loop body. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/docs/performance-roadmap.md b/docs/performance-roadmap.md index 0d3ef9d0..39ad741f 100644 --- a/docs/performance-roadmap.md +++ b/docs/performance-roadmap.md @@ -114,9 +114,9 @@ Medium to high depending on export rate and flow size. ### 6. Performance Mode Should Mean Performance -- [ ] Make sure high-throughput runs bypass packet-TUI work completely. -- [ ] Audit mutexes, watch channels, and per-packet UI accounting in performance-sensitive modes. -- [ ] Keep observability available, but not in the critical path by default. +- [x] Make sure high-throughput runs bypass packet-TUI work completely. +- [x] Audit mutexes, watch channels, and per-packet UI accounting in performance-sensitive modes. +- [x] Keep observability available, but not in the critical path by default. Primary files: @@ -258,6 +258,11 @@ What this does not fully prove: records repeated 400x, ~2.5 MB), warm-cache `--release` runs dropped from about `39.8 ms` to `19.7 ms` mean over 5 runs on this machine. Treat this as a local directional signal, not a Linux realtime substitute. +- 2026-03-25: Realtime packet-graph state is now only constructed when the + graph is actually enabled. High-throughput CSV/performance-mode runs no + longer allocate the watch channel, mutex-protected packet counter, or + per-packet UI accounting state on the hot path. Linux-side throughput numbers + are still pending the broader benchmark work in Phase 0. - 2026-03-25: `FeatureStats` now uses a Welford-style running variance accumulator and computes `std` at readout time. On the same local amplified offline fixture, warm-cache `--release` runs moved from about `21.3 ms` to diff --git a/rustiflow/src/main.rs b/rustiflow/src/main.rs index 8093bd3d..dbd28108 100644 --- a/rustiflow/src/main.rs +++ b/rustiflow/src/main.rs @@ -14,6 +14,7 @@ mod realtime; #[cfg(not(target_os = "linux"))] #[path = "realtime_stub.rs"] mod realtime; +mod realtime_mode; mod tests; mod tui; @@ -28,6 +29,7 @@ use flows::{ }; use log::{debug, error, info}; use output::OutputWriter; +use realtime_mode::packet_graph_mode; use std::time::Instant; use tokio::sync::mpsc; use tui::{launch_tui, Config}; @@ -102,7 +104,7 @@ async fn run_with_config(config: Config) { macro_rules! execute_realtime { ($flow_ty:ty) => {{ // Create output writer and initialize it - let performance_mode_disabled = config.output.export_path.is_some() && !matches!(std::env::var("RUST_LOG"), Ok(ref val) if val.contains("debug")) && !config.output.performance_mode; + let packet_graph_mode = packet_graph_mode(&config.output); let mut output_writer = OutputWriter::<$flow_ty>::new( config.output.output, @@ -143,7 +145,7 @@ async fn run_with_config(config: Config) { config.config.early_export, config.config.expiration_check_interval, ingress_only, - performance_mode_disabled, + packet_graph_mode, ) .await; diff --git a/rustiflow/src/realtime.rs b/rustiflow/src/realtime.rs index e1262a62..13430cf2 100644 --- a/rustiflow/src/realtime.rs +++ b/rustiflow/src/realtime.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use crate::debug; use crate::flow_tui::launch_packet_tui; use crate::packet_counts::PacketCountPerSecond; +use crate::realtime_mode::PacketGraphMode; use crate::{flow_table::FlowTable, flows::flow::Flow, packet_features::PacketFeatures}; use anyhow::Context; use aya::{ @@ -36,7 +37,7 @@ pub async fn handle_realtime( early_export: Option, expiration_check_interval: u64, ingress_only: bool, - performance_mode_disabled: bool, + packet_graph_mode: PacketGraphMode, ) -> Result where T: Flow, @@ -86,9 +87,19 @@ where let buffer_num_packets = 10_000; let mut shard_senders = Vec::with_capacity(num_threads as usize); - - let (packet_tx, packet_rx) = watch::channel(Vec::new()); - let packet_counter = Arc::new(Mutex::new(PacketCountPerSecond::new())); + let (packet_graph, packet_rx) = match packet_graph_mode { + PacketGraphMode::Enabled => { + let (packet_tx, packet_rx) = watch::channel(Vec::new()); + ( + Some(PacketGraphState { + packet_counter: Arc::new(Mutex::new(PacketCountPerSecond::new())), + packet_tx, + }), + Some(packet_rx), + ) + } + PacketGraphMode::Disabled => (None, None), + }; debug!("Creating {} sharded FlowTables...", num_threads); for _ in 0..num_threads { @@ -123,8 +134,7 @@ where for ebpf_event_source in event_sources_v4 { let shard_senders_clone = shard_senders.clone(); - let packet_counter_clone = Arc::clone(&packet_counter); - let packet_tx_clone = packet_tx.clone(); + let packet_graph = packet_graph.clone(); let realtime_offset_us = realtime_offset_us; handle_set.spawn(async move { @@ -137,12 +147,8 @@ where let ring_buf = guard.get_inner_mut(); while let Some(event) = ring_buf.next() { - if performance_mode_disabled { - let mut counter = packet_counter_clone.lock().await; - counter.increment(); - // Send the updated count to the TUI - let recent_counts = counter.get_counts_for_last_intervals(100); - let _ = packet_tx_clone.send(recent_counts); + if let Some(packet_graph) = &packet_graph { + packet_graph.record_packet().await; } let ebpf_event_ipv4: EbpfEventIpv4 = unsafe { std::ptr::read(event.as_ptr() as *const _) }; @@ -167,8 +173,7 @@ where for ebpf_event_source in event_sources_v6 { let shard_senders_clone = shard_senders.clone(); - let packet_counter_clone = Arc::clone(&packet_counter); - let packet_tx_clone = packet_tx.clone(); + let packet_graph = packet_graph.clone(); let realtime_offset_us = realtime_offset_us; handle_set.spawn(async move { @@ -181,12 +186,8 @@ where let ring_buf = guard.get_inner_mut(); while let Some(event) = ring_buf.next() { - if performance_mode_disabled { - let mut counter = packet_counter_clone.lock().await; - counter.increment(); - // Send the updated count to the TUI - let recent_counts = counter.get_counts_for_last_intervals(100); - let _ = packet_tx_clone.send(recent_counts); + if let Some(packet_graph) = &packet_graph { + packet_graph.record_packet().await; } let ebpf_event_ipv6: EbpfEventIpv6 = unsafe { std::ptr::read(event.as_ptr() as *const _) }; @@ -211,7 +212,7 @@ where info!("Waiting for Ctrl-C..."); - if performance_mode_disabled { + if let Some(packet_rx) = packet_rx { let _ = launch_packet_tui(packet_rx).await; } @@ -259,6 +260,21 @@ where Ok(total_dropped) } +#[derive(Clone)] +struct PacketGraphState { + packet_counter: Arc>, + packet_tx: watch::Sender>, +} + +impl PacketGraphState { + async fn record_packet(&self) { + let mut counter = self.packet_counter.lock().await; + counter.increment(); + let recent_counts = counter.get_counts_for_last_intervals(100); + let _ = self.packet_tx.send(recent_counts); + } +} + fn compute_shard_index(flow_key: &H, num_shards: u8) -> usize { assert!(num_shards > 0, "num_shards must be greater than 0"); let mut hasher = DefaultHasher::new(); diff --git a/rustiflow/src/realtime_mode.rs b/rustiflow/src/realtime_mode.rs new file mode 100644 index 00000000..1f4fe36e --- /dev/null +++ b/rustiflow/src/realtime_mode.rs @@ -0,0 +1,95 @@ +use crate::args::{ExportMethodType, OutputConfig}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum PacketGraphMode { + Disabled, + Enabled, +} + +pub fn packet_graph_mode(output: &OutputConfig) -> PacketGraphMode { + packet_graph_mode_with_debug(output, debug_logging_enabled()) +} + +fn packet_graph_mode_with_debug( + output: &OutputConfig, + debug_logging_enabled: bool, +) -> PacketGraphMode { + match ( + &output.output, + output.export_path.is_some(), + output.performance_mode, + debug_logging_enabled, + ) { + (ExportMethodType::Csv, true, false, false) => PacketGraphMode::Enabled, + _ => PacketGraphMode::Disabled, + } +} + +fn debug_logging_enabled() -> bool { + matches!(std::env::var("RUST_LOG"), Ok(ref value) if value.contains("debug")) +} + +#[cfg(test)] +mod tests { + use super::{packet_graph_mode_with_debug, PacketGraphMode}; + use crate::args::{ExportMethodType, OutputConfig}; + + fn output_config( + output: ExportMethodType, + export_path: Option<&str>, + performance_mode: bool, + ) -> OutputConfig { + OutputConfig { + output, + export_path: export_path.map(str::to_string), + header: false, + drop_contaminant_features: false, + performance_mode, + } + } + + #[test] + fn packet_graph_mode_only_enables_for_csv_exports_without_performance_mode() { + let output = output_config(ExportMethodType::Csv, Some("/tmp/out.csv"), false); + + assert_eq!( + packet_graph_mode_with_debug(&output, false), + PacketGraphMode::Enabled + ); + } + + #[test] + fn packet_graph_mode_stays_disabled_for_csv_performance_mode() { + let output = output_config(ExportMethodType::Csv, Some("/tmp/out.csv"), true); + + assert_eq!( + packet_graph_mode_with_debug(&output, false), + PacketGraphMode::Disabled + ); + } + + #[test] + fn packet_graph_mode_stays_disabled_for_debug_logging() { + let output = output_config(ExportMethodType::Csv, Some("/tmp/out.csv"), false); + + assert_eq!( + packet_graph_mode_with_debug(&output, true), + PacketGraphMode::Disabled + ); + } + + #[test] + fn packet_graph_mode_stays_disabled_without_csv_export_path() { + let csv_without_path = output_config(ExportMethodType::Csv, None, false); + let print_output = output_config(ExportMethodType::Print, None, false); + + assert_eq!( + packet_graph_mode_with_debug(&csv_without_path, false), + PacketGraphMode::Disabled + ); + assert_eq!( + packet_graph_mode_with_debug(&print_output, false), + PacketGraphMode::Disabled + ); + } +} diff --git a/rustiflow/src/realtime_stub.rs b/rustiflow/src/realtime_stub.rs index d3f658c9..7acbf332 100644 --- a/rustiflow/src/realtime_stub.rs +++ b/rustiflow/src/realtime_stub.rs @@ -1,4 +1,5 @@ use crate::flows::flow::Flow; +use crate::realtime_mode::PacketGraphMode; use tokio::sync::mpsc::Sender; /// Realtime capture depends on Aya/eBPF and is only available on Linux. @@ -11,7 +12,7 @@ pub async fn handle_realtime( _early_export: Option, _expiration_check_interval: u64, _ingress_only: bool, - _performance_mode_disabled: bool, + _packet_graph_mode: PacketGraphMode, ) -> Result where T: Flow, From c4cd82122783522034f13dae8f04aa1b644202be Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Thu, 26 Mar 2026 00:16:50 +0100 Subject: [PATCH 33/34] feat: add rustiflow ip scope and path locality --- AGENTS.md | 2 +- README.md | 3 +- docs/engineering-notes.md | 6 + rustiflow/src/args.rs | 2 +- rustiflow/src/flows/basic_flow.rs | 26 +++- rustiflow/src/flows/nf_flow.rs | 5 +- rustiflow/src/flows/rusti_flow.rs | 22 +++ rustiflow/src/flows/util.rs | 137 ++++++++++++++++ rustiflow/src/tests/flows/rusti_flow_test.rs | 156 ++++++++++++++++++- 9 files changed, 347 insertions(+), 12 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 84cce2b7..c65436a8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -109,7 +109,7 @@ in `docs/engineering-notes.md`. - [ ] Stabilize and measure before expanding the eBPF event payload further. - [x] Finish the remaining TCP quality signals that current metadata already supports: duplicate ACKs, zero-window events, and close style. -- [ ] Add the next IP and path signals once they can be trusted in both offline +- [x] Add the next IP and path signals once they can be trusted in both offline and realtime modes. Primary files: diff --git a/README.md b/README.md index a51079b7..78e34b5d 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ This tool is engineered for robust and efficient feature extraction, particularl - **Pcap File Support:** Facilitates packet analysis from pcap files, compatible with both Linux and Windows generated files. - **Diverse Output Options:** Features can be outputted to the console, a CSV file, or other formats with minimal effort. - **Richer TCP Quality Signals:** The RustiFlow feature set exports duplicate ACK counts, zero-window observations, and TCP close style in addition to the existing lifecycle and retransmission fields. +- **Endpoint-Aware IP Context:** The RustiFlow feature set exports `ip_version`, endpoint IP scope, and coarse `path_locality` derived from normalized addresses without expanding the eBPF event payload. ## Feature sets @@ -221,7 +222,7 @@ Options: - cic: Represents the CIC Flow, giving 90 features - cidds: Represents the CIDDS Flow, giving 10 features - nfstream: Represents a nfstream inspired flow, giving 71 features - - rustiflow: Represents the Rusti Flow, giving 199 features + - rustiflow: Represents the Rusti Flow, giving 203 features - custom: Represents a flow that you can implement yourself --active-timeout diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index ec2f52a9..b46c450f 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -49,6 +49,12 @@ This file keeps short-lived design choices and execution notes that would make disabled, RustiFlow no longer constructs the packet-count watch channel or mutex-protected counter state, so high-throughput runs skip that observability plumbing entirely instead of merely branching around it in the loop body. +- RustiFlow now exports `ip_version`, `source_ip_scope`, + `destination_ip_scope`, and `path_locality` derived from the normalized + `IpAddr` endpoints already shared by offline and realtime ingestion. The + adversarial test matrix covers private/shared/link-local/loopback/multicast + cases across IPv4 and IPv6 so these coarse path signals do not depend on + extra kernel event fields. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/rustiflow/src/args.rs b/rustiflow/src/args.rs index 8bc76e44..7f523f05 100644 --- a/rustiflow/src/args.rs +++ b/rustiflow/src/args.rs @@ -170,7 +170,7 @@ pub enum FlowType { /// Represents a nfstream inspired flow, giving 71 features. Nfstream, - /// Represents the Rusti Flow, giving 199 features. + /// Represents the Rusti Flow, giving 203 features. Rustiflow, /// Represents a flow that you can implement yourself. diff --git a/rustiflow/src/flows/basic_flow.rs b/rustiflow/src/flows/basic_flow.rs index 1725c78c..7ec6e5bb 100644 --- a/rustiflow/src/flows/basic_flow.rs +++ b/rustiflow/src/flows/basic_flow.rs @@ -3,7 +3,12 @@ use std::net::IpAddr; use chrono::{DateTime, Utc}; use pnet::packet::ip::IpNextHeaderProtocols; -use crate::{flows::util::iana_port_mapping, packet_features::PacketFeatures}; +use crate::{ + flows::util::{ + classify_ip_scope, classify_path_locality, iana_port_mapping, IpScope, PathLocality, + }, + packet_features::PacketFeatures, +}; use super::{flow::Flow, util::FlowExpireCause}; @@ -213,6 +218,25 @@ impl BasicFlow { pub fn get_first_timestamp(&self) -> DateTime { DateTime::from_timestamp_micros(self.first_timestamp_us).unwrap() } + + pub fn get_ip_version(&self) -> u8 { + match self.ip_source { + IpAddr::V4(_) => 4, + IpAddr::V6(_) => 6, + } + } + + pub fn get_source_ip_scope(&self) -> IpScope { + classify_ip_scope(self.ip_source) + } + + pub fn get_destination_ip_scope(&self) -> IpScope { + classify_ip_scope(self.ip_destination) + } + + pub fn get_path_locality(&self) -> PathLocality { + classify_path_locality(self.ip_source, self.ip_destination) + } } impl Flow for BasicFlow { diff --git a/rustiflow/src/flows/nf_flow.rs b/rustiflow/src/flows/nf_flow.rs index f5421853..5f83425a 100644 --- a/rustiflow/src/flows/nf_flow.rs +++ b/rustiflow/src/flows/nf_flow.rs @@ -46,10 +46,7 @@ impl NfFlow { } pub fn get_ip_version(&self) -> u8 { - match self.basic_flow.ip_source { - IpAddr::V4(_) => 4, - IpAddr::V6(_) => 6, - } + self.basic_flow.get_ip_version() } } diff --git a/rustiflow/src/flows/rusti_flow.rs b/rustiflow/src/flows/rusti_flow.rs index 4cd5c966..d1f55902 100644 --- a/rustiflow/src/flows/rusti_flow.rs +++ b/rustiflow/src/flows/rusti_flow.rs @@ -130,6 +130,13 @@ impl Flow for RustiFlow { self.basic_flow.ip_destination.to_string(), self.basic_flow.port_destination.to_string(), self.basic_flow.protocol.to_string(), + self.basic_flow.get_ip_version().to_string(), + self.basic_flow.get_source_ip_scope().as_str().to_string(), + self.basic_flow + .get_destination_ip_scope() + .as_str() + .to_string(), + self.basic_flow.get_path_locality().as_str().to_string(), self.basic_flow.get_first_timestamp().to_string(), self.basic_flow.get_last_timestamp().to_string(), duration_us.to_string(), @@ -218,6 +225,10 @@ impl Flow for RustiFlow { "destination_ip".to_string(), "destination_port".to_string(), "protocol".to_string(), + "ip_version".to_string(), + "source_ip_scope".to_string(), + "destination_ip_scope".to_string(), + "path_locality".to_string(), "timestamp_first".to_string(), "timestamp_last".to_string(), "flow_duration_us".to_string(), @@ -260,6 +271,13 @@ impl Flow for RustiFlow { iana_port_mapping(self.basic_flow.port_source).to_string(), iana_port_mapping(self.basic_flow.port_destination).to_string(), self.basic_flow.protocol.to_string(), + self.basic_flow.get_ip_version().to_string(), + self.basic_flow.get_source_ip_scope().as_str().to_string(), + self.basic_flow + .get_destination_ip_scope() + .as_str() + .to_string(), + self.basic_flow.get_path_locality().as_str().to_string(), duration_us.to_string(), self.basic_flow.flow_expire_cause.as_str().to_string(), u8::from(self.basic_flow.tcp_handshake_completed).to_string(), @@ -344,6 +362,10 @@ impl Flow for RustiFlow { "source_port_iana".to_string(), "destination_port_iana".to_string(), "protocol".to_string(), + "ip_version".to_string(), + "source_ip_scope".to_string(), + "destination_ip_scope".to_string(), + "path_locality".to_string(), "flow_duration_us".to_string(), "flow_expire_cause".to_string(), "tcp_handshake_completed".to_string(), diff --git a/rustiflow/src/flows/util.rs b/rustiflow/src/flows/util.rs index 7f4a25ce..af8bd911 100644 --- a/rustiflow/src/flows/util.rs +++ b/rustiflow/src/flows/util.rs @@ -1,3 +1,5 @@ +use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + /// Use ports as IANA port categories ['well-known', 'registered', 'dynamic'] pub fn iana_port_mapping(port: u16) -> &'static str { match port { @@ -7,6 +9,141 @@ pub fn iana_port_mapping(port: u16) -> &'static str { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum IpScope { + Loopback, + LinkLocal, + Private, + Shared, + Multicast, + Broadcast, + Unspecified, + Global, +} + +impl IpScope { + pub fn as_str(&self) -> &'static str { + match self { + Self::Loopback => "loopback", + Self::LinkLocal => "link_local", + Self::Private => "private", + Self::Shared => "shared", + Self::Multicast => "multicast", + Self::Broadcast => "broadcast", + Self::Unspecified => "unspecified", + Self::Global => "global", + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum PathLocality { + Loopback, + LinkLocal, + Private, + Mixed, + Multicast, + Public, + Other, +} + +impl PathLocality { + pub fn as_str(&self) -> &'static str { + match self { + Self::Loopback => "loopback", + Self::LinkLocal => "link_local", + Self::Private => "private", + Self::Mixed => "mixed", + Self::Multicast => "multicast", + Self::Public => "public", + Self::Other => "other", + } + } +} + +pub fn classify_ip_scope(address: IpAddr) -> IpScope { + match address { + IpAddr::V4(address) => classify_ipv4_scope(address), + IpAddr::V6(address) => classify_ipv6_scope(address), + } +} + +pub fn classify_path_locality(source: IpAddr, destination: IpAddr) -> PathLocality { + let source_scope = classify_ip_scope(source); + let destination_scope = classify_ip_scope(destination); + + if matches!(source_scope, IpScope::Multicast) || matches!(destination_scope, IpScope::Multicast) + { + PathLocality::Multicast + } else if matches!(source_scope, IpScope::Loopback) + && matches!(destination_scope, IpScope::Loopback) + { + PathLocality::Loopback + } else if matches!(source_scope, IpScope::LinkLocal) + && matches!(destination_scope, IpScope::LinkLocal) + { + PathLocality::LinkLocal + } else if is_localish_scope(source_scope) && is_localish_scope(destination_scope) { + PathLocality::Private + } else if is_localish_scope(source_scope) != is_localish_scope(destination_scope) { + PathLocality::Mixed + } else if matches!(source_scope, IpScope::Global) + && matches!(destination_scope, IpScope::Global) + { + PathLocality::Public + } else { + PathLocality::Other + } +} + +fn classify_ipv4_scope(address: Ipv4Addr) -> IpScope { + if address == Ipv4Addr::BROADCAST { + IpScope::Broadcast + } else if address.is_unspecified() { + IpScope::Unspecified + } else if address.is_loopback() { + IpScope::Loopback + } else if address.is_link_local() { + IpScope::LinkLocal + } else if address.is_private() { + IpScope::Private + } else if is_shared_ipv4(address) { + IpScope::Shared + } else if address.is_multicast() { + IpScope::Multicast + } else { + IpScope::Global + } +} + +fn classify_ipv6_scope(address: Ipv6Addr) -> IpScope { + if address.is_unspecified() { + IpScope::Unspecified + } else if address.is_loopback() { + IpScope::Loopback + } else if address.is_unicast_link_local() { + IpScope::LinkLocal + } else if address.is_unique_local() { + IpScope::Private + } else if address.is_multicast() { + IpScope::Multicast + } else { + IpScope::Global + } +} + +fn is_shared_ipv4(address: Ipv4Addr) -> bool { + let [first_octet, second_octet, ..] = address.octets(); + first_octet == 100 && (second_octet & 0b1100_0000) == 0b0100_0000 +} + +fn is_localish_scope(scope: IpScope) -> bool { + matches!( + scope, + IpScope::Loopback | IpScope::LinkLocal | IpScope::Private | IpScope::Shared + ) +} + #[derive(Clone, Copy, Debug, PartialEq)] pub enum FlowExpireCause { TcpTermination, diff --git a/rustiflow/src/tests/flows/rusti_flow_test.rs b/rustiflow/src/tests/flows/rusti_flow_test.rs index 1d641589..fb5a735a 100644 --- a/rustiflow/src/tests/flows/rusti_flow_test.rs +++ b/rustiflow/src/tests/flows/rusti_flow_test.rs @@ -1,18 +1,30 @@ #[cfg(test)] mod tests { - use std::net::{IpAddr, Ipv4Addr}; + use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; use crate::{ - flows::{basic_flow::TcpCloseStyle, flow::Flow, rusti_flow::RustiFlow}, + flows::{ + basic_flow::TcpCloseStyle, + flow::Flow, + rusti_flow::RustiFlow, + util::{IpScope, PathLocality}, + }, packet_features::{PacketFeatures, ACK_FLAG, SYN_FLAG}, }; fn setup_rusti_flow() -> RustiFlow { + setup_rusti_flow_with_endpoints( + IpAddr::V4(Ipv4Addr::new(172, 16, 0, 1)), + IpAddr::V4(Ipv4Addr::new(172, 16, 0, 2)), + ) + } + + fn setup_rusti_flow_with_endpoints(source_ip: IpAddr, destination_ip: IpAddr) -> RustiFlow { RustiFlow::new( "rusti-flow".to_string(), - IpAddr::V4(Ipv4Addr::new(172, 16, 0, 1)), + source_ip, 44444, - IpAddr::V4(Ipv4Addr::new(172, 16, 0, 2)), + destination_ip, 443, 6, 1_000_000, @@ -46,6 +58,7 @@ mod tests { fn dump_matches_feature_headers() { let flow = setup_rusti_flow(); + assert_eq!(count_csv_fields(&RustiFlow::get_features()), 203); assert_eq!( count_csv_fields(&flow.dump()), count_csv_fields(&RustiFlow::get_features()) @@ -56,6 +69,141 @@ mod tests { ); } + #[test] + fn rusti_flow_classifies_ip_scope_and_path_locality_across_endpoint_types() { + let cases = [ + ( + IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + IpAddr::V4(Ipv4Addr::new(192, 168, 1, 20)), + 4, + IpScope::Private, + IpScope::Private, + PathLocality::Private, + ), + ( + IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1)), + IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)), + 4, + IpScope::Shared, + IpScope::Global, + PathLocality::Mixed, + ), + ( + IpAddr::V4(Ipv4Addr::LOCALHOST), + IpAddr::V4(Ipv4Addr::LOCALHOST), + 4, + IpScope::Loopback, + IpScope::Loopback, + PathLocality::Loopback, + ), + ( + IpAddr::V4(Ipv4Addr::new(169, 254, 1, 1)), + IpAddr::V4(Ipv4Addr::new(169, 254, 1, 2)), + 4, + IpScope::LinkLocal, + IpScope::LinkLocal, + PathLocality::LinkLocal, + ), + ( + IpAddr::V6(Ipv6Addr::LOCALHOST), + IpAddr::V6(Ipv6Addr::LOCALHOST), + 6, + IpScope::Loopback, + IpScope::Loopback, + PathLocality::Loopback, + ), + ( + IpAddr::V6(Ipv6Addr::new(0xfd00, 0, 0, 0, 0, 0, 0, 1)), + IpAddr::V6(Ipv6Addr::new(0xfd00, 0, 0, 0, 0, 0, 0, 2)), + 6, + IpScope::Private, + IpScope::Private, + PathLocality::Private, + ), + ( + IpAddr::V6(Ipv6Addr::new(0xfe80, 0, 0, 0, 0, 0, 0, 1)), + IpAddr::V6(Ipv6Addr::new(0xfe80, 0, 0, 0, 0, 0, 0, 2)), + 6, + IpScope::LinkLocal, + IpScope::LinkLocal, + PathLocality::LinkLocal, + ), + ( + IpAddr::V6(Ipv6Addr::new(0xff02, 0, 0, 0, 0, 0, 0, 1)), + IpAddr::V6(Ipv6Addr::new(0x2001, 0x4860, 0, 0, 0, 0, 0, 8888)), + 6, + IpScope::Multicast, + IpScope::Global, + PathLocality::Multicast, + ), + ]; + + for ( + source_ip, + destination_ip, + ip_version, + source_scope, + destination_scope, + path_locality, + ) in cases + { + let flow = setup_rusti_flow_with_endpoints(source_ip, destination_ip); + + assert_eq!( + ( + flow.basic_flow.get_ip_version(), + flow.basic_flow.get_source_ip_scope(), + flow.basic_flow.get_destination_ip_scope(), + flow.basic_flow.get_path_locality(), + ), + (ip_version, source_scope, destination_scope, path_locality) + ); + } + } + + #[test] + fn rusti_flow_exports_endpoint_scope_and_path_locality_in_both_schemas() { + let flow = setup_rusti_flow_with_endpoints( + IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1)), + IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)), + ); + + let full_fields: Vec<_> = flow.dump().split(',').map(str::to_string).collect(); + let clean_fields: Vec<_> = flow + .dump_without_contamination() + .split(',') + .map(str::to_string) + .collect(); + + assert_eq!( + &full_fields[..10], + &[ + "rusti-flow".to_string(), + "100.64.0.1".to_string(), + "44444".to_string(), + "8.8.8.8".to_string(), + "443".to_string(), + "6".to_string(), + "4".to_string(), + "shared".to_string(), + "global".to_string(), + "mixed".to_string(), + ] + ); + assert_eq!( + &clean_fields[..7], + &[ + "registered".to_string(), + "well-known".to_string(), + "6".to_string(), + "4".to_string(), + "shared".to_string(), + "global".to_string(), + "mixed".to_string(), + ] + ); + } + #[test] fn rusti_flow_updates_lifecycle_timing_and_retransmission_features_together() { let source_ip = IpAddr::V4(Ipv4Addr::new(172, 16, 0, 1)); From eb9bdc0b2e9075c40c50ab0082b8ea7d2ac6c03b Mon Sep 17 00:00:00 2001 From: Laurens D'hooge <20303842+Str-Gen@users.noreply.github.com> Date: Thu, 26 Mar 2026 00:21:20 +0100 Subject: [PATCH 34/34] perf: keep flow table updates in place --- docs/engineering-notes.md | 5 + docs/performance-roadmap.md | 11 +- rustiflow/src/flow_table.rs | 146 ++++++++++++------- rustiflow/src/tests/flows/flow_table_test.rs | 53 +++++++ 4 files changed, 162 insertions(+), 53 deletions(-) diff --git a/docs/engineering-notes.md b/docs/engineering-notes.md index b46c450f..277bc0bb 100644 --- a/docs/engineering-notes.md +++ b/docs/engineering-notes.md @@ -55,6 +55,11 @@ This file keeps short-lived design choices and execution notes that would make adversarial test matrix covers private/shared/link-local/loopback/multicast cases across IPv4 and IPv6 so these coarse path signals do not depend on extra kernel event fields. +- `FlowTable` now keeps the ordinary existing-flow update path in place instead + of removing and reinserting the map entry on every packet. Table-level tests + now lock down two semantics that matter for that optimization: replacing an + expired flow with a fresh flow on the same key, and early export that keeps + the live flow resident for later final export. - Current test-hardening focus is to add adversarial deterministic cases before more feature work: false handshake completion, teardown edge cases, parser rejection behavior, and tiny fixture assertions that prove exported diff --git a/docs/performance-roadmap.md b/docs/performance-roadmap.md index 39ad741f..b407abad 100644 --- a/docs/performance-roadmap.md +++ b/docs/performance-roadmap.md @@ -86,9 +86,9 @@ High. Improves correctness and removes per-event userspace time acquisition. ### 4. FlowTable Access Patterns -- [ ] Reduce repeated hashing and key rebuilding in flow lookup. -- [ ] Avoid `contains_key` plus `remove` plus `insert` churn where possible. -- [ ] Revisit direction resolution after typed keys are introduced. +- [x] Reduce repeated hashing and key rebuilding in flow lookup. +- [x] Avoid `contains_key` plus `remove` plus `insert` churn where possible. +- [x] Revisit direction resolution after typed keys are introduced. Primary file: @@ -268,3 +268,8 @@ What this does not fully prove: offline fixture, warm-cache `--release` runs moved from about `21.3 ms` to `20.5 ms` mean over 5 runs on this machine. Smaller win than typed keys, but still in the expected direction. +- 2026-03-26: Existing-flow updates in `FlowTable` now stay in place on the + hot path instead of removing and reinserting map entries for every packet. + On the same local amplified offline fixture, warm-cache `--release` runs + moved from about `18.84 ms` to `18.30 ms` mean over 5 runs. This is still an + offline directional signal, not a Linux realtime benchmark. diff --git a/rustiflow/src/flow_table.rs b/rustiflow/src/flow_table.rs index d32df90a..7d884736 100644 --- a/rustiflow/src/flow_table.rs +++ b/rustiflow/src/flow_table.rs @@ -6,6 +6,19 @@ use crate::{ use log::{debug, error}; use tokio::sync::mpsc; +enum ExistingFlowUpdate { + Updated, + EarlyExport(T), + Terminated(T), + Expired(FlowExpireCause), +} + +enum FlowUpdate { + Active, + EarlyExport(T), + Terminated(T), +} + pub struct FlowTable { flow_map: HashMap, // HashMap for fast flow access by key active_timeout: u64, @@ -47,35 +60,15 @@ where let flow_key = packet.flow_key_value(); let reverse_flow_key = flow_key.reverse(); - if let Some(mut flow) = self.flow_map.remove(&flow_key) { - let (is_expired, cause) = - flow.is_expired(packet.timestamp_us, self.active_timeout, self.idle_timeout); - if is_expired { - flow.close_flow(packet.timestamp_us, cause); - self.export_flow(flow).await; - self.create_and_insert_flow(packet).await; - } else { - let is_terminated = self.update_flow_with_packet(&mut flow, packet, true).await; - if !is_terminated { - self.flow_map.insert(flow_key, flow); - } - } - } else if let Some(mut flow) = self.flow_map.remove(&reverse_flow_key) { - let (is_expired, cause) = - flow.is_expired(packet.timestamp_us, self.active_timeout, self.idle_timeout); - if is_expired { - flow.close_flow(packet.timestamp_us, cause); - self.export_flow(flow).await; - self.create_and_insert_flow(packet).await; - } else { - let is_terminated = self.update_flow_with_packet(&mut flow, packet, false).await; - if !is_terminated { - self.flow_map.insert(reverse_flow_key, flow); - } - } - } else { - self.create_and_insert_flow(packet).await; + if self.process_existing_flow(packet, flow_key, true).await + || self + .process_existing_flow(packet, reverse_flow_key, false) + .await + { + return; } + + self.create_and_insert_flow(packet).await; } /// Create and insert a new flow for the given packet. @@ -90,37 +83,90 @@ where packet.protocol, packet.timestamp_us, ); - let is_terminated = self - .update_flow_with_packet(&mut new_flow, packet, true) - .await; - if !is_terminated { - self.flow_map.insert(flow_key, new_flow); + match Self::apply_packet_to_flow(&mut new_flow, packet, true, self.early_export) { + FlowUpdate::Active => { + self.flow_map.insert(flow_key, new_flow); + } + FlowUpdate::EarlyExport(flow) => { + self.export_flow(flow).await; + self.flow_map.insert(flow_key, new_flow); + } + FlowUpdate::Terminated(flow) => { + self.export_flow(flow).await; + } } } - /// Updates a flow with a packet and exports flow if terminated. - /// - /// Returns a boolean indicating if the flow is terminated. - async fn update_flow_with_packet( + async fn process_existing_flow( &mut self, - flow: &mut T, packet: &PacketFeatures, + flow_key: FlowKey, is_forward: bool, ) -> bool { - let flow_terminated = flow.update_flow(&packet, is_forward); - - if flow_terminated { - // If terminated, export the flow - self.export_flow(flow.clone()).await; - } else if let Some(early_export) = self.early_export { - // If flow duration is greater than early export, export the flow immediately (without deletion from the flow table) - if ((packet.timestamp_us - flow.get_first_timestamp_us()) / 1_000_000) as u64 - > early_export - { - self.export_flow(flow.clone()).await; + let Some(update) = self.inspect_existing_flow(&flow_key, packet, is_forward) else { + return false; + }; + + match update { + ExistingFlowUpdate::Updated => {} + ExistingFlowUpdate::EarlyExport(flow) => { + self.export_flow(flow).await; + } + ExistingFlowUpdate::Terminated(flow) => { + self.flow_map.remove(&flow_key); + self.export_flow(flow).await; + } + ExistingFlowUpdate::Expired(cause) => { + if let Some(mut flow) = self.flow_map.remove(&flow_key) { + flow.close_flow(packet.timestamp_us, cause); + self.export_flow(flow).await; + } + self.create_and_insert_flow(packet).await; } } - flow_terminated + + true + } + + fn inspect_existing_flow( + &mut self, + flow_key: &FlowKey, + packet: &PacketFeatures, + is_forward: bool, + ) -> Option> { + let flow = self.flow_map.get_mut(flow_key)?; + let (is_expired, cause) = + flow.is_expired(packet.timestamp_us, self.active_timeout, self.idle_timeout); + + if is_expired { + Some(ExistingFlowUpdate::Expired(cause)) + } else { + Some( + match Self::apply_packet_to_flow(flow, packet, is_forward, self.early_export) { + FlowUpdate::Active => ExistingFlowUpdate::Updated, + FlowUpdate::EarlyExport(flow) => ExistingFlowUpdate::EarlyExport(flow), + FlowUpdate::Terminated(flow) => ExistingFlowUpdate::Terminated(flow), + }, + ) + } + } + + fn apply_packet_to_flow( + flow: &mut T, + packet: &PacketFeatures, + is_forward: bool, + early_export: Option, + ) -> FlowUpdate { + if flow.update_flow(packet, is_forward) { + FlowUpdate::Terminated(flow.clone()) + } else if early_export.is_some_and(|early_export| { + ((packet.timestamp_us - flow.get_first_timestamp_us()) / 1_000_000) as u64 + > early_export + }) { + FlowUpdate::EarlyExport(flow.clone()) + } else { + FlowUpdate::Active + } } /// Export all flows in the flow map in order of first packet arrival. diff --git a/rustiflow/src/tests/flows/flow_table_test.rs b/rustiflow/src/tests/flows/flow_table_test.rs index 6497f243..9aff7908 100644 --- a/rustiflow/src/tests/flows/flow_table_test.rs +++ b/rustiflow/src/tests/flows/flow_table_test.rs @@ -111,4 +111,57 @@ mod tests { assert_eq!(exported_flow.packet_stats.bwd_packet_len.get_count(), 1); assert!(rx.try_recv().is_err()); } + + #[tokio::test] + async fn expired_flow_is_replaced_by_new_flow_for_the_same_key() { + let (tx, mut rx) = mpsc::channel::(4); + let mut flow_table = FlowTable::new(3600, 1, None, tx, 60); + + let first_packet = build_packet(1_000_000); + let replacement_packet = build_packet(3_000_000); + + flow_table.process_packet(&first_packet).await; + flow_table.process_packet(&replacement_packet).await; + flow_table.export_all_flows(4_000_000).await; + + let first_export = rx.recv().await.expect("expected expired flow export"); + let second_export = rx.recv().await.expect("expected replacement flow export"); + + assert_eq!(first_export.flow_expire_cause, FlowExpireCause::IdleTimeout); + assert_eq!(first_export.first_timestamp_us, 1_000_000); + assert_eq!(first_export.last_timestamp_us, 1_000_000); + + assert_eq!( + second_export.flow_expire_cause, + FlowExpireCause::ExporterShutdown + ); + assert_eq!(second_export.first_timestamp_us, 3_000_000); + assert_eq!(second_export.last_timestamp_us, 3_000_000); + assert!(rx.try_recv().is_err()); + } + + #[tokio::test] + async fn early_export_keeps_flow_active_for_later_final_export() { + let (tx, mut rx) = mpsc::channel::(4); + let mut flow_table = FlowTable::new(3600, 120, Some(1), tx, 60); + + flow_table.process_packet(&build_packet(1_000_000)).await; + flow_table.process_packet(&build_packet(3_000_001)).await; + + let early_export = rx.recv().await.expect("expected early export"); + assert_eq!(early_export.flow_expire_cause, FlowExpireCause::None); + assert_eq!(early_export.first_timestamp_us, 1_000_000); + assert_eq!(early_export.last_timestamp_us, 3_000_001); + + flow_table.export_all_flows(4_000_000).await; + + let final_export = rx.recv().await.expect("expected final export"); + assert_eq!( + final_export.flow_expire_cause, + FlowExpireCause::ExporterShutdown + ); + assert_eq!(final_export.first_timestamp_us, 1_000_000); + assert_eq!(final_export.last_timestamp_us, 3_000_001); + assert!(rx.try_recv().is_err()); + } }