From bd3be7e647fece7d5ba8dadc5d1b6d7edfad6960 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 23:34:22 +0100 Subject: [PATCH 01/13] l2: add VXLAN interface type Introduce the VXLAN interface type for the L2 module. A VXLAN interface carries a VNI (VXLAN Network Identifier), a local VTEP address used as the outer IP source, an encapsulation VRF for underlay routing, and a configurable UDP destination port (default 4789). VXLAN interfaces are keyed by (VNI, encap_vrf_id) in a lockfree RCU-protected hash table so that the datapath can resolve incoming tunneled packets to the correct interface without locks. VXLAN interfaces are intended to be attached to a bridge domain. All L2 traffic entering the bridge is forwarded transparently over the VXLAN tunnel. The local VTEP address must already be configured in the encapsulation VRF. Signed-off-by: Robin Jarry --- modules/infra/api/gr_infra.h | 3 + modules/infra/control/ctlplane.c | 1 + modules/infra/control/iface.c | 1 + modules/l2/api/gr_l2.h | 16 ++ modules/l2/cli/meson.build | 1 + modules/l2/cli/vxlan.c | 208 ++++++++++++++++++++++++++ modules/l2/control/bridge.c | 1 + modules/l2/control/gr_l2_control.h | 4 + modules/l2/control/meson.build | 1 + modules/l2/control/vxlan.c | 227 +++++++++++++++++++++++++++++ 10 files changed, 463 insertions(+) create mode 100644 modules/l2/cli/vxlan.c create mode 100644 modules/l2/control/vxlan.c diff --git a/modules/infra/api/gr_infra.h b/modules/infra/api/gr_infra.h index f6f4bed0a..4d7d7f0e2 100644 --- a/modules/infra/api/gr_infra.h +++ b/modules/infra/api/gr_infra.h @@ -22,6 +22,7 @@ typedef enum : uint8_t { GR_IFACE_TYPE_IPIP, GR_IFACE_TYPE_BOND, GR_IFACE_TYPE_BRIDGE, + GR_IFACE_TYPE_VXLAN, GR_IFACE_TYPE_COUNT } gr_iface_type_t; @@ -448,6 +449,8 @@ static inline const char *gr_iface_type_name(gr_iface_type_t type) { return "bond"; case GR_IFACE_TYPE_BRIDGE: return "bridge"; + case GR_IFACE_TYPE_VXLAN: + return "vxlan"; case GR_IFACE_TYPE_UNDEF: case GR_IFACE_TYPE_COUNT: break; diff --git a/modules/infra/control/ctlplane.c b/modules/infra/control/ctlplane.c index 2d9bcf41c..8a17e42c5 100644 --- a/modules/infra/control/ctlplane.c +++ b/modules/infra/control/ctlplane.c @@ -398,6 +398,7 @@ static void iface_event(uint32_t event, const void *obj) { case GR_IFACE_TYPE_VLAN: case GR_IFACE_TYPE_BOND: case GR_IFACE_TYPE_BRIDGE: + case GR_IFACE_TYPE_VXLAN: break; default: return; diff --git a/modules/infra/control/iface.c b/modules/infra/control/iface.c index 206745463..d208eb7df 100644 --- a/modules/infra/control/iface.c +++ b/modules/infra/control/iface.c @@ -35,6 +35,7 @@ static bool iface_type_valid(gr_iface_type_t type) { case GR_IFACE_TYPE_IPIP: case GR_IFACE_TYPE_BOND: case GR_IFACE_TYPE_BRIDGE: + case GR_IFACE_TYPE_VXLAN: return true; case GR_IFACE_TYPE_UNDEF: case GR_IFACE_TYPE_COUNT: diff --git a/modules/l2/api/gr_l2.h b/modules/l2/api/gr_l2.h index 84f0c6dba..29c756ba9 100644 --- a/modules/l2/api/gr_l2.h +++ b/modules/l2/api/gr_l2.h @@ -41,6 +41,22 @@ struct gr_iface_info_bridge { uint16_t members[GR_BRIDGE_MAX_MEMBERS]; // Interface IDs of bridge members. }; +// VXLAN reconfiguration attribute flags. +#define GR_VXLAN_SET_VNI GR_BIT64(32) +#define GR_VXLAN_SET_ENCAP_VRF GR_BIT64(33) +#define GR_VXLAN_SET_DST_PORT GR_BIT64(34) +#define GR_VXLAN_SET_LOCAL GR_BIT64(35) +#define GR_VXLAN_SET_MAC GR_BIT64(37) + +// Info structure for GR_IFACE_TYPE_VXLAN interfaces. +struct gr_iface_info_vxlan { + uint32_t vni; // VXLAN Network Identifier (24-bit). + uint16_t encap_vrf_id; // L3 domain for underlay routing. + uint16_t dst_port; // UDP destination port (default 4789). + ip4_addr_t local; // Local VTEP IP address (must be a configured address in encap_vrf_id). + struct rte_ether_addr mac; // Default to random address. +}; + // FDB (L2 Forwarding Database) management ///////////////////////////////////// // FDB entry flags. diff --git a/modules/l2/cli/meson.build b/modules/l2/cli/meson.build index 53b9e5699..ba8c86745 100644 --- a/modules/l2/cli/meson.build +++ b/modules/l2/cli/meson.build @@ -4,4 +4,5 @@ cli_src += files( 'bridge.c', 'fdb.c', + 'vxlan.c', ) diff --git a/modules/l2/cli/vxlan.c b/modules/l2/cli/vxlan.c new file mode 100644 index 000000000..92655fb35 --- /dev/null +++ b/modules/l2/cli/vxlan.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include + +#include + +#include + +static void vxlan_show(struct gr_api_client *c, const struct gr_iface *iface) { + const struct gr_iface_info_vxlan *vxlan = (const struct gr_iface_info_vxlan *)iface->info; + struct gr_iface *vrf = iface_from_id(c, vxlan->encap_vrf_id); + printf("vni: %u\n", vxlan->vni); + printf("local: " IP4_F "\n", &vxlan->local); + printf("encap_vrf: %s\n", vrf ? vrf->name : "[deleted]"); + printf("dst_port: %u\n", vxlan->dst_port); + printf("mac: " ETH_F "\n", &vxlan->mac); + free(vrf); +} + +static void +vxlan_list_info(struct gr_api_client *c, const struct gr_iface *iface, char *buf, size_t len) { + const struct gr_iface_info_vxlan *vxlan = (const struct gr_iface_info_vxlan *)iface->info; + struct gr_iface *vrf = iface_from_id(c, vxlan->encap_vrf_id); + snprintf( + buf, + len, + "vni=%u local=" IP4_F " encap_vrf=%s", + vxlan->vni, + &vxlan->local, + vrf ? vrf->name : "[deleted]" + ); + free(vrf); +} + +static struct cli_iface_type vxlan_type = { + .type_id = GR_IFACE_TYPE_VXLAN, + .show = vxlan_show, + .list_info = vxlan_list_info, +}; + +static uint64_t parse_vxlan_args( + struct gr_api_client *c, + const struct ec_pnode *p, + struct gr_iface *iface, + bool update +) { + struct gr_iface_info_vxlan *vxlan; + uint64_t set_attrs; + + set_attrs = parse_iface_args(c, p, iface, sizeof(*vxlan), update); + + vxlan = (struct gr_iface_info_vxlan *)iface->info; + + if (arg_u32(p, "VNI", &vxlan->vni) < 0) { + if (errno != ENOENT) + return 0; + } else { + set_attrs |= GR_VXLAN_SET_VNI; + } + + if (arg_ip4(p, "LOCAL", &vxlan->local) < 0) { + if (errno != ENOENT) + return 0; + } else { + set_attrs |= GR_VXLAN_SET_LOCAL; + } + + if (arg_str(p, "ENCAP_VRF") != NULL) { + if (arg_vrf(c, p, "ENCAP_VRF", &vxlan->encap_vrf_id) < 0) + return 0; + else + set_attrs |= GR_VXLAN_SET_ENCAP_VRF; + } + + if (arg_u16(p, "DST_PORT", &vxlan->dst_port) < 0) { + if (errno != ENOENT) + return 0; + } else { + set_attrs |= GR_VXLAN_SET_DST_PORT; + } + + if (arg_eth_addr(p, "MAC", &vxlan->mac) < 0) { + if (errno != ENOENT) + return 0; + } else { + set_attrs |= GR_VXLAN_SET_MAC; + } + + if (set_attrs == 0) + errno = EINVAL; + return set_attrs; +} + +static cmd_status_t vxlan_add(struct gr_api_client *c, const struct ec_pnode *p) { + const struct gr_infra_iface_add_resp *resp; + struct gr_infra_iface_add_req *req = NULL; + void *resp_ptr = NULL; + size_t len; + + len = sizeof(*req) + sizeof(struct gr_iface_info_vxlan); + if ((req = calloc(1, len)) == NULL) + goto err; + + req->iface.type = GR_IFACE_TYPE_VXLAN; + req->iface.flags = GR_IFACE_F_UP; + + if (parse_vxlan_args(c, p, &req->iface, false) == 0) + goto err; + + if (gr_api_client_send_recv(c, GR_INFRA_IFACE_ADD, len, req, &resp_ptr) < 0) + goto err; + + free(req); + resp = resp_ptr; + printf("Created interface %u\n", resp->iface_id); + free(resp_ptr); + return CMD_SUCCESS; +err: + free(req); + return CMD_ERROR; +} + +static cmd_status_t vxlan_set(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_infra_iface_set_req *req = NULL; + cmd_status_t ret = CMD_ERROR; + size_t len; + + len = sizeof(*req) + sizeof(struct gr_iface_info_vxlan); + if ((req = calloc(1, len)) == NULL) + goto out; + + if ((req->set_attrs = parse_vxlan_args(c, p, &req->iface, true)) == 0) + goto out; + + if (gr_api_client_send_recv(c, GR_INFRA_IFACE_SET, len, req, NULL) < 0) + goto out; + + ret = CMD_SUCCESS; +out: + free(req); + return ret; +} + +#define VXLAN_ATTRS_CMD "(encap_vrf ENCAP_VRF),(mac MAC),(dst_port DST_PORT)" + +#define VXLAN_ATTRS_ARGS \ + IFACE_ATTRS_ARGS, \ + with_help( \ + "VXLAN Network Identifier (1-16777215).", \ + ec_node_uint("VNI", 1, 16777215, 10) \ + ), \ + with_help("Local VTEP IP address.", ec_node_re("LOCAL", IPV4_RE)), \ + with_help( \ + "L3 routing domain name for encap addresses.", \ + ec_node_dyn("ENCAP_VRF", complete_vrf_names, NULL) \ + ), \ + with_help("Ethernet address (default random).", ec_node_re("MAC", ETH_ADDR_RE)), \ + with_help( \ + "UDP destination port (default 4789).", \ + ec_node_uint("DST_PORT", 1, 65535, 10) \ + ) + +static int ctx_init(struct ec_node *root) { + int ret; + + ret = CLI_COMMAND( + INTERFACE_ADD_CTX(root), + "vxlan NAME vni VNI local LOCAL [" VXLAN_ATTRS_CMD "," IFACE_ATTRS_CMD "]", + vxlan_add, + "Create a new VXLAN tunnel interface.", + with_help("Interface name.", ec_node("any", "NAME")), + VXLAN_ATTRS_ARGS + ); + if (ret < 0) + return ret; + ret = CLI_COMMAND( + INTERFACE_SET_CTX(root), + "vxlan NAME (name NEW_NAME),(vni VNI),(local LOCAL), " VXLAN_ATTRS_CMD + "," IFACE_ATTRS_CMD, + vxlan_set, + "Modify VXLAN parameters.", + with_help( + "Interface name.", + ec_node_dyn("NAME", complete_iface_names, INT2PTR(GR_IFACE_TYPE_VXLAN)) + ), + with_help("New interface name.", ec_node("any", "NEW_NAME")), + VXLAN_ATTRS_ARGS + ); + if (ret < 0) + return ret; + + return 0; +} + +static struct cli_context ctx = { + .name = "vxlan", + .init = ctx_init, +}; + +static void __attribute__((constructor, used)) init(void) { + cli_context_register(&ctx); + register_iface_type(&vxlan_type); +} diff --git a/modules/l2/control/bridge.c b/modules/l2/control/bridge.c index 208f37112..24074d6de 100644 --- a/modules/l2/control/bridge.c +++ b/modules/l2/control/bridge.c @@ -35,6 +35,7 @@ static int bridge_attach_member(struct iface *bridge, struct iface *member) { case GR_IFACE_TYPE_PORT: case GR_IFACE_TYPE_VLAN: case GR_IFACE_TYPE_BOND: + case GR_IFACE_TYPE_VXLAN: break; default: return errno_set(EMEDIUMTYPE); diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h index 89284ee87..242878f29 100644 --- a/modules/l2/control/gr_l2_control.h +++ b/modules/l2/control/gr_l2_control.h @@ -32,3 +32,7 @@ void fdb_purge_iface(uint16_t iface_id); // Delete all FDB entries referencing the provided bridge. void fdb_purge_bridge(uint16_t bridge_id); + +GR_IFACE_INFO(GR_IFACE_TYPE_VXLAN, iface_info_vxlan, { BASE(gr_iface_info_vxlan); }); + +struct iface *vxlan_get_iface(rte_be32_t vni, uint16_t encap_vrf_id); diff --git a/modules/l2/control/meson.build b/modules/l2/control/meson.build index e98d2892b..d3e339e5e 100644 --- a/modules/l2/control/meson.build +++ b/modules/l2/control/meson.build @@ -4,6 +4,7 @@ src += files( 'bridge.c', 'fdb.c', + 'vxlan.c', ) inc += include_directories('.') diff --git a/modules/l2/control/vxlan.c b/modules/l2/control/vxlan.c new file mode 100644 index 000000000..49e516d2a --- /dev/null +++ b/modules/l2/control/vxlan.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +struct vxlan_key { + rte_be32_t vni; + // Use uint32_t to avoid padding issues. See ipip_key in ipip/control.c. + uint32_t vrf_id; +}; + +static struct rte_hash *vxlan_hash; + +struct iface *vxlan_get_iface(rte_be32_t vni, uint16_t encap_vrf_id) { + const struct vxlan_key key = {vni, encap_vrf_id}; + void *data; + + if (rte_hash_lookup_data(vxlan_hash, &key, &data) < 0) + return NULL; + + return data; +} + +static int iface_vxlan_reconfig( + struct iface *iface, + uint64_t set_attrs, + const struct gr_iface *, + const void *api_info +) { + struct iface_info_vxlan *cur = iface_info_vxlan(iface); + const struct vxlan_key cur_key = {rte_cpu_to_be_32(cur->vni), cur->encap_vrf_id}; + const struct gr_iface_info_vxlan *next = api_info; + int ret; + + if (set_attrs & GR_VXLAN_SET_ENCAP_VRF) { + uint16_t vrf = next->encap_vrf_id; + uint16_t old = cur->encap_vrf_id; + + if (vrf == GR_VRF_ID_UNDEF) + vrf = vrf_default_get_or_create(); + + if (vrf != old && vrf_incref(vrf) < 0) + return -errno; + + if (old != GR_VRF_ID_UNDEF) + vrf_decref(old); + + cur->encap_vrf_id = vrf; + } + + if (set_attrs & (GR_VXLAN_SET_VNI | GR_VXLAN_SET_ENCAP_VRF)) { + const struct vxlan_key next_key = {rte_cpu_to_be_32(next->vni), cur->encap_vrf_id}; + + if (rte_hash_lookup(vxlan_hash, &next_key) >= 0) + return errno_set(EADDRINUSE); + + if (next->vni == 0 || next->vni > 0xffffff) + return errno_set(ERANGE); + + rte_hash_del_key(vxlan_hash, &cur_key); + + ret = rte_hash_add_key_data(vxlan_hash, &next_key, iface); + if (ret < 0) + return errno_log(-ret, "rte_hash_add_key_data"); + + cur->vni = next->vni; + } + + if (set_attrs & GR_VXLAN_SET_DST_PORT) { + cur->dst_port = next->dst_port ?: RTE_VXLAN_DEFAULT_PORT; + } + + if (set_attrs & (GR_VXLAN_SET_LOCAL | GR_VXLAN_SET_ENCAP_VRF)) { + ip4_addr_t local = (set_attrs & GR_VXLAN_SET_LOCAL) ? next->local : cur->local; + const struct nexthop *nh = rib4_lookup(cur->encap_vrf_id, local); + if (nh == NULL) + return -errno; + if (nh->type != GR_NH_T_L3) + return errno_set(EPROTOTYPE); + + const struct nexthop_info_l3 *l3 = nexthop_info_l3(nh); + if (!(l3->flags & GR_NH_F_LOCAL)) + return errno_set(EPROTOTYPE); + + cur->local = local; + } + + if (set_attrs & GR_VXLAN_SET_MAC) { + if (iface_set_eth_addr(iface, &next->mac) < 0) + return -errno; + } + + return 0; +} + +static int iface_vxlan_fini(struct iface *iface) { + struct iface_info_vxlan *vxlan = iface_info_vxlan(iface); + + if (vxlan->encap_vrf_id != GR_VRF_ID_UNDEF) + vrf_decref(vxlan->encap_vrf_id); + + return 0; +} + +static int iface_vxlan_init(struct iface *iface, const void *api_info) { + struct gr_iface conf; + int ret; + + iface->speed = RTE_ETH_SPEED_NUM_10G; + if (iface->mtu == 0) + iface->mtu = 1450; + + conf.base = iface->base; + + ret = iface_vxlan_reconfig(iface, IFACE_SET_ALL, &conf, api_info); + if (ret < 0) { + iface_vxlan_fini(iface); + errno = -ret; + } + + return ret; +} + +static int iface_vxlan_get_eth_addr(const struct iface *iface, struct rte_ether_addr *mac) { + const struct iface_info_vxlan *vxlan = iface_info_vxlan(iface); + + *mac = vxlan->mac; + + return 0; +} + +static int iface_vxlan_set_eth_addr(struct iface *iface, const struct rte_ether_addr *mac) { + struct iface_info_vxlan *vxlan = iface_info_vxlan(iface); + + if (rte_is_zero_ether_addr(mac)) + rte_eth_random_addr(vxlan->mac.addr_bytes); + else + vxlan->mac = *mac; + + return 0; +} + +static void vxlan_to_api(void *info, const struct iface *iface) { + const struct iface_info_vxlan *vxlan = iface_info_vxlan(iface); + struct gr_iface_info_vxlan *api = info; + *api = vxlan->base; +} + +static const struct iface_type iface_type_vxlan = { + .id = GR_IFACE_TYPE_VXLAN, + .pub_size = sizeof(struct gr_iface_info_vxlan), + .priv_size = sizeof(struct iface_info_vxlan), + .init = iface_vxlan_init, + .reconfig = iface_vxlan_reconfig, + .fini = iface_vxlan_fini, + .get_eth_addr = iface_vxlan_get_eth_addr, + .set_eth_addr = iface_vxlan_set_eth_addr, + .to_api = vxlan_to_api, +}; + +static void vxlan_pre_remove_cb(uint32_t /*ev_type*/, const void *obj) { + const struct iface_info_vxlan *vxlan; + const struct iface *iface = obj; + + if (iface->type != GR_IFACE_TYPE_VXLAN) + return; + + vxlan = iface_info_vxlan(iface); + struct vxlan_key key = {rte_cpu_to_be_32(vxlan->vni), vxlan->encap_vrf_id}; + rte_hash_del_key(vxlan_hash, &key); +} + +static struct gr_event_subscription vxlan_subscription = { + .callback = vxlan_pre_remove_cb, + .ev_count = 1, + .ev_types = {GR_EVENT_IFACE_PRE_REMOVE}, +}; + +static void vxlan_init(struct event_base *) { + struct rte_hash_parameters params = { + .name = "vxlan", + .entries = GR_MAX_IFACES, + .key_len = sizeof(struct vxlan_key), + .socket_id = SOCKET_ID_ANY, + .extra_flag = RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF + | RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT, + }; + vxlan_hash = rte_hash_create(¶ms); + if (vxlan_hash == NULL) + ABORT("rte_hash_create(vxlan)"); + + struct rte_hash_rcu_config rcu_config = { + .v = gr_datapath_rcu(), .mode = RTE_HASH_QSBR_MODE_SYNC + }; + rte_hash_rcu_qsbr_add(vxlan_hash, &rcu_config); +} + +static void vxlan_fini(struct event_base *) { + rte_hash_free(vxlan_hash); + vxlan_hash = NULL; +} + +static struct gr_module vxlan_module = { + .name = "vxlan", + .depends_on = "rcu", + .init = vxlan_init, + .fini = vxlan_fini, +}; + +RTE_INIT(vxlan_constructor) { + gr_register_module(&vxlan_module); + iface_type_register(&iface_type_vxlan); + gr_event_subscribe(&vxlan_subscription); +} From 8385675c6f4c9f53ffd1bf57a15b2e6e31bc8637 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 08:39:41 +0100 Subject: [PATCH 02/13] l4: allow dynamic registration of UDP port aliases VXLAN uses UDP port 4789 by default but allows configuring a custom destination port per interface. Allow the control plane to register additional UDP ports at runtime as aliases for an already registered port, reusing the same datapath edge. Use reference counting so that multiple interfaces sharing the same non-default port do not interfere with each other during teardown. Signed-off-by: Robin Jarry --- modules/l4/gr_l4.h | 4 ++++ modules/l4/l4_input_local.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/modules/l4/gr_l4.h b/modules/l4/gr_l4.h index ff875adb9..99254bfaa 100644 --- a/modules/l4/gr_l4.h +++ b/modules/l4/gr_l4.h @@ -8,3 +8,7 @@ #include void l4_input_register_port(uint8_t proto, rte_be16_t port, const char *next_node); + +int l4_input_alias_port(uint8_t proto, rte_be16_t port, rte_be16_t alias); + +int l4_input_unalias_port(uint8_t proto, rte_be16_t alias); diff --git a/modules/l4/l4_input_local.c b/modules/l4/l4_input_local.c index e8fe50a3d..777e862e2 100644 --- a/modules/l4/l4_input_local.c +++ b/modules/l4/l4_input_local.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: BSD-3-Clause // Copyright (c) 2024 Christophe Fontaine -#include "gr_l4.h" - #include #include #include #include +#include #include #include #include @@ -20,6 +19,7 @@ enum edges { EDGE_COUNT, }; +static unsigned udp_refcounts[UINT_NUM_VALUES(rte_be16_t)] = {0}; static rte_edge_t udp_edges[UINT_NUM_VALUES(rte_be16_t)] = {MANAGEMENT}; void l4_input_register_port(uint8_t proto, rte_be16_t port, const char *next_node) { @@ -30,12 +30,40 @@ void l4_input_register_port(uint8_t proto, rte_be16_t port, const char *next_nod if (udp_edges[port] != MANAGEMENT) ABORT("next node already registered for udp port=%hu", p); udp_edges[port] = gr_node_attach_parent("l4_input_local", next_node); + udp_refcounts[port]++; break; default: ABORT("proto not supported %hhu", proto); } } +int l4_input_alias_port(uint8_t proto, rte_be16_t port, rte_be16_t alias) { + assert(proto == IPPROTO_UDP); + + if (udp_edges[port] == MANAGEMENT) + return errno_set(EADDRNOTAVAIL); + if (udp_edges[alias] != MANAGEMENT && udp_edges[alias] != udp_edges[port]) + return errno_set(EADDRINUSE); + + udp_edges[alias] = udp_edges[port]; + udp_refcounts[alias]++; + + return 0; +} + +int l4_input_unalias_port(uint8_t proto, rte_be16_t alias) { + assert(proto == IPPROTO_UDP); + + if (udp_edges[alias] == MANAGEMENT || udp_refcounts[alias] == 0) + return errno_set(EADDRNOTAVAIL); + + udp_refcounts[alias]--; + if (udp_refcounts[alias] == 0) + udp_edges[alias] = MANAGEMENT; + + return 0; +} + static uint16_t l4_input_local_process( struct rte_graph *graph, struct rte_node *node, From 636012818dd65ce6c4480761f494a0dcbbee2f18 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 14 Feb 2026 01:09:26 +0100 Subject: [PATCH 03/13] l2: register VXLAN UDP port in l4_input_local Wire up the VXLAN interface's configurable destination port to the L4 input node. When a non-default port is configured, register it as an alias for the standard VXLAN port (4789) so that the datapath delivers matching UDP packets to the vxlan_input node. Unregister the alias when the port changes or the interface is destroyed. Signed-off-by: Robin Jarry --- modules/l2/control/vxlan.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/modules/l2/control/vxlan.c b/modules/l2/control/vxlan.c index 49e516d2a..fde2876d6 100644 --- a/modules/l2/control/vxlan.c +++ b/modules/l2/control/vxlan.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -80,7 +81,19 @@ static int iface_vxlan_reconfig( } if (set_attrs & GR_VXLAN_SET_DST_PORT) { - cur->dst_port = next->dst_port ?: RTE_VXLAN_DEFAULT_PORT; + uint16_t port = next->dst_port ?: RTE_VXLAN_DEFAULT_PORT; + if (cur->dst_port != 0 && cur->dst_port != RTE_VXLAN_DEFAULT_PORT + && port != cur->dst_port) { + l4_input_unalias_port(IPPROTO_UDP, rte_cpu_to_be_16(cur->dst_port)); + } + if (port != RTE_VXLAN_DEFAULT_PORT && port != cur->dst_port) { + l4_input_alias_port( + IPPROTO_UDP, + RTE_BE16(RTE_VXLAN_DEFAULT_PORT), + rte_cpu_to_be_16(port) + ); + } + cur->dst_port = port; } if (set_attrs & (GR_VXLAN_SET_LOCAL | GR_VXLAN_SET_ENCAP_VRF)) { @@ -112,6 +125,9 @@ static int iface_vxlan_fini(struct iface *iface) { if (vxlan->encap_vrf_id != GR_VRF_ID_UNDEF) vrf_decref(vxlan->encap_vrf_id); + if (vxlan->dst_port != RTE_VXLAN_DEFAULT_PORT) + l4_input_unalias_port(IPPROTO_UDP, rte_cpu_to_be_16(vxlan->dst_port)); + return 0; } From d1cd36b0e9b0a5c20899bd2e3ff02f0076632630 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 14 Feb 2026 01:12:27 +0100 Subject: [PATCH 04/13] l2: add flood list management Introduce a transport-agnostic flood list framework for BUM traffic (Broadcast, Unknown unicast, Multicast). In EVPN, each PE maintains a flooding list built from IMET routes (RFC 8365, RFC 9572). The entries in this list differ depending on the overlay encapsulation: VXLAN uses a remote VTEP IPv4 address and a VNI, while SRv6 would use a 128-bit SID. The API defines a gr_flood_entry structure with a type discriminant and a union, allowing future encapsulation types (e.g. SRv6 SIDs) to be added without changing the API request types. A dispatch layer in control/flood.c routes add/del/list operations to type-specific callbacks registered at init time. Implement the VXLAN VTEP flood type (GR_FLOOD_T_VTEP). Each VXLAN interface maintains a per-VNI array of remote VTEP addresses used by the vxlan_flood datapath node for ingress replication. The array is replaced atomically with an RCU synchronization barrier so that the datapath never sees a partially updated list. CLI commands are exposed under "flood vtep add/del/show". Add new generated grcli-flood(1) man page. Signed-off-by: Robin Jarry --- docs/meson.build | 4 +- modules/l2/api/gr_l2.h | 59 ++++++++++ modules/l2/cli/flood.c | 178 +++++++++++++++++++++++++++++ modules/l2/cli/meson.build | 1 + modules/l2/control/flood.c | 107 +++++++++++++++++ modules/l2/control/gr_l2_control.h | 18 ++- modules/l2/control/meson.build | 1 + modules/l2/control/vxlan.c | 114 ++++++++++++++++++ 8 files changed, 479 insertions(+), 3 deletions(-) create mode 100644 modules/l2/cli/flood.c create mode 100644 modules/l2/control/flood.c diff --git a/docs/meson.build b/docs/meson.build index 478f5ef41..1ec5975ed 100644 --- a/docs/meson.build +++ b/docs/meson.build @@ -74,8 +74,8 @@ custom_target( # Individual command man pages # The list is hardcoded since we can't run grcli during meson configuration. grcli_commands = [ - 'address', 'affinity', 'conntrack', 'dnat44', 'events', 'fdb', 'graph', - 'interface', 'logging', 'nexthop', 'ping', 'ping6', 'route', + 'address', 'affinity', 'conntrack', 'dnat44', 'events', 'fdb', 'flood', + 'graph', 'interface', 'logging', 'nexthop', 'ping', 'ping6', 'route', 'router-advert', 'snat44', 'stats', 'trace', 'traceroute', 'traceroute6', 'tunsrc', ] diff --git a/modules/l2/api/gr_l2.h b/modules/l2/api/gr_l2.h index 29c756ba9..d23a64f5d 100644 --- a/modules/l2/api/gr_l2.h +++ b/modules/l2/api/gr_l2.h @@ -144,3 +144,62 @@ struct gr_fdb_config_set_req { }; // struct gr_fdb_config_set_resp { }; + +// Flood list management for BUM (Broadcast, Unknown unicast, Multicast) ////// + +typedef enum : uint8_t { + GR_FLOOD_T_VTEP = 1, // VXLAN remote VTEP +} gr_flood_type_t; + +static inline const char *gr_flood_type_name(gr_flood_type_t type) { + switch (type) { + case GR_FLOOD_T_VTEP: + return "vtep"; + } + return "?"; +} + +struct gr_flood_vtep { + uint32_t vni; + ip4_addr_t addr; +}; + +struct gr_flood_entry { + gr_flood_type_t type; + uint16_t vrf_id; + union { + struct gr_flood_vtep vtep; + }; +}; + +enum { + GR_EVENT_FLOOD_ADD = EVENT_TYPE(GR_L2_MODULE, 0x0011), + GR_EVENT_FLOOD_DEL = EVENT_TYPE(GR_L2_MODULE, 0x0012), +}; + +#define GR_FLOOD_ADD REQUEST_TYPE(GR_L2_MODULE, 0x0011) + +struct gr_flood_add_req { + struct gr_flood_entry entry; + bool exist_ok; +}; + +// struct gr_flood_add_resp { }; + +#define GR_FLOOD_DEL REQUEST_TYPE(GR_L2_MODULE, 0x0012) + +struct gr_flood_del_req { + struct gr_flood_entry entry; + bool missing_ok; +}; + +// struct gr_flood_del_resp { }; + +#define GR_FLOOD_LIST REQUEST_TYPE(GR_L2_MODULE, 0x0013) + +struct gr_flood_list_req { + gr_flood_type_t type; // 0 for all types + uint16_t vrf_id; // GR_VRF_ID_UNDEF for all +}; + +STREAM_RESP(struct gr_flood_entry); diff --git a/modules/l2/cli/flood.c b/modules/l2/cli/flood.c new file mode 100644 index 000000000..760665ac9 --- /dev/null +++ b/modules/l2/cli/flood.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static cmd_status_t vtep_add(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_flood_add_req req = { + .entry.type = GR_FLOOD_T_VTEP, + .exist_ok = true, + }; + + if (arg_ip4(p, "ADDR", &req.entry.vtep.addr) < 0) + return CMD_ERROR; + if (arg_u32(p, "VNI", &req.entry.vtep.vni) < 0) + return CMD_ERROR; + if (arg_vrf(c, p, "VRF", &req.entry.vrf_id) < 0) + return CMD_ERROR; + + if (gr_api_client_send_recv(c, GR_FLOOD_ADD, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +static cmd_status_t vtep_del(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_flood_del_req req = { + .entry.type = GR_FLOOD_T_VTEP, + .missing_ok = true, + }; + + if (arg_ip4(p, "ADDR", &req.entry.vtep.addr) < 0) + return CMD_ERROR; + if (arg_u32(p, "VNI", &req.entry.vtep.vni) < 0) + return CMD_ERROR; + if (arg_vrf(c, p, "VRF", &req.entry.vrf_id) < 0) + return CMD_ERROR; + + if (gr_api_client_send_recv(c, GR_FLOOD_DEL, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +static cmd_status_t vtep_show(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_flood_list_req req = { + .type = GR_FLOOD_T_VTEP, + .vrf_id = GR_VRF_ID_UNDEF, + }; + const struct gr_flood_entry *entry; + int ret; + + if (arg_str(p, "VRF") != NULL && arg_vrf(c, p, "VRF", &req.vrf_id) < 0) + return CMD_ERROR; + + struct libscols_table *table = scols_new_table(); + scols_table_new_column(table, "VNI", 0, SCOLS_FL_RIGHT); + scols_table_new_column(table, "VRF", 0, 0); + scols_table_new_column(table, "ADDR", 0, 0); + scols_table_set_column_separator(table, " "); + + gr_api_client_stream_foreach (entry, ret, c, GR_FLOOD_LIST, sizeof(req), &req) { + struct libscols_line *line = scols_table_new_line(table, NULL); + + scols_line_sprintf(line, 0, "%u", entry->vtep.vni); + + struct gr_iface *vrf = iface_from_id(c, entry->vrf_id); + scols_line_sprintf(line, 1, "%s", vrf ? vrf->name : "[deleted]"); + free(vrf); + + scols_line_sprintf(line, 2, IP4_F, &entry->vtep.addr); + } + + scols_print_table(table); + scols_unref_table(table); + + return ret < 0 ? CMD_ERROR : CMD_SUCCESS; +} + +#define FLOOD_CTX(root) CLI_CONTEXT(root, CTX_ARG("flood", "Flood list management.")) +#define VTEP_CTX(root) CLI_CONTEXT(FLOOD_CTX(root), CTX_ARG("vtep", "VXLAN Tunnel End-Points.")) + +static int ctx_init(struct ec_node *root) { + int ret; + + ret = CLI_COMMAND( + VTEP_CTX(root), + "add ADDR vni VNI [vrf VRF]", + vtep_add, + "Add a VXLAN flood VTEP.", + with_help("Remote VTEP IP address.", ec_node_re("ADDR", IPV4_RE)), + with_help( + "VXLAN Network Identifier (1-16777215).", + ec_node_uint("VNI", 1, 16777215, 10) + ), + with_help("L3 routing domain name.", ec_node_dyn("VRF", complete_vrf_names, NULL)) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + VTEP_CTX(root), + "del ADDR vni VNI [vrf VRF]", + vtep_del, + "Delete a VXLAN flood VTEP.", + with_help("Remote VTEP IP address.", ec_node_re("ADDR", IPV4_RE)), + with_help( + "VXLAN Network Identifier (1-16777215).", + ec_node_uint("VNI", 1, 16777215, 10) + ), + with_help("L3 routing domain name.", ec_node_dyn("VRF", complete_vrf_names, NULL)) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + VTEP_CTX(root), + "[show] [vrf VRF]", + vtep_show, + "List VXLAN flood VTEPs.", + with_help("L3 routing domain name.", ec_node_dyn("VRF", complete_vrf_names, NULL)) + ); + if (ret < 0) + return ret; + + return 0; +} + +static struct cli_context ctx = { + .name = "flood", + .init = ctx_init, +}; + +static void flood_event_print(uint32_t event, const void *obj) { + const struct gr_flood_entry *entry = obj; + const char *action; + + switch (event) { + case GR_EVENT_FLOOD_ADD: + action = "add"; + break; + case GR_EVENT_FLOOD_DEL: + action = "del"; + break; + default: + action = "?"; + break; + } + + printf("flood %s: %s vrf=%u", action, gr_flood_type_name(entry->type), entry->vrf_id); + switch (entry->type) { + case GR_FLOOD_T_VTEP: + printf(" " IP4_F " vni=%u", &entry->vtep.addr, entry->vtep.vni); + } + printf("\n"); +} + +static struct cli_event_printer printer = { + .print = flood_event_print, + .ev_count = 2, + .ev_types = { + GR_EVENT_FLOOD_ADD, + GR_EVENT_FLOOD_DEL, + }, +}; + +static void __attribute__((constructor, used)) init(void) { + cli_context_register(&ctx); + cli_event_printer_register(&printer); +} diff --git a/modules/l2/cli/meson.build b/modules/l2/cli/meson.build index ba8c86745..01e82a5ce 100644 --- a/modules/l2/cli/meson.build +++ b/modules/l2/cli/meson.build @@ -3,6 +3,7 @@ cli_src += files( 'bridge.c', + 'flood.c', 'fdb.c', 'vxlan.c', ) diff --git a/modules/l2/control/flood.c b/modules/l2/control/flood.c new file mode 100644 index 000000000..9d04ac7bb --- /dev/null +++ b/modules/l2/control/flood.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include + +#include + +static const struct flood_type_ops *flood_types[UINT_NUM_VALUES(gr_flood_type_t)]; + +static bool flood_type_valid(gr_flood_type_t type) { + switch (type) { + case GR_FLOOD_T_VTEP: + return true; + } + return false; +} + +void flood_type_register(const struct flood_type_ops *ops) { + if (!flood_type_valid(ops->type)) + ABORT("invalid flood type %u", ops->type); + if (flood_types[ops->type] != NULL) + ABORT("flood type %u already registered", ops->type); + flood_types[ops->type] = ops; +} + +static struct api_out flood_add(const void *request, struct api_ctx *) { + const struct gr_flood_add_req *req = request; + const struct flood_type_ops *ops; + int ret; + + ops = flood_types[req->entry.type]; + if (ops == NULL || ops->add == NULL) + return api_out(EAFNOSUPPORT, 0, NULL); + + ret = ops->add(&req->entry, req->exist_ok); + + return api_out(-ret, 0, NULL); +} + +static struct gr_api_handler flood_add_handler = { + .name = "flood add", + .request_type = GR_FLOOD_ADD, + .callback = flood_add, +}; + +static struct api_out flood_del(const void *request, struct api_ctx *) { + const struct gr_flood_del_req *req = request; + const struct flood_type_ops *ops; + int ret; + + ops = flood_types[req->entry.type]; + if (ops == NULL || ops->del == NULL) + return api_out(EAFNOSUPPORT, 0, NULL); + + ret = ops->del(&req->entry, req->missing_ok); + + return api_out(-ret, 0, NULL); +} + +static struct gr_api_handler flood_del_handler = { + .name = "flood del", + .request_type = GR_FLOOD_DEL, + .callback = flood_del, +}; + +static struct api_out flood_list(const void *request, struct api_ctx *ctx) { + const struct gr_flood_list_req *req = request; + const struct flood_type_ops *ops; + + for (unsigned t = 0; t < ARRAY_DIM(flood_types); t++) { + if (req->type != 0 && req->type != t) + continue; + ops = flood_types[t]; + if (ops == NULL || ops->list == NULL) + continue; + if (ops->list(req->vrf_id, ctx) < 0) + return api_out(errno, 0, NULL); + } + + return api_out(0, 0, NULL); +} + +static struct gr_api_handler flood_list_handler = { + .name = "flood list", + .request_type = GR_FLOOD_LIST, + .callback = flood_list, +}; + +static struct gr_event_serializer serializer = { + .size = sizeof(struct gr_flood_entry), + .ev_count = 2, + .ev_types = { + GR_EVENT_FLOOD_ADD, + GR_EVENT_FLOOD_DEL, + }, +}; + +RTE_INIT(flood_init) { + gr_register_api_handler(&flood_add_handler); + gr_register_api_handler(&flood_del_handler); + gr_register_api_handler(&flood_list_handler); + gr_event_register_serializer(&serializer); +} diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h index 242878f29..2652314c6 100644 --- a/modules/l2/control/gr_l2_control.h +++ b/modules/l2/control/gr_l2_control.h @@ -5,6 +5,7 @@ #include #include +#include #include @@ -33,6 +34,21 @@ void fdb_purge_iface(uint16_t iface_id); // Delete all FDB entries referencing the provided bridge. void fdb_purge_bridge(uint16_t bridge_id); -GR_IFACE_INFO(GR_IFACE_TYPE_VXLAN, iface_info_vxlan, { BASE(gr_iface_info_vxlan); }); +GR_IFACE_INFO(GR_IFACE_TYPE_VXLAN, iface_info_vxlan, { + BASE(gr_iface_info_vxlan); + + uint16_t n_flood_vteps; + ip4_addr_t *flood_vteps; +}); struct iface *vxlan_get_iface(rte_be32_t vni, uint16_t encap_vrf_id); + +// Flood list type callbacks, registered per gr_flood_t. +struct flood_type_ops { + gr_flood_type_t type; + int (*add)(const struct gr_flood_entry *, bool exist_ok); + int (*del)(const struct gr_flood_entry *, bool missing_ok); + int (*list)(uint16_t vrf_id, struct api_ctx *); +}; + +void flood_type_register(const struct flood_type_ops *); diff --git a/modules/l2/control/meson.build b/modules/l2/control/meson.build index d3e339e5e..459546a08 100644 --- a/modules/l2/control/meson.build +++ b/modules/l2/control/meson.build @@ -4,6 +4,7 @@ src += files( 'bridge.c', 'fdb.c', + 'flood.c', 'vxlan.c', ) diff --git a/modules/l2/control/vxlan.c b/modules/l2/control/vxlan.c index fde2876d6..3098bc712 100644 --- a/modules/l2/control/vxlan.c +++ b/modules/l2/control/vxlan.c @@ -14,7 +14,9 @@ #include #include +#include +#include #include struct vxlan_key { @@ -121,6 +123,16 @@ static int iface_vxlan_reconfig( static int iface_vxlan_fini(struct iface *iface) { struct iface_info_vxlan *vxlan = iface_info_vxlan(iface); + struct gr_flood_entry entry = { + .type = GR_FLOOD_T_VTEP, + .vrf_id = vxlan->encap_vrf_id, + .vtep.vni = vxlan->vni, + }; + + for (uint16_t i = 0; i < vxlan->n_flood_vteps; i++) { + entry.vtep.addr = vxlan->flood_vteps[i]; + gr_event_push(GR_EVENT_FLOOD_DEL, &entry); + } if (vxlan->encap_vrf_id != GR_VRF_ID_UNDEF) vrf_decref(vxlan->encap_vrf_id); @@ -128,6 +140,8 @@ static int iface_vxlan_fini(struct iface *iface) { if (vxlan->dst_port != RTE_VXLAN_DEFAULT_PORT) l4_input_unalias_port(IPPROTO_UDP, rte_cpu_to_be_16(vxlan->dst_port)); + rte_free(vxlan->flood_vteps); + return 0; } @@ -205,6 +219,105 @@ static struct gr_event_subscription vxlan_subscription = { .ev_types = {GR_EVENT_IFACE_PRE_REMOVE}, }; +static int vtep_flood_add(const struct gr_flood_entry *entry, bool exist_ok) { + struct iface_info_vxlan *vxlan; + ip4_addr_t *vteps, *old_vteps; + struct iface *iface; + + iface = vxlan_get_iface(rte_cpu_to_be_32(entry->vtep.vni), entry->vrf_id); + if (iface == NULL) + return errno_set(ENODEV); + + vxlan = iface_info_vxlan(iface); + + for (uint16_t i = 0; i < vxlan->n_flood_vteps; i++) { + if (vxlan->flood_vteps[i] == entry->vtep.addr) { + if (exist_ok) + return 0; + return errno_set(EEXIST); + } + } + + vteps = rte_calloc(__func__, vxlan->n_flood_vteps + 1, sizeof(*vteps), 0); + if (vteps == NULL) + return errno_set(ENOMEM); + + memcpy(vteps, vxlan->flood_vteps, vxlan->n_flood_vteps * sizeof(*vteps)); + vteps[vxlan->n_flood_vteps] = entry->vtep.addr; + old_vteps = vxlan->flood_vteps; + vxlan->flood_vteps = vteps; + // ensure n_flood_vteps is incremented *after* flood_vteps is updated + atomic_thread_fence(memory_order_release); + vxlan->n_flood_vteps++; + + rte_rcu_qsbr_synchronize(gr_datapath_rcu(), rte_lcore_id()); + rte_free(old_vteps); + + gr_event_push(GR_EVENT_FLOOD_ADD, entry); + + return 0; +} + +static int vtep_flood_del(const struct gr_flood_entry *entry, bool missing_ok) { + struct iface_info_vxlan *vxlan; + struct iface *iface; + + iface = vxlan_get_iface(rte_cpu_to_be_32(entry->vtep.vni), entry->vrf_id); + if (iface == NULL) { + if (missing_ok) + return 0; + return errno_set(ENOENT); + } + + vxlan = iface_info_vxlan(iface); + + for (uint16_t i = 0; i < vxlan->n_flood_vteps; i++) { + if (vxlan->flood_vteps[i] == entry->vtep.addr) { + vxlan->flood_vteps[i] = vxlan->flood_vteps[vxlan->n_flood_vteps - 1]; + vxlan->n_flood_vteps--; + gr_event_push(GR_EVENT_FLOOD_DEL, entry); + return 0; + } + } + + if (missing_ok) + return 0; + + return errno_set(ENOENT); +} + +static int vtep_flood_list(uint16_t vrf_id, struct api_ctx *ctx) { + struct gr_flood_entry entry = {.type = GR_FLOOD_T_VTEP}; + const struct iface_info_vxlan *vxlan; + uint32_t next = 0; + const void *key; + void *data; + + while (rte_hash_iterate(vxlan_hash, &key, &data, &next) >= 0) { + struct iface *iface = data; + vxlan = iface_info_vxlan(iface); + + if (vrf_id != GR_VRF_ID_UNDEF && vxlan->encap_vrf_id != vrf_id) + continue; + + for (uint16_t i = 0; i < vxlan->n_flood_vteps; i++) { + entry.vrf_id = vxlan->encap_vrf_id; + entry.vtep.vni = vxlan->vni; + entry.vtep.addr = vxlan->flood_vteps[i]; + api_send(ctx, sizeof(entry), &entry); + } + } + + return 0; +} + +static const struct flood_type_ops vtep_flood_ops = { + .type = GR_FLOOD_T_VTEP, + .add = vtep_flood_add, + .del = vtep_flood_del, + .list = vtep_flood_list, +}; + static void vxlan_init(struct event_base *) { struct rte_hash_parameters params = { .name = "vxlan", @@ -240,4 +353,5 @@ RTE_INIT(vxlan_constructor) { gr_register_module(&vxlan_module); iface_type_register(&iface_type_vxlan); gr_event_subscribe(&vxlan_subscription); + flood_type_register(&vtep_flood_ops); } From 19b6eccb88932bc89cd736e27b07a6a48a2aff40 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 23:50:19 +0100 Subject: [PATCH 05/13] l2: associate remote VTEP addresses with FDB entries In a VXLAN overlay, the bridge needs to know which remote VTEP to use when sending unicast frames to a learned MAC address. Add a VTEP IPv4 address field to FDB entries so that known unicast traffic can be sent directly to the correct tunnel endpoint instead of being flooded to all VTEPs. When bridge_input learns a MAC address from a VXLAN member interface, it records the source VTEP from the decapsulated packet's outer IP header. When forwarding to a known destination, the stored VTEP address is passed to the output path via the mbuf private data so that vxlan_output can build the correct outer header. Only set the VTEP field when the source interface is actually a VXLAN type to avoid storing uninitialized data from other packet paths (control plane, local bridge traffic). Signed-off-by: Robin Jarry --- modules/infra/datapath/gr_rxtx.h | 6 +++++- modules/l2/api/gr_l2.h | 2 ++ modules/l2/cli/fdb.c | 25 +++++++++++++++++++------ modules/l2/control/fdb.c | 11 ++++++++--- modules/l2/control/gr_l2_control.h | 4 +++- modules/l2/datapath/bridge_input.c | 9 +++++++-- 6 files changed, 44 insertions(+), 13 deletions(-) diff --git a/modules/infra/datapath/gr_rxtx.h b/modules/infra/datapath/gr_rxtx.h index add6adf0e..3828c633b 100644 --- a/modules/infra/datapath/gr_rxtx.h +++ b/modules/infra/datapath/gr_rxtx.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,10 @@ struct port_output_edges { rte_edge_t edges[RTE_MAX_ETHPORTS]; }; -GR_MBUF_PRIV_DATA_TYPE(iface_mbuf_data, { uint16_t vlan_id; }); +GR_MBUF_PRIV_DATA_TYPE(iface_mbuf_data, { + uint16_t vlan_id; + ip4_addr_t vtep; +}); int rxtx_trace_format(char *buf, size_t len, const void *data, size_t /*data_len*/); diff --git a/modules/l2/api/gr_l2.h b/modules/l2/api/gr_l2.h index d23a64f5d..54a9ef352 100644 --- a/modules/l2/api/gr_l2.h +++ b/modules/l2/api/gr_l2.h @@ -63,6 +63,7 @@ struct gr_iface_info_vxlan { typedef enum : uint8_t { GR_FDB_F_STATIC = GR_BIT8(0), // User-configured, never aged out. GR_FDB_F_LEARN = GR_BIT8(1), // Learned via local bridge. + GR_FDB_F_EXTERN = GR_BIT8(2), // Programmed by external control plane. } gr_fdb_flags_t; // Forwarding database entry associating a MAC+VLAN to a bridge member interface. @@ -71,6 +72,7 @@ struct gr_fdb_entry { struct rte_ether_addr mac; uint16_t vlan_id; uint16_t iface_id; // Updated automatically when a MAC moves between members. + ip4_addr_t vtep; // Remote VTEP for VXLAN-learned entries, 0 for local. gr_fdb_flags_t flags; clock_t last_seen; // Refreshed on each datapath hit for learned entries. }; diff --git a/modules/l2/cli/fdb.c b/modules/l2/cli/fdb.c index 4ab652086..2677a3741 100644 --- a/modules/l2/cli/fdb.c +++ b/modules/l2/cli/fdb.c @@ -93,7 +93,7 @@ static cmd_status_t fdb_flush(struct gr_api_client *c, const struct ec_pnode *p) return CMD_ERROR; if (arg_str(p, "all") != NULL) - req.flags |= GR_FDB_F_STATIC; + req.flags |= (GR_FDB_F_STATIC | GR_FDB_F_EXTERN); if (gr_api_client_send_recv(c, GR_FDB_FLUSH, sizeof(req), &req, NULL) < 0) return CMD_ERROR; @@ -108,6 +108,8 @@ static size_t fdb_format_flags(char *buf, size_t len, gr_fdb_flags_t flags) { SAFE_BUF(snprintf, len, "%slearn", n ? " " : ""); if (flags & GR_FDB_F_STATIC) SAFE_BUF(snprintf, len, "%sstatic", n ? " " : ""); + if (flags & GR_FDB_F_EXTERN) + SAFE_BUF(snprintf, len, "%sextern", n ? " " : ""); err: return n; } @@ -134,12 +136,15 @@ static cmd_status_t fdb_show(struct gr_api_client *c, const struct ec_pnode *p) req.flags |= GR_FDB_F_STATIC; if (arg_str(p, "learn") != NULL) req.flags |= GR_FDB_F_LEARN; + if (arg_str(p, "extern") != NULL) + req.flags |= GR_FDB_F_EXTERN; struct libscols_table *table = scols_new_table(); scols_table_new_column(table, "BRIDGE", 0, 0); scols_table_new_column(table, "MAC", 0, 0); scols_table_new_column(table, "VLAN", 0, 0); scols_table_new_column(table, "IFACE", 0, 0); + scols_table_new_column(table, "VTEP", 0, 0); scols_table_new_column(table, "FLAGS", 0, 0); scols_table_new_column(table, "AGE", 0, SCOLS_FL_RIGHT); scols_table_set_column_separator(table, " "); @@ -160,11 +165,14 @@ static cmd_status_t fdb_show(struct gr_api_client *c, const struct ec_pnode *p) scols_line_sprintf(line, 3, "%s", iface ? iface->name : "[deleted]"); free(iface); + if (fdb->vtep != 0) + scols_line_sprintf(line, 4, IP4_F, &fdb->vtep); + if (fdb_format_flags(flags, sizeof(flags), fdb->flags)) - scols_line_set_data(line, 4, flags); + scols_line_set_data(line, 5, flags); scols_line_sprintf( - line, 5, "%lds", (gr_clock_us() - fdb->last_seen) / CLOCKS_PER_SEC + line, 6, "%lds", (gr_clock_us() - fdb->last_seen) / CLOCKS_PER_SEC ); } @@ -256,7 +264,9 @@ static int ctx_init(struct ec_node *root) { "Flush only entries matching this MAC address.", ec_node_re("MAC", ETH_ADDR_RE) ), - with_help("Flush all entries including static.", ec_node_str("all", "all")) + with_help( + "Flush all entries including static and extern.", ec_node_str("all", "all") + ) ); if (ret < 0) return ret; @@ -282,7 +292,7 @@ static int ctx_init(struct ec_node *root) { ret = CLI_COMMAND( FDB_CTX(root), - "[show] [(bridge BRIDGE),(iface IFACE),(static|learn)]", + "[show] [(bridge BRIDGE),(iface IFACE),(static|learn|extern)]", fdb_show, "Show FDB entries.", with_help( @@ -294,7 +304,8 @@ static int ctx_init(struct ec_node *root) { ec_node_dyn("IFACE", complete_iface_names, INT2PTR(GR_IFACE_TYPE_UNDEF)) ), with_help("Show only static entries.", ec_node_str("static", "static")), - with_help("Show only learned entries.", ec_node_str("learn", "learn")) + with_help("Show only learned entries.", ec_node_str("learn", "learn")), + with_help("Show only extern entries.", ec_node_str("extern", "extern")) ); if (ret < 0) return ret; @@ -331,6 +342,8 @@ static void fdb_event_print(uint32_t event, const void *obj) { if (fdb->vlan_id != 0) printf(" vlan=%u", fdb->vlan_id); printf(" iface=%u", fdb->iface_id); + if (fdb->vtep != 0) + printf(" vtep=" IP4_F, &fdb->vtep); if (fdb_format_flags(flags, sizeof(flags), fdb->flags)) printf(" %s", flags); printf("\n"); diff --git a/modules/l2/control/fdb.c b/modules/l2/control/fdb.c index dfea4339c..0b10b0a77 100644 --- a/modules/l2/control/fdb.c +++ b/modules/l2/control/fdb.c @@ -105,7 +105,8 @@ void fdb_learn( uint16_t bridge_id, uint16_t iface_id, const struct rte_ether_addr *mac, - uint16_t vlan_id + uint16_t vlan_id, + ip4_addr_t vtep ) { const struct fdb_key key = {bridge_id, vlan_id, *mac}; struct gr_fdb_entry *fdb; @@ -121,6 +122,7 @@ void fdb_learn( fdb->mac = *mac; fdb->flags = GR_FDB_F_LEARN; fdb->iface_id = iface_id; + fdb->vtep = vtep; if (rte_hash_add_key_data(fdb_hash, &key, fdb) < 0) { // no space left in hash @@ -135,9 +137,10 @@ void fdb_learn( fdb->last_seen = gr_clock_us(); - if ((fdb->flags & GR_FDB_F_LEARN) && fdb->iface_id != iface_id) { + if ((fdb->flags & GR_FDB_F_LEARN) && (fdb->iface_id != iface_id || fdb->vtep != vtep)) { // update in case the mac address has moved fdb->iface_id = iface_id; + fdb->vtep = vtep; gr_event_push(GR_EVENT_FDB_UPDATE, fdb); } } @@ -177,7 +180,7 @@ static struct api_out fdb_add(const void *request, struct api_ctx *) { void *data; int ret; - if (req->fdb.flags & ~GR_FDB_F_STATIC) + if (req->fdb.flags & ~(GR_FDB_F_STATIC | GR_FDB_F_EXTERN)) return api_out(EINVAL, 0, NULL); iface = iface_from_id(req->fdb.iface_id); @@ -259,6 +262,8 @@ static inline bool fdb_match( return false; if ((flags & GR_FDB_F_LEARN) && !(e->flags & GR_FDB_F_LEARN)) return false; + if ((flags & GR_FDB_F_EXTERN) && !(e->flags & GR_FDB_F_EXTERN)) + return false; if (bridge_id != GR_IFACE_ID_UNDEF && e->bridge_id != bridge_id) return false; if (iface_id != GR_IFACE_ID_UNDEF && e->iface_id != iface_id) diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h index 2652314c6..8dc2308d4 100644 --- a/modules/l2/control/gr_l2_control.h +++ b/modules/l2/control/gr_l2_control.h @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -25,7 +26,8 @@ void fdb_learn( uint16_t bridge_id, uint16_t iface_id, const struct rte_ether_addr *, - uint16_t vlan_id + uint16_t vlan_id, + ip4_addr_t vtep ); // Delete all FDB entries referencing the provided interface. diff --git a/modules/l2/datapath/bridge_input.c b/modules/l2/datapath/bridge_input.c index a54f7116c..ccb9a1f28 100644 --- a/modules/l2/datapath/bridge_input.c +++ b/modules/l2/datapath/bridge_input.c @@ -39,6 +39,7 @@ static uint16_t bridge_input_process( struct iface_mbuf_data *d; struct rte_ether_hdr *eth; struct rte_mbuf *m; + ip4_addr_t vtep; rte_edge_t edge; for (uint16_t i = 0; i < nb_objs; i++) { @@ -61,8 +62,10 @@ static uint16_t bridge_input_process( br = iface_info_bridge(bridge); if (rte_is_unicast_ether_addr(ð->src_addr) - && !(br->flags & GR_BRIDGE_F_NO_LEARN)) - fdb_learn(bridge->id, d->iface->id, ð->src_addr, d->vlan_id); + && !(br->flags & GR_BRIDGE_F_NO_LEARN)) { + vtep = (d->iface->type == GR_IFACE_TYPE_VXLAN) ? d->vtep : 0; + fdb_learn(bridge->id, d->iface->id, ð->src_addr, d->vlan_id, vtep); + } if (rte_is_unicast_ether_addr(ð->dst_addr)) { fdb = fdb_lookup(bridge->id, ð->dst_addr, d->vlan_id); @@ -83,6 +86,8 @@ static uint16_t bridge_input_process( } // Direct output to learned interface d->iface = iface; + d->vtep = fdb->vtep; + if (iface->type == GR_IFACE_TYPE_BRIDGE) { edge = INPUT; } else { From d1cd40b18e0dc8c1095d3ee37307a37d3e135325 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 23:52:08 +0100 Subject: [PATCH 06/13] l2: implement VXLAN datapath nodes Add three datapath nodes for VXLAN packet processing. vxlan_input decapsulates incoming UDP/4789 packets. It strips the outer UDP and VXLAN headers, resolves the inner VNI to a VXLAN interface via the RCU-protected hash table, records the source VTEP from the outer IP header into the mbuf private data, and forwards the inner Ethernet frame to iface_input for bridge processing. vxlan_output encapsulates outgoing frames for a known destination VTEP. It prepends a pre-built IP/UDP/VXLAN header template initialized by the control plane, fills in the per-packet fields (destination VTEP, UDP length, IP length, checksum), and hashes the inner flow to select an ephemeral source port for underlay ECMP (RFC 7348 Section 5). The FIB lookup for the outer IP uses the encapsulation VRF, not the bridge domain. vxlan_flood handles BUM traffic by replicating the frame to every VTEP in the flood list via ingress replication. The original mbuf is sent to the first VTEP and clones are created for the rest. The bridge_flood node is updated to steer VXLAN member traffic through vxlan_flood instead of direct iface_output. Signed-off-by: Robin Jarry --- docs/graph.svg | 872 +++++++++++++++-------------- modules/l2/control/gr_l2_control.h | 31 + modules/l2/control/vxlan.c | 10 + modules/l2/datapath/bridge_flood.c | 8 +- modules/l2/datapath/l2_datapath.h | 15 + modules/l2/datapath/meson.build | 3 + modules/l2/datapath/vxlan_flood.c | 73 +++ modules/l2/datapath/vxlan_input.c | 124 ++++ modules/l2/datapath/vxlan_output.c | 118 ++++ 9 files changed, 844 insertions(+), 410 deletions(-) create mode 100644 modules/l2/datapath/l2_datapath.h create mode 100644 modules/l2/datapath/vxlan_flood.c create mode 100644 modules/l2/datapath/vxlan_input.c create mode 100644 modules/l2/datapath/vxlan_output.c diff --git a/docs/graph.svg b/docs/graph.svg index 59d85c29c..81a0301f8 100644 --- a/docs/graph.svg +++ b/docs/graph.svg @@ -4,885 +4,939 @@ - - - + + + bond_output - -bond_output + +bond_output port_output - -port_output + +port_output bond_output->port_output - - + + iface_input - -iface_input + +iface_input xconnect - -xconnect + +xconnect iface_input->xconnect - - + + eth_input - -eth_input + +eth_input iface_input->eth_input - - + + bridge_input - -bridge_input + +bridge_input iface_input->bridge_input - - + + iface_output - -iface_output + +iface_output iface_output->bond_output - - + + iface_output->port_output - - + + iface_output->bridge_input - - + + + + + +vxlan_output + +vxlan_output + + + +iface_output->vxlan_output + + port_tx - -port_tx + +port_tx - + port_output->port_tx - - + + port_rx - -port_rx + +port_rx - + port_rx->iface_input - - + + - + xconnect->port_output - - + + lacp_input - -lacp_input + +lacp_input eth_input->lacp_input - - + + snap_input - -snap_input + +snap_input eth_input->snap_input - - + + arp_input - -arp_input + +arp_input eth_input->arp_input - - + + - + ip_input - -ip_input + +ip_input eth_input->ip_input - - + + - + ip6_input - -ip6_input + +ip6_input eth_input->ip6_input - - + + eth_output - -eth_output + +eth_output eth_output->iface_output - - + + l2_redirect - -l2_redirect + +l2_redirect lacp_output - -lacp_output + +lacp_output - + lacp_output->eth_output - - + + - + snap_input->l2_redirect - - + + arp_input_reply - -arp_input_reply + +arp_input_reply - + arp_input->arp_input_reply - - + + arp_input_request - -arp_input_request + +arp_input_request - + arp_input->arp_input_request - - + + arp_output_reply - -arp_output_reply + +arp_output_reply - + arp_output_reply->eth_output - - + + arp_output_request - -arp_output_request + +arp_output_request - + arp_output_request->eth_output - - + + bridge_flood - -bridge_flood + +bridge_flood - + bridge_flood->iface_input - - + + - + bridge_flood->iface_output - - + + - + + +vxlan_flood + +vxlan_flood + + +bridge_flood->vxlan_flood + + + + + bridge_input->iface_input - - + + - + bridge_input->iface_output - - + + - + bridge_input->bridge_flood - - + + + + + +vxlan_flood->iface_output + + - + ospf_redirect - -ospf_redirect + +ospf_redirect - + ospf_redirect->l2_redirect - - + + - + loopback_input - -loopback_input + +loopback_input - + loopback_input->ip_input - - + + - + loopback_input->ip6_input - - + + - + loopback_output - -loopback_output + +loopback_output - + xvrf - -xvrf + +xvrf - + xvrf->ip_input - - + + - + xvrf->ip6_input - - + + - + ip_forward - -ip_forward + +ip_forward - + ip_output - -ip_output + +ip_output - + ip_forward->ip_output - - + + - + ip_fragment - -ip_fragment + +ip_fragment - + ip_fragment->ip_output - - + + - + ip_hold - -ip_hold + +ip_hold - + ip_input->ip_forward - - + + - + ip_input_local - -ip_input_local + +ip_input_local - + ip_input->ip_input_local - - + + - + ip_input->ip_output - - + + - + dnat44_dynamic - -dnat44_dynamic + +dnat44_dynamic - + ip_input->dnat44_dynamic - - + + - + dnat44_static - -dnat44_static + +dnat44_static - + ip_input->dnat44_static - - + + - + ip_loadbalance - -ip_loadbalance + +ip_loadbalance - + ip_loadbalance->ip_output - - + + - + ip_input_local->ospf_redirect - - + + - + ipip_input - -ipip_input + +ipip_input - + ip_input_local->ipip_input - - + + - + icmp_input - -icmp_input + +icmp_input - + ip_input_local->icmp_input - - + + - + l4_input_local - -l4_input_local + +l4_input_local - + ip_input_local->l4_input_local - - + + - + ip_output->eth_output - - + + - + ip_output->xvrf - - + + - + ip_output->ip_fragment - - + + - + ip_output->ip_hold - - + + - + ip_output->ip_loadbalance - - + + - + ipip_output - -ipip_output + +ipip_output - + ip_output->ipip_output - - + + - + sr6_output - -sr6_output + +sr6_output - + ip_output->sr6_output - - + + - + ip6_forward - -ip6_forward + +ip6_forward - + ip6_output - -ip6_output + +ip6_output - + ip6_forward->ip6_output - - + + - + ip6_hold - -ip6_hold + +ip6_hold - + ip6_input->ip6_forward - - + + - + ip6_input_local - -ip6_input_local + +ip6_input_local - + ip6_input->ip6_input_local - - + + - + ip6_input->ip6_output - - + + - + sr6_local - -sr6_local + +sr6_local - + ip6_input->sr6_local - - + + - + ip6_loadbalance - -ip6_loadbalance + +ip6_loadbalance - + ip6_loadbalance->ip6_output - - + + - + ip6_input_local->ospf_redirect - - + + - + icmp6_input - -icmp6_input + +icmp6_input - + ip6_input_local->icmp6_input - - + + - + ip6_input_local->l4_input_local - - + + - + ip6_output->eth_output - - + + - + ip6_output->xvrf - - + + - + ip6_output->ip6_hold - - + + - + ip6_output->ip6_loadbalance - - + + - + ip6_output->sr6_output - - + + - + ipip_input->ip_input - - + + - + ipip_output->ip_output - - + + - + + +vxlan_input + +vxlan_input + + + +vxlan_input->iface_input + + + + +vxlan_output->ip_output + + + + + dnat44_dynamic->ip_forward - - + + - + dnat44_dynamic->ip_input_local - - + + - + dnat44_static->ip_forward - - + + - + dnat44_static->ip_input_local - - + + - + sr6_local->ip_input - - + + - + sr6_local->ip6_input - - + + - + sr6_local->ip6_input_local - - + + - + sr6_output->ip6_output - - + + - + icmp_output - -icmp_output + +icmp_output - + icmp_input->icmp_output - - + + - + icmp_local_send - -icmp_local_send + +icmp_local_send - + icmp_local_send->icmp_output - - + + - + icmp_output->ip_output - - + + - + icmp6_output - -icmp6_output + +icmp6_output - + icmp6_input->icmp6_output - - + + - + ndp_na_input - -ndp_na_input + +ndp_na_input - + icmp6_input->ndp_na_input - - + + - + ndp_ns_input - -ndp_ns_input + +ndp_ns_input - + icmp6_input->ndp_ns_input - - + + - + ndp_rs_input - -ndp_rs_input + +ndp_rs_input - + icmp6_input->ndp_rs_input - - + + - + icmp6_local_send - -icmp6_local_send + +icmp6_local_send - + icmp6_local_send->icmp6_output - - + + - + icmp6_output->ip6_output - - + + - + ndp_na_output - -ndp_na_output + +ndp_na_output - + ndp_na_output->icmp6_output - - + + - + ndp_ns_output - -ndp_ns_output + +ndp_ns_output - + ndp_ns_output->icmp6_output - - + + + + + +l4_input_local->vxlan_input + + - + l4_loopback_output - -l4_loopback_output + +l4_loopback_output - + l4_input_local->l4_loopback_output - - + + - + dhcp_input - -dhcp_input + +dhcp_input - + l4_input_local->dhcp_input - - + + - + l4_loopback_output->loopback_output - - + + diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h index 8dc2308d4..20fbf0d70 100644 --- a/modules/l2/control/gr_l2_control.h +++ b/modules/l2/control/gr_l2_control.h @@ -8,6 +8,11 @@ #include #include +#include +#include +#include +#include + #include // Internal bridge info structure. @@ -36,9 +41,17 @@ void fdb_purge_iface(uint16_t iface_id); // Delete all FDB entries referencing the provided bridge. void fdb_purge_bridge(uint16_t bridge_id); +struct vxlan_template { + struct rte_ipv4_hdr ip; + struct rte_udp_hdr udp; + struct rte_vxlan_hdr vxlan; +}; + GR_IFACE_INFO(GR_IFACE_TYPE_VXLAN, iface_info_vxlan, { BASE(gr_iface_info_vxlan); + struct vxlan_template template; + uint16_t n_flood_vteps; ip4_addr_t *flood_vteps; }); @@ -54,3 +67,21 @@ struct flood_type_ops { }; void flood_type_register(const struct flood_type_ops *); + +#define VXLAN_FLAGS_VNI RTE_BE32(GR_BIT32(27)) + +static inline rte_be32_t vxlan_decode_vni(rte_be32_t vx_vni) { +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + return (rte_be32_t)((uint32_t)vx_vni >> 8); +#else + return (rte_be32_t)((uint32_t)(vx_vni & RTE_BE32(0xffffff00)) << 8); +#endif +} + +static inline rte_be32_t vxlan_encode_vni(uint32_t vni) { +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + return (rte_be32_t)((uint32_t)vni << 8); +#else + return (rte_be32_t)((uint32_t)rte_cpu_to_be_32(vni) >> 8); +#endif +} diff --git a/modules/l2/control/vxlan.c b/modules/l2/control/vxlan.c index 3098bc712..40b8f42ab 100644 --- a/modules/l2/control/vxlan.c +++ b/modules/l2/control/vxlan.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -118,6 +119,15 @@ static int iface_vxlan_reconfig( return -errno; } + // Update the datapath template from the current config. + cur->template.ip.version_ihl = IPV4_VERSION_IHL; + cur->template.ip.time_to_live = IPV4_DEFAULT_TTL; + cur->template.ip.next_proto_id = IPPROTO_UDP; + cur->template.ip.src_addr = cur->local; + cur->template.udp.dst_port = rte_cpu_to_be_16(cur->dst_port); + cur->template.vxlan.vx_flags = VXLAN_FLAGS_VNI; + cur->template.vxlan.vx_vni = vxlan_encode_vni(cur->vni); + return 0; } diff --git a/modules/l2/datapath/bridge_flood.c b/modules/l2/datapath/bridge_flood.c index 50ae8370b..01762b498 100644 --- a/modules/l2/datapath/bridge_flood.c +++ b/modules/l2/datapath/bridge_flood.c @@ -18,6 +18,7 @@ enum edges { OUTPUT = 0, INPUT, + VXLAN_FLOOD, DROP, EDGE_COUNT }; @@ -83,7 +84,11 @@ static uint16_t bridge_flood_process( if (clone == NULL) continue; - rte_node_enqueue_x1(graph, node, OUTPUT, clone); + if (member->type == GR_IFACE_TYPE_VXLAN) + rte_node_enqueue_x1(graph, node, VXLAN_FLOOD, clone); + else + rte_node_enqueue_x1(graph, node, OUTPUT, clone); + flood_count++; } if (iface != br && (br->flags & GR_IFACE_F_UP)) { @@ -112,6 +117,7 @@ static struct rte_node_register node = { .next_nodes = { [OUTPUT] = "iface_output", [INPUT] = "iface_input", + [VXLAN_FLOOD] = "vxlan_flood", [DROP] = "bridge_flood_drop", }, }; diff --git a/modules/l2/datapath/l2_datapath.h b/modules/l2/datapath/l2_datapath.h new file mode 100644 index 000000000..7c9ce2150 --- /dev/null +++ b/modules/l2/datapath/l2_datapath.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#pragma once + +#include + +#include + +struct trace_vxlan_data { + rte_be32_t vni; + ip4_addr_t vtep; +}; + +int trace_vxlan_format(char *buf, size_t len, const void *data, size_t data_len); diff --git a/modules/l2/datapath/meson.build b/modules/l2/datapath/meson.build index d61132060..b6dc45fc5 100644 --- a/modules/l2/datapath/meson.build +++ b/modules/l2/datapath/meson.build @@ -4,4 +4,7 @@ src += files( 'bridge_flood.c', 'bridge_input.c', + 'vxlan_flood.c', + 'vxlan_input.c', + 'vxlan_output.c', ) diff --git a/modules/l2/datapath/vxlan_flood.c b/modules/l2/datapath/vxlan_flood.c new file mode 100644 index 000000000..68d91c338 --- /dev/null +++ b/modules/l2/datapath/vxlan_flood.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include +#include +#include +#include +#include + +enum edges { + OUTPUT = 0, + DROP, + EDGE_COUNT +}; + +static uint16_t +vxlan_flood_process(struct rte_graph *graph, struct rte_node *node, void **objs, uint16_t nb_objs) { + const struct iface_info_vxlan *vxlan; + struct rte_mbuf *m, *clone; + uint16_t flood_count; + uint16_t sent = 0; + + for (uint16_t i = 0; i < nb_objs; i++) { + m = objs[i]; + flood_count = 0; + + if (gr_mbuf_is_traced(m)) + gr_mbuf_trace_add(m, node, 0); + + vxlan = iface_info_vxlan(mbuf_data(m)->iface); + + for (uint16_t j = 0; j < vxlan->n_flood_vteps; j++) { + if (flood_count == 0) { + clone = m; + } else { + clone = gr_mbuf_copy(m, UINT32_MAX, sizeof(struct mbuf_data)); + if (clone == NULL) + continue; + } + + iface_mbuf_data(clone)->vtep = vxlan->flood_vteps[j]; + + rte_node_enqueue_x1(graph, node, OUTPUT, clone); + + flood_count++; + } + + if (flood_count == 0) + rte_node_enqueue_x1(graph, node, DROP, m); + sent += flood_count; + } + + return sent; +} + +static struct rte_node_register node = { + .name = "vxlan_flood", + .process = vxlan_flood_process, + .nb_edges = EDGE_COUNT, + .next_nodes = { + [OUTPUT] = "iface_output", + [DROP] = "vxlan_flood_drop", + }, +}; + +static struct gr_node_info info = { + .node = &node, + .type = GR_NODE_T_L2, +}; + +GR_NODE_REGISTER(info); + +GR_DROP_REGISTER(vxlan_flood_drop); diff --git a/modules/l2/datapath/vxlan_input.c b/modules/l2/datapath/vxlan_input.c new file mode 100644 index 000000000..7edc8d949 --- /dev/null +++ b/modules/l2/datapath/vxlan_input.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include "l2_datapath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +enum { + IFACE_INPUT = 0, + NO_TUNNEL, + BAD_FLAGS, + EDGE_COUNT, +}; + +int trace_vxlan_format(char *buf, size_t len, const void *data, size_t /*data_len*/) { + const struct trace_vxlan_data *t = data; + int n = snprintf(buf, len, "vni=%u", rte_be_to_cpu_32(t->vni)); + if (t->vtep != 0) + n += snprintf(buf + n, len - n, " vtep=" IP4_F, &t->vtep); + return n; +} + +static uint16_t +vxlan_input_process(struct rte_graph *graph, struct rte_node *node, void **objs, uint16_t nb_objs) { + uint16_t last_vrf_id, vrf_id; + struct ip_local_mbuf_data *l; + struct iface_mbuf_data *d; + struct rte_vxlan_hdr *vh; + rte_be32_t vni, last_vni; + ip4_addr_t src_vtep; + struct iface *iface; + struct rte_mbuf *m; + rte_edge_t edge; + + last_vrf_id = GR_VRF_ID_UNDEF; + last_vni = 0; + iface = NULL; + vni = 0; + + for (uint16_t i = 0; i < nb_objs; i++) { + m = objs[i]; + l = ip_local_mbuf_data(m); + vrf_id = l->vrf_id; + src_vtep = l->src; + + vh = rte_pktmbuf_mtod_offset(m, struct rte_vxlan_hdr *, sizeof(struct rte_udp_hdr)); + if (!(vh->vx_flags & VXLAN_FLAGS_VNI)) { + edge = BAD_FLAGS; + goto next; + } + + vni = vxlan_decode_vni(vh->vx_vni); + if (vni != last_vni || vrf_id != last_vrf_id) { + iface = vxlan_get_iface(vni, vrf_id); + last_vrf_id = vrf_id; + last_vni = vni; + } + if (iface == NULL) { + edge = NO_TUNNEL; + goto next; + } + + rte_pktmbuf_adj(m, sizeof(struct rte_udp_hdr) + sizeof(*vh)); + + d = iface_mbuf_data(m); + d->iface = iface; + d->vlan_id = 0; + d->vtep = src_vtep; + edge = IFACE_INPUT; +next: + if (gr_mbuf_is_traced(m) || (iface && iface->flags & GR_IFACE_F_PACKET_TRACE)) { + struct trace_vxlan_data *t = gr_mbuf_trace_add(m, node, sizeof(*t)); + t->vni = vni; + t->vtep = src_vtep; + } + rte_node_enqueue_x1(graph, node, edge, m); + } + + return nb_objs; +} + +static void vxlan_input_register(void) { + l4_input_register_port(IPPROTO_UDP, RTE_BE16(RTE_VXLAN_DEFAULT_PORT), "vxlan_input"); +} + +static struct rte_node_register vxlan_input_node = { + .name = "vxlan_input", + + .process = vxlan_input_process, + + .nb_edges = EDGE_COUNT, + .next_nodes = { + [IFACE_INPUT] = "iface_input", + [NO_TUNNEL] = "vxlan_input_no_tunnel", + [BAD_FLAGS] = "vxlan_input_bad_flags", + }, +}; + +static struct gr_node_info vxlan_input_info = { + .node = &vxlan_input_node, + .type = GR_NODE_T_L3, + .register_callback = vxlan_input_register, + .trace_format = trace_vxlan_format, +}; + +GR_NODE_REGISTER(vxlan_input_info); + +GR_DROP_REGISTER(vxlan_input_no_tunnel); +GR_DROP_REGISTER(vxlan_input_bad_flags); diff --git a/modules/l2/datapath/vxlan_output.c b/modules/l2/datapath/vxlan_output.c new file mode 100644 index 000000000..3eb706b00 --- /dev/null +++ b/modules/l2/datapath/vxlan_output.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Robin Jarry + +#include "l2_datapath.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +enum { + IP_OUTPUT = 0, + NO_ROUTE, + NO_HEADROOM, + EDGE_COUNT, +}; + +#define EPHEMERAL_PORT_START 49152 +#define EPHEMERAL_PORT_MASK (UINT16_MAX - EPHEMERAL_PORT_START) + +static inline rte_be16_t vxlan_src_port(uint32_t hash) { + // RFC 7348 Section 5, recommends using source port hashing to enable + // ECMP load balancing in the underlay network. + return rte_cpu_to_be_16(EPHEMERAL_PORT_START + (hash & EPHEMERAL_PORT_MASK)); +} + +static uint16_t vxlan_output_process( + struct rte_graph *graph, + struct rte_node *node, + void **objs, + uint16_t nb_objs +) { + const struct iface_info_vxlan *vxlan; + struct iface_mbuf_data *d; + struct vxlan_template *vh; + const struct nexthop *nh; + struct rte_mbuf *m; + rte_edge_t edge; + uint16_t len; + + for (uint16_t i = 0; i < nb_objs; i++) { + m = objs[i]; + d = iface_mbuf_data(m); + vxlan = iface_info_vxlan(d->iface); + + if (gr_mbuf_is_traced(m)) { + struct trace_vxlan_data *t = gr_mbuf_trace_add(m, node, sizeof(*t)); + t->vni = rte_cpu_to_be_32(vxlan->vni); + t->vtep = d->vtep; + } + + nh = fib4_lookup(vxlan->encap_vrf_id, d->vtep); + if (nh == NULL) { + edge = NO_ROUTE; + goto next; + } + + len = rte_pktmbuf_pkt_len(m); + + vh = gr_mbuf_prepend(m, vh); + if (unlikely(vh == NULL)) { + edge = NO_HEADROOM; + goto next; + } + + *vh = vxlan->template; + vh->udp.src_port = vxlan_src_port(m->hash.rss); + vh->udp.dgram_len = rte_cpu_to_be_16(len + sizeof(vh->udp) + sizeof(vh->vxlan)); + vh->ip.dst_addr = d->vtep; + vh->ip.total_length = rte_cpu_to_be_16(len + sizeof(*vh)); + vh->ip.hdr_checksum = rte_ipv4_cksum(&vh->ip); + + ip_output_mbuf_data(m)->nh = nh; + + edge = IP_OUTPUT; +next: + rte_node_enqueue_x1(graph, node, edge, m); + } + + return nb_objs; +} + +static void vxlan_output_register(void) { + iface_output_type_register(GR_IFACE_TYPE_VXLAN, "vxlan_output"); +} + +static struct rte_node_register vxlan_output_node = { + .name = "vxlan_output", + + .process = vxlan_output_process, + + .nb_edges = EDGE_COUNT, + .next_nodes = { + [IP_OUTPUT] = "ip_output", + [NO_ROUTE] = "vxlan_output_no_route", + [NO_HEADROOM] = "error_no_headroom", + }, +}; + +static struct gr_node_info vxlan_output_info = { + .node = &vxlan_output_node, + .type = GR_NODE_T_L3, + .register_callback = vxlan_output_register, + .trace_format = trace_vxlan_format, +}; + +GR_NODE_REGISTER(vxlan_output_info); + +GR_DROP_REGISTER(vxlan_output_no_route); From 584befeab1a517013eea3bf0ab13ce540883d90e Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 23:52:54 +0100 Subject: [PATCH 07/13] smoke: add VXLAN tunnel test Set up a VXLAN overlay between grout and a Linux netns peer. Grout runs a bridge with a VXLAN member (VNI 100) and the Linux side mirrors the topology with a kernel VXLAN device enslaved to a Linux bridge. Both sides have flood lists configured with each other's VTEP address for BUM traffic replication. The test verifies L3 connectivity over the tunnel by having the Linux side ping the bridge address. This exercises the full path: ARP resolution over VXLAN, FDB learning from decapsulated traffic, and ICMP echo reply via the VXLAN output encapsulation. Signed-off-by: Robin Jarry --- smoke/vxlan_test.sh | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 smoke/vxlan_test.sh diff --git a/smoke/vxlan_test.sh b/smoke/vxlan_test.sh new file mode 100755 index 000000000..1af723393 --- /dev/null +++ b/smoke/vxlan_test.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +. $(dirname $0)/_init.sh + +port_add p0 + +grcli address add 10.0.0.1/24 iface p0 +grcli interface add bridge br100 +grcli interface add vxlan vxlan100 vni 100 local 10.0.0.1 domain br100 +grcli flood vtep add 10.0.0.2 vni 100 + +grcli address add 192.168.100.1/24 iface br100 + +netns_add n1 +move_to_netns x-p0 n1 +ip -n n1 addr add 10.0.0.2/24 dev x-p0 +ip -n n1 link add br100 type bridge +ip -n n1 link set br100 up +ip -n n1 link add vxlan100 type vxlan id 100 local 10.0.0.2 dstport 4789 dev x-p0 +ip -n n1 link set vxlan100 master br100 +ip -n n1 link set vxlan100 up +ip -n n1 addr add 192.168.100.2/24 dev br100 +bridge -n n1 fdb add 00:00:00:00:00:00 dev vxlan100 self vni 100 dst 10.0.0.1 + +# Test L3 connectivity over VXLAN tunnel +# The Linux side initiates the ping which will cause grout to learn the MAC +ip netns exec n1 ping -i0.01 -c3 -W1 192.168.100.1 + +grcli fdb show From 2e3dec434c2cc3b3f47995a51fc21be3351936a7 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 01:01:44 +0100 Subject: [PATCH 08/13] frr: sync bridge interfaces Report bridge interfaces to FRR as ZEBRA_IF_BRIDGE with their MAC address. Tag members with ZEBRA_IF_SLAVE_BRIDGE and propagate the bridge ifindex so that FRR can associate them with the correct master. Signed-off-by: Robin Jarry --- frr/if_grout.c | 16 ++++++++++++++-- frr/zebra_dplane_grout.c | 5 +++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/frr/if_grout.c b/frr/if_grout.c index fb3a6ca3e..16ec9735c 100644 --- a/frr/if_grout.c +++ b/frr/if_grout.c @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -43,9 +44,11 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { enum zebra_slave_iftype slave_type = ZEBRA_IF_SLAVE_NONE; enum zebra_link_type link_type = ZEBRA_LLT_UNKNOWN; enum zebra_iftype zif_type = ZEBRA_IF_OTHER; + const struct gr_iface_info_bridge *gr_bridge = NULL; const struct gr_iface_info_vlan *gr_vlan = NULL; const struct gr_iface_info_port *gr_port = NULL; const struct gr_iface_info_bond *gr_bond = NULL; + ifindex_t bridge_ifindex = IFINDEX_INTERNAL; ifindex_t link_ifindex = IFINDEX_INTERNAL; ifindex_t bond_ifindex = IFINDEX_INTERNAL; const struct rte_ether_addr *mac = NULL; @@ -83,6 +86,12 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { link_type = ZEBRA_LLT_ETHER; zif_type = ZEBRA_IF_VRF; break; + case GR_IFACE_TYPE_BRIDGE: + gr_bridge = (const struct gr_iface_info_bridge *)&gr_if->info; + link_type = ZEBRA_LLT_ETHER; + zif_type = ZEBRA_IF_BRIDGE; + mac = &gr_bridge->mac; + break; case GR_IFACE_TYPE_UNDEF: default: gr_log_err( @@ -123,12 +132,15 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { bond_ifindex = ifindex_grout_to_frr(gr_if->domain_id); slave_type = ZEBRA_IF_SLAVE_BOND; break; + case GR_IFACE_MODE_BRIDGE: + bridge_ifindex = ifindex_grout_to_frr(gr_if->domain_id); + slave_type = ZEBRA_IF_SLAVE_BRIDGE; + break; default: break; } - // no bridge support in grout - dplane_ctx_set_ifp_bridge_ifindex(ctx, IFINDEX_INTERNAL); + dplane_ctx_set_ifp_bridge_ifindex(ctx, bridge_ifindex); dplane_ctx_set_ifp_master_ifindex(ctx, IFINDEX_INTERNAL); dplane_ctx_set_ifp_bond_ifindex(ctx, bond_ifindex); dplane_ctx_set_ifp_zif_slave_type(ctx, slave_type); diff --git a/frr/zebra_dplane_grout.c b/frr/zebra_dplane_grout.c index 9ecbb01df..2ee2049b9 100644 --- a/frr/zebra_dplane_grout.c +++ b/frr/zebra_dplane_grout.c @@ -198,11 +198,12 @@ static void grout_sync_ifaces_addresses(struct event *e) { } static void grout_sync_ifaces(struct event *) { - // Sync interfaces in dependency order: VRF first (no deps), then bond - // and ipip (need VRF only), port (needs VRF, may be bond member), vlan + // Sync interfaces in dependency order: VRF first (no deps), then bridge, bond + // and ipip (need VRF only), port (needs VRF, may be bond or bridge member), vlan // (needs parent port or bond). static const gr_iface_type_t types[] = { GR_IFACE_TYPE_VRF, + GR_IFACE_TYPE_BRIDGE, GR_IFACE_TYPE_BOND, GR_IFACE_TYPE_IPIP, GR_IFACE_TYPE_PORT, From f10e94b0f7cc037722404770c9061633c205b57a Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 14 Feb 2026 00:26:42 +0100 Subject: [PATCH 09/13] frr: sync VXLAN interfaces Report VXLAN interfaces to FRR's zebra as ZEBRA_IF_VXLAN with the associated L2 VNI information. This allows FRR's EVPN control plane to discover which VNIs are locally configured and advertise them via BGP IMET routes to remote PEs. The VXLAN L2 info includes the VNI, the local VTEP address, and the underlay interface index so that zebra can correlate the tunnel with the correct underlay routing context. Signed-off-by: Robin Jarry --- frr/if_grout.c | 16 ++++++++++++++++ frr/zebra_dplane_grout.c | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/frr/if_grout.c b/frr/if_grout.c index 16ec9735c..a19e5c00b 100644 --- a/frr/if_grout.c +++ b/frr/if_grout.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #define GROUT_NS NS_DEFAULT @@ -45,6 +46,7 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { enum zebra_link_type link_type = ZEBRA_LLT_UNKNOWN; enum zebra_iftype zif_type = ZEBRA_IF_OTHER; const struct gr_iface_info_bridge *gr_bridge = NULL; + const struct gr_iface_info_vxlan *gr_vxlan = NULL; const struct gr_iface_info_vlan *gr_vlan = NULL; const struct gr_iface_info_port *gr_port = NULL; const struct gr_iface_info_bond *gr_bond = NULL; @@ -92,6 +94,12 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { zif_type = ZEBRA_IF_BRIDGE; mac = &gr_bridge->mac; break; + case GR_IFACE_TYPE_VXLAN: + gr_vxlan = (const struct gr_iface_info_vxlan *)&gr_if->info; + link_type = ZEBRA_LLT_ETHER; + zif_type = ZEBRA_IF_VXLAN; + mac = &gr_vxlan->mac; + break; case GR_IFACE_TYPE_UNDEF: default: gr_log_err( @@ -162,6 +170,14 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { vlan_info.vid = gr_vlan->vlan_id; dplane_ctx_set_ifp_vlan_info(ctx, &vlan_info); } + if (gr_vxlan) { + struct zebra_l2info_vxlan vi = {0}; + vi.vni_info.iftype = ZEBRA_VXLAN_IF_VNI; + vi.vni_info.vni.vni = gr_vxlan->vni; + vi.ifindex_link = ifindex_grout_to_frr(gr_vxlan->encap_vrf_id); + vi.vtep_ip.s_addr = gr_vxlan->local; + dplane_ctx_set_ifp_vxlan_info(ctx, &vi); + } } else { dplane_ctx_set_op(ctx, DPLANE_OP_INTF_DELETE); dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_QUEUED); diff --git a/frr/zebra_dplane_grout.c b/frr/zebra_dplane_grout.c index 2ee2049b9..3e2465dfa 100644 --- a/frr/zebra_dplane_grout.c +++ b/frr/zebra_dplane_grout.c @@ -200,7 +200,7 @@ static void grout_sync_ifaces_addresses(struct event *e) { static void grout_sync_ifaces(struct event *) { // Sync interfaces in dependency order: VRF first (no deps), then bridge, bond // and ipip (need VRF only), port (needs VRF, may be bond or bridge member), vlan - // (needs parent port or bond). + // (needs parent port or bond) and vxlan (needs VRF and bridge). static const gr_iface_type_t types[] = { GR_IFACE_TYPE_VRF, GR_IFACE_TYPE_BRIDGE, @@ -208,6 +208,7 @@ static void grout_sync_ifaces(struct event *) { GR_IFACE_TYPE_IPIP, GR_IFACE_TYPE_PORT, GR_IFACE_TYPE_VLAN, + GR_IFACE_TYPE_VXLAN, }; struct gr_infra_iface_list_req if_req; bool sync_vrf[GR_MAX_IFACES] = {false}; From 96078c56d35943000590f2b24a3002e047b7ebba Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 14 Feb 2026 00:29:35 +0100 Subject: [PATCH 10/13] frr: sync FDB entries Synchronize bridge FDB entries bidirectionally between grout and FRR. This is required for EVPN to advertise locally learned MAC addresses via BGP type-2 routes and to install remotely learned MACs into the bridge forwarding table. Zebra's dplane API is asymmetric for MAC/FDB entries. In the downward direction (zebra to dplane provider), zebra uses DPLANE_OP_MAC_INSTALL and DPLANE_OP_MAC_DELETE to push MACs into the dataplane. In the upward direction (dplane provider notifying zebra of learned MACs), DPLANE_OP_NEIGH_INSTALL and DPLANE_OP_NEIGH_DELETE must be used instead. These go through zebra_neigh_macfdb_update() which calls zebra_vxlan_local_mac_add_update() and ultimately triggers BGP EVPN type-2 route advertisement. By contrast, the DPLANE_OP_MAC_* result handler (zebra_vxlan_handle_result) is a no-op. Despite the NEIGH op name, the context payload uses the macinfo union member and is populated with dplane_ctx_mac_set_*() accessors, exactly like zebra's own netlink provider does in netlink_macfdb_change(). Unlike routes and nexthops which use higher-level zebra APIs that resolve the namespace from the VRF ID, the FDB notification path looks up interfaces via if_lookup_by_index_per_ns(ns_id, ifindex). GROUT_NS must therefore be set on the dplane context for the interface lookup to succeed. Function names follow zebra's rt_netlink.c naming conventions: grout_macfdb_change() for the upward notification path (like netlink_macfdb_change) and grout_macfdb_update_ctx() for the downward install path (like netlink_macfdb_update_ctx). Self-event suppression is enabled on the FDB event subscriptions to prevent feedback loops when FRR installs a MAC that was originally learned by grout. Signed-off-by: Robin Jarry --- frr/if_grout.c | 2 - frr/rt_grout.c | 98 ++++++++++++++++++++++++++++++++++++++++ frr/rt_grout.h | 4 ++ frr/zebra_dplane_grout.c | 53 ++++++++++++++++++++++ frr/zebra_dplane_grout.h | 3 ++ 5 files changed, 158 insertions(+), 2 deletions(-) diff --git a/frr/if_grout.c b/frr/if_grout.c index a19e5c00b..12c0b3bff 100644 --- a/frr/if_grout.c +++ b/frr/if_grout.c @@ -16,8 +16,6 @@ #include #include -#define GROUT_NS NS_DEFAULT - static uint64_t gr_if_flags_to_netlink(struct gr_iface *gr_if, enum zebra_link_type link_type) { uint64_t frr_if_flags = 0; diff --git a/frr/rt_grout.c b/frr/rt_grout.c index 6ae2b637e..aa2ae8661 100644 --- a/frr/rt_grout.c +++ b/frr/rt_grout.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -844,3 +845,100 @@ void grout_nexthop_change(bool new, struct gr_nexthop *gr_nh, bool startup) { // nexthop_free() must *NOT* be used to preserve the nh_srv6 context. free(nh); } + +void grout_macfdb_change(const struct gr_fdb_entry *fdb, bool new) { + struct zebra_dplane_ctx *ctx = dplane_ctx_alloc(); + struct ethaddr mac; + + gr_log_debug( + "%s bridge=%u iface=%u mac=%pEA vlan=%u vtep=%pI4", + new ? "add" : "del", + fdb->bridge_id, + fdb->iface_id, + &fdb->mac, + fdb->vlan_id, + &fdb->vtep + ); + + memcpy(&mac, &fdb->mac, sizeof(mac)); + + // Zebra's dplane API is asymmetric for FDB entries: + // + // - DPLANE_OP_MAC_INSTALL/DELETE is the downward path (zebra pushing + // MACs to dplane providers). The result handler is a no-op. + // - DPLANE_OP_NEIGH_INSTALL/DELETE is the upward path (dplane providers + // notifying zebra of learned MACs). This goes through + // zebra_neigh_macfdb_update() which triggers EVPN type-2 routes. + // + // It is NOT a bug to use dplane_ctx_mac_set_*() with DPLANE_OP_NEIGH_* + // ops. The macinfo and neigh fields are separate union members in the + // dplane context, and zebra's own netlink provider does the same thing + // (see rt_netlink.c netlink_macfdb_change()). + dplane_ctx_set_ns_id(ctx, GROUT_NS); + dplane_ctx_set_ifindex(ctx, ifindex_grout_to_frr(fdb->iface_id)); + dplane_ctx_mac_set_addr(ctx, &mac); + dplane_ctx_mac_set_nhg_id(ctx, 0); + dplane_ctx_mac_set_ndm_state(ctx, NUD_REACHABLE); + dplane_ctx_mac_set_ndm_flags(ctx, NTF_MASTER); + dplane_ctx_mac_set_dst_present(ctx, fdb->vtep != 0); + dplane_ctx_mac_set_vtep_ip(ctx, &(struct in_addr) {fdb->vtep}); + dplane_ctx_mac_set_vid(ctx, fdb->vlan_id); + dplane_ctx_mac_set_dp_static(ctx, fdb->flags & GR_FDB_F_STATIC); + dplane_ctx_mac_set_local_inactive(ctx, false); + dplane_ctx_mac_set_is_sticky(ctx, false); + dplane_ctx_set_op(ctx, new ? DPLANE_OP_NEIGH_INSTALL : DPLANE_OP_NEIGH_DELETE); + + dplane_provider_enqueue_to_zebra(ctx); +} + +enum zebra_dplane_result grout_macfdb_update_ctx(struct zebra_dplane_ctx *ctx) { + bool add = dplane_ctx_get_op(ctx) == DPLANE_OP_MAC_INSTALL; + uint32_t req_type; + size_t len; + void *req; + int ret; + + gr_log_debug( + "%s bridge=%u iface=%u mac=%pEA vlan=%u vtep=%pI4", + add ? "add" : "del", + ifindex_frr_to_grout(dplane_ctx_get_ifindex(ctx)), + ifindex_frr_to_grout(dplane_ctx_get_ifindex(ctx)), + dplane_ctx_mac_get_addr(ctx), + dplane_ctx_mac_get_vlan(ctx), + dplane_ctx_mac_get_vtep_ip(ctx) + ); + + len = add ? sizeof(struct gr_fdb_add_req) : sizeof(struct gr_fdb_del_req); + req = calloc(1, len); + if (req == NULL) { + gr_log_err("failed to allocate memory"); + return ZEBRA_DPLANE_REQUEST_FAILURE; + } + + if (add) { + struct gr_fdb_add_req *add = req; + add->exist_ok = true; + add->fdb.iface_id = ifindex_frr_to_grout(dplane_ctx_get_ifindex(ctx)); + add->fdb.bridge_id = ifindex_frr_to_grout(dplane_ctx_mac_get_br_ifindex(ctx)); + add->fdb.vlan_id = dplane_ctx_mac_get_vlan(ctx); + add->fdb.flags = GR_FDB_F_EXTERN; + if (dplane_ctx_mac_get_dp_static(ctx)) + add->fdb.flags |= GR_FDB_F_STATIC; + memcpy(&add->fdb.mac, dplane_ctx_mac_get_addr(ctx), sizeof(add->fdb.mac)); + add->fdb.vtep = dplane_ctx_mac_get_vtep_ip(ctx)->s_addr; + req_type = GR_FDB_ADD; + } else { + struct gr_fdb_del_req *del = req; + del->missing_ok = true; + del->bridge_id = ifindex_frr_to_grout(dplane_ctx_mac_get_br_ifindex(ctx)); + del->vlan_id = dplane_ctx_mac_get_vlan(ctx); + memcpy(&del->mac, dplane_ctx_mac_get_addr(ctx), sizeof(del->mac)); + req_type = GR_FDB_DEL; + } + + ret = grout_client_send_recv(req_type, len, req, NULL); + + free(req); + + return ret == 0 ? ZEBRA_DPLANE_REQUEST_SUCCESS : ZEBRA_DPLANE_REQUEST_FAILURE; +} diff --git a/frr/rt_grout.h b/frr/rt_grout.h index 3da557e52..56b6391a9 100644 --- a/frr/rt_grout.h +++ b/frr/rt_grout.h @@ -5,6 +5,7 @@ #include #include +#include #include @@ -13,3 +14,6 @@ void grout_route6_change(bool new, struct gr_ip6_route *gr_r6); enum zebra_dplane_result grout_add_del_route(struct zebra_dplane_ctx *ctx); enum zebra_dplane_result grout_add_del_nexthop(struct zebra_dplane_ctx *ctx); void grout_nexthop_change(bool new, struct gr_nexthop *gr_nh, bool startup); + +void grout_macfdb_change(const struct gr_fdb_entry *fdb, bool new); +enum zebra_dplane_result grout_macfdb_update_ctx(struct zebra_dplane_ctx *ctx); diff --git a/frr/zebra_dplane_grout.c b/frr/zebra_dplane_grout.c index 3e2465dfa..a763e3c29 100644 --- a/frr/zebra_dplane_grout.c +++ b/frr/zebra_dplane_grout.c @@ -8,6 +8,7 @@ #include "rt_grout.h" #include +#include #include #include @@ -85,6 +86,29 @@ static int grout_client_ensure_connect(void) { return 0; } +static void grout_sync_fdb(struct event *) { + struct gr_fdb_list_req req = {.bridge_id = GR_IFACE_ID_UNDEF}; + struct gr_fdb_entry *fdb; + int ret; + + gr_log_debug("sync FDB entries"); + + if (grout_client_ensure_connect() < 0) + return; + + gr_api_client_stream_foreach (fdb, ret, grout_ctx.client, GR_FDB_LIST, sizeof(req), &req) { + gr_log_debug( + "sync fdb bridge %u iface %u mac %pEA", + fdb->bridge_id, + fdb->iface_id, + &fdb->mac + ); + grout_macfdb_change(fdb, true); + } + if (ret < 0) + gr_log_err("GR_FDB_LIST: %s", strerror(errno)); +} + static void grout_sync_routes(struct event *e) { struct gr_ip4_route_list_req r4_req = {.vrf_id = EVENT_VAL(e)}; struct gr_ip4_route *r4; @@ -241,6 +265,8 @@ static void grout_sync_ifaces(struct event *) { if (sync_vrf[i]) event_add_event(zrouter.master, grout_sync_ifaces_addresses, NULL, i, NULL); } + + event_add_event(zrouter.master, grout_sync_fdb, NULL, 0, NULL); } static void dplane_grout_connect(struct event *) { @@ -256,6 +282,9 @@ static void dplane_grout_connect(struct event *) { {.type = GR_EVENT_IP6_ADDR_ADD, .suppress_self_events = false}, {.type = GR_EVENT_IP_ADDR_DEL, .suppress_self_events = false}, {.type = GR_EVENT_IP6_ADDR_DEL, .suppress_self_events = false}, + {.type = GR_EVENT_FDB_ADD, .suppress_self_events = true}, + {.type = GR_EVENT_FDB_DEL, .suppress_self_events = true}, + {.type = GR_EVENT_FDB_UPDATE, .suppress_self_events = true}, }; if (grout_notif_subscribe(&grout_ctx.dplane_notifs, gr_evts, ARRAY_DIM(gr_evts)) < 0) @@ -345,6 +374,12 @@ static const char *gr_req_type_to_str(uint32_t e) { return TOSTRING(GR_IP6_ROUTE_LIST); case GR_SRV6_TUNSRC_SET: return TOSTRING(GR_SRV6_TUNSRC_SET); + case GR_FDB_ADD: + return TOSTRING(GR_FDB_ADD); + case GR_FDB_DEL: + return TOSTRING(GR_FDB_DEL); + case GR_FDB_LIST: + return TOSTRING(GR_FDB_LIST); default: snprintf(buf, sizeof(buf), "0x%x", e); return buf; @@ -423,6 +458,12 @@ static const char *gr_evt_to_str(uint32_t e) { return TOSTRING(GR_EVENT_NEXTHOP_UPDATE); case GR_EVENT_NEXTHOP_DELETE: return TOSTRING(GR_EVENT_NEXTHOP_DELETE); + case GR_EVENT_FDB_ADD: + return TOSTRING(GR_EVENT_FDB_ADD); + case GR_EVENT_FDB_UPDATE: + return TOSTRING(GR_EVENT_FDB_UPDATE); + case GR_EVENT_FDB_DEL: + return TOSTRING(GR_EVENT_FDB_DEL); default: snprintf(buf, sizeof(buf), "event 0x%x", e); return buf; @@ -469,6 +510,14 @@ static void dplane_read_notifications(struct event *event) { case GR_EVENT_IP6_ADDR_DEL: grout_interface_addr6_change(new, PAYLOAD(gr_e)); break; + + case GR_EVENT_FDB_ADD: + case GR_EVENT_FDB_UPDATE: + new = true; + // fallthrough + case GR_EVENT_FDB_DEL: + grout_macfdb_change(PAYLOAD(gr_e), new); + break; } free(gr_e); @@ -548,6 +597,10 @@ static enum zebra_dplane_result zd_grout_process_update(struct zebra_dplane_ctx case DPLANE_OP_NH_DELETE: return grout_add_del_nexthop(ctx); + case DPLANE_OP_MAC_INSTALL: + case DPLANE_OP_MAC_DELETE: + return grout_macfdb_update_ctx(ctx); + case DPLANE_OP_SRV6_ENCAP_SRCADDR_SET: return grout_set_sr_tunsrc(ctx); diff --git a/frr/zebra_dplane_grout.h b/frr/zebra_dplane_grout.h index 9b03f3640..f8af1ed79 100644 --- a/frr/zebra_dplane_grout.h +++ b/frr/zebra_dplane_grout.h @@ -3,7 +3,10 @@ #pragma once +#include #include #include +#define GROUT_NS NS_DEFAULT + int grout_client_send_recv(uint32_t req_type, size_t tx_len, const void *tx_data, void **rx_data); From c868bf73429d873533f3c38eddf8c1d58c31e1a4 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Sat, 14 Feb 2026 00:30:48 +0100 Subject: [PATCH 11/13] frr: sync VTEP flood lists Handle DPLANE_OP_VTEP_ADD and DPLANE_OP_VTEP_DELETE operations from FRR's EVPN control plane. When BGP learns a remote VTEP via an IMET route (EVPN type-3), zebra pushes the VTEP to the dataplane provider. The grout_vxlan_flood_update_ctx() function (named after zebra's netlink_vxlan_flood_update_ctx() in rt_netlink.c) translates these operations into GR_FLOOD_ADD/DEL requests with GR_FLOOD_T_VTEP type. This is a downward-only path: zebra pushes flood list entries to the dplane provider. There is no upward notification for VTEP flood list changes since grout does not learn VTEPs on its own, they are always provided by FRR's BGP EVPN control plane. This allows BGP EVPN to dynamically manage the per-VNI flood lists used for BUM traffic ingress replication, replacing the need for static flood list configuration via the CLI. Signed-off-by: Robin Jarry --- frr/rt_grout.c | 55 ++++++++++++++++++++++++++++++++++++++++ frr/rt_grout.h | 1 + frr/zebra_dplane_grout.c | 12 +++++++++ 3 files changed, 68 insertions(+) diff --git a/frr/rt_grout.c b/frr/rt_grout.c index aa2ae8661..57b16975a 100644 --- a/frr/rt_grout.c +++ b/frr/rt_grout.c @@ -5,6 +5,7 @@ #include "log_grout.h" #include "rt_grout.h" +#include #include #include @@ -942,3 +943,57 @@ enum zebra_dplane_result grout_macfdb_update_ctx(struct zebra_dplane_ctx *ctx) { return ret == 0 ? ZEBRA_DPLANE_REQUEST_SUCCESS : ZEBRA_DPLANE_REQUEST_FAILURE; } + +enum zebra_dplane_result grout_vxlan_flood_update_ctx(struct zebra_dplane_ctx *ctx) { + const struct ipaddr *addr = dplane_ctx_neigh_get_ipaddr(ctx); + bool add = dplane_ctx_get_op(ctx) == DPLANE_OP_VTEP_ADD; + struct gr_flood_entry *entry; + uint32_t req_type; + size_t len; + void *req; + int ret; + + gr_log_debug( + "%s %pIA vni=%u vrf=%u", + add ? "add" : "del", + addr, + dplane_ctx_neigh_get_vni(ctx), + vrf_frr_to_grout(dplane_ctx_get_vrf(ctx)) + ); + + if (addr->ipa_type != IPADDR_V4) { + gr_log_err("IPv6 flood list entries are not supported"); + return ZEBRA_DPLANE_REQUEST_FAILURE; + } + + len = add ? sizeof(struct gr_flood_add_req) : sizeof(struct gr_flood_del_req); + + req = calloc(1, len); + if (req == NULL) { + gr_log_err("failed to allocate memory"); + return ZEBRA_DPLANE_REQUEST_FAILURE; + } + + if (add) { + struct gr_flood_add_req *a = req; + entry = &a->entry; + a->exist_ok = true; + req_type = GR_FLOOD_ADD; + } else { + struct gr_flood_del_req *d = req; + entry = &d->entry; + d->missing_ok = true; + req_type = GR_FLOOD_DEL; + } + + entry->type = GR_FLOOD_T_VTEP; + entry->vrf_id = vrf_frr_to_grout(dplane_ctx_get_vrf(ctx)); + entry->vtep.vni = dplane_ctx_neigh_get_vni(ctx); + entry->vtep.addr = addr->ipaddr_v4.s_addr; + + ret = grout_client_send_recv(req_type, len, req, NULL); + + free(req); + + return ret == 0 ? ZEBRA_DPLANE_REQUEST_SUCCESS : ZEBRA_DPLANE_REQUEST_FAILURE; +} diff --git a/frr/rt_grout.h b/frr/rt_grout.h index 56b6391a9..942c2f06d 100644 --- a/frr/rt_grout.h +++ b/frr/rt_grout.h @@ -17,3 +17,4 @@ void grout_nexthop_change(bool new, struct gr_nexthop *gr_nh, bool startup); void grout_macfdb_change(const struct gr_fdb_entry *fdb, bool new); enum zebra_dplane_result grout_macfdb_update_ctx(struct zebra_dplane_ctx *ctx); +enum zebra_dplane_result grout_vxlan_flood_update_ctx(struct zebra_dplane_ctx *ctx); diff --git a/frr/zebra_dplane_grout.c b/frr/zebra_dplane_grout.c index a763e3c29..4a0a6e949 100644 --- a/frr/zebra_dplane_grout.c +++ b/frr/zebra_dplane_grout.c @@ -380,6 +380,10 @@ static const char *gr_req_type_to_str(uint32_t e) { return TOSTRING(GR_FDB_DEL); case GR_FDB_LIST: return TOSTRING(GR_FDB_LIST); + case GR_FLOOD_ADD: + return TOSTRING(GR_FLOOD_ADD); + case GR_FLOOD_DEL: + return TOSTRING(GR_FLOOD_DEL); default: snprintf(buf, sizeof(buf), "0x%x", e); return buf; @@ -464,6 +468,10 @@ static const char *gr_evt_to_str(uint32_t e) { return TOSTRING(GR_EVENT_FDB_UPDATE); case GR_EVENT_FDB_DEL: return TOSTRING(GR_EVENT_FDB_DEL); + case GR_EVENT_FLOOD_ADD: + return TOSTRING(GR_EVENT_FLOOD_ADD); + case GR_EVENT_FLOOD_DEL: + return TOSTRING(GR_EVENT_FLOOD_DEL); default: snprintf(buf, sizeof(buf), "event 0x%x", e); return buf; @@ -601,6 +609,10 @@ static enum zebra_dplane_result zd_grout_process_update(struct zebra_dplane_ctx case DPLANE_OP_MAC_DELETE: return grout_macfdb_update_ctx(ctx); + case DPLANE_OP_VTEP_ADD: + case DPLANE_OP_VTEP_DELETE: + return grout_vxlan_flood_update_ctx(ctx); + case DPLANE_OP_SRV6_ENCAP_SRCADDR_SET: return grout_set_sr_tunsrc(ctx); From 9398ec0a86f8c9d2635c6bcafe99eadfec2c3143 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Fri, 13 Feb 2026 23:53:46 +0100 Subject: [PATCH 12/13] smoke: add EVPN VXLAN integration test with FRR Set up a full EVPN/VXLAN topology between FRR+grout and a standalone FRR+Linux peer. Each side runs a bridge with a VXLAN member (VNI 100) and a host namespace. Both peers run iBGP with the l2vpn evpn address-family and advertise-all-vni. The test verifies that EVPN type-3 (IMET) routes are exchanged so that both sides install each other's VTEP in their flood lists. It then verifies end-to-end L2 connectivity by pinging between the two host namespaces through the VXLAN overlay, which exercises type-2 (MAC/IP) route advertisement and FDB synchronization. Signed-off-by: Robin Jarry --- smoke/evpn_vxlan_frr_test.sh | 189 +++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100755 smoke/evpn_vxlan_frr_test.sh diff --git a/smoke/evpn_vxlan_frr_test.sh b/smoke/evpn_vxlan_frr_test.sh new file mode 100755 index 000000000..d94433249 --- /dev/null +++ b/smoke/evpn_vxlan_frr_test.sh @@ -0,0 +1,189 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +# This test verifies EVPN/VXLAN type-2 (MAC/IP) and type-3 (flood VTEP) route +# exchange between FRR+Grout and a standalone FRR+Linux peer. Each side has +# a bridge with a VXLAN member (VNI 100) and a host connected to a local port. +# BGP EVPN advertises locally learned MACs and flood VTEPs to the remote peer. +# +# Success criteria: +# - Both sides exchange EVPN type-3 routes (flood VTEPs installed). +# - Host-A and Host-B can ping each other through the VXLAN overlay. +# - Both sides learn the remote MAC via EVPN type-2 routes. +# +# - - - - - - - - - - - - - - - - - - - - - - - - - - +# | evpn-peer | | grout | +# +# | +----------+ | | +----------+ | +# | vxlan100 | | vxlan100 | +# | +----+-----+ | | +-----+----+ | +# | | +# | +---+---+ | | +---+---+ | +# | br100 | | br100 | +# | +---+---+ | | +---+---+ | +# | .1 .2 | +# | +---+---+ +-------+ | | +------+ +---+---+ | +# | p1 | | x-p0 | | p0 | | p1 | +# | +---+---+ +---+---+ | | +---+--+ +---+---+ | +# - - - |- - - - - - |- - - - - -| - - - - - -| - - - +# | | | | +# - - - |- - - - . | <----- BGP -----> | - - - -| - - - +# | | | | | | | | +# +---+----+ `----------------------' +---+----+ +# | | x-p1 | | underlay | | x-p1 | | +# +--------+ 172.16.0.0/24 +--------+ +# | .2 | | .3 | +# <= = = = = = = = = = = = = => +# | host-a | overlay | host-b | +# - - - - - - - - 10.0.0.0/24 - - - - - - - ' + +. $(dirname $0)/_init_frr.sh + +# right side ------------------------------------------------------------------- +create_interface p0 +set_ip_address p0 172.16.0.2/24 + +grcli interface add bridge br100 +create_interface p1 domain br100 +grcli interface add vxlan vxlan100 vni 100 local 172.16.0.2 domain br100 + +netns_add host-b +move_to_netns x-p1 host-b +ip -n host-b addr add 10.0.0.3/24 dev x-p1 + +# left side -------------------------------------------------------------------- +start_frr evpn-peer 0 + +ip netns exec evpn-peer sysctl -qw net.ipv4.conf.all.forwarding=1 +ip netns exec evpn-peer sysctl -qw net.ipv4.conf.all.rp_filter=0 +ip netns exec evpn-peer sysctl -qw net.ipv4.conf.default.rp_filter=0 + +move_to_netns x-p0 evpn-peer +ip -n evpn-peer addr add 172.16.0.1/24 dev x-p0 + +ip -n evpn-peer link add br100 type bridge +ip -n evpn-peer link set br100 up + +ip -n evpn-peer link add vxlan100 type vxlan id 100 local 172.16.0.1 dstport 4789 nolearning +ip -n evpn-peer link set vxlan100 master br100 +ip -n evpn-peer link set vxlan100 up + +# Host-A: veth pair, one end in host-a, other end in evpn-peer bridge +ip -n evpn-peer link add p1 type veth peer name x-p1 +ip -n evpn-peer link set p1 master br100 +ip -n evpn-peer link set p1 up + +netns_add host-a +ip -n evpn-peer link set x-p1 netns host-a +ip -n host-a link set x-p1 up +ip -n host-a addr add 10.0.0.2/24 dev x-p1 + +# BGP EVPN on peer +vtysh -N evpn-peer <<-EOF +configure terminal + +router bgp 65000 + bgp router-id 172.16.0.1 + no bgp default ipv4-unicast + + neighbor 172.16.0.2 remote-as 65000 + + address-family l2vpn evpn + neighbor 172.16.0.2 activate + advertise-all-vni + exit-address-family +exit +EOF + +# BGP EVPN on Grout +vtysh <<-EOF +configure terminal + +router bgp 65000 + bgp router-id 172.16.0.2 + no bgp default ipv4-unicast + + neighbor 172.16.0.1 remote-as 65000 + + address-family l2vpn evpn + neighbor 172.16.0.1 activate + advertise-all-vni + exit-address-family +exit +EOF + +# -- Wait for EVPN type-3 (flood VTEP) exchange ------------------------------- +attempts=0 +while ! bridge -n evpn-peer fdb show dev vxlan100 | grep -qF 172.16.0.2; do + if [ "$attempts" -ge 10 ]; then + vtysh -N evpn-peer -c "show evpn vni 100" + fail "Linux peer did not learn remote VTEP 172.16.0.2" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +attempts=0 +while ! grcli flood vtep show | grep -qF 172.16.0.1; do + if [ "$attempts" -ge 10 ]; then + grcli flood vtep show + fail "Grout did not learn remote VTEP 172.16.0.1" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +bridge -n evpn-peer fdb show dev vxlan100 +grcli fdb show +grcli flood vtep show + +# -- Verify L2 connectivity through VXLAN overlay ----------------------------- + +# Ping triggers ARP which triggers MAC learning + EVPN type-2 advertisement. +ip netns exec host-a ping -i0.1 -c3 -W1 10.0.0.3 +ip netns exec host-b ping -i0.1 -c3 -W1 10.0.0.2 + +grcli fdb show iface vxlan100 +bridge -n evpn-peer fdb show dev vxlan100 + +# -- Verify EVPN type-2 (MAC/IP) learned on both sides +mac_a=$(ip netns exec host-a cat /sys/class/net/x-p1/address) +attempts=0 +while ! vtysh -c "show bgp l2vpn evpn route type 2" | grep -qF "$mac_a"; do + if [ "$attempts" -ge 10 ]; then + vtysh -c "show bgp l2vpn evpn route type 2" + fail "FRR did not learn type 2 route" + fi + sleep 1 + attempts=$((attempts + 1)) +done +attempts=0 +while ! grcli fdb show iface vxlan100 extern | grep -qF "$mac_a"; do + if [ "$attempts" -ge 10 ]; then + grcli fdb show iface vxlan100 + fail "FRR did not program FDB entry" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +mac_b=$(ip netns exec host-b cat /sys/class/net/x-p1/address) +attempts=0 +while ! vtysh -N evpn-peer -c "show bgp l2vpn evpn route type 2" | grep -qF "$mac_b"; do + if [ "$attempts" -ge 10 ]; then + vtysh -N evpn-peer -c "show bgp l2vpn evpn route type 2" + fail "EVPN peer did not learn type 2 route" + fi + sleep 1 + attempts=$((attempts + 1)) +done +attempts=0 +while ! bridge -n evpn-peer fdb show dev vxlan100 | grep -q "$mac_b.*extern"; do + if [ "$attempts" -ge 10 ]; then + bridge -n evpn-peer fdb show dev vxlan100 + fail "EVPN peer did not program FDB entry in bridge" + fi + sleep 1 + attempts=$((attempts + 1)) +done From 1efb8f88a9730751021f76a8d2068efac6e4b7b4 Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Thu, 26 Feb 2026 10:07:16 +0100 Subject: [PATCH 13/13] l2: add per-bridge FDB forwarding statistics Track FDB forwarding decisions in per-bridge, per-lcore counters that complement the generic per-interface iface_stats and drop node software stats: - hit: unicast forwarded via FDB lookup - miss: unknown unicast, sent to flood - flood: broadcast/multicast, sent to flood Counters are incremented in the bridge_input datapath node using per-lcore arrays indexed by bridge ID to avoid cache contention. The control plane aggregates across all lcores on demand. Exposed under grcli stats fdb show/reset with per-bridge granularity. Stats are cleared when a bridge is destroyed. Signed-off-by: Fabien Dupont --- modules/l2/api/gr_l2.h | 21 ++++++ modules/l2/cli/l2_stats.c | 102 +++++++++++++++++++++++++++++ modules/l2/cli/meson.build | 1 + modules/l2/control/bridge.c | 4 ++ modules/l2/control/gr_l2_control.h | 20 ++++++ modules/l2/control/l2_stats.c | 71 ++++++++++++++++++++ modules/l2/control/meson.build | 1 + modules/l2/datapath/bridge_input.c | 10 +++ smoke/bridge_test.sh | 5 ++ 9 files changed, 235 insertions(+) create mode 100644 modules/l2/cli/l2_stats.c create mode 100644 modules/l2/control/l2_stats.c diff --git a/modules/l2/api/gr_l2.h b/modules/l2/api/gr_l2.h index 54a9ef352..6fbe7052a 100644 --- a/modules/l2/api/gr_l2.h +++ b/modules/l2/api/gr_l2.h @@ -205,3 +205,24 @@ struct gr_flood_list_req { }; STREAM_RESP(struct gr_flood_entry); + +// FDB statistics /////////////////////////////////////////////////////////////// + +#define GR_L2_FDB_STATS_GET REQUEST_TYPE(GR_L2_MODULE, 0x0020) + +struct gr_l2_fdb_stats_get_req { + uint16_t bridge_id; +}; + +struct gr_l2_fdb_stats { + uint16_t bridge_id; + uint64_t hit; // unicast forwarded via FDB lookup + uint64_t miss; // unknown unicast, sent to flood + uint64_t flood; // broadcast/multicast, sent to flood +}; + +#define GR_L2_FDB_STATS_RESET REQUEST_TYPE(GR_L2_MODULE, 0x0021) + +struct gr_l2_fdb_stats_reset_req { + uint16_t bridge_id; +}; diff --git a/modules/l2/cli/l2_stats.c b/modules/l2/cli/l2_stats.c new file mode 100644 index 000000000..939985f6f --- /dev/null +++ b/modules/l2/cli/l2_stats.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Fabien Dupont + +#include +#include +#include +#include + +#include + +#include + +static cmd_status_t fdb_stats_show(struct gr_api_client *c, const struct ec_pnode *p) { + const struct gr_l2_fdb_stats *stats; + struct gr_l2_fdb_stats_get_req req; + void *resp_ptr = NULL; + struct gr_iface *iface; + + iface = iface_from_name(c, arg_str(p, "NAME")); + if (iface == NULL) + return CMD_ERROR; + + req.bridge_id = iface->id; + free(iface); + + if (gr_api_client_send_recv(c, GR_L2_FDB_STATS_GET, sizeof(req), &req, &resp_ptr) < 0) + return CMD_ERROR; + + stats = resp_ptr; + + printf("fdb_hit: %lu\n", stats->hit); + printf("fdb_miss: %lu\n", stats->miss); + printf("bcast: %lu\n", stats->flood); + + free(resp_ptr); + return CMD_SUCCESS; +} + +static cmd_status_t fdb_stats_reset(struct gr_api_client *c, const struct ec_pnode *p) { + struct gr_l2_fdb_stats_reset_req req; + struct gr_iface *iface; + + iface = iface_from_name(c, arg_str(p, "NAME")); + if (iface == NULL) + return CMD_ERROR; + + req.bridge_id = iface->id; + free(iface); + + if (gr_api_client_send_recv(c, GR_L2_FDB_STATS_RESET, sizeof(req), &req, NULL) < 0) + return CMD_ERROR; + + return CMD_SUCCESS; +} + +#define FDB_STATS_CTX(root) \ + CLI_CONTEXT( \ + root, \ + CTX_ARG("stats", "Statistics."), \ + CTX_ARG("fdb", "FDB forwarding statistics.") \ + ) + +static int ctx_init(struct ec_node *root) { + int ret; + + ret = CLI_COMMAND( + FDB_STATS_CTX(root), + "show NAME", + fdb_stats_show, + "Show FDB forwarding statistics for a bridge.", + with_help( + "Bridge interface name.", + ec_node_dyn("NAME", complete_iface_names, INT2PTR(GR_IFACE_TYPE_BRIDGE)) + ) + ); + if (ret < 0) + return ret; + + ret = CLI_COMMAND( + FDB_STATS_CTX(root), + "reset NAME", + fdb_stats_reset, + "Reset FDB forwarding statistics for a bridge.", + with_help( + "Bridge interface name.", + ec_node_dyn("NAME", complete_iface_names, INT2PTR(GR_IFACE_TYPE_BRIDGE)) + ) + ); + if (ret < 0) + return ret; + + return 0; +} + +static struct cli_context ctx = { + .name = "fdb stats", + .init = ctx_init, +}; + +static void __attribute__((constructor, used)) init(void) { + cli_context_register(&ctx); +} diff --git a/modules/l2/cli/meson.build b/modules/l2/cli/meson.build index 01e82a5ce..0d5b39879 100644 --- a/modules/l2/cli/meson.build +++ b/modules/l2/cli/meson.build @@ -5,5 +5,6 @@ cli_src += files( 'bridge.c', 'flood.c', 'fdb.c', + 'l2_stats.c', 'vxlan.c', ) diff --git a/modules/l2/control/bridge.c b/modules/l2/control/bridge.c index 24074d6de..d4edfec51 100644 --- a/modules/l2/control/bridge.c +++ b/modules/l2/control/bridge.c @@ -89,6 +89,10 @@ static int bridge_fini(struct iface *iface) { gr_event_push(GR_EVENT_IFACE_POST_RECONFIG, member); } + // Clear FDB forwarding stats. + if (iface->id < L2_MAX_BRIDGES) + memset(l2_fdb_stats[iface->id], 0, sizeof(l2_fdb_stats[0])); + fdb_purge_bridge(iface->id); return 0; diff --git a/modules/l2/control/gr_l2_control.h b/modules/l2/control/gr_l2_control.h index 20fbf0d70..6aefb4eaa 100644 --- a/modules/l2/control/gr_l2_control.h +++ b/modules/l2/control/gr_l2_control.h @@ -9,12 +9,32 @@ #include #include +#include #include #include #include #include +// Per-core FDB forwarding statistics, indexed by [bridge_slot][lcore_id]. +// Track forwarding decisions that generic per-interface iface_stats and +// drop node software stats cannot distinguish. +struct fdb_stats { + uint64_t hit; // unicast forwarded via FDB lookup + uint64_t miss; // unknown unicast, sent to flood + uint64_t flood; // broadcast/multicast, sent to flood +} __rte_cache_aligned; + +#define L2_MAX_BRIDGES 256 + +extern struct fdb_stats l2_fdb_stats[L2_MAX_BRIDGES][RTE_MAX_LCORE]; + +static inline struct fdb_stats *fdb_get_stats(uint16_t bridge_id, unsigned lcore_id) { + if (bridge_id >= L2_MAX_BRIDGES) + return NULL; + return &l2_fdb_stats[bridge_id][lcore_id]; +} + // Internal bridge info structure. GR_IFACE_INFO(GR_IFACE_TYPE_BRIDGE, iface_info_bridge, { BASE(__gr_iface_info_bridge_base); diff --git a/modules/l2/control/l2_stats.c b/modules/l2/control/l2_stats.c new file mode 100644 index 000000000..cef93c172 --- /dev/null +++ b/modules/l2/control/l2_stats.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026 Fabien Dupont + +#include +#include +#include +#include +#include + +#include + +#include +#include + +struct fdb_stats l2_fdb_stats[L2_MAX_BRIDGES][RTE_MAX_LCORE]; + +static struct api_out fdb_stats_get(const void *request, struct api_ctx *) { + const struct gr_l2_fdb_stats_get_req *req = request; + struct gr_l2_fdb_stats *resp; + + const struct iface *iface = iface_from_id(req->bridge_id); + if (iface == NULL || iface->type != GR_IFACE_TYPE_BRIDGE) + return api_out(ENOENT, 0, NULL); + + resp = calloc(1, sizeof(*resp)); + if (resp == NULL) + return api_out(ENOMEM, 0, NULL); + + resp->bridge_id = req->bridge_id; + + for (unsigned lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + struct fdb_stats *fs = fdb_get_stats(req->bridge_id, lcore_id); + if (fs == NULL) + continue; + resp->hit += fs->hit; + resp->miss += fs->miss; + resp->flood += fs->flood; + } + + return api_out(0, sizeof(*resp), resp); +} + +static struct api_out fdb_stats_reset(const void *request, struct api_ctx *) { + const struct gr_l2_fdb_stats_reset_req *req = request; + + const struct iface *iface = iface_from_id(req->bridge_id); + if (iface == NULL || iface->type != GR_IFACE_TYPE_BRIDGE) + return api_out(ENOENT, 0, NULL); + + if (req->bridge_id < L2_MAX_BRIDGES) + memset(l2_fdb_stats[req->bridge_id], 0, sizeof(l2_fdb_stats[0])); + + return api_out(0, 0, NULL); +} + +static struct gr_api_handler fdb_stats_get_handler = { + .name = "fdb stats get", + .request_type = GR_L2_FDB_STATS_GET, + .callback = fdb_stats_get, +}; + +static struct gr_api_handler fdb_stats_reset_handler = { + .name = "fdb stats reset", + .request_type = GR_L2_FDB_STATS_RESET, + .callback = fdb_stats_reset, +}; + +RTE_INIT(fdb_stats_constructor) { + gr_register_api_handler(&fdb_stats_get_handler); + gr_register_api_handler(&fdb_stats_reset_handler); +} diff --git a/modules/l2/control/meson.build b/modules/l2/control/meson.build index 459546a08..a6801dab7 100644 --- a/modules/l2/control/meson.build +++ b/modules/l2/control/meson.build @@ -5,6 +5,7 @@ src += files( 'bridge.c', 'fdb.c', 'flood.c', + 'l2_stats.c', 'vxlan.c', ) diff --git a/modules/l2/datapath/bridge_input.c b/modules/l2/datapath/bridge_input.c index ccb9a1f28..af90731be 100644 --- a/modules/l2/datapath/bridge_input.c +++ b/modules/l2/datapath/bridge_input.c @@ -10,6 +10,7 @@ #include #include +#include enum edges { OUTPUT = 0, @@ -38,9 +39,11 @@ static uint16_t bridge_input_process( const struct gr_fdb_entry *fdb; struct iface_mbuf_data *d; struct rte_ether_hdr *eth; + struct fdb_stats *stats; struct rte_mbuf *m; ip4_addr_t vtep; rte_edge_t edge; + unsigned lcore_id = rte_lcore_id(); for (uint16_t i = 0; i < nb_objs; i++) { m = objs[i]; @@ -60,6 +63,7 @@ static uint16_t bridge_input_process( goto next; } br = iface_info_bridge(bridge); + stats = fdb_get_stats(bridge->id, lcore_id); if (rte_is_unicast_ether_addr(ð->src_addr) && !(br->flags & GR_BRIDGE_F_NO_LEARN)) { @@ -71,6 +75,8 @@ static uint16_t bridge_input_process( fdb = fdb_lookup(bridge->id, ð->dst_addr, d->vlan_id); if (fdb == NULL) { // Unknown unicast + if (stats) + stats->miss++; edge = FLOOD; goto next; } @@ -85,6 +91,8 @@ static uint16_t bridge_input_process( goto next; } // Direct output to learned interface + if (stats) + stats->hit++; d->iface = iface; d->vtep = fdb->vtep; @@ -95,6 +103,8 @@ static uint16_t bridge_input_process( } } else { // Broadcast, multicast + if (stats) + stats->flood++; edge = FLOOD; } next: diff --git a/smoke/bridge_test.sh b/smoke/bridge_test.sh index 31089fdab..6be375d09 100755 --- a/smoke/bridge_test.sh +++ b/smoke/bridge_test.sh @@ -48,6 +48,11 @@ if grcli fdb show iface p1 | grep .; then fail "fdb still contains entries for removed interface" fi +# verify FDB stats show forwarding counters +grcli stats fdb show br0 | grep -q 'fdb_hit:' || fail "stats missing fdb_hit field" +grcli stats fdb reset br0 +grcli stats fdb show br0 | grep -q 'fdb_hit: 0' || fail "stats not reset" + grcli interface del br0 if grcli fdb show | grep .; then fail "fdb still contains entries"