From b8cf4866b2f8a7f59583693d5c6b00be43cc2739 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 14 Apr 2026 10:41:36 +0800 Subject: [PATCH 1/3] feat: add bridge TAP network backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New network/bridge package: creates TAP devices and adds them directly to an existing Linux bridge as ports. An external DHCP server on the bridge serves VM IPs. Data path: VM → TAP → bridge → eth0 (3 hops, vs CNI's 5 hops). No veth, no TC redirect, no netns. TAP is a direct bridge port. Designed to work with cocoon-net's cni0 bridge or any pre-existing bridge with DHCP + routing already configured. Usage: cocoon vm run --bridge cni0 --name vm1 IMAGE The --bridge flag is mutually exclusive with --network (CNI). CH and FC launchProcess skip EnterNetns when NetnsPath is empty. --- cmd/core/helpers.go | 10 ++ cmd/vm/commands.go | 3 +- cmd/vm/run.go | 19 ++- hypervisor/cloudhypervisor/start.go | 6 +- hypervisor/firecracker/start.go | 2 +- network/bridge/bridge_linux.go | 206 ++++++++++++++++++++++++++++ network/bridge/bridge_other.go | 48 +++++++ 7 files changed, 285 insertions(+), 9 deletions(-) create mode 100644 network/bridge/bridge_linux.go create mode 100644 network/bridge/bridge_other.go diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index d853f389..c09fb854 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -21,6 +21,7 @@ import ( "github.com/cocoonstack/cocoon/images/cloudimg" "github.com/cocoonstack/cocoon/images/oci" "github.com/cocoonstack/cocoon/network" + bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/network/cni" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/snapshot/localfile" @@ -172,6 +173,15 @@ func InitNetwork(conf *config.Config) (network.Network, error) { return p, nil } +// InitBridgeNetwork creates a TAP-on-bridge network provider. +func InitBridgeNetwork(conf *config.Config, bridgeDev string) (network.Network, error) { + p, err := bridgenet.New(conf, bridgeDev) + if err != nil { + return nil, fmt.Errorf("init bridge network: %w", err) + } + return p, nil +} + // InitSnapshot initializes the snapshot backend. func InitSnapshot(conf *config.Config) (snapshot.Snapshot, error) { s, err := localfile.New(conf) diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index 29dee3bf..972e5d8a 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -155,7 +155,8 @@ func addVMFlags(cmd *cobra.Command) { cmd.Flags().String("memory", "1G", "memory size") //nolint:mnd cmd.Flags().String("storage", "10G", "COW disk size") //nolint:mnd cmd.Flags().Int("nics", 1, "number of network interfaces (0 = no network); multiple NICs with auto IP config only works for cloudimg; OCI images auto-configure only the last NIC, others require manual setup inside the guest") - cmd.Flags().String("network", "", "CNI conflist name (empty = default)") + cmd.Flags().String("network", "", "CNI conflist name (empty = default); mutually exclusive with --bridge") + cmd.Flags().String("bridge", "", "use TAP-on-bridge instead of CNI (value is bridge device, e.g. cni0); VM gets IP via DHCP from the bridge") cmd.Flags().Bool("windows", false, "Windows guest (UEFI boot, kvm_hyperv=on, no cidata)") } diff --git a/cmd/vm/run.go b/cmd/vm/run.go index 045c6d42..73566a21 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -235,7 +235,8 @@ func (h Handler) prepareClone(ctx context.Context, cmd *cobra.Command, conf *con return nil, "", nil, nil, fmt.Errorf("--nics %d below snapshot minimum %d", nics, cfg.NICs) } - netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker)) + bridgeDev, _ := cmd.Flags().GetString("bridge") + netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker), bridgeDev) if err != nil { return nil, "", nil, nil, err } @@ -288,6 +289,10 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t if conf.UseFirecracker && vmCfg.Windows { return nil, nil, nil, fmt.Errorf("--fc and --windows are mutually exclusive: Firecracker does not support Windows guests") } + bridgeDev, _ := cmd.Flags().GetString("bridge") + if bridgeDev != "" && vmCfg.Network != "" { + return nil, nil, nil, fmt.Errorf("--bridge and --network are mutually exclusive") + } backends, hyper, err := cmdcore.InitBackends(ctx, conf) if err != nil { @@ -312,7 +317,7 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t } nics, _ := cmd.Flags().GetInt("nics") - netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker)) + netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker), bridgeDev) if err != nil { return nil, nil, nil, err } @@ -334,11 +339,17 @@ func tapQueues(cpu int, useFC bool) int { return cpu } -func initNetwork(ctx context.Context, conf *config.Config, vmID string, nics int, vmCfg *types.VMConfig, queues int) (network.Network, []*types.NetworkConfig, error) { +func initNetwork(ctx context.Context, conf *config.Config, vmID string, nics int, vmCfg *types.VMConfig, queues int, bridgeDev string) (network.Network, []*types.NetworkConfig, error) { if nics <= 0 { return nil, nil, nil } - netProvider, err := cmdcore.InitNetwork(conf) + var netProvider network.Network + var err error + if bridgeDev != "" { + netProvider, err = cmdcore.InitBridgeNetwork(conf, bridgeDev) + } else { + netProvider, err = cmdcore.InitNetwork(conf) + } if err != nil { return nil, nil, fmt.Errorf("init network: %w", err) } diff --git a/hypervisor/cloudhypervisor/start.go b/hypervisor/cloudhypervisor/start.go index e1a80c91..970bd37f 100644 --- a/hypervisor/cloudhypervisor/start.go +++ b/hypervisor/cloudhypervisor/start.go @@ -90,9 +90,9 @@ func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VM cmd.Stderr = logFile } - // If the VM has network, CH must be launched inside the VM's netns - // so it can access the tap device. We setns before fork and restore after. - if withNetwork { + // CNI mode: TAP is inside a per-VM netns, switch before fork. + // Bridge mode: TAP is in host netns, no EnterNetns needed. + if withNetwork && rec.NetworkConfigs[0].NetnsPath != "" { restore, enterErr := hypervisor.EnterNetns(rec.NetworkConfigs[0].NetnsPath) if enterErr != nil { return 0, fmt.Errorf("enter netns: %w", enterErr) diff --git a/hypervisor/firecracker/start.go b/hypervisor/firecracker/start.go index 319c6a4c..b8bff36e 100644 --- a/hypervisor/firecracker/start.go +++ b/hypervisor/firecracker/start.go @@ -177,7 +177,7 @@ func (fc *Firecracker) launchProcess(ctx context.Context, rec *hypervisor.VMReco fcCmd.Stdin = slave fcCmd.Stdout = slave - if withNetwork { + if withNetwork && rec.NetworkConfigs[0].NetnsPath != "" { restore, enterErr := hypervisor.EnterNetns(rec.NetworkConfigs[0].NetnsPath) if enterErr != nil { _ = master.Close() diff --git a/network/bridge/bridge_linux.go b/network/bridge/bridge_linux.go new file mode 100644 index 00000000..b6ad0697 --- /dev/null +++ b/network/bridge/bridge_linux.go @@ -0,0 +1,206 @@ +//go:build linux + +package bridge + +import ( + "context" + "crypto/rand" + "fmt" + "net" + + "github.com/projecteru2/core/log" + "github.com/vishvananda/netlink" + + "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/types" +) + +// compile-time interface check. +var _ network.Network = (*Bridge)(nil) + +const ( + typ = "bridge" + defaultQueueSize = 256 +) + +// Bridge implements network.Network by creating TAP devices and adding +// them directly to an existing Linux bridge. An external DHCP server +// on the bridge (e.g. dnsmasq) serves VM IPs. No veth, no TC, no +// netns — just TAP-on-bridge, the simplest possible VM networking. +// +// This backend is designed to work with cocoon-net's cni0 bridge or +// any pre-existing bridge that has DHCP + routing already set up. +type Bridge struct { + conf *config.Config + bridgeDev string + bridgeIdx int +} + +// New creates a Bridge network provider. The bridge device must exist. +func New(conf *config.Config, bridgeDev string) (*Bridge, error) { + if conf == nil { + return nil, fmt.Errorf("config is nil") + } + if bridgeDev == "" { + return nil, fmt.Errorf("bridge device name is required") + } + br, err := netlink.LinkByName(bridgeDev) + if err != nil { + return nil, fmt.Errorf("bridge %s: %w", bridgeDev, err) + } + if br.Type() != "bridge" { + return nil, fmt.Errorf("%s is not a bridge (type: %s)", bridgeDev, br.Type()) + } + return &Bridge{ + conf: conf, + bridgeDev: bridgeDev, + bridgeIdx: br.Attrs().Index, + }, nil +} + +// Type returns the provider identifier. +func (b *Bridge) Type() string { return typ } + +// Verify checks whether the TAP for a VM exists. +func (b *Bridge) Verify(_ context.Context, vmID string) error { + if _, err := netlink.LinkByName(tapName(vmID, 0)); err != nil { + return fmt.Errorf("tap %s: %w", tapName(vmID, 0), err) + } + return nil +} + +// Config creates TAP devices and adds them to the bridge. +func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig, existing ...*types.NetworkConfig) ([]*types.NetworkConfig, error) { + logger := log.WithFunc("bridge.Config") + + br, err := netlink.LinkByIndex(b.bridgeIdx) + if err != nil { + return nil, fmt.Errorf("find bridge: %w", err) + } + + var configs []*types.NetworkConfig + for i := range numNICs { + name := tapName(vmID, i) + + var mac string + if i < len(existing) && existing[i] != nil { + mac = existing[i].Mac + } else { + mac = generateMAC() + } + + queues := netNumQueues(vmCfg.CPU) + if err := createTAP(name, queues); err != nil { + return nil, fmt.Errorf("create tap %s: %w", name, err) + } + + tap, err := netlink.LinkByName(name) + if err != nil { + return nil, fmt.Errorf("find tap %s: %w", name, err) + } + + // Add TAP to bridge — this is the only wiring needed. + if err := netlink.LinkSetMaster(tap, br); err != nil { + return nil, fmt.Errorf("add %s to %s: %w", name, b.bridgeDev, err) + } + + // Sync MTU from bridge. + if mtu := br.Attrs().MTU; mtu > 0 { + _ = netlink.LinkSetMTU(tap, mtu) + } + + if err := netlink.LinkSetUp(tap); err != nil { + return nil, fmt.Errorf("set %s up: %w", name, err) + } + + configs = append(configs, &types.NetworkConfig{ + Tap: name, + Mac: mac, + NumQueues: queues, + QueueSize: defaultQueueSize, + // NetnsPath: empty — TAP is in host netns. + // Network: nil — IP comes from DHCP on the bridge. + }) + logger.Debugf(ctx, "NIC %d: tap=%s mac=%s bridge=%s", i, name, mac, b.bridgeDev) + } + return configs, nil +} + +// Delete removes TAP devices for the given VMs. +func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { + var succeeded []string + for _, vmID := range vmIDs { + for i := range 8 { // max 8 NICs per VM + name := tapName(vmID, i) + l, err := netlink.LinkByName(name) + if err != nil { + break // no more TAPs for this VM + } + _ = netlink.LinkDel(l) + } + succeeded = append(succeeded, vmID) + } + return succeeded, nil +} + +// Inspect is not supported — bridge mode has no persistent records. +func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { + return nil, nil +} + +// List is not supported — bridge mode has no persistent records. +func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { + return nil, nil +} + +// RegisterGC is a no-op. +func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} + +// --- helpers --- + +func createTAP(name string, queues int) error { + flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI + if queues <= 1 { + queues = 1 + flags |= netlink.TUNTAP_ONE_QUEUE + } else { + flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS + } + tap := &netlink.Tuntap{ + LinkAttrs: netlink.LinkAttrs{Name: name}, + Mode: netlink.TUNTAP_MODE_TAP, + Queues: queues, + Flags: flags, + } + if err := netlink.LinkAdd(tap); err != nil { + return err + } + for _, fd := range tap.Fds { + _ = fd.Close() + } + return nil +} + +func tapName(vmID string, nic int) string { + const pfxLen = 8 + if len(vmID) > pfxLen { + vmID = vmID[:pfxLen] + } + return fmt.Sprintf("bt%s-%d", vmID, nic) +} + +func netNumQueues(cpu int) int { + if cpu <= 1 { + return 2 //nolint:mnd + } + return cpu * 2 //nolint:mnd +} + +func generateMAC() string { + buf := make([]byte, 6) //nolint:mnd + _, _ = rand.Read(buf) + buf[0] = (buf[0] | 0x02) & 0xfe + return net.HardwareAddr(buf).String() +} diff --git a/network/bridge/bridge_other.go b/network/bridge/bridge_other.go new file mode 100644 index 00000000..2a787864 --- /dev/null +++ b/network/bridge/bridge_other.go @@ -0,0 +1,48 @@ +//go:build !linux + +package bridge + +import ( + "context" + "fmt" + "runtime" + + "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/types" +) + +// Bridge is a placeholder for non-Linux. +type Bridge struct{} + +// New returns an error on non-Linux. +func New(_ *config.Config, _ string) (*Bridge, error) { + return nil, fmt.Errorf("bridge TAP networking requires Linux (running on %s)", runtime.GOOS) +} + +// Type returns the provider identifier. +func (b *Bridge) Type() string { return "bridge" } + +// Verify is not supported. +func (b *Bridge) Verify(_ context.Context, _ string) error { return errUnsupported } + +// Config is not supported. +func (b *Bridge) Config(_ context.Context, _ string, _ int, _ *types.VMConfig, _ ...*types.NetworkConfig) ([]*types.NetworkConfig, error) { + return nil, errUnsupported +} + +// Delete is not supported. +func (b *Bridge) Delete(_ context.Context, _ []string) ([]string, error) { return nil, errUnsupported } + +// Inspect is not supported. +func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { + return nil, errUnsupported +} + +// List is not supported. +func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { return nil, errUnsupported } + +// RegisterGC is a no-op. +func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} + +var errUnsupported = fmt.Errorf("bridge TAP networking requires Linux (running on %s)", runtime.GOOS) From a84fdda76f6c7519d5af3fd2da02392ef0aa70d9 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 14 Apr 2026 12:30:40 +0800 Subject: [PATCH 2/3] bridge networking impl with DHCP --- cmd/vm/commands.go | 1 + cmd/vm/lifecycle.go | 3 +++ network/bridge/bridge_linux.go | 26 +++++++++++++++++++------- network/bridge/bridge_other.go | 3 +++ network/cni/create.go | 2 +- network/cni/create_linux.go | 10 ++++++---- 6 files changed, 33 insertions(+), 12 deletions(-) diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index 972e5d8a..e5b7112b 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -167,4 +167,5 @@ func addCloneFlags(cmd *cobra.Command) { cmd.Flags().String("storage", "", "COW disk size (empty = inherit from snapshot)") cmd.Flags().Int("nics", 0, "number of NICs (0 = inherit from snapshot)") cmd.Flags().String("network", "", "CNI conflist name (empty = inherit from source VM)") + cmd.Flags().String("bridge", "", "use TAP-on-bridge instead of CNI (value is bridge device, e.g. cni0)") } diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 8e495252..7b092dd0 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -13,6 +13,7 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" "github.com/cocoonstack/cocoon/console" "github.com/cocoonstack/cocoon/hypervisor" + bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/network" "github.com/cocoonstack/cocoon/types" ) @@ -191,6 +192,8 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { return fmt.Errorf("vm(s) deleted but network cleanup failed: %w", delErr) } } + // Also clean up bridge TAPs (no-op if none exist). + bridgenet.CleanupTAPs(allDeleted) } if lastErr != nil { diff --git a/network/bridge/bridge_linux.go b/network/bridge/bridge_linux.go index b6ad0697..aca13d5c 100644 --- a/network/bridge/bridge_linux.go +++ b/network/bridge/bridge_linux.go @@ -130,7 +130,14 @@ func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *ty // Delete removes TAP devices for the given VMs. func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { - var succeeded []string + return CleanupTAPs(vmIDs), nil +} + +// CleanupTAPs removes bridge TAP devices for the given VM IDs. +// It does not require a Bridge instance and is safe to call +// even when no bridge TAPs exist (no-op per VM). +func CleanupTAPs(vmIDs []string) []string { + var cleaned []string for _, vmID := range vmIDs { for i := range 8 { // max 8 NICs per VM name := tapName(vmID, i) @@ -140,9 +147,9 @@ func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { } _ = netlink.LinkDel(l) } - succeeded = append(succeeded, vmID) + cleaned = append(cleaned, vmID) } - return succeeded, nil + return cleaned } // Inspect is not supported — bridge mode has no persistent records. @@ -160,18 +167,23 @@ func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} // --- helpers --- -func createTAP(name string, queues int) error { +func createTAP(name string, numQueues int) error { + // CH uses queue pairs (TX+RX): queue_pairs = num_queues / 2. + // Multi-queue requires queue_pairs > 1, i.e. num_queues > 2. + // The TAP's IFF_MULTI_QUEUE flag must match CH's expectation, + // otherwise CH's sysfs pre-flight check rejects the device. + queuePairs := max(1, numQueues/2) //nolint:mnd flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI - if queues <= 1 { - queues = 1 + if queuePairs <= 1 { flags |= netlink.TUNTAP_ONE_QUEUE + queuePairs = 1 } else { flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS } tap := &netlink.Tuntap{ LinkAttrs: netlink.LinkAttrs{Name: name}, Mode: netlink.TUNTAP_MODE_TAP, - Queues: queues, + Queues: queuePairs, Flags: flags, } if err := netlink.LinkAdd(tap); err != nil { diff --git a/network/bridge/bridge_other.go b/network/bridge/bridge_other.go index 2a787864..dc9bbdfb 100644 --- a/network/bridge/bridge_other.go +++ b/network/bridge/bridge_other.go @@ -34,6 +34,9 @@ func (b *Bridge) Config(_ context.Context, _ string, _ int, _ *types.VMConfig, _ // Delete is not supported. func (b *Bridge) Delete(_ context.Context, _ []string) ([]string, error) { return nil, errUnsupported } +// CleanupTAPs is a no-op on non-Linux. +func CleanupTAPs(_ []string) []string { return nil } + // Inspect is not supported. func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { return nil, errUnsupported diff --git a/network/cni/create.go b/network/cni/create.go index 8c60af93..e7e1baf3 100644 --- a/network/cni/create.go +++ b/network/cni/create.go @@ -91,7 +91,7 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types if i < len(existing) && existing[i] != nil { overrideMAC = existing[i].Mac } - mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, vmCfg.CPU, overrideMAC) + mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, netNumQueues(vmCfg.CPU), overrideMAC) if setupErr != nil { return nil, fmt.Errorf("setup tc-redirect %s: %w", vmID, setupErr) } diff --git a/network/cni/create_linux.go b/network/cni/create_linux.go index 5c880672..c7f19dde 100644 --- a/network/cni/create_linux.go +++ b/network/cni/create_linux.go @@ -95,18 +95,20 @@ func tcRedirectInNS(ifName, tapName string, queues int, overrideMAC string) (str } } - // Match hypervisor attach flags and queue layout. + // CH uses queue pairs (TX+RX): queue_pairs = num_queues / 2. + // Multi-queue requires queue_pairs > 1, i.e. num_queues > 2. + queuePairs := max(1, queues/2) //nolint:mnd flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI - if queues <= 1 { - queues = 1 + if queuePairs <= 1 { flags |= netlink.TUNTAP_ONE_QUEUE + queuePairs = 1 } else { flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS } tap := &netlink.Tuntap{ LinkAttrs: netlink.LinkAttrs{Name: tapName}, Mode: netlink.TUNTAP_MODE_TAP, - Queues: queues, + Queues: queuePairs, Flags: flags, } if addErr := netlink.LinkAdd(tap); addErr != nil { From 64049112ce920b6148aacff8094799761bbbd249 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 14 Apr 2026 14:28:40 +0800 Subject: [PATCH 3/3] refactor bridge codebase --- cmd/others/handler.go | 2 + cmd/vm/lifecycle.go | 61 +++++++++++++++--- network/bridge/bridge_linux.go | 50 +++++++-------- network/bridge/bridge_other.go | 6 +- network/bridge/gc_linux.go | 112 +++++++++++++++++++++++++++++++++ network/bridge/gc_other.go | 32 ++++++++++ network/cni/create.go | 19 ++---- network/cni/create_linux.go | 1 - network/utils.go | 21 +++++++ types/network.go | 9 +++ 10 files changed, 256 insertions(+), 57 deletions(-) create mode 100644 network/bridge/gc_linux.go create mode 100644 network/bridge/gc_other.go create mode 100644 network/utils.go diff --git a/cmd/others/handler.go b/cmd/others/handler.go index d687be26..6d0e570a 100644 --- a/cmd/others/handler.go +++ b/cmd/others/handler.go @@ -8,6 +8,7 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/version" ) @@ -48,6 +49,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { hyper.RegisterGC(o) } netProvider.RegisterGC(o) + gc.Register(o, bridge.GCModule(conf.RootDir)) snapBackend.RegisterGC(o) if err := o.Run(ctx); err != nil { return err diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 7b092dd0..a48bc580 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -11,10 +11,11 @@ import ( "github.com/spf13/cobra" cmdcore "github.com/cocoonstack/cocoon/cmd/core" + "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/console" "github.com/cocoonstack/cocoon/hypervisor" - bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/network" + bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/types" ) @@ -35,10 +36,8 @@ func (h Handler) Start(cmd *cobra.Command, args []string) error { } // Recover network for all backends before starting. - if netProvider, netErr := cmdcore.InitNetwork(conf); netErr == nil { - for hyper, refs := range routed { - h.recoverNetwork(ctx, hyper, netProvider, refs) - } + for hyper, refs := range routed { + h.recoverNetwork(ctx, conf, hyper, refs) } return batchRoutedCmd(ctx, "start", "started", routed, func(hyper hypervisor.Hypervisor, refs []string) ([]string, error) { @@ -205,23 +204,67 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { return nil } -func (h Handler) recoverNetwork(ctx context.Context, hyper hypervisor.Hypervisor, net network.Network, refs []string) { +func (h Handler) recoverNetwork(ctx context.Context, conf *config.Config, hyper hypervisor.Hypervisor, refs []string) { logger := log.WithFunc("cmd.recoverNetwork") + + // Lazy-init CNI provider (may fail if not configured — OK for bridge-only setups). + var cniProvider network.Network + if p, err := cmdcore.InitNetwork(conf); err == nil { + cniProvider = p + } + + // Cache bridge providers by device name to avoid redundant netlink lookups. + bridgeProviders := map[string]network.Network{} + for _, ref := range refs { vm, err := hyper.Inspect(ctx, ref) if err != nil || vm == nil || len(vm.NetworkConfigs) == 0 { continue } - if net.Verify(ctx, vm.ID) == nil { + + netProvider, provErr := providerForVM(conf, cniProvider, bridgeProviders, vm.NetworkConfigs) + if provErr != nil { + logger.Warnf(ctx, "skip recovery for VM %s: %v", vm.ID, provErr) + continue + } + if netProvider.Verify(ctx, vm.ID) == nil { continue } - logger.Warnf(ctx, "netns missing for VM %s, recovering network", vm.ID) - if _, recoverErr := net.Config(ctx, vm.ID, len(vm.NetworkConfigs), &vm.Config, vm.NetworkConfigs...); recoverErr != nil { + logger.Warnf(ctx, "network missing for VM %s, recovering", vm.ID) + if _, recoverErr := netProvider.Config(ctx, vm.ID, len(vm.NetworkConfigs), &vm.Config, vm.NetworkConfigs...); recoverErr != nil { logger.Warnf(ctx, "recover network for VM %s: %v (start will fail)", vm.ID, recoverErr) } } } +// providerForVM selects the correct network provider based on persisted NetworkConfig. +func providerForVM(conf *config.Config, cniProvider network.Network, bridgeCache map[string]network.Network, configs []*types.NetworkConfig) (network.Network, error) { + if len(configs) == 0 { + return nil, fmt.Errorf("no network configs") + } + // All NICs on a VM share the same backend. + cfg := configs[0] + if cfg.Backend == "bridge" { + if cfg.BridgeDev == "" { + return nil, fmt.Errorf("bridge backend but no bridge device persisted") + } + if cached, ok := bridgeCache[cfg.BridgeDev]; ok { + return cached, nil + } + p, err := cmdcore.InitBridgeNetwork(conf, cfg.BridgeDev) + if err != nil { + return nil, err + } + bridgeCache[cfg.BridgeDev] = p + return p, nil + } + // "cni" or empty (backward compat). + if cniProvider == nil { + return nil, fmt.Errorf("CNI provider not available") + } + return cniProvider, nil +} + // batchRoutedCmd runs a batch operation across multiple backends. func batchRoutedCmd(ctx context.Context, name, pastTense string, routed map[hypervisor.Hypervisor][]string, fn func(hypervisor.Hypervisor, []string) ([]string, error)) error { logger := log.WithFunc("cmd." + name) diff --git a/network/bridge/bridge_linux.go b/network/bridge/bridge_linux.go index aca13d5c..5d4e6a01 100644 --- a/network/bridge/bridge_linux.go +++ b/network/bridge/bridge_linux.go @@ -91,7 +91,7 @@ func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *ty mac = generateMAC() } - queues := netNumQueues(vmCfg.CPU) + queues := network.NetNumQueues(vmCfg.CPU) if err := createTAP(name, queues); err != nil { return nil, fmt.Errorf("create tap %s: %w", name, err) } @@ -103,6 +103,7 @@ func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *ty // Add TAP to bridge — this is the only wiring needed. if err := netlink.LinkSetMaster(tap, br); err != nil { + _ = netlink.LinkDel(tap) return nil, fmt.Errorf("add %s to %s: %w", name, b.bridgeDev, err) } @@ -112,6 +113,7 @@ func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *ty } if err := netlink.LinkSetUp(tap); err != nil { + _ = netlink.LinkDel(tap) return nil, fmt.Errorf("set %s up: %w", name, err) } @@ -120,6 +122,8 @@ func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *ty Mac: mac, NumQueues: queues, QueueSize: defaultQueueSize, + Backend: typ, + BridgeDev: b.bridgeDev, // NetnsPath: empty — TAP is in host netns. // Network: nil — IP comes from DHCP on the bridge. }) @@ -133,6 +137,21 @@ func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { return CleanupTAPs(vmIDs), nil } +// Inspect is not supported — bridge mode has no persistent records. +func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { + return nil, nil +} + +// List is not supported — bridge mode has no persistent records. +func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { + return nil, nil +} + +// RegisterGC registers the bridge GC module that reclaims orphan bt* TAP devices. +func (b *Bridge) RegisterGC(orch *gc.Orchestrator) { + gc.Register(orch, GCModule(b.conf.RootDir)) +} + // CleanupTAPs removes bridge TAP devices for the given VM IDs. // It does not require a Bridge instance and is safe to call // even when no bridge TAPs exist (no-op per VM). @@ -152,21 +171,6 @@ func CleanupTAPs(vmIDs []string) []string { return cleaned } -// Inspect is not supported — bridge mode has no persistent records. -func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { - return nil, nil -} - -// List is not supported — bridge mode has no persistent records. -func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { - return nil, nil -} - -// RegisterGC is a no-op. -func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} - -// --- helpers --- - func createTAP(name string, numQueues int) error { // CH uses queue pairs (TX+RX): queue_pairs = num_queues / 2. // Multi-queue requires queue_pairs > 1, i.e. num_queues > 2. @@ -176,7 +180,6 @@ func createTAP(name string, numQueues int) error { flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI if queuePairs <= 1 { flags |= netlink.TUNTAP_ONE_QUEUE - queuePairs = 1 } else { flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS } @@ -196,18 +199,7 @@ func createTAP(name string, numQueues int) error { } func tapName(vmID string, nic int) string { - const pfxLen = 8 - if len(vmID) > pfxLen { - vmID = vmID[:pfxLen] - } - return fmt.Sprintf("bt%s-%d", vmID, nic) -} - -func netNumQueues(cpu int) int { - if cpu <= 1 { - return 2 //nolint:mnd - } - return cpu * 2 //nolint:mnd + return fmt.Sprintf("%s%s-%d", tapPrefix, network.VMIDPrefix(vmID), nic) } func generateMAC() string { diff --git a/network/bridge/bridge_other.go b/network/bridge/bridge_other.go index dc9bbdfb..79f2d214 100644 --- a/network/bridge/bridge_other.go +++ b/network/bridge/bridge_other.go @@ -34,9 +34,6 @@ func (b *Bridge) Config(_ context.Context, _ string, _ int, _ *types.VMConfig, _ // Delete is not supported. func (b *Bridge) Delete(_ context.Context, _ []string) ([]string, error) { return nil, errUnsupported } -// CleanupTAPs is a no-op on non-Linux. -func CleanupTAPs(_ []string) []string { return nil } - // Inspect is not supported. func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { return nil, errUnsupported @@ -48,4 +45,7 @@ func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { return nil, // RegisterGC is a no-op. func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} +// CleanupTAPs is a no-op on non-Linux. +func CleanupTAPs(_ []string) []string { return nil } + var errUnsupported = fmt.Errorf("bridge TAP networking requires Linux (running on %s)", runtime.GOOS) diff --git a/network/bridge/gc_linux.go b/network/bridge/gc_linux.go new file mode 100644 index 00000000..f095c9a4 --- /dev/null +++ b/network/bridge/gc_linux.go @@ -0,0 +1,112 @@ +//go:build linux + +package bridge + +import ( + "context" + "path/filepath" + "slices" + "strings" + + "github.com/projecteru2/core/log" + "github.com/vishvananda/netlink" + + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/lock/flock" + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/utils" +) + +const tapPrefix = "bt" + +// bridgeSnapshot holds the set of VM ID prefixes that own bt* TAP devices. +type bridgeSnapshot struct { + prefixes map[string]struct{} +} + +// GCModule returns a GC module that reclaims orphan bt* TAP devices. +// It does not require a Bridge instance — only rootDir for the lock file. +func GCModule(rootDir string) gc.Module[bridgeSnapshot] { + lockPath := filepath.Join(rootDir, "bridge", "gc.lock") + _ = utils.EnsureDirs(filepath.Dir(lockPath)) + + return gc.Module[bridgeSnapshot]{ + Name: typ, + Locker: flock.New(lockPath), + ReadDB: func(_ context.Context) (bridgeSnapshot, error) { + snap := bridgeSnapshot{prefixes: make(map[string]struct{})} + + links, err := netlink.LinkList() + if err != nil { + return snap, err + } + for _, l := range links { + if prefix, ok := parseTAPName(l.Attrs().Name); ok { + snap.prefixes[prefix] = struct{}{} + } + } + return snap, nil + }, + Resolve: func(snap bridgeSnapshot, others map[string]any) []string { + active := gc.Collect(others, gc.VMIDs) + + // Build set of 8-char prefixes from active VM IDs. + activePrefixes := make(map[string]struct{}, len(active)) + for id := range active { + activePrefixes[network.VMIDPrefix(id)] = struct{}{} + } + + var orphans []string + for prefix := range snap.prefixes { + if _, ok := activePrefixes[prefix]; !ok { + orphans = append(orphans, prefix) + } + } + slices.Sort(orphans) + return orphans + }, + Collect: func(ctx context.Context, prefixes []string) error { + logger := log.WithFunc("bridge.gc.Collect") + + orphanSet := make(map[string]struct{}, len(prefixes)) + for _, p := range prefixes { + orphanSet[p] = struct{}{} + } + + links, err := netlink.LinkList() + if err != nil { + return err + } + for _, l := range links { + name := l.Attrs().Name + prefix, ok := parseTAPName(name) + if !ok { + continue + } + if _, orphan := orphanSet[prefix]; !orphan { + continue + } + if err := netlink.LinkDel(l); err != nil { + logger.Warnf(ctx, "delete orphan TAP %s: %v", name, err) + } else { + logger.Infof(ctx, "collected orphan TAP %s", name) + } + } + return nil + }, + } +} + +// parseTAPName extracts the vmID prefix from a bridge TAP name like "bt-". +// Returns the prefix and true, or ("", false) if the name doesn't match. +func parseTAPName(name string) (string, bool) { + rest, ok := strings.CutPrefix(name, tapPrefix) + if !ok { + return "", false + } + idx := strings.LastIndex(rest, "-") + if idx <= 0 { + return "", false + } + return rest[:idx], true +} diff --git a/network/bridge/gc_other.go b/network/bridge/gc_other.go new file mode 100644 index 00000000..eb9ea134 --- /dev/null +++ b/network/bridge/gc_other.go @@ -0,0 +1,32 @@ +//go:build !linux + +package bridge + +import ( + "context" + + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/lock/flock" +) + +// bridgeSnapshot is a placeholder for non-Linux. +type bridgeSnapshot struct{} + +// GCModule returns a no-op GC module on non-Linux — bridge TAPs don't exist. +func GCModule(rootDir string) gc.Module[bridgeSnapshot] { + return gc.Module[bridgeSnapshot]{ + Name: "bridge", + // /dev/null is world-writable and supports flock on all Unix platforms, + // so TryLock always succeeds without creating a real lock file. + Locker: flock.New("/dev/null"), + ReadDB: func(_ context.Context) (bridgeSnapshot, error) { + return bridgeSnapshot{}, nil + }, + Resolve: func(_ bridgeSnapshot, _ map[string]any) []string { + return nil + }, + Collect: func(_ context.Context, _ []string) error { + return nil + }, + } +} diff --git a/network/cni/create.go b/network/cni/create.go index e7e1baf3..42e2460e 100644 --- a/network/cni/create.go +++ b/network/cni/create.go @@ -91,7 +91,7 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types if i < len(existing) && existing[i] != nil { overrideMAC = existing[i].Mac } - mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, netNumQueues(vmCfg.CPU), overrideMAC) + mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, network.NetNumQueues(vmCfg.CPU), overrideMAC) if setupErr != nil { return nil, fmt.Errorf("setup tc-redirect %s: %w", vmID, setupErr) } @@ -99,8 +99,9 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types configs = append(configs, &types.NetworkConfig{ Tap: tapName, Mac: mac, - NumQueues: netNumQueues(vmCfg.CPU), + NumQueues: network.NetNumQueues(vmCfg.CPU), QueueSize: defaultQueueSize, + Backend: typ, NetnsPath: nsPath, Network: netInfo, }) @@ -142,19 +143,7 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types } func tapNameForVM(vmID string, nic int) string { - const vmIDPrefixLen = 8 - if len(vmID) > vmIDPrefixLen { - vmID = vmID[:vmIDPrefixLen] - } - return fmt.Sprintf("tap%s-%d", vmID, nic) -} - -// netNumQueues returns the virtio-net queue count for cpu. -func netNumQueues(cpu int) int { - if cpu <= 1 { - return 2 //nolint:mnd - } - return cpu * 2 //nolint:mnd + return fmt.Sprintf("tap%s-%d", network.VMIDPrefix(vmID), nic) } // extractNetworkInfo converts a CNI ADD result into types.Network. diff --git a/network/cni/create_linux.go b/network/cni/create_linux.go index c7f19dde..06a5b30f 100644 --- a/network/cni/create_linux.go +++ b/network/cni/create_linux.go @@ -101,7 +101,6 @@ func tcRedirectInNS(ifName, tapName string, queues int, overrideMAC string) (str flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI if queuePairs <= 1 { flags |= netlink.TUNTAP_ONE_QUEUE - queuePairs = 1 } else { flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS } diff --git a/network/utils.go b/network/utils.go new file mode 100644 index 00000000..0bf16bd3 --- /dev/null +++ b/network/utils.go @@ -0,0 +1,21 @@ +package network + +const vmIDPrefixLen = 8 + +// NetNumQueues returns the virtio-net queue count for the given CPU count. +// CH uses queue pairs (TX+RX), so the result is always even (≥ 2). +func NetNumQueues(cpu int) int { + if cpu <= 1 { + return 2 //nolint:mnd + } + return cpu * 2 //nolint:mnd +} + +// VMIDPrefix returns the first 8 characters of a VM ID, matching the +// truncation used by both bridge and CNI TAP device naming. +func VMIDPrefix(vmID string) string { + if len(vmID) > vmIDPrefixLen { + return vmID[:vmIDPrefixLen] + } + return vmID +} diff --git a/types/network.go b/types/network.go index bc497511..9fc5a9c5 100644 --- a/types/network.go +++ b/types/network.go @@ -7,6 +7,15 @@ type NetworkConfig struct { NumQueues int `json:"num_queues"` // Virtio queue count (= CPU * 2 for multi-queue). QueueSize int `json:"queue_size"` + // Backend identifies the network provider that created this NIC ("cni" or "bridge"). + // Used to select the correct provider during network recovery on VM start. + // Empty means "cni" for backward compatibility with existing VM records. + Backend string `json:"backend,omitempty"` + + // BridgeDev is the Linux bridge device name (e.g. "cni0", "br0"). + // Set only when Backend is "bridge"; required for recovery and GC. + BridgeDev string `json:"bridge_dev,omitempty"` + // NetnsPath is the network namespace path where the tap device lives. // Set by the network plugin at Config time; read by the hypervisor at Start time. // Empty when the network backend does not use network namespaces (e.g. macOS vmnet).