diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index d853f389..c09fb854 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -21,6 +21,7 @@ import ( "github.com/cocoonstack/cocoon/images/cloudimg" "github.com/cocoonstack/cocoon/images/oci" "github.com/cocoonstack/cocoon/network" + bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/network/cni" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/snapshot/localfile" @@ -172,6 +173,15 @@ func InitNetwork(conf *config.Config) (network.Network, error) { return p, nil } +// InitBridgeNetwork creates a TAP-on-bridge network provider. +func InitBridgeNetwork(conf *config.Config, bridgeDev string) (network.Network, error) { + p, err := bridgenet.New(conf, bridgeDev) + if err != nil { + return nil, fmt.Errorf("init bridge network: %w", err) + } + return p, nil +} + // InitSnapshot initializes the snapshot backend. func InitSnapshot(conf *config.Config) (snapshot.Snapshot, error) { s, err := localfile.New(conf) diff --git a/cmd/others/handler.go b/cmd/others/handler.go index d687be26..6d0e570a 100644 --- a/cmd/others/handler.go +++ b/cmd/others/handler.go @@ -8,6 +8,7 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/version" ) @@ -48,6 +49,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { hyper.RegisterGC(o) } netProvider.RegisterGC(o) + gc.Register(o, bridge.GCModule(conf.RootDir)) snapBackend.RegisterGC(o) if err := o.Run(ctx); err != nil { return err diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index 29dee3bf..e5b7112b 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -155,7 +155,8 @@ func addVMFlags(cmd *cobra.Command) { cmd.Flags().String("memory", "1G", "memory size") //nolint:mnd cmd.Flags().String("storage", "10G", "COW disk size") //nolint:mnd cmd.Flags().Int("nics", 1, "number of network interfaces (0 = no network); multiple NICs with auto IP config only works for cloudimg; OCI images auto-configure only the last NIC, others require manual setup inside the guest") - cmd.Flags().String("network", "", "CNI conflist name (empty = default)") + cmd.Flags().String("network", "", "CNI conflist name (empty = default); mutually exclusive with --bridge") + cmd.Flags().String("bridge", "", "use TAP-on-bridge instead of CNI (value is bridge device, e.g. cni0); VM gets IP via DHCP from the bridge") cmd.Flags().Bool("windows", false, "Windows guest (UEFI boot, kvm_hyperv=on, no cidata)") } @@ -166,4 +167,5 @@ func addCloneFlags(cmd *cobra.Command) { cmd.Flags().String("storage", "", "COW disk size (empty = inherit from snapshot)") cmd.Flags().Int("nics", 0, "number of NICs (0 = inherit from snapshot)") cmd.Flags().String("network", "", "CNI conflist name (empty = inherit from source VM)") + cmd.Flags().String("bridge", "", "use TAP-on-bridge instead of CNI (value is bridge device, e.g. cni0)") } diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 8e495252..a48bc580 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -11,9 +11,11 @@ import ( "github.com/spf13/cobra" cmdcore "github.com/cocoonstack/cocoon/cmd/core" + "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/console" "github.com/cocoonstack/cocoon/hypervisor" "github.com/cocoonstack/cocoon/network" + bridgenet "github.com/cocoonstack/cocoon/network/bridge" "github.com/cocoonstack/cocoon/types" ) @@ -34,10 +36,8 @@ func (h Handler) Start(cmd *cobra.Command, args []string) error { } // Recover network for all backends before starting. - if netProvider, netErr := cmdcore.InitNetwork(conf); netErr == nil { - for hyper, refs := range routed { - h.recoverNetwork(ctx, hyper, netProvider, refs) - } + for hyper, refs := range routed { + h.recoverNetwork(ctx, conf, hyper, refs) } return batchRoutedCmd(ctx, "start", "started", routed, func(hyper hypervisor.Hypervisor, refs []string) ([]string, error) { @@ -191,6 +191,8 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { return fmt.Errorf("vm(s) deleted but network cleanup failed: %w", delErr) } } + // Also clean up bridge TAPs (no-op if none exist). + bridgenet.CleanupTAPs(allDeleted) } if lastErr != nil { @@ -202,23 +204,67 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { return nil } -func (h Handler) recoverNetwork(ctx context.Context, hyper hypervisor.Hypervisor, net network.Network, refs []string) { +func (h Handler) recoverNetwork(ctx context.Context, conf *config.Config, hyper hypervisor.Hypervisor, refs []string) { logger := log.WithFunc("cmd.recoverNetwork") + + // Lazy-init CNI provider (may fail if not configured — OK for bridge-only setups). + var cniProvider network.Network + if p, err := cmdcore.InitNetwork(conf); err == nil { + cniProvider = p + } + + // Cache bridge providers by device name to avoid redundant netlink lookups. + bridgeProviders := map[string]network.Network{} + for _, ref := range refs { vm, err := hyper.Inspect(ctx, ref) if err != nil || vm == nil || len(vm.NetworkConfigs) == 0 { continue } - if net.Verify(ctx, vm.ID) == nil { + + netProvider, provErr := providerForVM(conf, cniProvider, bridgeProviders, vm.NetworkConfigs) + if provErr != nil { + logger.Warnf(ctx, "skip recovery for VM %s: %v", vm.ID, provErr) + continue + } + if netProvider.Verify(ctx, vm.ID) == nil { continue } - logger.Warnf(ctx, "netns missing for VM %s, recovering network", vm.ID) - if _, recoverErr := net.Config(ctx, vm.ID, len(vm.NetworkConfigs), &vm.Config, vm.NetworkConfigs...); recoverErr != nil { + logger.Warnf(ctx, "network missing for VM %s, recovering", vm.ID) + if _, recoverErr := netProvider.Config(ctx, vm.ID, len(vm.NetworkConfigs), &vm.Config, vm.NetworkConfigs...); recoverErr != nil { logger.Warnf(ctx, "recover network for VM %s: %v (start will fail)", vm.ID, recoverErr) } } } +// providerForVM selects the correct network provider based on persisted NetworkConfig. +func providerForVM(conf *config.Config, cniProvider network.Network, bridgeCache map[string]network.Network, configs []*types.NetworkConfig) (network.Network, error) { + if len(configs) == 0 { + return nil, fmt.Errorf("no network configs") + } + // All NICs on a VM share the same backend. + cfg := configs[0] + if cfg.Backend == "bridge" { + if cfg.BridgeDev == "" { + return nil, fmt.Errorf("bridge backend but no bridge device persisted") + } + if cached, ok := bridgeCache[cfg.BridgeDev]; ok { + return cached, nil + } + p, err := cmdcore.InitBridgeNetwork(conf, cfg.BridgeDev) + if err != nil { + return nil, err + } + bridgeCache[cfg.BridgeDev] = p + return p, nil + } + // "cni" or empty (backward compat). + if cniProvider == nil { + return nil, fmt.Errorf("CNI provider not available") + } + return cniProvider, nil +} + // batchRoutedCmd runs a batch operation across multiple backends. func batchRoutedCmd(ctx context.Context, name, pastTense string, routed map[hypervisor.Hypervisor][]string, fn func(hypervisor.Hypervisor, []string) ([]string, error)) error { logger := log.WithFunc("cmd." + name) diff --git a/cmd/vm/run.go b/cmd/vm/run.go index 045c6d42..73566a21 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -235,7 +235,8 @@ func (h Handler) prepareClone(ctx context.Context, cmd *cobra.Command, conf *con return nil, "", nil, nil, fmt.Errorf("--nics %d below snapshot minimum %d", nics, cfg.NICs) } - netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker)) + bridgeDev, _ := cmd.Flags().GetString("bridge") + netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker), bridgeDev) if err != nil { return nil, "", nil, nil, err } @@ -288,6 +289,10 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t if conf.UseFirecracker && vmCfg.Windows { return nil, nil, nil, fmt.Errorf("--fc and --windows are mutually exclusive: Firecracker does not support Windows guests") } + bridgeDev, _ := cmd.Flags().GetString("bridge") + if bridgeDev != "" && vmCfg.Network != "" { + return nil, nil, nil, fmt.Errorf("--bridge and --network are mutually exclusive") + } backends, hyper, err := cmdcore.InitBackends(ctx, conf) if err != nil { @@ -312,7 +317,7 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t } nics, _ := cmd.Flags().GetInt("nics") - netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker)) + netProvider, networkConfigs, err := initNetwork(ctx, conf, vmID, nics, vmCfg, tapQueues(vmCfg.CPU, conf.UseFirecracker), bridgeDev) if err != nil { return nil, nil, nil, err } @@ -334,11 +339,17 @@ func tapQueues(cpu int, useFC bool) int { return cpu } -func initNetwork(ctx context.Context, conf *config.Config, vmID string, nics int, vmCfg *types.VMConfig, queues int) (network.Network, []*types.NetworkConfig, error) { +func initNetwork(ctx context.Context, conf *config.Config, vmID string, nics int, vmCfg *types.VMConfig, queues int, bridgeDev string) (network.Network, []*types.NetworkConfig, error) { if nics <= 0 { return nil, nil, nil } - netProvider, err := cmdcore.InitNetwork(conf) + var netProvider network.Network + var err error + if bridgeDev != "" { + netProvider, err = cmdcore.InitBridgeNetwork(conf, bridgeDev) + } else { + netProvider, err = cmdcore.InitNetwork(conf) + } if err != nil { return nil, nil, fmt.Errorf("init network: %w", err) } diff --git a/hypervisor/cloudhypervisor/start.go b/hypervisor/cloudhypervisor/start.go index e1a80c91..970bd37f 100644 --- a/hypervisor/cloudhypervisor/start.go +++ b/hypervisor/cloudhypervisor/start.go @@ -90,9 +90,9 @@ func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VM cmd.Stderr = logFile } - // If the VM has network, CH must be launched inside the VM's netns - // so it can access the tap device. We setns before fork and restore after. - if withNetwork { + // CNI mode: TAP is inside a per-VM netns, switch before fork. + // Bridge mode: TAP is in host netns, no EnterNetns needed. + if withNetwork && rec.NetworkConfigs[0].NetnsPath != "" { restore, enterErr := hypervisor.EnterNetns(rec.NetworkConfigs[0].NetnsPath) if enterErr != nil { return 0, fmt.Errorf("enter netns: %w", enterErr) diff --git a/hypervisor/firecracker/start.go b/hypervisor/firecracker/start.go index 319c6a4c..b8bff36e 100644 --- a/hypervisor/firecracker/start.go +++ b/hypervisor/firecracker/start.go @@ -177,7 +177,7 @@ func (fc *Firecracker) launchProcess(ctx context.Context, rec *hypervisor.VMReco fcCmd.Stdin = slave fcCmd.Stdout = slave - if withNetwork { + if withNetwork && rec.NetworkConfigs[0].NetnsPath != "" { restore, enterErr := hypervisor.EnterNetns(rec.NetworkConfigs[0].NetnsPath) if enterErr != nil { _ = master.Close() diff --git a/network/bridge/bridge_linux.go b/network/bridge/bridge_linux.go new file mode 100644 index 00000000..5d4e6a01 --- /dev/null +++ b/network/bridge/bridge_linux.go @@ -0,0 +1,210 @@ +//go:build linux + +package bridge + +import ( + "context" + "crypto/rand" + "fmt" + "net" + + "github.com/projecteru2/core/log" + "github.com/vishvananda/netlink" + + "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/types" +) + +// compile-time interface check. +var _ network.Network = (*Bridge)(nil) + +const ( + typ = "bridge" + defaultQueueSize = 256 +) + +// Bridge implements network.Network by creating TAP devices and adding +// them directly to an existing Linux bridge. An external DHCP server +// on the bridge (e.g. dnsmasq) serves VM IPs. No veth, no TC, no +// netns — just TAP-on-bridge, the simplest possible VM networking. +// +// This backend is designed to work with cocoon-net's cni0 bridge or +// any pre-existing bridge that has DHCP + routing already set up. +type Bridge struct { + conf *config.Config + bridgeDev string + bridgeIdx int +} + +// New creates a Bridge network provider. The bridge device must exist. +func New(conf *config.Config, bridgeDev string) (*Bridge, error) { + if conf == nil { + return nil, fmt.Errorf("config is nil") + } + if bridgeDev == "" { + return nil, fmt.Errorf("bridge device name is required") + } + br, err := netlink.LinkByName(bridgeDev) + if err != nil { + return nil, fmt.Errorf("bridge %s: %w", bridgeDev, err) + } + if br.Type() != "bridge" { + return nil, fmt.Errorf("%s is not a bridge (type: %s)", bridgeDev, br.Type()) + } + return &Bridge{ + conf: conf, + bridgeDev: bridgeDev, + bridgeIdx: br.Attrs().Index, + }, nil +} + +// Type returns the provider identifier. +func (b *Bridge) Type() string { return typ } + +// Verify checks whether the TAP for a VM exists. +func (b *Bridge) Verify(_ context.Context, vmID string) error { + if _, err := netlink.LinkByName(tapName(vmID, 0)); err != nil { + return fmt.Errorf("tap %s: %w", tapName(vmID, 0), err) + } + return nil +} + +// Config creates TAP devices and adds them to the bridge. +func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig, existing ...*types.NetworkConfig) ([]*types.NetworkConfig, error) { + logger := log.WithFunc("bridge.Config") + + br, err := netlink.LinkByIndex(b.bridgeIdx) + if err != nil { + return nil, fmt.Errorf("find bridge: %w", err) + } + + var configs []*types.NetworkConfig + for i := range numNICs { + name := tapName(vmID, i) + + var mac string + if i < len(existing) && existing[i] != nil { + mac = existing[i].Mac + } else { + mac = generateMAC() + } + + queues := network.NetNumQueues(vmCfg.CPU) + if err := createTAP(name, queues); err != nil { + return nil, fmt.Errorf("create tap %s: %w", name, err) + } + + tap, err := netlink.LinkByName(name) + if err != nil { + return nil, fmt.Errorf("find tap %s: %w", name, err) + } + + // Add TAP to bridge — this is the only wiring needed. + if err := netlink.LinkSetMaster(tap, br); err != nil { + _ = netlink.LinkDel(tap) + return nil, fmt.Errorf("add %s to %s: %w", name, b.bridgeDev, err) + } + + // Sync MTU from bridge. + if mtu := br.Attrs().MTU; mtu > 0 { + _ = netlink.LinkSetMTU(tap, mtu) + } + + if err := netlink.LinkSetUp(tap); err != nil { + _ = netlink.LinkDel(tap) + return nil, fmt.Errorf("set %s up: %w", name, err) + } + + configs = append(configs, &types.NetworkConfig{ + Tap: name, + Mac: mac, + NumQueues: queues, + QueueSize: defaultQueueSize, + Backend: typ, + BridgeDev: b.bridgeDev, + // NetnsPath: empty — TAP is in host netns. + // Network: nil — IP comes from DHCP on the bridge. + }) + logger.Debugf(ctx, "NIC %d: tap=%s mac=%s bridge=%s", i, name, mac, b.bridgeDev) + } + return configs, nil +} + +// Delete removes TAP devices for the given VMs. +func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { + return CleanupTAPs(vmIDs), nil +} + +// Inspect is not supported — bridge mode has no persistent records. +func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { + return nil, nil +} + +// List is not supported — bridge mode has no persistent records. +func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { + return nil, nil +} + +// RegisterGC registers the bridge GC module that reclaims orphan bt* TAP devices. +func (b *Bridge) RegisterGC(orch *gc.Orchestrator) { + gc.Register(orch, GCModule(b.conf.RootDir)) +} + +// CleanupTAPs removes bridge TAP devices for the given VM IDs. +// It does not require a Bridge instance and is safe to call +// even when no bridge TAPs exist (no-op per VM). +func CleanupTAPs(vmIDs []string) []string { + var cleaned []string + for _, vmID := range vmIDs { + for i := range 8 { // max 8 NICs per VM + name := tapName(vmID, i) + l, err := netlink.LinkByName(name) + if err != nil { + break // no more TAPs for this VM + } + _ = netlink.LinkDel(l) + } + cleaned = append(cleaned, vmID) + } + return cleaned +} + +func createTAP(name string, numQueues int) error { + // CH uses queue pairs (TX+RX): queue_pairs = num_queues / 2. + // Multi-queue requires queue_pairs > 1, i.e. num_queues > 2. + // The TAP's IFF_MULTI_QUEUE flag must match CH's expectation, + // otherwise CH's sysfs pre-flight check rejects the device. + queuePairs := max(1, numQueues/2) //nolint:mnd + flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI + if queuePairs <= 1 { + flags |= netlink.TUNTAP_ONE_QUEUE + } else { + flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS + } + tap := &netlink.Tuntap{ + LinkAttrs: netlink.LinkAttrs{Name: name}, + Mode: netlink.TUNTAP_MODE_TAP, + Queues: queuePairs, + Flags: flags, + } + if err := netlink.LinkAdd(tap); err != nil { + return err + } + for _, fd := range tap.Fds { + _ = fd.Close() + } + return nil +} + +func tapName(vmID string, nic int) string { + return fmt.Sprintf("%s%s-%d", tapPrefix, network.VMIDPrefix(vmID), nic) +} + +func generateMAC() string { + buf := make([]byte, 6) //nolint:mnd + _, _ = rand.Read(buf) + buf[0] = (buf[0] | 0x02) & 0xfe + return net.HardwareAddr(buf).String() +} diff --git a/network/bridge/bridge_other.go b/network/bridge/bridge_other.go new file mode 100644 index 00000000..79f2d214 --- /dev/null +++ b/network/bridge/bridge_other.go @@ -0,0 +1,51 @@ +//go:build !linux + +package bridge + +import ( + "context" + "fmt" + "runtime" + + "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/types" +) + +// Bridge is a placeholder for non-Linux. +type Bridge struct{} + +// New returns an error on non-Linux. +func New(_ *config.Config, _ string) (*Bridge, error) { + return nil, fmt.Errorf("bridge TAP networking requires Linux (running on %s)", runtime.GOOS) +} + +// Type returns the provider identifier. +func (b *Bridge) Type() string { return "bridge" } + +// Verify is not supported. +func (b *Bridge) Verify(_ context.Context, _ string) error { return errUnsupported } + +// Config is not supported. +func (b *Bridge) Config(_ context.Context, _ string, _ int, _ *types.VMConfig, _ ...*types.NetworkConfig) ([]*types.NetworkConfig, error) { + return nil, errUnsupported +} + +// Delete is not supported. +func (b *Bridge) Delete(_ context.Context, _ []string) ([]string, error) { return nil, errUnsupported } + +// Inspect is not supported. +func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { + return nil, errUnsupported +} + +// List is not supported. +func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { return nil, errUnsupported } + +// RegisterGC is a no-op. +func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} + +// CleanupTAPs is a no-op on non-Linux. +func CleanupTAPs(_ []string) []string { return nil } + +var errUnsupported = fmt.Errorf("bridge TAP networking requires Linux (running on %s)", runtime.GOOS) diff --git a/network/bridge/gc_linux.go b/network/bridge/gc_linux.go new file mode 100644 index 00000000..f095c9a4 --- /dev/null +++ b/network/bridge/gc_linux.go @@ -0,0 +1,112 @@ +//go:build linux + +package bridge + +import ( + "context" + "path/filepath" + "slices" + "strings" + + "github.com/projecteru2/core/log" + "github.com/vishvananda/netlink" + + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/lock/flock" + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/utils" +) + +const tapPrefix = "bt" + +// bridgeSnapshot holds the set of VM ID prefixes that own bt* TAP devices. +type bridgeSnapshot struct { + prefixes map[string]struct{} +} + +// GCModule returns a GC module that reclaims orphan bt* TAP devices. +// It does not require a Bridge instance — only rootDir for the lock file. +func GCModule(rootDir string) gc.Module[bridgeSnapshot] { + lockPath := filepath.Join(rootDir, "bridge", "gc.lock") + _ = utils.EnsureDirs(filepath.Dir(lockPath)) + + return gc.Module[bridgeSnapshot]{ + Name: typ, + Locker: flock.New(lockPath), + ReadDB: func(_ context.Context) (bridgeSnapshot, error) { + snap := bridgeSnapshot{prefixes: make(map[string]struct{})} + + links, err := netlink.LinkList() + if err != nil { + return snap, err + } + for _, l := range links { + if prefix, ok := parseTAPName(l.Attrs().Name); ok { + snap.prefixes[prefix] = struct{}{} + } + } + return snap, nil + }, + Resolve: func(snap bridgeSnapshot, others map[string]any) []string { + active := gc.Collect(others, gc.VMIDs) + + // Build set of 8-char prefixes from active VM IDs. + activePrefixes := make(map[string]struct{}, len(active)) + for id := range active { + activePrefixes[network.VMIDPrefix(id)] = struct{}{} + } + + var orphans []string + for prefix := range snap.prefixes { + if _, ok := activePrefixes[prefix]; !ok { + orphans = append(orphans, prefix) + } + } + slices.Sort(orphans) + return orphans + }, + Collect: func(ctx context.Context, prefixes []string) error { + logger := log.WithFunc("bridge.gc.Collect") + + orphanSet := make(map[string]struct{}, len(prefixes)) + for _, p := range prefixes { + orphanSet[p] = struct{}{} + } + + links, err := netlink.LinkList() + if err != nil { + return err + } + for _, l := range links { + name := l.Attrs().Name + prefix, ok := parseTAPName(name) + if !ok { + continue + } + if _, orphan := orphanSet[prefix]; !orphan { + continue + } + if err := netlink.LinkDel(l); err != nil { + logger.Warnf(ctx, "delete orphan TAP %s: %v", name, err) + } else { + logger.Infof(ctx, "collected orphan TAP %s", name) + } + } + return nil + }, + } +} + +// parseTAPName extracts the vmID prefix from a bridge TAP name like "bt-". +// Returns the prefix and true, or ("", false) if the name doesn't match. +func parseTAPName(name string) (string, bool) { + rest, ok := strings.CutPrefix(name, tapPrefix) + if !ok { + return "", false + } + idx := strings.LastIndex(rest, "-") + if idx <= 0 { + return "", false + } + return rest[:idx], true +} diff --git a/network/bridge/gc_other.go b/network/bridge/gc_other.go new file mode 100644 index 00000000..eb9ea134 --- /dev/null +++ b/network/bridge/gc_other.go @@ -0,0 +1,32 @@ +//go:build !linux + +package bridge + +import ( + "context" + + "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/lock/flock" +) + +// bridgeSnapshot is a placeholder for non-Linux. +type bridgeSnapshot struct{} + +// GCModule returns a no-op GC module on non-Linux — bridge TAPs don't exist. +func GCModule(rootDir string) gc.Module[bridgeSnapshot] { + return gc.Module[bridgeSnapshot]{ + Name: "bridge", + // /dev/null is world-writable and supports flock on all Unix platforms, + // so TryLock always succeeds without creating a real lock file. + Locker: flock.New("/dev/null"), + ReadDB: func(_ context.Context) (bridgeSnapshot, error) { + return bridgeSnapshot{}, nil + }, + Resolve: func(_ bridgeSnapshot, _ map[string]any) []string { + return nil + }, + Collect: func(_ context.Context, _ []string) error { + return nil + }, + } +} diff --git a/network/cni/create.go b/network/cni/create.go index 8c60af93..42e2460e 100644 --- a/network/cni/create.go +++ b/network/cni/create.go @@ -91,7 +91,7 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types if i < len(existing) && existing[i] != nil { overrideMAC = existing[i].Mac } - mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, vmCfg.CPU, overrideMAC) + mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, network.NetNumQueues(vmCfg.CPU), overrideMAC) if setupErr != nil { return nil, fmt.Errorf("setup tc-redirect %s: %w", vmID, setupErr) } @@ -99,8 +99,9 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types configs = append(configs, &types.NetworkConfig{ Tap: tapName, Mac: mac, - NumQueues: netNumQueues(vmCfg.CPU), + NumQueues: network.NetNumQueues(vmCfg.CPU), QueueSize: defaultQueueSize, + Backend: typ, NetnsPath: nsPath, Network: netInfo, }) @@ -142,19 +143,7 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types } func tapNameForVM(vmID string, nic int) string { - const vmIDPrefixLen = 8 - if len(vmID) > vmIDPrefixLen { - vmID = vmID[:vmIDPrefixLen] - } - return fmt.Sprintf("tap%s-%d", vmID, nic) -} - -// netNumQueues returns the virtio-net queue count for cpu. -func netNumQueues(cpu int) int { - if cpu <= 1 { - return 2 //nolint:mnd - } - return cpu * 2 //nolint:mnd + return fmt.Sprintf("tap%s-%d", network.VMIDPrefix(vmID), nic) } // extractNetworkInfo converts a CNI ADD result into types.Network. diff --git a/network/cni/create_linux.go b/network/cni/create_linux.go index 5c880672..06a5b30f 100644 --- a/network/cni/create_linux.go +++ b/network/cni/create_linux.go @@ -95,10 +95,11 @@ func tcRedirectInNS(ifName, tapName string, queues int, overrideMAC string) (str } } - // Match hypervisor attach flags and queue layout. + // CH uses queue pairs (TX+RX): queue_pairs = num_queues / 2. + // Multi-queue requires queue_pairs > 1, i.e. num_queues > 2. + queuePairs := max(1, queues/2) //nolint:mnd flags := netlink.TUNTAP_VNET_HDR | netlink.TUNTAP_NO_PI - if queues <= 1 { - queues = 1 + if queuePairs <= 1 { flags |= netlink.TUNTAP_ONE_QUEUE } else { flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS @@ -106,7 +107,7 @@ func tcRedirectInNS(ifName, tapName string, queues int, overrideMAC string) (str tap := &netlink.Tuntap{ LinkAttrs: netlink.LinkAttrs{Name: tapName}, Mode: netlink.TUNTAP_MODE_TAP, - Queues: queues, + Queues: queuePairs, Flags: flags, } if addErr := netlink.LinkAdd(tap); addErr != nil { diff --git a/network/utils.go b/network/utils.go new file mode 100644 index 00000000..0bf16bd3 --- /dev/null +++ b/network/utils.go @@ -0,0 +1,21 @@ +package network + +const vmIDPrefixLen = 8 + +// NetNumQueues returns the virtio-net queue count for the given CPU count. +// CH uses queue pairs (TX+RX), so the result is always even (≥ 2). +func NetNumQueues(cpu int) int { + if cpu <= 1 { + return 2 //nolint:mnd + } + return cpu * 2 //nolint:mnd +} + +// VMIDPrefix returns the first 8 characters of a VM ID, matching the +// truncation used by both bridge and CNI TAP device naming. +func VMIDPrefix(vmID string) string { + if len(vmID) > vmIDPrefixLen { + return vmID[:vmIDPrefixLen] + } + return vmID +} diff --git a/types/network.go b/types/network.go index bc497511..9fc5a9c5 100644 --- a/types/network.go +++ b/types/network.go @@ -7,6 +7,15 @@ type NetworkConfig struct { NumQueues int `json:"num_queues"` // Virtio queue count (= CPU * 2 for multi-queue). QueueSize int `json:"queue_size"` + // Backend identifies the network provider that created this NIC ("cni" or "bridge"). + // Used to select the correct provider during network recovery on VM start. + // Empty means "cni" for backward compatibility with existing VM records. + Backend string `json:"backend,omitempty"` + + // BridgeDev is the Linux bridge device name (e.g. "cni0", "br0"). + // Set only when Backend is "bridge"; required for recovery and GC. + BridgeDev string `json:"bridge_dev,omitempty"` + // NetnsPath is the network namespace path where the tap device lives. // Set by the network plugin at Config time; read by the hypervisor at Start time. // Empty when the network backend does not use network namespaces (e.g. macOS vmnet).