// Copyright 2015 flannel authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // +build !windows package vxlan // Some design notes and history: // VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts) // The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses // - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates // an L2 miss (i.e. an ARP lookup) // - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use. // This is stored in the ARP table (with a timeout) to avoid constantly looking it up. // - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from // the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called // an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route // is created to the whole flannel network so that non-local traffic is sent over the vxlan device. // // In this scheme the scaling of table entries (per host) is: // - 1 route (for the configured network out the vxlan device) // - One arp entry for each remote container that this host has recently contacted // - One FDB entry for each remote host // // The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either // during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required. // // // The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not // listening for any netlink messages anymore. This improves reliability (no problems with timeouts if // flannel crashes or restarts) and simplifies upgrades. // // How it works: // Create the vxlan device but don't register for any L2MISS or L3MISS messages // Then, as each remote host is discovered (either on startup or when they are added), do the following // 1) Create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host). // 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC) // 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon. // // In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host // // In this newest scheme, there is also the option of skipping the use of vxlan for hosts that are on the same subnet, // this is called "directRouting" import ( "encoding/json" "fmt" log "github.com/golang/glog" "net" "sync" "golang.org/x/net/context" "github.com/coreos/flannel/backend" "github.com/coreos/flannel/pkg/ip" "github.com/coreos/flannel/subnet" ) func init() { backend.Register("vxlan", New) } const ( defaultVNI = 1 ) type VXLANBackend struct { subnetMgr subnet.Manager extIface *backend.ExternalInterface } func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) { backend := &VXLANBackend{ subnetMgr: sm, extIface: extIface, } return backend, nil } func newSubnetAttrs(publicIP net.IP, mac net.HardwareAddr) (*subnet.LeaseAttrs, error) { data, err := json.Marshal(&vxlanLeaseAttrs{hardwareAddr(mac)}) if err != nil { return nil, err } return &subnet.LeaseAttrs{ PublicIP: ip.FromIP(publicIP), BackendType: "vxlan", BackendData: json.RawMessage(data), }, nil } func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg sync.WaitGroup, config *subnet.Config) (backend.Network, error) { // Parse our configuration cfg := struct { VNI int Port int GBP bool Learning bool DirectRouting bool }{ VNI: defaultVNI, } if len(config.Backend) > 0 { if err := json.Unmarshal(config.Backend, &cfg); err != nil { return nil, fmt.Errorf("error decoding VXLAN backend config: %v", err) } } log.Infof("VXLAN config: VNI=%d Port=%d GBP=%v Learning=%v DirectRouting=%v", cfg.VNI, cfg.Port, cfg.GBP, cfg.Learning, cfg.DirectRouting) devAttrs := vxlanDeviceAttrs{ vni: uint32(cfg.VNI), name: fmt.Sprintf("flannel.%v", cfg.VNI), vtepIndex: be.extIface.Iface.Index, vtepAddr: be.extIface.IfaceAddr, vtepPort: cfg.Port, gbp: cfg.GBP, learning: cfg.Learning, } dev, err := newVXLANDevice(&devAttrs) if err != nil { return nil, err } dev.directRouting = cfg.DirectRouting subnetAttrs, err := newSubnetAttrs(be.extIface.ExtAddr, dev.MACAddr()) if err != nil { return nil, err } lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs) switch err { case nil: case context.Canceled, context.DeadlineExceeded: return nil, err default: return nil, fmt.Errorf("failed to acquire lease: %v", err) } // Ensure that the device has a /32 address so that no broadcast routes are created. // This IP is just used as a source address for host to workload traffic (so // the return path for the traffic has an address on the flannel network to use as the destination) if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}); err != nil { return nil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err) } return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease) } // So we can make it JSON (un)marshalable type hardwareAddr net.HardwareAddr func (hw hardwareAddr) MarshalJSON() ([]byte, error) { return []byte(fmt.Sprintf("%q", net.HardwareAddr(hw))), nil } func (hw *hardwareAddr) UnmarshalJSON(bytes []byte) error { if len(bytes) < 2 || bytes[0] != '"' || bytes[len(bytes)-1] != '"' { return fmt.Errorf("error parsing hardware addr") } bytes = bytes[1 : len(bytes)-1] mac, err := net.ParseMAC(string(bytes)) if err != nil { return err } *hw = hardwareAddr(mac) return nil }