kilo/pkg/mesh/routes.go
Lucas Servén Marín acfd0bbaec
pkg/iptables: reduce calls to iptables
Currently, every time the iptables controller syncs rules, it spawns an
an iptables process for every rule it checks. This causes two problems:
1. it creates unnecessary load on the system; and
2. it causes contention on the xtables lock file.

This commit creates a lazy cache for iptables rules and chains that
avoids spawning iptables processes. This means that each time the
iptables rules are reconciled, if no rules need to be changed then at
most one iptables process should be spawned to check all of the rules in
a chain and at most one process should be spawned to check all of the
chains in a table.

Note: the success of this reduction in calls to iptables depends on a
somewhat fragile comparison of iptables rule text. The text of any rule
must match exactly, including the order of the flags. An improvement to
come would be to implement an iptables rule parser than can be used to
check semantic equivalence betweem iptables rules.

Signed-off-by: Lucas Servén Marín <lserven@gmail.com>
2021-02-20 19:24:06 +01:00

257 lines
9.6 KiB
Go

// Copyright 2019 the Kilo authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
package mesh
import (
"net"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"github.com/squat/kilo/pkg/encapsulation"
"github.com/squat/kilo/pkg/iptables"
)
const kiloTableIndex = 1107
// Routes generates a slice of routes for a given Topology.
func (t *Topology) Routes(kiloIfaceName string, kiloIface, privIface, tunlIface int, local bool, enc encapsulation.Encapsulator) ([]*netlink.Route, []*netlink.Rule) {
var routes []*netlink.Route
var rules []*netlink.Rule
if !t.leader {
// Find the GW for this segment.
// This will be the an IP of the leader.
// In an IPIP encapsulated mesh it is the leader's private IP.
var gw net.IP
for _, segment := range t.segments {
if segment.location == t.location {
gw = enc.Gw(segment.endpoint.IP, segment.privateIPs[segment.leader], segment.cidrs[segment.leader])
break
}
}
for _, segment := range t.segments {
// First, add a route to the WireGuard IP of the segment.
routes = append(routes, encapsulateRoute(&netlink.Route{
Dst: oneAddressCIDR(segment.wireGuardIP),
Flags: int(netlink.FLAG_ONLINK),
Gw: gw,
LinkIndex: privIface,
Protocol: unix.RTPROT_STATIC,
}, enc.Strategy(), t.privateIP, tunlIface))
// Add routes for the current segment if local is true.
if segment.location == t.location {
if local {
for i := range segment.cidrs {
// Don't add routes for the local node.
if segment.privateIPs[i].Equal(t.privateIP.IP) {
continue
}
routes = append(routes, encapsulateRoute(&netlink.Route{
Dst: segment.cidrs[i],
Flags: int(netlink.FLAG_ONLINK),
Gw: segment.privateIPs[i],
LinkIndex: privIface,
Protocol: unix.RTPROT_STATIC,
}, enc.Strategy(), t.privateIP, tunlIface))
// Encapsulate packets from the host's Pod subnet headed
// to private IPs.
if enc.Strategy() == encapsulation.Always || (enc.Strategy() == encapsulation.CrossSubnet && !t.privateIP.Contains(segment.privateIPs[i])) {
routes = append(routes, &netlink.Route{
Dst: oneAddressCIDR(segment.privateIPs[i]),
Flags: int(netlink.FLAG_ONLINK),
Gw: segment.privateIPs[i],
LinkIndex: tunlIface,
Protocol: unix.RTPROT_STATIC,
Table: kiloTableIndex,
})
rules = append(rules, defaultRule(&netlink.Rule{
Src: t.subnet,
Dst: oneAddressCIDR(segment.privateIPs[i]),
Table: kiloTableIndex,
}))
}
}
}
continue
}
for i := range segment.cidrs {
// Add routes to the Pod CIDRs of nodes in other segments.
routes = append(routes, encapsulateRoute(&netlink.Route{
Dst: segment.cidrs[i],
Flags: int(netlink.FLAG_ONLINK),
Gw: gw,
LinkIndex: privIface,
Protocol: unix.RTPROT_STATIC,
}, enc.Strategy(), t.privateIP, tunlIface))
}
for i := range segment.privateIPs {
// Add routes to the private IPs of nodes in other segments.
routes = append(routes, encapsulateRoute(&netlink.Route{
Dst: oneAddressCIDR(segment.privateIPs[i]),
Flags: int(netlink.FLAG_ONLINK),
Gw: gw,
LinkIndex: privIface,
Protocol: unix.RTPROT_STATIC,
}, enc.Strategy(), t.privateIP, tunlIface))
}
}
// Add routes for the allowed IPs of peers.
for _, peer := range t.peers {
for i := range peer.AllowedIPs {
routes = append(routes, encapsulateRoute(&netlink.Route{
Dst: peer.AllowedIPs[i],
Flags: int(netlink.FLAG_ONLINK),
Gw: gw,
LinkIndex: privIface,
Protocol: unix.RTPROT_STATIC,
}, enc.Strategy(), t.privateIP, tunlIface))
}
}
return routes, rules
}
for _, segment := range t.segments {
// Add routes for the current segment if local is true.
if segment.location == t.location {
// If the local node does not have a private IP address,
// then skip adding routes, because the node is in its own location.
if local && t.privateIP != nil {
for i := range segment.cidrs {
// Don't add routes for the local node.
if segment.privateIPs[i].Equal(t.privateIP.IP) {
continue
}
routes = append(routes, encapsulateRoute(&netlink.Route{
Dst: segment.cidrs[i],
Flags: int(netlink.FLAG_ONLINK),
Gw: segment.privateIPs[i],
LinkIndex: privIface,
Protocol: unix.RTPROT_STATIC,
}, enc.Strategy(), t.privateIP, tunlIface))
// Encapsulate packets from the host's Pod subnet headed
// to private IPs.
if enc.Strategy() == encapsulation.Always || (enc.Strategy() == encapsulation.CrossSubnet && !t.privateIP.Contains(segment.privateIPs[i])) {
routes = append(routes, &netlink.Route{
Dst: oneAddressCIDR(segment.privateIPs[i]),
Flags: int(netlink.FLAG_ONLINK),
Gw: segment.privateIPs[i],
LinkIndex: tunlIface,
Protocol: unix.RTPROT_STATIC,
Table: kiloTableIndex,
})
rules = append(rules, defaultRule(&netlink.Rule{
Src: t.subnet,
Dst: oneAddressCIDR(segment.privateIPs[i]),
Table: kiloTableIndex,
}))
// Also encapsulate packets from the Kilo interface
// headed to private IPs.
rules = append(rules, defaultRule(&netlink.Rule{
Dst: oneAddressCIDR(segment.privateIPs[i]),
Table: kiloTableIndex,
IifName: kiloIfaceName,
}))
}
}
}
// Continuing here prevents leaders form adding routes via WireGuard to
// nodes in their own location.
continue
}
for i := range segment.cidrs {
// Add routes to the Pod CIDRs of nodes in other segments.
routes = append(routes, &netlink.Route{
Dst: segment.cidrs[i],
Flags: int(netlink.FLAG_ONLINK),
Gw: segment.wireGuardIP,
LinkIndex: kiloIface,
Protocol: unix.RTPROT_STATIC,
})
// Don't add routes through Kilo if the private IP
// equals the external IP. This means that the node
// is only accessible through an external IP and we
// cannot encapsulate traffic to an IP through the IP.
if segment.privateIPs == nil || segment.privateIPs[i].Equal(segment.endpoint.IP) {
continue
}
// Add routes to the private IPs of nodes in other segments.
// Number of CIDRs and private IPs always match so
// we can reuse the loop.
routes = append(routes, &netlink.Route{
Dst: oneAddressCIDR(segment.privateIPs[i]),
Flags: int(netlink.FLAG_ONLINK),
Gw: segment.wireGuardIP,
LinkIndex: kiloIface,
Protocol: unix.RTPROT_STATIC,
})
}
}
// Add routes for the allowed IPs of peers.
for _, peer := range t.peers {
for i := range peer.AllowedIPs {
routes = append(routes, &netlink.Route{
Dst: peer.AllowedIPs[i],
LinkIndex: kiloIface,
Protocol: unix.RTPROT_STATIC,
})
}
}
return routes, rules
}
func encapsulateRoute(route *netlink.Route, encapsulate encapsulation.Strategy, subnet *net.IPNet, tunlIface int) *netlink.Route {
if encapsulate == encapsulation.Always || (encapsulate == encapsulation.CrossSubnet && !subnet.Contains(route.Gw)) {
route.LinkIndex = tunlIface
}
return route
}
// Rules returns the iptables rules required by the local node.
func (t *Topology) Rules(cni bool) []iptables.Rule {
var rules []iptables.Rule
rules = append(rules, iptables.NewIPv4Chain("nat", "KILO-NAT"))
rules = append(rules, iptables.NewIPv6Chain("nat", "KILO-NAT"))
if cni {
rules = append(rules, iptables.NewRule(iptables.GetProtocol(len(t.subnet.IP)), "nat", "POSTROUTING", "-s", t.subnet.String(), "-m", "comment", "--comment", "Kilo: jump to KILO-NAT chain", "-j", "KILO-NAT"))
}
for _, s := range t.segments {
rules = append(rules, iptables.NewRule(iptables.GetProtocol(len(s.wireGuardIP)), "nat", "KILO-NAT", "-d", oneAddressCIDR(s.wireGuardIP).String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for WireGuared IPs", "-j", "RETURN"))
for _, aip := range s.allowedIPs {
rules = append(rules, iptables.NewRule(iptables.GetProtocol(len(aip.IP)), "nat", "KILO-NAT", "-d", aip.String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for known IPs", "-j", "RETURN"))
}
}
for _, p := range t.peers {
for _, aip := range p.AllowedIPs {
rules = append(rules,
iptables.NewRule(iptables.GetProtocol(len(aip.IP)), "nat", "POSTROUTING", "-s", aip.String(), "-m", "comment", "--comment", "Kilo: jump to NAT chain", "-j", "KILO-NAT"),
iptables.NewRule(iptables.GetProtocol(len(aip.IP)), "nat", "KILO-NAT", "-d", aip.String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for peers", "-j", "RETURN"),
)
}
}
rules = append(rules, iptables.NewIPv4Rule("nat", "KILO-NAT", "-m", "comment", "--comment", "Kilo: NAT remaining packets", "-j", "MASQUERADE"))
rules = append(rules, iptables.NewIPv6Rule("nat", "KILO-NAT", "-m", "comment", "--comment", "Kilo: NAT remaining packets", "-j", "MASQUERADE"))
return rules
}
func defaultRule(rule *netlink.Rule) *netlink.Rule {
base := netlink.NewRule()
base.Src = rule.Src
base.Dst = rule.Dst
base.IifName = rule.IifName
base.Table = rule.Table
return base
}