linux/include/net/l3mdev.h

300 lines
6.3 KiB
C
Raw Normal View History

/*
* include/net/l3mdev.h - L3 master device API
* Copyright (c) 2015 Cumulus Networks
* Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#ifndef _NET_L3MDEV_H_
#define _NET_L3MDEV_H_
#include <net/dst.h>
net: Add l3mdev rule Currently, VRFs require 1 oif and 1 iif rule per address family per VRF. As the number of VRF devices increases it brings scalability issues with the increasing rule list. All of the VRF rules have the same format with the exception of the specific table id to direct the lookup. Since the table id is available from the oif or iif in the loopup, the VRF rules can be consolidated to a single rule that pulls the table from the VRF device. This patch introduces a new rule attribute l3mdev. The l3mdev rule means the table id used for the lookup is pulled from the L3 master device (e.g., VRF) rather than being statically defined. With the l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev rule per address family (IPv4 and IPv6). If an admin wishes to insert higher priority rules for specific VRFs those rules will co-exist with the l3mdev rule. This capability means current VRF scripts will co-exist with this new simpler implementation. Currently, the rules list for both ipv4 and ipv6 look like this: $ ip ru ls 1000: from all oif vrf1 lookup 1001 1000: from all iif vrf1 lookup 1001 1000: from all oif vrf2 lookup 1002 1000: from all iif vrf2 lookup 1002 1000: from all oif vrf3 lookup 1003 1000: from all iif vrf3 lookup 1003 1000: from all oif vrf4 lookup 1004 1000: from all iif vrf4 lookup 1004 1000: from all oif vrf5 lookup 1005 1000: from all iif vrf5 lookup 1005 1000: from all oif vrf6 lookup 1006 1000: from all iif vrf6 lookup 1006 1000: from all oif vrf7 lookup 1007 1000: from all iif vrf7 lookup 1007 1000: from all oif vrf8 lookup 1008 1000: from all iif vrf8 lookup 1008 ... 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default With the l3mdev rule the list is just the following regardless of the number of VRFs: $ ip ru ls 1000: from all lookup [l3mdev table] 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default (Note: the above pretty print of the rule is based on an iproute2 prototype. Actual verbage may change) Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-09 01:55:39 +08:00
#include <net/fib_rules.h>
/**
* struct l3mdev_ops - l3mdev operations
*
* @l3mdev_fib_table: Get FIB table id to use for lookups
*
* @l3mdev_l3_rcv: Hook in L3 receive path
*
* @l3mdev_l3_out: Hook in L3 output path
*
* @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
*/
struct l3mdev_ops {
u32 (*l3mdev_fib_table)(const struct net_device *dev);
struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
struct sk_buff *skb, u16 proto);
struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev,
struct sock *sk, struct sk_buff *skb,
u16 proto);
/* IPv6 ops */
struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
struct flowi6 *fl6);
};
#ifdef CONFIG_NET_L3_MASTER_DEV
net: Add l3mdev rule Currently, VRFs require 1 oif and 1 iif rule per address family per VRF. As the number of VRF devices increases it brings scalability issues with the increasing rule list. All of the VRF rules have the same format with the exception of the specific table id to direct the lookup. Since the table id is available from the oif or iif in the loopup, the VRF rules can be consolidated to a single rule that pulls the table from the VRF device. This patch introduces a new rule attribute l3mdev. The l3mdev rule means the table id used for the lookup is pulled from the L3 master device (e.g., VRF) rather than being statically defined. With the l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev rule per address family (IPv4 and IPv6). If an admin wishes to insert higher priority rules for specific VRFs those rules will co-exist with the l3mdev rule. This capability means current VRF scripts will co-exist with this new simpler implementation. Currently, the rules list for both ipv4 and ipv6 look like this: $ ip ru ls 1000: from all oif vrf1 lookup 1001 1000: from all iif vrf1 lookup 1001 1000: from all oif vrf2 lookup 1002 1000: from all iif vrf2 lookup 1002 1000: from all oif vrf3 lookup 1003 1000: from all iif vrf3 lookup 1003 1000: from all oif vrf4 lookup 1004 1000: from all iif vrf4 lookup 1004 1000: from all oif vrf5 lookup 1005 1000: from all iif vrf5 lookup 1005 1000: from all oif vrf6 lookup 1006 1000: from all iif vrf6 lookup 1006 1000: from all oif vrf7 lookup 1007 1000: from all iif vrf7 lookup 1007 1000: from all oif vrf8 lookup 1008 1000: from all iif vrf8 lookup 1008 ... 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default With the l3mdev rule the list is just the following regardless of the number of VRFs: $ ip ru ls 1000: from all lookup [l3mdev table] 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default (Note: the above pretty print of the rule is based on an iproute2 prototype. Actual verbage may change) Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-09 01:55:39 +08:00
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
struct fib_lookup_arg *arg);
void l3mdev_update_flow(struct net *net, struct flowi *fl);
net: l3mdev: address selection should only consider devices in L3 domain David Lamparter noted a use case where the source address selection fails to pick an address from a VRF interface - unnumbered interfaces. Relevant commands from his script: ip addr add 9.9.9.9/32 dev lo ip link set lo up ip link add name vrf0 type vrf table 101 ip rule add oif vrf0 table 101 ip rule add iif vrf0 table 101 ip link set vrf0 up ip addr add 10.0.0.3/32 dev vrf0 ip link add name dummy2 type dummy ip link set dummy2 master vrf0 up --> note dummy2 has no address - unnumbered device ip route add 10.2.2.2/32 dev dummy2 table 101 ip neigh add 10.2.2.2 dev dummy2 lladdr 02:00:00:00:00:02 tcpdump -ni dummy2 & And using ping instead of his socat example: $ ping -I vrf0 -c1 10.2.2.2 ping: Warning: source address might be selected on device other than vrf0. PING 10.2.2.2 (10.2.2.2) from 9.9.9.9 vrf0: 56(84) bytes of data. >From tcpdump: 12:57:29.449128 IP 9.9.9.9 > 10.2.2.2: ICMP echo request, id 2491, seq 1, length 64 Note the source address is from lo and is not a VRF local address. With this patch: $ ping -I vrf0 -c1 10.2.2.2 PING 10.2.2.2 (10.2.2.2) from 10.0.0.3 vrf0: 56(84) bytes of data. >From tcpdump: 12:59:25.096426 IP 10.0.0.3 > 10.2.2.2: ICMP echo request, id 2113, seq 1, length 64 Now the source address comes from vrf0. The ipv4 function for selecting source address takes a const argument. Removing the const requires touching a lot of places, so instead l3mdev_master_ifindex_rcu is changed to take a const argument and then do the typecast to non-const as required by netdev_master_upper_dev_get_rcu. This is similar to what l3mdev_fib_table_rcu does. IPv6 for unnumbered interfaces appears to be selecting the addresses properly. Cc: David Lamparter <david@opensourcerouting.org> Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-25 03:47:02 +08:00
int l3mdev_master_ifindex_rcu(const struct net_device *dev);
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
int ifindex;
rcu_read_lock();
ifindex = l3mdev_master_ifindex_rcu(dev);
rcu_read_unlock();
return ifindex;
}
static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
int rc = 0;
if (likely(ifindex)) {
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
rc = l3mdev_master_ifindex_rcu(dev);
rcu_read_unlock();
}
return rc;
}
static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
{
/* netdev_master_upper_dev_get_rcu calls
* list_first_or_null_rcu to walk the upper dev list.
* list_first_or_null_rcu does not handle a const arg. We aren't
* making changes, just want the master device from that list so
* typecast to remove the const
*/
struct net_device *dev = (struct net_device *)_dev;
struct net_device *master;
if (!dev)
return NULL;
if (netif_is_l3_master(dev))
master = dev;
else if (netif_is_l3_slave(dev))
master = netdev_master_upper_dev_get_rcu(dev);
else
master = NULL;
return master;
}
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
rcu_read_lock();
ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
rcu_read_unlock();
return ifindex;
}
u32 l3mdev_fib_table_rcu(const struct net_device *dev);
u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
static inline u32 l3mdev_fib_table(const struct net_device *dev)
{
u32 tb_id;
rcu_read_lock();
tb_id = l3mdev_fib_table_rcu(dev);
rcu_read_unlock();
return tb_id;
}
static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
struct net_device *dev;
bool rc = false;
if (ifindex == 0)
return false;
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
rc = netif_is_l3_master(dev);
rcu_read_unlock();
return rc;
}
struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
static inline
struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
{
struct net_device *master = NULL;
if (netif_is_l3_slave(skb->dev))
master = netdev_master_upper_dev_get_rcu(skb->dev);
ipvlan, l3mdev: fix broken l3s mode wrt local routes While implementing ipvlan l3 and l3s mode for kubernetes CNI plugin, I ran into the issue that while l3 mode is working fine, l3s mode does not have any connectivity to kube-apiserver and hence all pods end up in Error state as well. The ipvlan master device sits on top of a bond device and hostns traffic to kube-apiserver (also running in hostns) is DNATed from 10.152.183.1:443 to 139.178.29.207:37573 where the latter is the address of the bond0. While in l3 mode, a curl to https://10.152.183.1:443 or to https://139.178.29.207:37573 works fine from hostns, neither of them do in case of l3s. In the latter only a curl to https://127.0.0.1:37573 appeared to work where for local addresses of bond0 I saw kernel suddenly starting to emit ARP requests to query HW address of bond0 which remained unanswered and neighbor entries in INCOMPLETE state. These ARP requests only happen while in l3s. Debugging this further, I found the issue is that l3s mode is piggy- backing on l3 master device, and in this case local routes are using l3mdev_master_dev_rcu(dev) instead of net->loopback_dev as per commit f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev if relevant") and 5f02ce24c269 ("net: l3mdev: Allow the l3mdev to be a loopback"). I found that reverting them back into using the net->loopback_dev fixed ipvlan l3s connectivity and got everything working for the CNI. Now judging from 4fbae7d83c98 ("ipvlan: Introduce l3s mode") and the l3mdev paper in [0] the only sole reason why ipvlan l3s is relying on l3 master device is to get the l3mdev_ip_rcv() receive hook for setting the dst entry of the input route without adding its own ipvlan specific hacks into the receive path, however, any l3 domain semantics beyond just that are breaking l3s operation. Note that ipvlan also has the ability to dynamically switch its internal operation from l3 to l3s for all ports via ipvlan_set_port_mode() at runtime. In any case, l3 vs l3s soley distinguishes itself by 'de-confusing' netfilter through switching skb->dev to ipvlan slave device late in NF_INET_LOCAL_IN before handing the skb to L4. Minimal fix taken here is to add a IFF_L3MDEV_RX_HANDLER flag which, if set from ipvlan setup, gets us only the wanted l3mdev_l3_rcv() hook without any additional l3mdev semantics on top. This should also have minimal impact since dev->priv_flags is already hot in cache. With this set, l3s mode is working fine and I also get things like masquerading pod traffic on the ipvlan master properly working. [0] https://netdevconf.org/1.2/papers/ahern-what-is-l3mdev-paper.pdf Fixes: f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev if relevant") Fixes: 5f02ce24c269 ("net: l3mdev: Allow the l3mdev to be a loopback") Fixes: 4fbae7d83c98 ("ipvlan: Introduce l3s mode") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Mahesh Bandewar <maheshb@google.com> Cc: David Ahern <dsa@cumulusnetworks.com> Cc: Florian Westphal <fw@strlen.de> Cc: Martynas Pumputis <m@lambda.lt> Acked-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-01-30 19:49:48 +08:00
else if (netif_is_l3_master(skb->dev) ||
netif_has_l3_rx_handler(skb->dev))
master = skb->dev;
if (master && master->l3mdev_ops->l3mdev_l3_rcv)
skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
return skb;
}
static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
return l3mdev_l3_rcv(skb, AF_INET);
}
static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
return l3mdev_l3_rcv(skb, AF_INET6);
}
static inline
struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
{
struct net_device *dev = skb_dst(skb)->dev;
if (netif_is_l3_slave(dev)) {
struct net_device *master;
master = netdev_master_upper_dev_get_rcu(dev);
if (master && master->l3mdev_ops->l3mdev_l3_out)
skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
skb, proto);
}
return skb;
}
static inline
struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
return l3mdev_l3_out(sk, skb, AF_INET);
}
static inline
struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
return l3mdev_l3_out(sk, skb, AF_INET6);
}
#else
net: l3mdev: address selection should only consider devices in L3 domain David Lamparter noted a use case where the source address selection fails to pick an address from a VRF interface - unnumbered interfaces. Relevant commands from his script: ip addr add 9.9.9.9/32 dev lo ip link set lo up ip link add name vrf0 type vrf table 101 ip rule add oif vrf0 table 101 ip rule add iif vrf0 table 101 ip link set vrf0 up ip addr add 10.0.0.3/32 dev vrf0 ip link add name dummy2 type dummy ip link set dummy2 master vrf0 up --> note dummy2 has no address - unnumbered device ip route add 10.2.2.2/32 dev dummy2 table 101 ip neigh add 10.2.2.2 dev dummy2 lladdr 02:00:00:00:00:02 tcpdump -ni dummy2 & And using ping instead of his socat example: $ ping -I vrf0 -c1 10.2.2.2 ping: Warning: source address might be selected on device other than vrf0. PING 10.2.2.2 (10.2.2.2) from 9.9.9.9 vrf0: 56(84) bytes of data. >From tcpdump: 12:57:29.449128 IP 9.9.9.9 > 10.2.2.2: ICMP echo request, id 2491, seq 1, length 64 Note the source address is from lo and is not a VRF local address. With this patch: $ ping -I vrf0 -c1 10.2.2.2 PING 10.2.2.2 (10.2.2.2) from 10.0.0.3 vrf0: 56(84) bytes of data. >From tcpdump: 12:59:25.096426 IP 10.0.0.3 > 10.2.2.2: ICMP echo request, id 2113, seq 1, length 64 Now the source address comes from vrf0. The ipv4 function for selecting source address takes a const argument. Removing the const requires touching a lot of places, so instead l3mdev_master_ifindex_rcu is changed to take a const argument and then do the typecast to non-const as required by netdev_master_upper_dev_get_rcu. This is similar to what l3mdev_fib_table_rcu does. IPv6 for unnumbered interfaces appears to be selecting the addresses properly. Cc: David Lamparter <david@opensourcerouting.org> Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-25 03:47:02 +08:00
static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
{
return 0;
}
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
return 0;
}
static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
{
return 0;
}
static inline
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
{
return 0;
}
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
return 0;
}
static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
{
return NULL;
}
static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev)
{
return 0;
}
static inline u32 l3mdev_fib_table(const struct net_device *dev)
{
return 0;
}
static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
{
return 0;
}
static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
return false;
}
static inline
struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
{
return NULL;
}
static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
return skb;
}
static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
return skb;
}
net: Add l3mdev rule Currently, VRFs require 1 oif and 1 iif rule per address family per VRF. As the number of VRF devices increases it brings scalability issues with the increasing rule list. All of the VRF rules have the same format with the exception of the specific table id to direct the lookup. Since the table id is available from the oif or iif in the loopup, the VRF rules can be consolidated to a single rule that pulls the table from the VRF device. This patch introduces a new rule attribute l3mdev. The l3mdev rule means the table id used for the lookup is pulled from the L3 master device (e.g., VRF) rather than being statically defined. With the l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev rule per address family (IPv4 and IPv6). If an admin wishes to insert higher priority rules for specific VRFs those rules will co-exist with the l3mdev rule. This capability means current VRF scripts will co-exist with this new simpler implementation. Currently, the rules list for both ipv4 and ipv6 look like this: $ ip ru ls 1000: from all oif vrf1 lookup 1001 1000: from all iif vrf1 lookup 1001 1000: from all oif vrf2 lookup 1002 1000: from all iif vrf2 lookup 1002 1000: from all oif vrf3 lookup 1003 1000: from all iif vrf3 lookup 1003 1000: from all oif vrf4 lookup 1004 1000: from all iif vrf4 lookup 1004 1000: from all oif vrf5 lookup 1005 1000: from all iif vrf5 lookup 1005 1000: from all oif vrf6 lookup 1006 1000: from all iif vrf6 lookup 1006 1000: from all oif vrf7 lookup 1007 1000: from all iif vrf7 lookup 1007 1000: from all oif vrf8 lookup 1008 1000: from all iif vrf8 lookup 1008 ... 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default With the l3mdev rule the list is just the following regardless of the number of VRFs: $ ip ru ls 1000: from all lookup [l3mdev table] 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default (Note: the above pretty print of the rule is based on an iproute2 prototype. Actual verbage may change) Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-09 01:55:39 +08:00
static inline
struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
return skb;
}
static inline
struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
return skb;
}
net: Add l3mdev rule Currently, VRFs require 1 oif and 1 iif rule per address family per VRF. As the number of VRF devices increases it brings scalability issues with the increasing rule list. All of the VRF rules have the same format with the exception of the specific table id to direct the lookup. Since the table id is available from the oif or iif in the loopup, the VRF rules can be consolidated to a single rule that pulls the table from the VRF device. This patch introduces a new rule attribute l3mdev. The l3mdev rule means the table id used for the lookup is pulled from the L3 master device (e.g., VRF) rather than being statically defined. With the l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev rule per address family (IPv4 and IPv6). If an admin wishes to insert higher priority rules for specific VRFs those rules will co-exist with the l3mdev rule. This capability means current VRF scripts will co-exist with this new simpler implementation. Currently, the rules list for both ipv4 and ipv6 look like this: $ ip ru ls 1000: from all oif vrf1 lookup 1001 1000: from all iif vrf1 lookup 1001 1000: from all oif vrf2 lookup 1002 1000: from all iif vrf2 lookup 1002 1000: from all oif vrf3 lookup 1003 1000: from all iif vrf3 lookup 1003 1000: from all oif vrf4 lookup 1004 1000: from all iif vrf4 lookup 1004 1000: from all oif vrf5 lookup 1005 1000: from all iif vrf5 lookup 1005 1000: from all oif vrf6 lookup 1006 1000: from all iif vrf6 lookup 1006 1000: from all oif vrf7 lookup 1007 1000: from all iif vrf7 lookup 1007 1000: from all oif vrf8 lookup 1008 1000: from all iif vrf8 lookup 1008 ... 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default With the l3mdev rule the list is just the following regardless of the number of VRFs: $ ip ru ls 1000: from all lookup [l3mdev table] 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default (Note: the above pretty print of the rule is based on an iproute2 prototype. Actual verbage may change) Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-09 01:55:39 +08:00
static inline
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
struct fib_lookup_arg *arg)
{
return 1;
}
static inline
void l3mdev_update_flow(struct net *net, struct flowi *fl)
{
}
#endif
#endif /* _NET_L3MDEV_H_ */