Merge branch 'ip_tunnel-collect_md'

Alexei Starovoitov says:

====================
ip_tunnel: add collect_md mode to IPv4/IPv6 tunnels

Similar to geneve, vxlan, gre tunnels implement 'collect metadata' mode
in ipip, ipip6, ip6ip6 tunnels.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2016-09-17 10:13:16 -04:00
commit fd9527f404
9 changed files with 662 additions and 63 deletions

View File

@ -23,6 +23,7 @@ struct __ip6_tnl_parm {
__u8 proto; /* tunnel protocol */
__u8 encap_limit; /* encapsulation limit for tunnel */
__u8 hop_limit; /* hop limit for tunnel */
bool collect_md;
__be32 flowinfo; /* traffic class and flowlabel for tunnel */
__u32 flags; /* tunnel flags */
struct in6_addr laddr; /* local tunnel end-point address */

View File

@ -255,6 +255,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, const u8 protocol);
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const u8 proto);
int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);

View File

@ -73,6 +73,7 @@ enum {
IFLA_IPTUN_ENCAP_FLAGS,
IFLA_IPTUN_ENCAP_SPORT,
IFLA_IPTUN_ENCAP_DPORT,
IFLA_IPTUN_COLLECT_METADATA,
__IFLA_IPTUN_MAX,
};
#define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1)

View File

@ -55,6 +55,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
u32 headroom = sizeof(struct iphdr);
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
const struct iphdr *inner_iph;
struct rtable *rt;
struct flowi4 fl4;
__be16 df = 0;
u8 tos, ttl;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET))
goto tx_error;
key = &tun_info->key;
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
tos = key->tos;
if (tos == 1) {
if (skb->protocol == htons(ETH_P_IP))
tos = inner_iph->tos;
else if (skb->protocol == htons(ETH_P_IPV6))
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
RT_TOS(tos), tunnel->parms.link);
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
goto tx_error;
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
dev->stats.tx_carrier_errors++;
goto tx_error;
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = key->ttl;
if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
ttl = inner_iph->ttl;
else if (skb->protocol == htons(ETH_P_IPV6))
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
df = htons(IP_DF);
else if (skb->protocol == htons(ETH_P_IP))
df = inner_iph->frag_off & htons(IP_DF);
headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
if (headroom > dev->needed_headroom)
dev->needed_headroom = headroom;
if (skb_cow_head(skb, dev->needed_headroom)) {
ip_rt_put(rt);
goto tx_dropped;
}
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
dev->stats.tx_errors++;
goto kfree;
tx_dropped:
dev->stats.tx_dropped++;
kfree:
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{

View File

@ -115,6 +115,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/dst_metadata.h>
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
struct metadata_dst *tun_dst = NULL;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tpi = &ipip_tpi;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
if (tunnel->collect_md) {
tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
if (!tun_dst)
return 0;
}
return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
}
return -1;
@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, ipproto);
ip_tunnel_xmit(skb, dev, tiph, ipproto);
if (tunnel->collect_md)
ip_md_tunnel_xmit(skb, dev, ipproto);
else
ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
tx_error:
@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
}
static void ipip_netlink_parms(struct nlattr *data[],
struct ip_tunnel_parm *parms)
struct ip_tunnel_parm *parms, bool *collect_md)
{
memset(parms, 0, sizeof(*parms));
parms->iph.version = 4;
parms->iph.protocol = IPPROTO_IPIP;
parms->iph.ihl = 5;
*collect_md = false;
if (!data)
return;
@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
if (data[IFLA_IPTUN_COLLECT_METADATA])
*collect_md = true;
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
if (ipip_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
ipip_netlink_parms(data, &p);
ipip_netlink_parms(data, &p, &t->collect_md);
return ip_tunnel_newlink(dev, tb, &p);
}
@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
bool collect_md;
if (ipip_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
ipip_netlink_parms(data, &p);
ipip_netlink_parms(data, &p, &collect_md);
if (collect_md)
return -EINVAL;
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
/* IFLA_IPTUN_COLLECT_METADATA */
nla_total_size(0) +
0;
}
@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
tunnel->encap.flags))
goto nla_put_failure;
if (tunnel->collect_md)
if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
goto nla_put_failure;
return 0;
nla_put_failure:
@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {

View File

@ -57,6 +57,7 @@
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/dst_metadata.h>
MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device");
@ -90,6 +91,7 @@ struct ip6_tnl_net {
struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
struct ip6_tnl __rcu *tnls_wc[1];
struct ip6_tnl __rcu **tnls[2];
struct ip6_tnl __rcu *collect_md_tun;
};
static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
return t;
}
t = rcu_dereference(ip6n->collect_md_tun);
if (t)
return t;
t = rcu_dereference(ip6n->tnls_wc[0]);
if (t && (t->dev->flags & IFF_UP))
return t;
@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
if (t->parms.collect_md)
rcu_assign_pointer(ip6n->collect_md_tun, t);
rcu_assign_pointer(t->next , rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
}
@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
struct ip6_tnl __rcu **tp;
struct ip6_tnl *iter;
if (t->parms.collect_md)
rcu_assign_pointer(ip6n->collect_md_tun, NULL);
for (tp = ip6_tnl_bucket(ip6n, &t->parms);
(iter = rtnl_dereference(*tp)) != NULL;
tp = &iter->next) {
@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
if (tun_dst)
skb_dst_set(skb, (struct dst_entry *)tun_dst);
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
{
struct ip6_tnl *t;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct metadata_dst *tun_dst = NULL;
int ret = -1;
rcu_read_lock();
@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
goto drop;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
if (t->parms.collect_md) {
tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
if (!tun_dst)
return 0;
}
ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_error);
}
@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
int mtu;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen;
u8 hop_limit;
int err = -1;
if (t->parms.collect_md) {
hop_limit = skb_tunnel_info(skb)->key.ttl;
goto route_lookup;
} else {
hop_limit = t->parms.hop_limit;
}
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
struct in6_addr *addr6;
@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
goto tx_err_link_failure;
if (!dst) {
route_lookup:
dst = ip6_route_output(net, NULL, fl6);
if (dst->error)
@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
dst = NULL;
goto tx_err_link_failure;
}
if (t->parms.collect_md &&
ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
&fl6->daddr, 0, &fl6->saddr))
goto tx_err_link_failure;
ndst = dst;
}
@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
}
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
if (skb_dst(skb))
if (skb_dst(skb) && !t->parms.collect_md)
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
if (skb->len > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
skb = new_skb;
}
if (!fl6->flowi6_mark && ndst)
dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
if (t->parms.collect_md) {
if (t->encap.type != TUNNEL_ENCAP_NONE)
goto tx_err_dst_release;
} else {
if (!fl6->flowi6_mark && ndst)
dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
}
skb_dst_set(skb, dst);
if (encap_limit >= 0) {
@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->hop_limit = hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (tproto != IPPROTO_IPIP && tproto != 0)
return -1;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
dsfield = ipv4_get_dsfield(iph);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
& IPV6_TCLASS_MASK;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
if (t->parms.collect_md) {
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET6))
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
& IPV6_TCLASS_MASK;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
}
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
ip6_tnl_addr_conflict(t, ipv6h))
return -1;
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
if (tel->encap_limit == 0) {
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_HDR_FIELD, offset + 2);
return -1;
}
encap_limit = tel->encap_limit - 1;
} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
if (t->parms.collect_md) {
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET6))
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (void *)&skb_network_header(skb)[offset];
if (tel->encap_limit == 0) {
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_HDR_FIELD, offset + 2);
return -1;
}
encap_limit = tel->encap_limit - 1;
} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
encap_limit = t->parms.encap_limit;
}
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
}
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
if (err)
return err;
ip6_tnl_link_config(t);
if (t->parms.collect_md) {
dev->features |= NETIF_F_NETNS_LOCAL;
netif_keep_dst(dev);
}
return 0;
}
@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_PROTO])
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
if (data[IFLA_IPTUN_COLLECT_METADATA])
parms->collect_md = true;
}
static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net *net = dev_net(dev);
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct ip6_tnl *nt, *t;
struct ip_tunnel_encap ipencap;
@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
ip6_tnl_netlink_parms(data, &nt->parms);
t = ip6_tnl_locate(net, &nt->parms, 0);
if (!IS_ERR(t))
return -EEXIST;
if (nt->parms.collect_md) {
if (rtnl_dereference(ip6n->collect_md_tun))
return -EEXIST;
} else {
t = ip6_tnl_locate(net, &nt->parms, 0);
if (!IS_ERR(t))
return -EEXIST;
}
return ip6_tnl_create2(dev);
}
@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
ip6_tnl_netlink_parms(data, &p);
if (p.collect_md)
return -EINVAL;
t = ip6_tnl_locate(net, &p, 0);
if (!IS_ERR(t)) {
@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
/* IFLA_IPTUN_COLLECT_METADATA */
nla_total_size(0) +
0;
}
@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
goto nla_put_failure;
if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
tunnel->encap.type) ||
nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
tunnel->encap.sport) ||
nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
tunnel->encap.dport) ||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
tunnel->encap.flags))
if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
goto nla_put_failure;
if (parm->collect_md)
if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
goto nla_put_failure;
return 0;
nla_put_failure:
@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ip6_link_ops __read_mostly = {

View File

@ -1,4 +1,5 @@
/* Copyright (c) 2016 VMware
* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@ -8,12 +9,15 @@
#include <uapi/linux/if_ether.h>
#include <uapi/linux/if_packet.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/ipv6.h>
#include <uapi/linux/in.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/filter.h>
#include <uapi/linux/pkt_cls.h>
#include <net/ipv6.h>
#include "bpf_helpers.h"
#define _htonl __builtin_bswap32
#define ERROR(ret) do {\
char fmt[] = "ERROR line:%d ret:%d\n";\
bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \
@ -188,4 +192,190 @@ int _geneve_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
}
SEC("ipip_set_tunnel")
int _ipip_set_tunnel(struct __sk_buff *skb)
{
struct bpf_tunnel_key key = {};
void *data = (void *)(long)skb->data;
struct iphdr *iph = data;
struct tcphdr *tcp = data + sizeof(*iph);
void *data_end = (void *)(long)skb->data_end;
int ret;
/* single length check */
if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
ERROR(1);
return TC_ACT_SHOT;
}
key.tunnel_ttl = 64;
if (iph->protocol == IPPROTO_ICMP) {
key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
} else {
if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
return TC_ACT_SHOT;
if (tcp->dest == htons(5200))
key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
else if (tcp->dest == htons(5201))
key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
else
return TC_ACT_SHOT;
}
ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
if (ret < 0) {
ERROR(ret);
return TC_ACT_SHOT;
}
return TC_ACT_OK;
}
SEC("ipip_get_tunnel")
int _ipip_get_tunnel(struct __sk_buff *skb)
{
int ret;
struct bpf_tunnel_key key;
char fmt[] = "remote ip 0x%x\n";
ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
if (ret < 0) {
ERROR(ret);
return TC_ACT_SHOT;
}
bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
return TC_ACT_OK;
}
SEC("ipip6_set_tunnel")
int _ipip6_set_tunnel(struct __sk_buff *skb)
{
struct bpf_tunnel_key key = {};
void *data = (void *)(long)skb->data;
struct iphdr *iph = data;
struct tcphdr *tcp = data + sizeof(*iph);
void *data_end = (void *)(long)skb->data_end;
int ret;
/* single length check */
if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
ERROR(1);
return TC_ACT_SHOT;
}
key.remote_ipv6[0] = _htonl(0x2401db00);
key.tunnel_ttl = 64;
if (iph->protocol == IPPROTO_ICMP) {
key.remote_ipv6[3] = _htonl(1);
} else {
if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) {
ERROR(iph->protocol);
return TC_ACT_SHOT;
}
if (tcp->dest == htons(5200)) {
key.remote_ipv6[3] = _htonl(1);
} else if (tcp->dest == htons(5201)) {
key.remote_ipv6[3] = _htonl(2);
} else {
ERROR(tcp->dest);
return TC_ACT_SHOT;
}
}
ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
if (ret < 0) {
ERROR(ret);
return TC_ACT_SHOT;
}
return TC_ACT_OK;
}
SEC("ipip6_get_tunnel")
int _ipip6_get_tunnel(struct __sk_buff *skb)
{
int ret;
struct bpf_tunnel_key key;
char fmt[] = "remote ip6 %x::%x\n";
ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
if (ret < 0) {
ERROR(ret);
return TC_ACT_SHOT;
}
bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
_htonl(key.remote_ipv6[3]));
return TC_ACT_OK;
}
SEC("ip6ip6_set_tunnel")
int _ip6ip6_set_tunnel(struct __sk_buff *skb)
{
struct bpf_tunnel_key key = {};
void *data = (void *)(long)skb->data;
struct ipv6hdr *iph = data;
struct tcphdr *tcp = data + sizeof(*iph);
void *data_end = (void *)(long)skb->data_end;
int ret;
/* single length check */
if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
ERROR(1);
return TC_ACT_SHOT;
}
key.remote_ipv6[0] = _htonl(0x2401db00);
key.tunnel_ttl = 64;
if (iph->nexthdr == NEXTHDR_ICMP) {
key.remote_ipv6[3] = _htonl(1);
} else {
if (iph->nexthdr != NEXTHDR_TCP) {
ERROR(iph->nexthdr);
return TC_ACT_SHOT;
}
if (tcp->dest == htons(5200)) {
key.remote_ipv6[3] = _htonl(1);
} else if (tcp->dest == htons(5201)) {
key.remote_ipv6[3] = _htonl(2);
} else {
ERROR(tcp->dest);
return TC_ACT_SHOT;
}
}
ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
if (ret < 0) {
ERROR(ret);
return TC_ACT_SHOT;
}
return TC_ACT_OK;
}
SEC("ip6ip6_get_tunnel")
int _ip6ip6_get_tunnel(struct __sk_buff *skb)
{
int ret;
struct bpf_tunnel_key key;
char fmt[] = "remote ip6 %x::%x\n";
ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
if (ret < 0) {
ERROR(ret);
return TC_ACT_SHOT;
}
bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
_htonl(key.remote_ipv6[3]));
return TC_ACT_OK;
}
char _license[] SEC("license") = "GPL";

178
samples/bpf/test_ipip.sh Executable file
View File

@ -0,0 +1,178 @@
#!/bin/bash
function config_device {
ip netns add at_ns0
ip netns add at_ns1
ip netns add at_ns2
ip link add veth0 type veth peer name veth0b
ip link add veth1 type veth peer name veth1b
ip link add veth2 type veth peer name veth2b
ip link set veth0b up
ip link set veth1b up
ip link set veth2b up
ip link set dev veth0b mtu 1500
ip link set dev veth1b mtu 1500
ip link set dev veth2b mtu 1500
ip link set veth0 netns at_ns0
ip link set veth1 netns at_ns1
ip link set veth2 netns at_ns2
ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
ip netns exec at_ns0 ip link set dev veth0 up
ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1
ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad
ip netns exec at_ns1 ip link set dev veth1 up
ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2
ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad
ip netns exec at_ns2 ip link set dev veth2 up
ip link add br0 type bridge
ip link set br0 up
ip link set dev br0 mtu 1500
ip link set veth0b master br0
ip link set veth1b master br0
ip link set veth2b master br0
}
function add_ipip_tunnel {
ip netns exec at_ns0 \
ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200
ip netns exec at_ns0 ip link set dev $DEV_NS up
ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
ip netns exec at_ns1 \
ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200
ip netns exec at_ns1 ip link set dev $DEV_NS up
# same inner IP address in at_ns0 and at_ns1
ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
ip netns exec at_ns2 ip link add dev $DEV type ipip external
ip netns exec at_ns2 ip link set dev $DEV up
ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
}
function add_ipip6_tunnel {
ip netns exec at_ns0 \
ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64
ip netns exec at_ns0 ip link set dev $DEV_NS up
ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
ip netns exec at_ns1 \
ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64
ip netns exec at_ns1 ip link set dev $DEV_NS up
# same inner IP address in at_ns0 and at_ns1
ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external
ip netns exec at_ns2 ip link set dev $DEV up
ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
}
function add_ip6ip6_tunnel {
ip netns exec at_ns0 \
ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64
ip netns exec at_ns0 ip link set dev $DEV_NS up
ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64
ip netns exec at_ns1 \
ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64
ip netns exec at_ns1 ip link set dev $DEV_NS up
# same inner IP address in at_ns0 and at_ns1
ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64
ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external
ip netns exec at_ns2 ip link set dev $DEV up
ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64
}
function attach_bpf {
DEV=$1
SET_TUNNEL=$2
GET_TUNNEL=$3
ip netns exec at_ns2 tc qdisc add dev $DEV clsact
ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL
ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL
}
function test_ipip {
DEV_NS=ipip_std
DEV=ipip_bpf
config_device
# tcpdump -nei br0 &
cat /sys/kernel/debug/tracing/trace_pipe &
add_ipip_tunnel
attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
ip netns exec at_ns0 ping -c 1 10.1.1.200
ip netns exec at_ns2 ping -c 1 10.1.1.100
ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
sleep 0.2
# tcp check _same_ IP over different tunnels
ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
cleanup
}
# IPv4 over IPv6 tunnel
function test_ipip6 {
DEV_NS=ipip_std
DEV=ipip_bpf
config_device
# tcpdump -nei br0 &
cat /sys/kernel/debug/tracing/trace_pipe &
add_ipip6_tunnel
attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel
ip netns exec at_ns0 ping -c 1 10.1.1.200
ip netns exec at_ns2 ping -c 1 10.1.1.100
ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
sleep 0.2
# tcp check _same_ IP over different tunnels
ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
cleanup
}
# IPv6 over IPv6 tunnel
function test_ip6ip6 {
DEV_NS=ipip_std
DEV=ipip_bpf
config_device
# tcpdump -nei br0 &
cat /sys/kernel/debug/tracing/trace_pipe &
add_ip6ip6_tunnel
attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel
ip netns exec at_ns0 ping -6 -c 1 2601:646::2
ip netns exec at_ns2 ping -6 -c 1 2601:646::1
ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null
ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null
sleep 0.2
# tcp check _same_ IP over different tunnels
ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200
ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201
cleanup
}
function cleanup {
set +ex
pkill iperf
ip netns delete at_ns0
ip netns delete at_ns1
ip netns delete at_ns2
ip link del veth0
ip link del veth1
ip link del veth2
ip link del br0
pkill tcpdump
pkill cat
set -ex
}
cleanup
echo "Testing IP tunnels..."
test_ipip
test_ipip6
test_ip6ip6
echo "*** PASS ***"

View File

@ -9,15 +9,13 @@
# local 172.16.1.200 remote 172.16.1.100
# veth1 IP: 172.16.1.200, tunnel dev <type>11
set -e
function config_device {
ip netns add at_ns0
ip link add veth0 type veth peer name veth1
ip link set veth0 netns at_ns0
ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
ip netns exec at_ns0 ip link set dev veth0 up
ip link set dev veth1 up
ip link set dev veth1 up mtu 1500
ip addr add dev veth1 172.16.1.200/24
}
@ -67,6 +65,19 @@ function add_geneve_tunnel {
ip addr add dev $DEV 10.1.1.200/24
}
function add_ipip_tunnel {
# in namespace
ip netns exec at_ns0 \
ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200
ip netns exec at_ns0 ip link set dev $DEV_NS up
ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
# out of namespace
ip link add dev $DEV type $TYPE external
ip link set dev $DEV up
ip addr add dev $DEV 10.1.1.200/24
}
function attach_bpf {
DEV=$1
SET_TUNNEL=$2
@ -85,6 +96,7 @@ function test_gre {
attach_bpf $DEV gre_set_tunnel gre_get_tunnel
ping -c 1 10.1.1.100
ip netns exec at_ns0 ping -c 1 10.1.1.200
cleanup
}
function test_vxlan {
@ -96,6 +108,7 @@ function test_vxlan {
attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
ping -c 1 10.1.1.100
ip netns exec at_ns0 ping -c 1 10.1.1.200
cleanup
}
function test_geneve {
@ -107,21 +120,48 @@ function test_geneve {
attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
ping -c 1 10.1.1.100
ip netns exec at_ns0 ping -c 1 10.1.1.200
cleanup
}
function test_ipip {
TYPE=ipip
DEV_NS=ipip00
DEV=ipip11
config_device
tcpdump -nei veth1 &
cat /sys/kernel/debug/tracing/trace_pipe &
add_ipip_tunnel
ethtool -K veth1 gso off gro off rx off tx off
ip link set dev veth1 mtu 1500
attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
ping -c 1 10.1.1.100
ip netns exec at_ns0 ping -c 1 10.1.1.200
ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
sleep 0.2
iperf -c 10.1.1.100 -n 5k -p 5200
cleanup
}
function cleanup {
set +ex
pkill iperf
ip netns delete at_ns0
ip link del veth1
ip link del $DEV
ip link del ipip11
ip link del gretap11
ip link del geneve11
pkill tcpdump
pkill cat
set -ex
}
cleanup
echo "Testing GRE tunnel..."
test_gre
cleanup
echo "Testing VXLAN tunnel..."
test_vxlan
cleanup
echo "Testing GENEVE tunnel..."
test_geneve
cleanup
echo "Success"
echo "Testing IPIP tunnel..."
test_ipip
echo "*** PASS ***"