From ac1cf3990c99802eae3aa735b35c94a2131eb9fe Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:20 +0200 Subject: [PATCH 01/13] ip_tunnels: remove custom alignment and packing The custom alignment of struct ip_tunnel_key is unnecessary. In struct sw_flow_key, it starts at offset 256, in struct ip_tunnel_info it's the first field. The structure is also packed even without the __packed keyword. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 984dbfa15e13..81cf11c931e4 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -36,7 +36,7 @@ struct ip_tunnel_key { __u8 ipv4_ttl; __be16 tp_src; __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ +}; /* Indicates whether the tunnel info structure represents receive * or transmit tunnel parameters. From 6b8847c5a2bafbbf92f4b779f87165093457ea68 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:21 +0200 Subject: [PATCH 02/13] ip_tunnels: use u8/u16/u32 The ip_tunnels.h include file uses mixture of __u16 and u16 (etc.) types. Unify it to the non-underscore variants. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 81cf11c931e4..ca173f22f07f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -32,8 +32,8 @@ struct ip_tunnel_key { __be32 ipv4_src; __be32 ipv4_dst; __be16 tun_flags; - __u8 ipv4_tos; - __u8 ipv4_ttl; + u8 ipv4_tos; + u8 ipv4_ttl; __be16 tp_src; __be16 tp_dst; }; @@ -64,8 +64,8 @@ struct ip_tunnel_6rd_parm { #endif struct ip_tunnel_encap { - __u16 type; - __u16 flags; + u16 type; + u16 flags; __be16 sport; __be16 dport; }; @@ -95,8 +95,8 @@ struct ip_tunnel { * arrived */ /* These four fields used only by GRE */ - __u32 i_seqno; /* The last seen seqno */ - __u32 o_seqno; /* The last output seqno */ + u32 i_seqno; /* The last seen seqno */ + u32 o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ int mlink; @@ -273,8 +273,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df, bool xnet); + __be32 src, __be32 dst, u8 proto, + u8 tos, u8 ttl, __be16 df, bool xnet); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, int gso_type_mask); From 376534a3d17002d608985bd67c3b0880eacadd14 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:22 +0200 Subject: [PATCH 03/13] ip_tunnels: use offsetofend Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index ca173f22f07f..cc3b39e9010b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -23,9 +23,7 @@ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ -#define IP_TUNNEL_KEY_SIZE \ - (offsetof(struct ip_tunnel_key, tp_dst) + \ - FIELD_SIZEOF(struct ip_tunnel_key, tp_dst)) +#define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) struct ip_tunnel_key { __be64 tun_id; From c1ea5d672aaff08da337dee735dbb548e3415585 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:23 +0200 Subject: [PATCH 04/13] ip_tunnels: add IPv6 addresses to ip_tunnel_key Add the IPv6 addresses as an union with IPv4 ones. When using IPv4, the newly introduced padding after the IPv4 addresses needs to be zeroed out. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 +++--- include/net/ip_tunnels.h | 24 ++++++++++++++++++++---- net/core/filter.c | 4 ++-- net/ipv4/ip_gre.c | 10 +++++----- net/ipv4/ip_tunnel_core.c | 8 ++++---- net/openvswitch/flow_netlink.c | 18 +++++++++--------- net/openvswitch/flow_table.c | 2 +- net/openvswitch/vport-geneve.c | 2 +- net/openvswitch/vport.c | 2 +- net/openvswitch/vport.h | 4 ++-- 10 files changed, 48 insertions(+), 32 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ad51dac88d19..30a7abcf2c09 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1276,8 +1276,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto drop; info = &tun_dst->u.tun_info; - info->key.ipv4_src = iph->saddr; - info->key.ipv4_dst = iph->daddr; + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; info->key.ipv4_tos = iph->tos; info->key.ipv4_ttl = iph->ttl; info->key.tp_src = udp_hdr(skb)->source; @@ -1925,7 +1925,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = be64_to_cpu(info->key.tun_id); remote_ip.sin.sin_family = AF_INET; - remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst; + remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; dst = &remote_ip; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cc3b39e9010b..6a51371dad00 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -25,10 +25,24 @@ /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) +/* Used to memset ipv4 address padding. */ +#define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) +#define IP_TUNNEL_KEY_IPV4_PAD_LEN \ + (FIELD_SIZEOF(struct ip_tunnel_key, u) - \ + FIELD_SIZEOF(struct ip_tunnel_key, u.ipv4)) + struct ip_tunnel_key { __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; + union { + struct { + __be32 src; + __be32 dst; + } ipv4; + struct { + struct in6_addr src; + struct in6_addr dst; + } ipv6; + } u; __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; @@ -177,8 +191,10 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, const void *opts, u8 opts_len) { tun_info->key.tun_id = tun_id; - tun_info->key.ipv4_src = saddr; - tun_info->key.ipv4_dst = daddr; + tun_info->key.u.ipv4.src = saddr; + tun_info->key.u.ipv4.dst = daddr; + memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, + 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); tun_info->key.ipv4_tos = tos; tun_info->key.ipv4_ttl = ttl; tun_info->key.tun_flags = tun_flags; diff --git a/net/core/filter.c b/net/core/filter.c index 83f08cefeab7..379568562ffb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1495,7 +1495,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; to->tunnel_id = be64_to_cpu(info->key.tun_id); - to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src); + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); return 0; } @@ -1529,7 +1529,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; info->key.tun_id = cpu_to_be64(from->tunnel_id); - info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4); + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); return 0; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fb44d693796e..b7bb7d6aa7a8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -407,8 +407,8 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) return PACKET_REJECT; info = &tun_dst->u.tun_info; - info->key.ipv4_src = iph->saddr; - info->key.ipv4_dst = iph->daddr; + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; info->key.ipv4_tos = iph->tos; info->key.ipv4_ttl = iph->ttl; @@ -527,8 +527,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; memset(&fl, 0, sizeof(fl)); - fl.daddr = key->ipv4_dst; - fl.saddr = key->ipv4_src; + fl.daddr = key->u.ipv4.dst; + fl.saddr = key->u.ipv4.src; fl.flowi4_tos = RT_TOS(key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -564,7 +564,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - key->ipv4_dst, IPPROTO_GRE, + key->u.ipv4.dst, IPPROTO_GRE, key->ipv4_tos, key->ipv4_ttl, df, false); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1c2389d582a6..93907d71cda6 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -227,10 +227,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) - tun_info->key.ipv4_dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); + tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); if (tb[LWTUNNEL_IP_SRC]) - tun_info->key.ipv4_src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); + tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); @@ -262,8 +262,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || - nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.ipv4_dst) || - nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.ipv4_src) || + nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || + nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) || nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a6eb77ab1a64..a7f866374817 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -534,11 +534,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: @@ -609,7 +609,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, } if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { + if (!match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } @@ -647,13 +647,13 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->ipv4_src && + if (output->u.ipv4.src && nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->ipv4_src)) + output->u.ipv4.src)) return -EMSGSIZE; - if (output->ipv4_dst && + if (output->u.ipv4.dst && nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->ipv4_dst)) + output->u.ipv4.dst)) return -EMSGSIZE; if (output->ipv4_tos && nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) @@ -1116,7 +1116,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.ipv4_dst) + if (match->key->tun_key.u.ipv4.dst) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1287,7 +1287,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask)) { + if ((swkey->tun_key.u.ipv4.dst || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 3a9d1dde76ed..d22d8e948d0f 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -426,7 +426,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.ipv4_dst) + if (key->tun_key.u.ipv4.dst) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 1da3a14d1010..023813d05f88 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -203,7 +203,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) } err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->u.ipv4.dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, df, sport, dport, tun_key->tun_flags, vni, opts_len, opts, !!(tun_key->tun_flags & TUNNEL_CSUM), false); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d14f59403c5e..a06adc72a58d 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -603,7 +603,7 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, * saddr, tp_src and tp_dst */ __ip_tunnel_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, + fl.saddr, tun_key->u.ipv4.dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, tp_src, tp_dst, diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1a689c28b5a6..43d8f5a835cb 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -254,8 +254,8 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, struct rtable *rt; memset(fl, 0, sizeof(*fl)); - fl->daddr = key->ipv4_dst; - fl->saddr = key->ipv4_src; + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; fl->flowi4_tos = RT_TOS(key->ipv4_tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; From 7c383fb2254c44e096427470da6a36380169b548 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:24 +0200 Subject: [PATCH 05/13] ip_tunnels: use tos and ttl fields also for IPv6 Rename the ipv4_tos and ipv4_ttl fields to just 'tos' and 'ttl', as they'll be used with IPv6 tunnels, too. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++---- include/net/ip_tunnels.h | 8 ++++---- net/ipv4/ip_gre.c | 8 ++++---- net/ipv4/ip_tunnel_core.c | 8 ++++---- net/openvswitch/flow_netlink.c | 10 +++++----- net/openvswitch/vport-geneve.c | 4 ++-- net/openvswitch/vport.c | 4 ++-- net/openvswitch/vport.h | 2 +- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 30a7abcf2c09..ebeb3def06c5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1278,8 +1278,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) info = &tun_dst->u.tun_info; info->key.u.ipv4.src = iph->saddr; info->key.u.ipv4.dst = iph->daddr; - info->key.ipv4_tos = iph->tos; - info->key.ipv4_ttl = iph->ttl; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; @@ -1960,8 +1960,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, else flags &= ~VXLAN_F_UDP_CSUM; - ttl = info->key.ipv4_ttl; - tos = info->key.ipv4_tos; + ttl = info->key.ttl; + tos = info->key.tos; if (info->options_len) md = ip_tunnel_info_opts(info, sizeof(*md)); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6a51371dad00..224e4ecec91b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -44,8 +44,8 @@ struct ip_tunnel_key { } ipv6; } u; __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; + u8 tos; /* TOS for IPv4, TC for IPv6 */ + u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; }; @@ -195,8 +195,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_info->key.u.ipv4.dst = daddr; memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); - tun_info->key.ipv4_tos = tos; - tun_info->key.ipv4_ttl = ttl; + tun_info->key.tos = tos; + tun_info->key.ttl = ttl; tun_info->key.tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b7bb7d6aa7a8..5193618b2600 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -409,8 +409,8 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) info = &tun_dst->u.tun_info; info->key.u.ipv4.src = iph->saddr; info->key.u.ipv4.dst = iph->daddr; - info->key.ipv4_tos = iph->tos; - info->key.ipv4_ttl = iph->ttl; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; info->mode = IP_TUNNEL_INFO_RX; info->key.tun_flags = tpi->flags & @@ -529,7 +529,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl, 0, sizeof(fl)); fl.daddr = key->u.ipv4.dst; fl.saddr = key->u.ipv4.src; - fl.flowi4_tos = RT_TOS(key->ipv4_tos); + fl.flowi4_tos = RT_TOS(key->tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -565,7 +565,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->ipv4_tos, key->ipv4_ttl, df, false); + key->tos, key->ttl, df, false); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 93907d71cda6..f0514e39e57c 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -233,10 +233,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) - tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); if (tb[LWTUNNEL_IP_TOS]) - tun_info->key.ipv4_tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_SPORT]) tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); @@ -264,8 +264,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || - nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) || - nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) || + nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a7f866374817..4e7a3f7facc2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -542,11 +542,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + SW_FLOW_KEY_PUT(match, tun_key.tos, nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + SW_FLOW_KEY_PUT(match, tun_key.ttl, nla_get_u8(a), is_mask); ttl = true; break; @@ -655,10 +655,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->u.ipv4.dst)) return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 023813d05f88..d01bd6360970 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -203,8 +203,8 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) } err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->u.ipv4.dst, tun_key->ipv4_tos, - tun_key->ipv4_ttl, df, sport, dport, + tun_key->u.ipv4.dst, tun_key->tos, + tun_key->ttl, df, sport, dport, tun_key->tun_flags, vni, opts_len, opts, !!(tun_key->tun_flags & TUNNEL_CSUM), false); if (err < 0) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index a06adc72a58d..d73e5a16e7ca 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -604,8 +604,8 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, */ __ip_tunnel_info_init(egress_tun_info, fl.saddr, tun_key->u.ipv4.dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, + tun_key->tos, + tun_key->ttl, tp_src, tp_dst, tun_key->tun_id, tun_key->tun_flags, diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 43d8f5a835cb..b88b3ee86f07 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -256,7 +256,7 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, memset(fl, 0, sizeof(*fl)); fl->daddr = key->u.ipv4.dst; fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->ipv4_tos); + fl->flowi4_tos = RT_TOS(key->tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; From 61adedf3e3f1d3f032c5a6a299978d91eff6d555 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:25 +0200 Subject: [PATCH 06/13] route: move lwtunnel state to dst_entry Currently, the lwtunnel state resides in per-protocol data. This is a problem if we encapsulate ipv6 traffic in an ipv4 tunnel (or vice versa). The xmit function of the tunnel does not know whether the packet has been routed to it by ipv4 or ipv6, yet it needs the lwtstate data. Moving the lwtstate data to dst_entry makes such inter-protocol tunneling possible. As a bonus, this brings a nice diffstat. Signed-off-by: Jiri Benc Acked-by: Roopa Prabhu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 - drivers/net/vxlan.c | 4 +- include/net/dst.h | 3 +- include/net/dst_metadata.h | 15 +++----- include/net/ip6_fib.h | 1 - include/net/lwtunnel.h | 12 ------ include/net/route.h | 1 - net/core/dst.c | 3 ++ net/core/filter.c | 2 +- net/core/lwtunnel.c | 70 +++++----------------------------- net/ipv4/ip_gre.c | 2 +- net/ipv4/route.c | 20 ++++------ net/ipv6/ila.c | 14 ++----- net/ipv6/ip6_fib.c | 1 - net/ipv6/route.c | 20 +++++----- net/mpls/mpls_iptunnel.c | 7 +--- net/openvswitch/vport-netdev.c | 2 +- 17 files changed, 48 insertions(+), 130 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index dbeffe789185..b3d9c5546c79 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -295,7 +295,6 @@ static struct rtable *vrf_rtable_create(struct net_device *dev) rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); rth->rt_uncached_list = NULL; - rth->rt_lwtstate = NULL; } return rth; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ebeb3def06c5..93613ffd8d7e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1909,7 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 flags = vxlan->flags; /* FIXME: Support IPv6 */ - info = skb_tunnel_info(skb, AF_INET); + info = skb_tunnel_info(skb); if (rdst) { dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; @@ -2105,7 +2105,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_fdb *f; /* FIXME: Support IPv6 */ - info = skb_tunnel_info(skb, AF_INET); + info = skb_tunnel_info(skb); skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/include/net/dst.h b/include/net/dst.h index 2578811cef51..0a9a723f6c19 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -44,6 +44,7 @@ struct dst_entry { #else void *__pad1; #endif + struct lwtunnel_state *lwtstate; int (*input)(struct sk_buff *); int (*output)(struct sock *sk, struct sk_buff *skb); @@ -89,7 +90,7 @@ struct dst_entry { * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT - long __pad_to_align_refcnt[2]; + long __pad_to_align_refcnt[1]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 075f523ff23f..2cb52d562272 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,22 +23,17 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, - int family) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); - struct rtable *rt; + struct dst_entry *dst; if (md_dst) return &md_dst->u.tun_info; - switch (family) { - case AF_INET: - rt = (struct rtable *)skb_dst(skb); - if (rt && rt->rt_lwtstate) - return lwt_tun_info(rt->rt_lwtstate); - break; - } + dst = skb_dst(skb); + if (dst && dst->lwtstate) + return lwt_tun_info(dst->lwtstate); return NULL; } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 276328e3daa6..063d30474cf6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -133,7 +133,6 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; - struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index cfee53916ba5..843489884448 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -87,9 +87,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); -int lwtunnel_input6(struct sk_buff *skb); #else @@ -164,21 +162,11 @@ static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } -static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } -static inline int lwtunnel_input6(struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/include/net/route.h b/include/net/route.h index 6dda2c1bf8c6..395d79bb556c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,7 +66,6 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; - struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/core/dst.c b/net/core/dst.c index f8694d1b8702..50dcdbb0ee46 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif + dst->lwtstate = NULL; atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; @@ -264,6 +266,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) kfree(dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); + lwtstate_put(dst->lwtstate); dst = child; if (dst) { diff --git a/net/core/filter.c b/net/core/filter.c index 379568562ffb..b4adc961413f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1489,7 +1489,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET); + struct ip_tunnel_info *info = skb_tunnel_info(skb); if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) return -EINVAL; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 3331585174d9..e924c2e08554 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -179,14 +179,16 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) } EXPORT_SYMBOL(lwtunnel_cmp_encap); -int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, - struct lwtunnel_state *lwtstate) +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; int ret = -EINVAL; - if (!lwtstate) + if (!dst) goto drop; + lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) @@ -209,47 +211,18 @@ int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, return ret; } - -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) { - lwtstate = rt->rt6i_lwtstate; - skb->dev = rt->dst.dev; - } - - skb->protocol = htons(ETH_P_IPV6); - - return __lwtunnel_output(sk, skb, lwtstate); -} -EXPORT_SYMBOL(lwtunnel_output6); - -int lwtunnel_output(struct sock *sk, struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) { - lwtstate = rt->rt_lwtstate; - skb->dev = rt->dst.dev; - } - - skb->protocol = htons(ETH_P_IP); - - return __lwtunnel_output(sk, skb, lwtstate); -} EXPORT_SYMBOL(lwtunnel_output); -int __lwtunnel_input(struct sk_buff *skb, - struct lwtunnel_state *lwtstate) +int lwtunnel_input(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; int ret = -EINVAL; - if (!lwtstate) + if (!dst) goto drop; + lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) @@ -272,27 +245,4 @@ int __lwtunnel_input(struct sk_buff *skb, return ret; } - -int lwtunnel_input6(struct sk_buff *skb) -{ - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) - lwtstate = rt->rt6i_lwtstate; - - return __lwtunnel_input(skb, lwtstate); -} -EXPORT_SYMBOL(lwtunnel_input6); - -int lwtunnel_input(struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) - lwtstate = rt->rt_lwtstate; - - return __lwtunnel_input(skb, lwtstate); -} EXPORT_SYMBOL(lwtunnel_input); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5193618b2600..1bf328182697 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -521,7 +521,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) __be16 df, flags; int err; - tun_info = skb_tunnel_info(skb, AF_INET); + tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) goto err_free_skb; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2403e85107f0..f3087aaa6dd8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1359,7 +1359,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } - lwtstate_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1408,7 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate); + rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1494,7 +1493,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1624,19 +1622,18 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) { - rth->rt_lwtstate->orig_output = rth->dst.output; + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; rth->dst.output = lwtunnel_output; } - if (lwtunnel_input_redirect(rth->rt_lwtstate)) { - rth->rt_lwtstate->orig_input = rth->dst.input; + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } skb_dst_set(skb, &rth->dst); @@ -1695,7 +1692,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - tun_info = skb_tunnel_info(skb, AF_INET); + tun_info = skb_tunnel_info(skb); if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else @@ -1815,7 +1812,6 @@ out: return err; rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { @@ -2006,7 +2002,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2029,7 +2024,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) + if (lwtunnel_output_redirect(rth->dst.lwtstate)) rth->dst.output = lwtunnel_output; return rth; @@ -2293,7 +2288,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - rt->rt_lwtstate = NULL; dst_free(new); } diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index 2540ab4b76d1..f011c3d5ca40 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -89,16 +89,13 @@ static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) static int ila_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rt6_info *rt6 = NULL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - rt6 = (struct rt6_info *)dst; + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); - - return rt6->rt6i_lwtstate->orig_output(sk, skb); + return dst->lwtstate->orig_output(sk, skb); drop: kfree_skb(skb); @@ -108,16 +105,13 @@ static int ila_output(struct sock *sk, struct sk_buff *skb) static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rt6_info *rt6 = NULL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - rt6 = (struct rt6_info *)dst; + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); - - return rt6->rt6i_lwtstate->orig_input(skb); + return dst->lwtstate->orig_input(skb); drop: kfree_skb(skb); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5693b5eb8482..865e777ae20c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -178,7 +178,6 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { - lwtstate_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3733049715e..e6bbcdee7707 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1784,14 +1784,14 @@ int ip6_route_add(struct fib6_config *cfg) cfg->fc_encap, &lwtstate); if (err) goto out; - rt->rt6i_lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) { - rt->rt6i_lwtstate->orig_output = rt->dst.output; - rt->dst.output = lwtunnel_output6; + rt->dst.lwtstate = lwtstate_get(lwtstate); + if (lwtunnel_output_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_output = rt->dst.output; + rt->dst.output = lwtunnel_output; } - if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) { - rt->rt6i_lwtstate->orig_input = rt->dst.input; - rt->dst.input = lwtunnel_input6; + if (lwtunnel_input_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input; } } @@ -2174,7 +2174,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate); + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -2838,7 +2838,7 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->rt6i_lwtstate); + + lwtunnel_get_encap_size(rt->dst.lwtstate); } static int rt6_fill_node(struct net *net, @@ -2991,7 +2991,7 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; - lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + lwtunnel_fill_encap(skb, rt->dst.lwtstate); nlmsg_end(skb, nlh); return 0; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 276f8c992218..3da5ca3ba563 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -48,7 +48,6 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; - struct lwtunnel_state *lwtstate = NULL; int err = 0; bool bos; int i; @@ -58,11 +57,9 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) { ttl = ip_hdr(skb)->ttl; rt = (struct rtable *)dst; - lwtstate = rt->rt_lwtstate; } else if (skb->protocol == htons(ETH_P_IPV6)) { ttl = ipv6_hdr(skb)->hop_limit; rt6 = (struct rt6_info *)dst; - lwtstate = rt6->rt6i_lwtstate; } else { goto drop; } @@ -72,12 +69,12 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; if (!mpls_output_possible(out_dev) || - !lwtstate || skb_warn_if_lro(skb)) + !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); - tun_encap_info = mpls_lwtunnel_encap(lwtstate); + tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 4b70aaa4a746..a75011505039 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -57,7 +57,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, skb_tunnel_info(skb, AF_INET)); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: From 06e9d040ba08b0f645783ff958384d5837b3fa3a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:26 +0200 Subject: [PATCH 07/13] ipv6: drop metadata dst in ip6_route_input The fix in commit 48fb6b554501 is incomplete, as now ip6_route_input can be called with non-NULL dst if it's a metadata dst and the reference is leaked. Drop the reference. Fixes: 48fb6b554501 ("ipv6: fix crash over flow-based vxlan device") Fixes: ee122c79d422 ("vxlan: Flow based tunneling") CC: Wei-Chun Chao CC: Thomas Graf Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e6bbcdee7707..0947ad0b3de8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1140,6 +1140,7 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } From ab450605b35caa768ca33e86db9403229bf42be4 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:27 +0200 Subject: [PATCH 08/13] ipv6: ndisc: inherit metadata dst when creating ndisc requests If output device wants to see the dst, inherit the dst of the original skb in the ndisc request. This is an IPv6 counterpart of commit 0accfc268f4d ("arp: Inherit metadata dst when creating ARP requests"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ndisc.h | 3 ++- net/ipv6/addrconf.c | 2 +- net/ipv6/ndisc.c | 10 +++++++--- net/ipv6/route.c | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index b3a7751251b4..aba5695fadb0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -182,7 +182,8 @@ int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 59242399b0b5..0f08d3b9e238 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3656,7 +3656,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any, NULL); out: in6_ifa_put(ifp); rtnl_unlock(); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3054611f88a..13d3c2beb93e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -553,7 +553,8 @@ static void ndisc_send_unsol_na(struct net_device *dev) void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr) + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -589,6 +590,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr); + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb) + skb_dst_copy(skb, oskb); + ndisc_send_skb(skb, daddr, saddr); } @@ -675,12 +679,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, neigh, target, target, saddr); + ndisc_send_ns(dev, neigh, target, target, saddr, skb); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr, skb); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0947ad0b3de8..c4f3b9fcca9d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -538,7 +538,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL); dev_put(work->dev); kfree(work); } From 705cc62f6728c5a23e3c82465aa94e652e0b50e4 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:28 +0200 Subject: [PATCH 09/13] vxlan: provide access function for vxlan socket address family Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++---- include/net/vxlan.h | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 93613ffd8d7e..070149f77072 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -236,7 +236,7 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && - inet_sk(vs->sock->sk)->sk.sk_family == family && + vxlan_get_sk_family(vs) == family && vs->flags == flags) return vs; } @@ -625,7 +625,7 @@ static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); - sa_family_t sa_family = sk->sk_family; + sa_family_t sa_family = vxlan_get_sk_family(vs); __be16 port = inet_sk(sk)->inet_sport; int err; @@ -650,7 +650,7 @@ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); - sa_family_t sa_family = sk->sk_family; + sa_family_t sa_family = vxlan_get_sk_family(vs); __be16 port = inet_sk(sk)->inet_sport; rcu_read_lock(); @@ -2390,7 +2390,7 @@ void vxlan_get_rx_port(struct net_device *dev) for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { port = inet_sk(vs->sock->sk)->inet_sport; - sa_family = vs->sock->sk->sk_family; + sa_family = vxlan_get_sk_family(vs); dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, port); } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e4534f1b2d8c..43677e6b9c43 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -241,3 +241,8 @@ static inline void vxlan_get_rx_port(struct net_device *netdev) } #endif #endif + +static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) +{ + return vs->sock->sk->sk_family; +} From 6f264e47d4dbbe590ac1131587933bb87ded296d Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:29 +0200 Subject: [PATCH 10/13] vxlan: do not shadow flags variable The 'flags' variable is already defined in the outer scope. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 070149f77072..2c1abf95c17d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2025,7 +2025,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } else { struct dst_entry *ndst; struct flowi6 fl6; - u32 flags; + u32 rt6i_flags; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; @@ -2050,9 +2050,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } /* Bypass encapsulation if the destination is local */ - flags = ((struct rt6_info *)ndst)->rt6i_flags; - if (flags & RTF_LOCAL && - !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; + if (rt6i_flags & RTF_LOCAL && + !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; dst_release(ndst); From a725e514dbb444f2a39c2bc5de72eb5efbeb7d5e Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:30 +0200 Subject: [PATCH 11/13] vxlan: metadata based tunneling for IPv6 Support metadata based (formerly flow based) tunneling also for IPv6. This complements commit ee122c79d422 ("vxlan: Flow based tunneling"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 69 ++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2c1abf95c17d..54615bb9d916 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1269,17 +1269,27 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) } if (vxlan_collect_metadata(vs)) { - const struct iphdr *iph = ip_hdr(skb); - tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC); if (!tun_dst) goto drop; info = &tun_dst->u.tun_info; - info->key.u.ipv4.src = iph->saddr; - info->key.u.ipv4.dst = iph->daddr; - info->key.tos = iph->tos; - info->key.ttl = iph->ttl; + if (vxlan_get_sk_family(vs) == AF_INET) { + const struct iphdr *iph = ip_hdr(skb); + + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; + } else { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + + info->key.u.ipv6.src = ip6h->saddr; + info->key.u.ipv6.dst = ip6h->daddr; + info->key.tos = ipv6_get_dsfield(ip6h); + info->key.ttl = ip6h->hop_limit; + } + info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; @@ -1894,6 +1904,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; + unsigned short family = vxlan_get_sk_family(vxlan->vn_sock); struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; @@ -1908,7 +1919,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, int err; u32 flags = vxlan->flags; - /* FIXME: Support IPv6 */ info = skb_tunnel_info(skb); if (rdst) { @@ -1924,8 +1934,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = be64_to_cpu(info->key.tun_id); - remote_ip.sin.sin_family = AF_INET; - remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; + remote_ip.sa.sa_family = family; + if (family == AF_INET) + remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; + else + remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; dst = &remote_ip; } @@ -1951,23 +1964,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); + if (info) { + if (info->key.tun_flags & TUNNEL_CSUM) + flags |= VXLAN_F_UDP_CSUM; + else + flags &= ~VXLAN_F_UDP_CSUM; + + ttl = info->key.ttl; + tos = info->key.tos; + + if (info->options_len) + md = ip_tunnel_info_opts(info, sizeof(*md)); + } else { + md->gbp = skb->mark; + } + if (dst->sa.sa_family == AF_INET) { - if (info) { - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) - df = htons(IP_DF); - if (info->key.tun_flags & TUNNEL_CSUM) - flags |= VXLAN_F_UDP_CSUM; - else - flags &= ~VXLAN_F_UDP_CSUM; - - ttl = info->key.ttl; - tos = info->key.tos; - - if (info->options_len) - md = ip_tunnel_info_opts(info, sizeof(*md)); - } else { - md->gbp = skb->mark; - } + if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)) + df = htons(IP_DF); memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; @@ -2066,12 +2080,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); - md->gbp = skb->mark; - err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, 0, ttl, src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), - vxlan->flags); + flags); #endif } @@ -2104,7 +2116,6 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_rdst *rdst, *fdst = NULL; struct vxlan_fdb *f; - /* FIXME: Support IPv6 */ info = skb_tunnel_info(skb); skb_reset_mac_header(skb); From 904af04d30f303d96902584206457128c3051d8d Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:31 +0200 Subject: [PATCH 12/13] ipv6: route: extend flow representation with tunnel key Use flowi_tunnel in flowi6 similarly to what is done with IPv4. This complements commit 1b7179d3adff ("route: Extend flow representation with tunnel key"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/ipv6/route.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/net/flow.h b/include/net/flow.h index f305588fc162..9e0297c4c11d 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -130,6 +130,7 @@ struct flowi6 { #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid +#define flowi6_tun_key __fl_common.flowic_tun_key struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4f3b9fcca9d..6c0fe4c7ce8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -54,11 +54,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -1131,6 +1133,7 @@ void ip6_route_input(struct sk_buff *skb) const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, @@ -1140,6 +1143,9 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + tun_info = skb_tunnel_info(skb); + if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } From 32a2b002ce615eadd3bfaddabde290f70a1dd17b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:32 +0200 Subject: [PATCH 13/13] ipv6: route: per route IP tunnel metadata via lightweight tunnel Allow specification of per route IP tunnel instructions also for IPv6. This complements commit 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel"). Signed-off-by: Jiri Benc CC: YOSHIFUJI Hideaki Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 16 ++++++ net/ipv4/ip_tunnel_core.c | 102 ++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index aa84ca396bcb..34141a5dfe74 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -8,6 +8,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_MPLS, LWTUNNEL_ENCAP_IP, LWTUNNEL_ENCAP_ILA, + LWTUNNEL_ENCAP_IP6, __LWTUNNEL_ENCAP_MAX, }; @@ -28,4 +29,19 @@ enum lwtunnel_ip_t { #define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1) +enum lwtunnel_ip6_t { + LWTUNNEL_IP6_UNSPEC, + LWTUNNEL_IP6_ID, + LWTUNNEL_IP6_DST, + LWTUNNEL_IP6_SRC, + LWTUNNEL_IP6_HOPLIMIT, + LWTUNNEL_IP6_TC, + LWTUNNEL_IP6_SPORT, + LWTUNNEL_IP6_DPORT, + LWTUNNEL_IP6_FLAGS, + __LWTUNNEL_IP6_MAX, +}; + +#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) + #endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f0514e39e57c..289b6c26ce37 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -299,9 +299,111 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .cmp_encap = ip_tun_cmp_encap, }; +static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP6; + + tun_info = lwt_tun_info(new_state); + + if (tb[LWTUNNEL_IP6_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]); + + if (tb[LWTUNNEL_IP6_DST]) + tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); + + if (tb[LWTUNNEL_IP6_SRC]) + tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]); + + if (tb[LWTUNNEL_IP6_HOPLIMIT]) + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]); + + if (tb[LWTUNNEL_IP6_TC]) + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); + + if (tb[LWTUNNEL_IP6_SPORT]) + tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); + + if (tb[LWTUNNEL_IP6_DPORT]) + tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); + + if (tb[LWTUNNEL_IP6_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options = NULL; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip6_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* LWTUNNEL_IP6_ID */ + + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + + nla_total_size(1) /* LWTUNNEL_IP6_TC */ + + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ + + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { + .build_state = ip6_tun_build_state, + .fill_encap = ip6_tun_fill_encap_info, + .get_encap_size = ip6_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, +}; + void __init ip_tunnel_core_init(void) { lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;