net/ipv6: Add support for path selection using hash of 5-tuple
Some operators prefer IPv6 path selection to use a standard 5-tuple
hash rather than just an L3 hash with the flow the label. To that end
add support to IPv6 for multipath hash policy similar to bf4e0a3db9
("net: ipv4: add support for ECMP hash policy choice"). The default
is still L3 which covers source and destination addresses along with
flow label and IPv6 protocol.
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
b75cc8f90f
commit
b4bac172e9
|
@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN
|
||||||
FALSE: disabled
|
FALSE: disabled
|
||||||
Default: FALSE
|
Default: FALSE
|
||||||
|
|
||||||
|
fib_multipath_hash_policy - INTEGER
|
||||||
|
Controls which hash policy to use for multipath routes.
|
||||||
|
Default: 0 (Layer 3)
|
||||||
|
Possible values:
|
||||||
|
0 - Layer 3 (source and destination addresses plus flow label)
|
||||||
|
1 - Layer 4 (standard 5-tuple)
|
||||||
|
|
||||||
anycast_src_echo_reply - BOOLEAN
|
anycast_src_echo_reply - BOOLEAN
|
||||||
Controls the use of anycast addresses as source addresses for ICMPv6
|
Controls the use of anycast addresses as source addresses for ICMPv6
|
||||||
echo reply
|
echo reply
|
||||||
|
|
|
@ -130,8 +130,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
|
||||||
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
|
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
|
||||||
const struct in6_addr *saddr, int oif,
|
const struct in6_addr *saddr, int oif,
|
||||||
const struct sk_buff *skb, int flags);
|
const struct sk_buff *skb, int flags);
|
||||||
u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
|
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
|
||||||
struct flow_keys *hkeys);
|
const struct sk_buff *skb, struct flow_keys *hkeys);
|
||||||
|
|
||||||
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
|
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ enum netevent_notif_type {
|
||||||
NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */
|
NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */
|
||||||
NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
|
NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
|
||||||
NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
|
NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
|
||||||
|
NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */
|
||||||
};
|
};
|
||||||
|
|
||||||
int register_netevent_notifier(struct notifier_block *nb);
|
int register_netevent_notifier(struct notifier_block *nb);
|
||||||
|
|
|
@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 {
|
||||||
int ip6_rt_gc_elasticity;
|
int ip6_rt_gc_elasticity;
|
||||||
int ip6_rt_mtu_expires;
|
int ip6_rt_mtu_expires;
|
||||||
int ip6_rt_min_advmss;
|
int ip6_rt_min_advmss;
|
||||||
|
int multipath_hash_policy;
|
||||||
int flowlabel_consistency;
|
int flowlabel_consistency;
|
||||||
int auto_flowlabels;
|
int auto_flowlabels;
|
||||||
int icmpv6_time;
|
int icmpv6_time;
|
||||||
|
|
|
@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
|
||||||
fl6.fl6_icmp_type = type;
|
fl6.fl6_icmp_type = type;
|
||||||
fl6.fl6_icmp_code = code;
|
fl6.fl6_icmp_code = code;
|
||||||
fl6.flowi6_uid = sock_net_uid(net, NULL);
|
fl6.flowi6_uid = sock_net_uid(net, NULL);
|
||||||
fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL);
|
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
|
||||||
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
|
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
|
||||||
|
|
||||||
sk = icmpv6_xmit_lock(net);
|
sk = icmpv6_xmit_lock(net);
|
||||||
|
|
|
@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
|
static struct rt6_info *rt6_multipath_select(const struct net *net,
|
||||||
|
struct rt6_info *match,
|
||||||
struct flowi6 *fl6, int oif,
|
struct flowi6 *fl6, int oif,
|
||||||
const struct sk_buff *skb,
|
const struct sk_buff *skb,
|
||||||
int strict)
|
int strict)
|
||||||
|
@ -461,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
|
||||||
* case it will always be non-zero. Otherwise now is the time to do it.
|
* case it will always be non-zero. Otherwise now is the time to do it.
|
||||||
*/
|
*/
|
||||||
if (!fl6->mp_hash)
|
if (!fl6->mp_hash)
|
||||||
fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL);
|
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
|
||||||
|
|
||||||
if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
|
if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
|
||||||
return match;
|
return match;
|
||||||
|
@ -932,7 +933,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
|
||||||
rt = rt6_device_match(net, rt, &fl6->saddr,
|
rt = rt6_device_match(net, rt, &fl6->saddr,
|
||||||
fl6->flowi6_oif, flags);
|
fl6->flowi6_oif, flags);
|
||||||
if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
|
if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
|
||||||
rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif,
|
rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
|
||||||
skb, flags);
|
skb, flags);
|
||||||
}
|
}
|
||||||
if (rt == net->ipv6.ip6_null_entry) {
|
if (rt == net->ipv6.ip6_null_entry) {
|
||||||
|
@ -1674,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
|
||||||
redo_rt6_select:
|
redo_rt6_select:
|
||||||
rt = rt6_select(net, fn, oif, strict);
|
rt = rt6_select(net, fn, oif, strict);
|
||||||
if (rt->rt6i_nsiblings)
|
if (rt->rt6i_nsiblings)
|
||||||
rt = rt6_multipath_select(rt, fl6, oif, skb, strict);
|
rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
|
||||||
if (rt == net->ipv6.ip6_null_entry) {
|
if (rt == net->ipv6.ip6_null_entry) {
|
||||||
fn = fib6_backtrack(fn, &fl6->saddr);
|
fn = fib6_backtrack(fn, &fl6->saddr);
|
||||||
if (fn)
|
if (fn)
|
||||||
|
@ -1839,21 +1840,56 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if skb is set it will be used and fl6 can be NULL */
|
/* if skb is set it will be used and fl6 can be NULL */
|
||||||
u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
|
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
|
||||||
struct flow_keys *flkeys)
|
const struct sk_buff *skb, struct flow_keys *flkeys)
|
||||||
{
|
{
|
||||||
struct flow_keys hash_keys;
|
struct flow_keys hash_keys;
|
||||||
u32 mhash;
|
u32 mhash;
|
||||||
|
|
||||||
memset(&hash_keys, 0, sizeof(hash_keys));
|
switch (net->ipv6.sysctl.multipath_hash_policy) {
|
||||||
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
|
case 0:
|
||||||
if (skb) {
|
memset(&hash_keys, 0, sizeof(hash_keys));
|
||||||
ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
|
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
|
||||||
} else {
|
if (skb) {
|
||||||
hash_keys.addrs.v6addrs.src = fl6->saddr;
|
ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
|
||||||
hash_keys.addrs.v6addrs.dst = fl6->daddr;
|
} else {
|
||||||
hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
|
hash_keys.addrs.v6addrs.src = fl6->saddr;
|
||||||
hash_keys.basic.ip_proto = fl6->flowi6_proto;
|
hash_keys.addrs.v6addrs.dst = fl6->daddr;
|
||||||
|
hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
|
||||||
|
hash_keys.basic.ip_proto = fl6->flowi6_proto;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
if (skb) {
|
||||||
|
unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
|
||||||
|
struct flow_keys keys;
|
||||||
|
|
||||||
|
/* short-circuit if we already have L4 hash present */
|
||||||
|
if (skb->l4_hash)
|
||||||
|
return skb_get_hash_raw(skb) >> 1;
|
||||||
|
|
||||||
|
memset(&hash_keys, 0, sizeof(hash_keys));
|
||||||
|
|
||||||
|
if (!flkeys) {
|
||||||
|
skb_flow_dissect_flow_keys(skb, &keys, flag);
|
||||||
|
flkeys = &keys;
|
||||||
|
}
|
||||||
|
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
|
||||||
|
hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
|
||||||
|
hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
|
||||||
|
hash_keys.ports.src = flkeys->ports.src;
|
||||||
|
hash_keys.ports.dst = flkeys->ports.dst;
|
||||||
|
hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
|
||||||
|
} else {
|
||||||
|
memset(&hash_keys, 0, sizeof(hash_keys));
|
||||||
|
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
|
||||||
|
hash_keys.addrs.v6addrs.src = fl6->saddr;
|
||||||
|
hash_keys.addrs.v6addrs.dst = fl6->daddr;
|
||||||
|
hash_keys.ports.src = fl6->fl6_sport;
|
||||||
|
hash_keys.ports.dst = fl6->fl6_dport;
|
||||||
|
hash_keys.basic.ip_proto = fl6->flowi6_proto;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
mhash = flow_hash_from_keys(&hash_keys);
|
mhash = flow_hash_from_keys(&hash_keys);
|
||||||
|
|
||||||
|
@ -1884,7 +1920,7 @@ void ip6_route_input(struct sk_buff *skb)
|
||||||
flkeys = &_flkeys;
|
flkeys = &_flkeys;
|
||||||
|
|
||||||
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
|
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
|
||||||
fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys);
|
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
|
||||||
skb_dst_drop(skb);
|
skb_dst_drop(skb);
|
||||||
skb_dst_set(skb,
|
skb_dst_set(skb,
|
||||||
ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
|
ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
|
||||||
|
|
|
@ -16,14 +16,31 @@
|
||||||
#include <net/ipv6.h>
|
#include <net/ipv6.h>
|
||||||
#include <net/addrconf.h>
|
#include <net/addrconf.h>
|
||||||
#include <net/inet_frag.h>
|
#include <net/inet_frag.h>
|
||||||
|
#include <net/netevent.h>
|
||||||
#ifdef CONFIG_NETLABEL
|
#ifdef CONFIG_NETLABEL
|
||||||
#include <net/calipso.h>
|
#include <net/calipso.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static int zero;
|
||||||
static int one = 1;
|
static int one = 1;
|
||||||
static int auto_flowlabels_min;
|
static int auto_flowlabels_min;
|
||||||
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
|
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
|
||||||
|
|
||||||
|
static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
|
||||||
|
void __user *buffer, size_t *lenp,
|
||||||
|
loff_t *ppos)
|
||||||
|
{
|
||||||
|
struct net *net;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
net = container_of(table->data, struct net,
|
||||||
|
ipv6.sysctl.multipath_hash_policy);
|
||||||
|
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||||
|
if (write && ret == 0)
|
||||||
|
call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ctl_table ipv6_table_template[] = {
|
static struct ctl_table ipv6_table_template[] = {
|
||||||
{
|
{
|
||||||
|
@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = {
|
||||||
.mode = 0644,
|
.mode = 0644,
|
||||||
.proc_handler = proc_dointvec
|
.proc_handler = proc_dointvec
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "fib_multipath_hash_policy",
|
||||||
|
.data = &init_net.ipv6.sysctl.multipath_hash_policy,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_rt6_multipath_hash_policy,
|
||||||
|
.extra1 = &zero,
|
||||||
|
.extra2 = &one,
|
||||||
|
},
|
||||||
{ }
|
{ }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
|
||||||
ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
|
ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
|
||||||
ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
|
ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
|
||||||
ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
|
ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
|
||||||
|
ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
|
||||||
|
|
||||||
ipv6_route_table = ipv6_route_sysctl_init(net);
|
ipv6_route_table = ipv6_route_sysctl_init(net);
|
||||||
if (!ipv6_route_table)
|
if (!ipv6_route_table)
|
||||||
|
|
Loading…
Reference in New Issue