mirror of https://gitee.com/openkylin/linux.git
bpf: Add redirect_peer helper
Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net
This commit is contained in:
parent
dd2ce6a537
commit
9aa1206e8f
|
@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
|
||||||
return smp_processor_id() % dev->real_num_rx_queues;
|
return smp_processor_id() % dev->real_num_rx_queues;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct net_device *veth_peer_dev(struct net_device *dev)
|
||||||
|
{
|
||||||
|
struct veth_priv *priv = netdev_priv(dev);
|
||||||
|
|
||||||
|
/* Callers must be under RCU read side. */
|
||||||
|
return rcu_dereference(priv->peer);
|
||||||
|
}
|
||||||
|
|
||||||
static int veth_xdp_xmit(struct net_device *dev, int n,
|
static int veth_xdp_xmit(struct net_device *dev, int n,
|
||||||
struct xdp_frame **frames,
|
struct xdp_frame **frames,
|
||||||
u32 flags, bool ndo_xmit)
|
u32 flags, bool ndo_xmit)
|
||||||
|
@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
|
||||||
.ndo_set_rx_headroom = veth_set_rx_headroom,
|
.ndo_set_rx_headroom = veth_set_rx_headroom,
|
||||||
.ndo_bpf = veth_xdp,
|
.ndo_bpf = veth_xdp,
|
||||||
.ndo_xdp_xmit = veth_ndo_xdp_xmit,
|
.ndo_xdp_xmit = veth_ndo_xdp_xmit,
|
||||||
|
.ndo_get_peer_dev = veth_peer_dev,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
|
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
|
||||||
|
|
|
@ -1277,6 +1277,9 @@ struct netdev_net_notifier {
|
||||||
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
|
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
|
||||||
* int cmd);
|
* int cmd);
|
||||||
* Add, change, delete or get information on an IPv4 tunnel.
|
* Add, change, delete or get information on an IPv4 tunnel.
|
||||||
|
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
|
||||||
|
* If a device is paired with a peer device, return the peer instance.
|
||||||
|
* The caller must be under RCU read context.
|
||||||
*/
|
*/
|
||||||
struct net_device_ops {
|
struct net_device_ops {
|
||||||
int (*ndo_init)(struct net_device *dev);
|
int (*ndo_init)(struct net_device *dev);
|
||||||
|
@ -1484,6 +1487,7 @@ struct net_device_ops {
|
||||||
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
|
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
|
||||||
int (*ndo_tunnel_ctl)(struct net_device *dev,
|
int (*ndo_tunnel_ctl)(struct net_device *dev,
|
||||||
struct ip_tunnel_parm *p, int cmd);
|
struct ip_tunnel_parm *p, int cmd);
|
||||||
|
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -3719,6 +3719,22 @@ union bpf_attr {
|
||||||
* never return NULL.
|
* never return NULL.
|
||||||
* Return
|
* Return
|
||||||
* A pointer pointing to the kernel percpu variable on this cpu.
|
* A pointer pointing to the kernel percpu variable on this cpu.
|
||||||
|
*
|
||||||
|
* long bpf_redirect_peer(u32 ifindex, u64 flags)
|
||||||
|
* Description
|
||||||
|
* Redirect the packet to another net device of index *ifindex*.
|
||||||
|
* This helper is somewhat similar to **bpf_redirect**\ (), except
|
||||||
|
* that the redirection happens to the *ifindex*' peer device and
|
||||||
|
* the netns switch takes place from ingress to ingress without
|
||||||
|
* going through the CPU's backlog queue.
|
||||||
|
*
|
||||||
|
* The *flags* argument is reserved and must be 0. The helper is
|
||||||
|
* currently only supported for tc BPF program types at the ingress
|
||||||
|
* hook and for veth device types. The peer device must reside in a
|
||||||
|
* different network namespace.
|
||||||
|
* Return
|
||||||
|
* The helper returns **TC_ACT_REDIRECT** on success or
|
||||||
|
* **TC_ACT_SHOT** on error.
|
||||||
*/
|
*/
|
||||||
#define __BPF_FUNC_MAPPER(FN) \
|
#define __BPF_FUNC_MAPPER(FN) \
|
||||||
FN(unspec), \
|
FN(unspec), \
|
||||||
|
@ -3876,6 +3892,7 @@ union bpf_attr {
|
||||||
FN(redirect_neigh), \
|
FN(redirect_neigh), \
|
||||||
FN(bpf_per_cpu_ptr), \
|
FN(bpf_per_cpu_ptr), \
|
||||||
FN(bpf_this_cpu_ptr), \
|
FN(bpf_this_cpu_ptr), \
|
||||||
|
FN(redirect_peer), \
|
||||||
/* */
|
/* */
|
||||||
|
|
||||||
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
||||||
|
|
|
@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
|
||||||
|
|
||||||
static inline struct sk_buff *
|
static inline struct sk_buff *
|
||||||
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
|
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
|
||||||
struct net_device *orig_dev)
|
struct net_device *orig_dev, bool *another)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_NET_CLS_ACT
|
#ifdef CONFIG_NET_CLS_ACT
|
||||||
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
|
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
|
||||||
|
@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
|
||||||
* redirecting to another netdev
|
* redirecting to another netdev
|
||||||
*/
|
*/
|
||||||
__skb_push(skb, skb->mac_len);
|
__skb_push(skb, skb->mac_len);
|
||||||
skb_do_redirect(skb);
|
if (skb_do_redirect(skb) == -EAGAIN) {
|
||||||
|
__skb_pull(skb, skb->mac_len);
|
||||||
|
*another = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
case TC_ACT_CONSUMED:
|
case TC_ACT_CONSUMED:
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
|
||||||
skip_taps:
|
skip_taps:
|
||||||
#ifdef CONFIG_NET_INGRESS
|
#ifdef CONFIG_NET_INGRESS
|
||||||
if (static_branch_unlikely(&ingress_needed_key)) {
|
if (static_branch_unlikely(&ingress_needed_key)) {
|
||||||
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
|
bool another = false;
|
||||||
|
|
||||||
|
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
|
||||||
|
&another);
|
||||||
|
if (another)
|
||||||
|
goto another_round;
|
||||||
if (!skb)
|
if (!skb)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
|
|
|
@ -2380,8 +2380,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
|
||||||
|
|
||||||
/* Internal, non-exposed redirect flags. */
|
/* Internal, non-exposed redirect flags. */
|
||||||
enum {
|
enum {
|
||||||
BPF_F_NEIGH = (1ULL << 1),
|
BPF_F_NEIGH = (1ULL << 1),
|
||||||
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH)
|
BPF_F_PEER = (1ULL << 2),
|
||||||
|
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER)
|
||||||
};
|
};
|
||||||
|
|
||||||
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
|
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
|
||||||
|
@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
|
||||||
int skb_do_redirect(struct sk_buff *skb)
|
int skb_do_redirect(struct sk_buff *skb)
|
||||||
{
|
{
|
||||||
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
|
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
|
||||||
|
struct net *net = dev_net(skb->dev);
|
||||||
struct net_device *dev;
|
struct net_device *dev;
|
||||||
u32 flags = ri->flags;
|
u32 flags = ri->flags;
|
||||||
|
|
||||||
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
|
dev = dev_get_by_index_rcu(net, ri->tgt_index);
|
||||||
ri->tgt_index = 0;
|
ri->tgt_index = 0;
|
||||||
if (unlikely(!dev)) {
|
ri->flags = 0;
|
||||||
kfree_skb(skb);
|
if (unlikely(!dev))
|
||||||
return -EINVAL;
|
goto out_drop;
|
||||||
}
|
if (flags & BPF_F_PEER) {
|
||||||
|
const struct net_device_ops *ops = dev->netdev_ops;
|
||||||
|
|
||||||
|
if (unlikely(!ops->ndo_get_peer_dev ||
|
||||||
|
!skb_at_tc_ingress(skb)))
|
||||||
|
goto out_drop;
|
||||||
|
dev = ops->ndo_get_peer_dev(dev);
|
||||||
|
if (unlikely(!dev ||
|
||||||
|
!is_skb_forwardable(dev, skb) ||
|
||||||
|
net_eq(net, dev_net(dev))))
|
||||||
|
goto out_drop;
|
||||||
|
skb->dev = dev;
|
||||||
|
return -EAGAIN;
|
||||||
|
}
|
||||||
return flags & BPF_F_NEIGH ?
|
return flags & BPF_F_NEIGH ?
|
||||||
__bpf_redirect_neigh(skb, dev) :
|
__bpf_redirect_neigh(skb, dev) :
|
||||||
__bpf_redirect(skb, dev, flags);
|
__bpf_redirect(skb, dev, flags);
|
||||||
|
out_drop:
|
||||||
|
kfree_skb(skb);
|
||||||
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
|
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
|
||||||
|
@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
|
||||||
.arg2_type = ARG_ANYTHING,
|
.arg2_type = ARG_ANYTHING,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
|
||||||
|
{
|
||||||
|
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
|
||||||
|
|
||||||
|
if (unlikely(flags))
|
||||||
|
return TC_ACT_SHOT;
|
||||||
|
|
||||||
|
ri->flags = BPF_F_PEER;
|
||||||
|
ri->tgt_index = ifindex;
|
||||||
|
|
||||||
|
return TC_ACT_REDIRECT;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct bpf_func_proto bpf_redirect_peer_proto = {
|
||||||
|
.func = bpf_redirect_peer,
|
||||||
|
.gpl_only = false,
|
||||||
|
.ret_type = RET_INTEGER,
|
||||||
|
.arg1_type = ARG_ANYTHING,
|
||||||
|
.arg2_type = ARG_ANYTHING,
|
||||||
|
};
|
||||||
|
|
||||||
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
|
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
|
||||||
{
|
{
|
||||||
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
|
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
|
||||||
|
@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||||
return &bpf_redirect_proto;
|
return &bpf_redirect_proto;
|
||||||
case BPF_FUNC_redirect_neigh:
|
case BPF_FUNC_redirect_neigh:
|
||||||
return &bpf_redirect_neigh_proto;
|
return &bpf_redirect_neigh_proto;
|
||||||
|
case BPF_FUNC_redirect_peer:
|
||||||
|
return &bpf_redirect_peer_proto;
|
||||||
case BPF_FUNC_get_route_realm:
|
case BPF_FUNC_get_route_realm:
|
||||||
return &bpf_get_route_realm_proto;
|
return &bpf_get_route_realm_proto;
|
||||||
case BPF_FUNC_get_hash_recalc:
|
case BPF_FUNC_get_hash_recalc:
|
||||||
|
|
|
@ -3719,6 +3719,22 @@ union bpf_attr {
|
||||||
* never return NULL.
|
* never return NULL.
|
||||||
* Return
|
* Return
|
||||||
* A pointer pointing to the kernel percpu variable on this cpu.
|
* A pointer pointing to the kernel percpu variable on this cpu.
|
||||||
|
*
|
||||||
|
* long bpf_redirect_peer(u32 ifindex, u64 flags)
|
||||||
|
* Description
|
||||||
|
* Redirect the packet to another net device of index *ifindex*.
|
||||||
|
* This helper is somewhat similar to **bpf_redirect**\ (), except
|
||||||
|
* that the redirection happens to the *ifindex*' peer device and
|
||||||
|
* the netns switch takes place from ingress to ingress without
|
||||||
|
* going through the CPU's backlog queue.
|
||||||
|
*
|
||||||
|
* The *flags* argument is reserved and must be 0. The helper is
|
||||||
|
* currently only supported for tc BPF program types at the ingress
|
||||||
|
* hook and for veth device types. The peer device must reside in a
|
||||||
|
* different network namespace.
|
||||||
|
* Return
|
||||||
|
* The helper returns **TC_ACT_REDIRECT** on success or
|
||||||
|
* **TC_ACT_SHOT** on error.
|
||||||
*/
|
*/
|
||||||
#define __BPF_FUNC_MAPPER(FN) \
|
#define __BPF_FUNC_MAPPER(FN) \
|
||||||
FN(unspec), \
|
FN(unspec), \
|
||||||
|
@ -3876,6 +3892,7 @@ union bpf_attr {
|
||||||
FN(redirect_neigh), \
|
FN(redirect_neigh), \
|
||||||
FN(bpf_per_cpu_ptr), \
|
FN(bpf_per_cpu_ptr), \
|
||||||
FN(bpf_this_cpu_ptr), \
|
FN(bpf_this_cpu_ptr), \
|
||||||
|
FN(redirect_peer), \
|
||||||
/* */
|
/* */
|
||||||
|
|
||||||
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
||||||
|
|
Loading…
Reference in New Issue