mirror of https://gitee.com/openkylin/linux.git
netfilter: ipt_CLUSTERIP: do not hold dev
It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
This commit is contained in:
parent
34158151d2
commit
202f59afd4
|
@ -47,7 +47,7 @@ struct clusterip_config {
|
||||||
|
|
||||||
__be32 clusterip; /* the IP address */
|
__be32 clusterip; /* the IP address */
|
||||||
u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
|
u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
|
||||||
struct net_device *dev; /* device */
|
int ifindex; /* device ifindex */
|
||||||
u_int16_t num_total_nodes; /* total number of nodes */
|
u_int16_t num_total_nodes; /* total number of nodes */
|
||||||
unsigned long local_nodes; /* node number array */
|
unsigned long local_nodes; /* node number array */
|
||||||
|
|
||||||
|
@ -57,6 +57,9 @@ struct clusterip_config {
|
||||||
enum clusterip_hashmode hash_mode; /* which hashing mode */
|
enum clusterip_hashmode hash_mode; /* which hashing mode */
|
||||||
u_int32_t hash_initval; /* hash initialization */
|
u_int32_t hash_initval; /* hash initialization */
|
||||||
struct rcu_head rcu;
|
struct rcu_head rcu;
|
||||||
|
|
||||||
|
char ifname[IFNAMSIZ]; /* device ifname */
|
||||||
|
struct notifier_block notifier; /* refresh c->ifindex in it */
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_PROC_FS
|
#ifdef CONFIG_PROC_FS
|
||||||
|
@ -98,9 +101,8 @@ clusterip_config_put(struct clusterip_config *c)
|
||||||
* entry(rule) is removed, remove the config from lists, but don't free it
|
* entry(rule) is removed, remove the config from lists, but don't free it
|
||||||
* yet, since proc-files could still be holding references */
|
* yet, since proc-files could still be holding references */
|
||||||
static inline void
|
static inline void
|
||||||
clusterip_config_entry_put(struct clusterip_config *c)
|
clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
|
||||||
{
|
{
|
||||||
struct net *net = dev_net(c->dev);
|
|
||||||
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
||||||
|
|
||||||
local_bh_disable();
|
local_bh_disable();
|
||||||
|
@ -109,8 +111,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
|
||||||
spin_unlock(&cn->lock);
|
spin_unlock(&cn->lock);
|
||||||
local_bh_enable();
|
local_bh_enable();
|
||||||
|
|
||||||
dev_mc_del(c->dev, c->clustermac);
|
unregister_netdevice_notifier(&c->notifier);
|
||||||
dev_put(c->dev);
|
|
||||||
|
|
||||||
/* In case anyone still accesses the file, the open/close
|
/* In case anyone still accesses the file, the open/close
|
||||||
* functions are also incrementing the refcount on their own,
|
* functions are also incrementing the refcount on their own,
|
||||||
|
@ -170,19 +171,55 @@ clusterip_config_init_nodelist(struct clusterip_config *c,
|
||||||
set_bit(i->local_nodes[n] - 1, &c->local_nodes);
|
set_bit(i->local_nodes[n] - 1, &c->local_nodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct clusterip_config *
|
static int
|
||||||
clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
|
clusterip_netdev_event(struct notifier_block *this, unsigned long event,
|
||||||
struct net_device *dev)
|
void *ptr)
|
||||||
{
|
{
|
||||||
struct net *net = dev_net(dev);
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||||
struct clusterip_config *c;
|
struct clusterip_config *c;
|
||||||
|
|
||||||
|
c = container_of(this, struct clusterip_config, notifier);
|
||||||
|
switch (event) {
|
||||||
|
case NETDEV_REGISTER:
|
||||||
|
if (!strcmp(dev->name, c->ifname)) {
|
||||||
|
c->ifindex = dev->ifindex;
|
||||||
|
dev_mc_add(dev, c->clustermac);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NETDEV_UNREGISTER:
|
||||||
|
if (dev->ifindex == c->ifindex) {
|
||||||
|
dev_mc_del(dev, c->clustermac);
|
||||||
|
c->ifindex = -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NETDEV_CHANGENAME:
|
||||||
|
if (!strcmp(dev->name, c->ifname)) {
|
||||||
|
c->ifindex = dev->ifindex;
|
||||||
|
dev_mc_add(dev, c->clustermac);
|
||||||
|
} else if (dev->ifindex == c->ifindex) {
|
||||||
|
dev_mc_del(dev, c->clustermac);
|
||||||
|
c->ifindex = -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NOTIFY_DONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct clusterip_config *
|
||||||
|
clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
|
||||||
|
__be32 ip, const char *iniface)
|
||||||
|
{
|
||||||
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
||||||
|
struct clusterip_config *c;
|
||||||
|
int err;
|
||||||
|
|
||||||
c = kzalloc(sizeof(*c), GFP_ATOMIC);
|
c = kzalloc(sizeof(*c), GFP_ATOMIC);
|
||||||
if (!c)
|
if (!c)
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
|
|
||||||
c->dev = dev;
|
strcpy(c->ifname, iniface);
|
||||||
|
c->ifindex = -1;
|
||||||
c->clusterip = ip;
|
c->clusterip = ip;
|
||||||
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
|
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
|
||||||
c->num_total_nodes = i->num_total_nodes;
|
c->num_total_nodes = i->num_total_nodes;
|
||||||
|
@ -213,17 +250,27 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
|
||||||
cn->procdir,
|
cn->procdir,
|
||||||
&clusterip_proc_fops, c);
|
&clusterip_proc_fops, c);
|
||||||
if (!c->pde) {
|
if (!c->pde) {
|
||||||
|
err = -ENOMEM;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
c->notifier.notifier_call = clusterip_netdev_event;
|
||||||
|
err = register_netdevice_notifier(&c->notifier);
|
||||||
|
if (!err)
|
||||||
|
return c;
|
||||||
|
|
||||||
|
#ifdef CONFIG_PROC_FS
|
||||||
|
proc_remove(c->pde);
|
||||||
|
err:
|
||||||
|
#endif
|
||||||
spin_lock_bh(&cn->lock);
|
spin_lock_bh(&cn->lock);
|
||||||
list_del_rcu(&c->list);
|
list_del_rcu(&c->list);
|
||||||
spin_unlock_bh(&cn->lock);
|
spin_unlock_bh(&cn->lock);
|
||||||
kfree(c);
|
kfree(c);
|
||||||
|
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(err);
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return c;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_PROC_FS
|
#ifdef CONFIG_PROC_FS
|
||||||
|
@ -425,15 +472,14 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
|
||||||
e->ip.iniface);
|
e->ip.iniface);
|
||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
config = clusterip_config_init(cipinfo,
|
|
||||||
e->ip.dst.s_addr, dev);
|
|
||||||
if (IS_ERR(config)) {
|
|
||||||
dev_put(dev);
|
dev_put(dev);
|
||||||
|
|
||||||
|
config = clusterip_config_init(par->net, cipinfo,
|
||||||
|
e->ip.dst.s_addr,
|
||||||
|
e->ip.iniface);
|
||||||
|
if (IS_ERR(config))
|
||||||
return PTR_ERR(config);
|
return PTR_ERR(config);
|
||||||
}
|
}
|
||||||
dev_mc_add(config->dev, config->clustermac);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
cipinfo->config = config;
|
cipinfo->config = config;
|
||||||
|
|
||||||
|
@ -458,7 +504,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
|
||||||
|
|
||||||
/* if no more entries are referencing the config, remove it
|
/* if no more entries are referencing the config, remove it
|
||||||
* from the list and destroy the proc entry */
|
* from the list and destroy the proc entry */
|
||||||
clusterip_config_entry_put(cipinfo->config);
|
clusterip_config_entry_put(par->net, cipinfo->config);
|
||||||
|
|
||||||
clusterip_config_put(cipinfo->config);
|
clusterip_config_put(cipinfo->config);
|
||||||
|
|
||||||
|
@ -558,10 +604,9 @@ arp_mangle(void *priv,
|
||||||
* addresses on different interfacs. However, in the CLUSTERIP case
|
* addresses on different interfacs. However, in the CLUSTERIP case
|
||||||
* this wouldn't work, since we didn't subscribe the mcast group on
|
* this wouldn't work, since we didn't subscribe the mcast group on
|
||||||
* other interfaces */
|
* other interfaces */
|
||||||
if (c->dev != state->out) {
|
if (c->ifindex != state->out->ifindex) {
|
||||||
pr_debug("not mangling arp reply on different "
|
pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n",
|
||||||
"interface: cip'%s'-skb'%s'\n",
|
c->ifindex, state->out->ifindex);
|
||||||
c->dev->name, state->out->name);
|
|
||||||
clusterip_config_put(c);
|
clusterip_config_put(c);
|
||||||
return NF_ACCEPT;
|
return NF_ACCEPT;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue