From e5fc9e7a666e5964b60e05903b90aa832354b68c Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 12 Nov 2010 17:33:17 +0100 Subject: [PATCH 01/80] netfilter: nf_conntrack: don't always initialize ct->proto ct->proto is big(60 bytes) due to structure ip_ct_tcp, and we don't need to initialize the whole for all the other protocols. This patch moves proto to the end of structure nf_conn, and pushes the initialization down to the individual protocols. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 6 +++--- net/netfilter/nf_conntrack_core.c | 3 ++- net/netfilter/nf_conntrack_netlink.c | 1 + net/netfilter/nf_conntrack_proto_dccp.c | 3 +++ net/netfilter/nf_conntrack_proto_sctp.c | 1 + net/netfilter/nf_conntrack_proto_tcp.c | 14 +++----------- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index caf17db87dbc..abfff1e8e0d0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -116,14 +116,14 @@ struct nf_conn { u_int32_t secmark; #endif - /* Storage reserved for other modules: */ - union nf_conntrack_proto proto; - /* Extensions */ struct nf_ct_ext *ext; #ifdef CONFIG_NET_NS struct net *ct_net; #endif + + /* Storage reserved for other modules, must be the last member */ + union nf_conntrack_proto proto; }; static inline struct nf_conn * diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 27a5ea6b6a0f..0ba7d4801daf 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -651,7 +651,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone, * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. */ memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, - sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); + offsetof(struct nf_conn, proto) - + offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index b729ace1dcc1..7f59be82449f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1375,6 +1375,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } #endif + memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 5292560d6d4a..9ae57c57c50e 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -452,6 +452,9 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.state = CT_DCCP_NONE; + ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; + ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; + ct->proto.dccp.handshake_seq = 0; return true; out_invalid: diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index c6049c2d5ea8..6f4ee70f460b 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -413,6 +413,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, test_bit(SCTP_CID_COOKIE_ACK, map)) return false; + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Don't need lock here: this conntrack not in circulation yet */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3fb2b73b24dc..6f38d0e2ea4a 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1066,9 +1066,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, BUG_ON(th == NULL); /* Don't need lock here: this conntrack not in circulation yet */ - new_state - = tcp_conntracks[0][get_conntrack_index(th)] - [TCP_CONNTRACK_NONE]; + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { @@ -1077,6 +1075,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, } if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, @@ -1088,11 +1087,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); - ct->proto.tcp.seen[1].flags = 0; } else if (nf_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. @@ -1107,7 +1106,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end + ct->proto.tcp.seen[0].td_maxwin; - ct->proto.tcp.seen[0].td_scale = 0; /* We assume SACK and liberal window checking to handle * window scaling */ @@ -1116,13 +1114,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, IP_CT_TCP_FLAG_BE_LIBERAL; } - ct->proto.tcp.seen[1].td_end = 0; - ct->proto.tcp.seen[1].td_maxend = 0; - ct->proto.tcp.seen[1].td_maxwin = 0; - ct->proto.tcp.seen[1].td_scale = 0; - /* tcp_packet will set them */ - ct->proto.tcp.state = TCP_CONNTRACK_NONE; ct->proto.tcp.last_index = TCP_NONE_SET; pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " From ca36181050a523f6c0af3ef7cb509bbbc4ede276 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 12 Nov 2010 17:34:17 +0100 Subject: [PATCH 02/80] netfilter: xt_NFQUEUE: remove modulo operations Signed-off-by: Changli Gao Acked-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/netfilter/xt_NFQUEUE.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 039cce1bde3d..39627706aac6 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -72,10 +72,12 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) if (info->queues_total > 1) { if (par->family == NFPROTO_IPV4) - queue = hash_v4(skb) % info->queues_total + queue; + queue = (((u64) hash_v4(skb) * info->queues_total) >> + 32) + queue; #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) else if (par->family == NFPROTO_IPV6) - queue = hash_v6(skb) % info->queues_total + queue; + queue = (((u64) hash_v6(skb) * info->queues_total) >> + 32) + queue; #endif } return NF_QUEUE_NR(queue); From b468645d72c2b4a15512f0a18e77670ea058b861 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 15 Nov 2010 11:23:06 +0100 Subject: [PATCH 03/80] netfilter: xt_LOG: do print MAC header on FORWARD I am observing consistent behavior even with bridges, so let's unlock this. xt_mac is already usable in FORWARD, too. Section 9 of http://ebtables.sourceforge.net/br_fw_ia/br_fw_ia.html#section9 says the MAC source address is changed, but my observation does not match that claim -- the MAC header is retained. Signed-off-by: Jan Engelhardt [Patrick; code inspection seems to confirm this] Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ipt_LOG.c | 3 +-- net/ipv6/netfilter/ip6t_LOG.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 72ffc8fda2e9..d76d6c9ed946 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf, } #endif - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, 0); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 09c88891a753..05027b753721 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -452,8 +452,7 @@ ip6t_log_packet(u_int8_t pf, in ? in->name : "", out ? out->name : ""); - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, skb_network_offset(skb), 1); From 3b2368806915e1e69ac3bcc0d6a7cfde64307655 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 15 Nov 2010 11:47:52 +0100 Subject: [PATCH 04/80] netfilter: ct_extend: fix the wrong alloc_size In function update_alloc_size(), sizeof(struct nf_ct_ext) is added twice wrongly. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_extend.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index bd82450c193f..920f9244388b 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -144,9 +144,8 @@ static void update_alloc_size(struct nf_ct_ext_type *type) if (!t1) continue; - t1->alloc_size = sizeof(struct nf_ct_ext) - + ALIGN(sizeof(struct nf_ct_ext), t1->align) - + t1->len; + t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + + t1->len; for (j = 0; j < NF_CT_EXT_NUM; j++) { t2 = nf_ct_ext_types[j]; if (t2 == NULL || t2 == t1 || From 0f8e80044b26b4b30213a3fdffebd325cdc21362 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 15 Nov 2010 11:51:06 +0100 Subject: [PATCH 05/80] netfilter: nf_conntrack: define ct_*_info as needed Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index abfff1e8e0d0..8a58901c96d3 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -50,11 +50,24 @@ union nf_conntrack_expect_proto { /* per conntrack: application helper private data */ union nf_conntrack_help { /* insert conntrack helper private data (master) here */ +#if defined(CONFIG_NF_CONNTRACK_FTP) || defined(CONFIG_NF_CONNTRACK_FTP_MODULE) struct nf_ct_ftp_master ct_ftp_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_PPTP) || \ + defined(CONFIG_NF_CONNTRACK_PPTP_MODULE) struct nf_ct_pptp_master ct_pptp_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_H323) || \ + defined(CONFIG_NF_CONNTRACK_H323_MODULE) struct nf_ct_h323_master ct_h323_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_SANE) || \ + defined(CONFIG_NF_CONNTRACK_SANE_MODULE) struct nf_ct_sane_master ct_sane_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_SIP) || defined(CONFIG_NF_CONNTRACK_SIP_MODULE) struct nf_ct_sip_master ct_sip_info; +#endif }; #include From 76a2d3bcfcc86e2a8044258515b86492a37631a3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 15 Nov 2010 11:59:03 +0100 Subject: [PATCH 06/80] netfilter: nf_nat: don't use atomic bit operation As we own the conntrack and the others can't see it until we confirm it, we don't need to use atomic bit operation on ct->status. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_nat_core.h | 4 ++-- net/ipv4/netfilter/nf_nat_core.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index 33602ab66190..5aec85c29979 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -21,9 +21,9 @@ static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { if (manip == IP_NAT_MANIP_SRC) - return test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + return ct->status & IPS_SRC_NAT_DONE_BIT; else - return test_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + return ct->status & IPS_DST_NAT_DONE_BIT; } struct nlattr; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c04787ce1a71..ab877acb22a1 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -323,9 +323,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_DST_NAT_DONE_BIT; else - set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_SRC_NAT_DONE_BIT; return NF_ACCEPT; } From e0e76c83becc7536e8371e560504d836d34fcf7d Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 15 Nov 2010 12:23:24 +0100 Subject: [PATCH 07/80] netfilter: ct_extend: define NF_CT_EXT_* as needed Less IDs make nf_ct_ext smaller. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_ecache.h | 8 ++++++++ include/net/netfilter/nf_conntrack_extend.h | 6 ++++++ include/net/netfilter/nf_nat.h | 4 ++++ 3 files changed, 18 insertions(+) diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 96ba5f7dcab6..f596b60d6d75 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -23,12 +23,17 @@ struct nf_conntrack_ecache { static inline struct nf_conntrack_ecache * nf_ct_ecache_find(const struct nf_conn *ct) { +#ifdef CONFIG_NF_CONNTRACK_EVENTS return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE); +#else + return NULL; +#endif } static inline struct nf_conntrack_ecache * nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { +#ifdef CONFIG_NF_CONNTRACK_EVENTS struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; @@ -45,6 +50,9 @@ nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) e->expmask = expmask; } return e; +#else + return NULL; +#endif }; #ifdef CONFIG_NF_CONNTRACK_EVENTS diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 0772d296dfdb..1a9f96db3798 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -7,10 +7,16 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, +#if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, +#endif NF_CT_EXT_ACCT, +#ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES NF_CT_EXT_ZONE, +#endif NF_CT_EXT_NUM, }; diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index f5f09f032a90..e966092a36f1 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -84,7 +84,11 @@ extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { +#if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) return nf_ct_ext_find(ct, NF_CT_EXT_NAT); +#else + return NULL; +#endif } #else /* !__KERNEL__: iptables wants this to compile. */ From 03c0e5bb34c9755ae4d955c97fba40b24e9c7fe7 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 15 Nov 2010 12:27:27 +0100 Subject: [PATCH 08/80] netfilter: nf_nat: define nat_pptp_info as needed Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_nat.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index e966092a36f1..aff80b190c12 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -56,7 +56,9 @@ struct nf_nat_multi_range_compat { /* per conntrack: nat application helper private data */ union nf_conntrack_nat_help { /* insert nat helper private data here */ +#if defined(CONFIG_NF_NAT_PPTP) || defined(CONFIG_NF_NAT_PPTP_MODULE) struct nf_nat_pptp nat_pptp_info; +#endif }; struct nf_conn; From 9811600f7c1f18152430c6b93b0a76fdd88a59ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20Leroy?= Date: Mon, 15 Nov 2010 13:57:56 +0100 Subject: [PATCH 09/80] netfilter: xt_CLASSIFY: add ARP support, allow CLASSIFY target on any table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Frédéric Leroy Signed-off-by: Patrick McHardy --- net/netfilter/xt_CLASSIFY.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index c2c0e4abeb99..af9c4dadf816 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -19,12 +19,14 @@ #include #include #include +#include MODULE_AUTHOR("Patrick McHardy "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: Qdisc classification"); MODULE_ALIAS("ipt_CLASSIFY"); MODULE_ALIAS("ip6t_CLASSIFY"); +MODULE_ALIAS("arpt_CLASSIFY"); static unsigned int classify_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static struct xt_target classify_tg_reg __read_mostly = { - .name = "CLASSIFY", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "mangle", - .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), - .target = classify_tg, - .targetsize = sizeof(struct xt_classify_target_info), - .me = THIS_MODULE, +static struct xt_target classify_tg_reg[] __read_mostly = { + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_UNSPEC, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_ARP, + .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, }; static int __init classify_tg_init(void) { - return xt_register_target(&classify_tg_reg); + return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } static void __exit classify_tg_exit(void) { - xt_unregister_target(&classify_tg_reg); + xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } module_init(classify_tg_init); From 0e60ebe04c51807db972d03665651ae6b5c26d7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 18:17:21 +0100 Subject: [PATCH 10/80] netfilter: add __rcu annotations Add some __rcu annotations and use helpers to reduce number of sparse warnings (CONFIG_SPARSE_RCU_POINTER=y) Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 6 +++--- include/net/netfilter/nf_conntrack_ecache.h | 4 ++-- include/net/netfilter/nf_conntrack_l3proto.h | 2 +- net/netfilter/core.c | 4 ++-- net/netfilter/nf_conntrack_expect.c | 6 +++--- net/netfilter/nf_conntrack_proto.c | 20 +++++++++++++++----- net/netfilter/nf_conntrack_standalone.c | 9 ++++++--- net/netfilter/nf_log.c | 6 ++++-- net/netfilter/nf_queue.c | 18 ++++++++++++++---- net/netfilter/nfnetlink_log.c | 6 +++--- 10 files changed, 53 insertions(+), 28 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 89341c32631a..928a35ec21c7 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -265,7 +265,7 @@ struct nf_afinfo { int route_key_size; }; -extern const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO]; +extern const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO]; static inline const struct nf_afinfo *nf_get_afinfo(unsigned short family) { return rcu_dereference(nf_afinfo[family]); @@ -355,9 +355,9 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu; extern void nf_ct_attach(struct sk_buff *, struct sk_buff *); -extern void (*nf_ct_destroy)(struct nf_conntrack *); +extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index f596b60d6d75..8fdb04b8cce0 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -67,7 +67,7 @@ struct nf_ct_event_notifier { int (*fcn)(unsigned int events, struct nf_ct_event *item); }; -extern struct nf_ct_event_notifier *nf_conntrack_event_cb; +extern struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb); extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb); @@ -167,7 +167,7 @@ struct nf_exp_event_notifier { int (*fcn)(unsigned int events, struct nf_exp_event *item); }; -extern struct nf_exp_event_notifier *nf_expect_event_cb; +extern struct nf_exp_event_notifier __rcu *nf_expect_event_cb; extern int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *nb); extern void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *nb); diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index a7547611e8f1..e8010f445ae1 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -73,7 +73,7 @@ struct nf_conntrack_l3proto { struct module *me; }; -extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; +extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX]; /* Protocol registration. */ extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 85dabb86be6f..5faec4fd8193 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -212,7 +212,7 @@ EXPORT_SYMBOL(skb_make_writable); /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) @@ -229,7 +229,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) } EXPORT_SYMBOL(nf_ct_attach); -void (*nf_ct_destroy)(struct nf_conntrack *); +void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly; EXPORT_SYMBOL(nf_ct_destroy); void nf_conntrack_destroy(struct nf_conntrack *nfct) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 46e8966912b1..cab196cf428c 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -482,7 +482,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -495,11 +495,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index dc7bb74110df..03b56a0fff30 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -166,6 +166,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; + struct nf_conntrack_l3proto *old; if (proto->l3proto >= AF_MAX) return -EBUSY; @@ -174,7 +175,9 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) return -EINVAL; mutex_lock(&nf_ct_proto_mutex); - if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { + old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex)); + if (old != &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -201,7 +204,9 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); + BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != proto); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); nf_ct_l3proto_unregister_sysctl(proto); @@ -299,8 +304,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) smp_wmb(); nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != - &nf_conntrack_l4proto_generic) { + } else if (rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != &nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -331,7 +338,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); + BUG_ON(rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != l4proto); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], &nf_conntrack_l4proto_generic); nf_ct_l4proto_unregister_sysctl(l4proto); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0fb65705b44b..328f1d2a51f8 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -29,6 +29,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); @@ -56,7 +57,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -69,13 +70,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu( + &net->ct.hash[st->bucket])); } return head; } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index b07393eab88e..20c775cff2a8 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -161,7 +161,8 @@ static int seq_show(struct seq_file *s, void *v) struct nf_logger *t; int ret; - logger = nf_loggers[*pos]; + logger = rcu_dereference_protected(nf_loggers[*pos], + lockdep_is_held(&nf_log_mutex)); if (!logger) ret = seq_printf(s, "%2lld NONE (", *pos); @@ -249,7 +250,8 @@ static int nf_log_proc_dostring(ctl_table *table, int write, mutex_unlock(&nf_log_mutex); } else { mutex_lock(&nf_log_mutex); - logger = nf_loggers[tindex]; + logger = rcu_dereference_protected(nf_loggers[tindex], + lockdep_is_held(&nf_log_mutex)); if (!logger) table->data = "NONE"; else diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 74aebed5bd28..1876f7411561 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -27,14 +27,17 @@ static DEFINE_MUTEX(queue_handler_mutex); int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { int ret; + const struct nf_queue_handler *old; if (pf >= ARRAY_SIZE(queue_handler)) return -EINVAL; mutex_lock(&queue_handler_mutex); - if (queue_handler[pf] == qh) + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old == qh) ret = -EEXIST; - else if (queue_handler[pf]) + else if (old) ret = -EBUSY; else { rcu_assign_pointer(queue_handler[pf], qh); @@ -49,11 +52,15 @@ EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { + const struct nf_queue_handler *old; + if (pf >= ARRAY_SIZE(queue_handler)) return -EINVAL; mutex_lock(&queue_handler_mutex); - if (queue_handler[pf] && queue_handler[pf] != qh) { + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old && old != qh) { mutex_unlock(&queue_handler_mutex); return -EINVAL; } @@ -73,7 +80,10 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) mutex_lock(&queue_handler_mutex); for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { - if (queue_handler[pf] == qh) + if (rcu_dereference_protected( + queue_handler[pf], + lockdep_is_held(&queue_handler_mutex) + ) == qh) rcu_assign_pointer(queue_handler[pf], NULL); } mutex_unlock(&queue_handler_mutex); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6a1572b0ab41..91592da504b9 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -874,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st) for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&instance_table[st->bucket])) - return rcu_dereference_bh(instance_table[st->bucket].first); + return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); } return NULL; } static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) { - h = rcu_dereference_bh(h->next); + h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = rcu_dereference_bh(instance_table[st->bucket].first); + h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); } return h; } From be9e9163afcfc3137e7c6377cb0c7b406318fde0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 18:18:29 +0100 Subject: [PATCH 11/80] netfilter: nf_ct_frag6_sysctl_table is static Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3a3f129a44cb..eb9f1c03de3f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -73,7 +73,7 @@ static struct inet_frags nf_frags; static struct netns_frags nf_init_frags; #ifdef CONFIG_SYSCTL -struct ctl_table nf_ct_frag6_sysctl_table[] = { +static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .data = &nf_init_frags.timeout, From eb733162ae4f69b93f7c08012e6e239f31796de8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 18:43:59 +0100 Subject: [PATCH 12/80] netfilter: add __rcu annotations Use helpers to reduce number of sparse warnings (CONFIG_SPARSE_RCU_POINTER=y) Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- .../nf_conntrack_l3proto_ipv4_compat.c | 17 +++++++++++------ net/ipv4/netfilter/nf_nat_core.c | 5 ++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 37f8adb68c79..ab9c05c9734e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -20,6 +20,7 @@ #include #include #include +#include struct ct_iter_state { struct seq_net_private p; @@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); } return head; } @@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index ab877acb22a1..eb55835a02c3 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -502,7 +502,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } From ab0cba25128e1435a59b1ec4ae0c7505548fed87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 18:45:12 +0100 Subject: [PATCH 13/80] netfilter: nf_nat_amanda: rename a variable Avoid a sparse warning about 'ret' variable shadowing Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_amanda.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 0f23b3f06df0..703f366fd235 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; + int res; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) + res = nf_ct_expect_related(exp); + if (res == 0) break; - else if (ret != -EBUSY) { + else if (res != -EBUSY) { port = 0; break; } From c5d277d29ad1ae9add8d6984025ccd2e835971ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 19:45:13 +0100 Subject: [PATCH 14/80] netfilter: rcu sparse cleanups Use RCU helpers to reduce number of sparse warnings (CONFIG_SPARSE_RCU_POINTER=y), and adds lockdep checks. Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_expect.c | 15 ++++++++++++--- net/netfilter/nf_conntrack_extend.c | 6 ++++-- net/netfilter/nf_conntrack_helper.c | 10 ++++++++-- net/netfilter/nf_conntrack_proto.c | 4 ++-- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index cab196cf428c..bbb21402596d 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -337,7 +337,10 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); if (master_help) { - p = &master_help->helper->expect_policy[exp->class]; + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[exp->class]; exp->timeout.expires = jiffies + p->timeout * HZ; } add_timer(&exp->timeout); @@ -373,7 +376,10 @@ static inline int refresh_timer(struct nf_conntrack_expect *i) if (!del_timer(&i->timeout)) return 0; - p = &master_help->helper->expect_policy[i->class]; + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[i->class]; i->timeout.expires = jiffies + p->timeout * HZ; add_timer(&i->timeout); return 1; @@ -411,7 +417,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } /* Will be over limit? */ if (master_help) { - p = &master_help->helper->expect_policy[expect->class]; + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 920f9244388b..80a23ed62bb0 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -140,14 +140,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type) /* This assumes that extended areas in conntrack for the types whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ for (i = min; i <= max; i++) { - t1 = nf_ct_ext_types[i]; + t1 = rcu_dereference_protected(nf_ct_ext_types[i], + lockdep_is_held(&nf_ct_ext_type_mutex)); if (!t1) continue; t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + t1->len; for (j = 0; j < NF_CT_EXT_NUM; j++) { - t2 = nf_ct_ext_types[j]; + t2 = rcu_dereference_protected(nf_ct_ext_types[j], + lockdep_is_held(&nf_ct_ext_type_mutex)); if (t2 == NULL || t2 == t1 || (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) continue; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 59e1a4cd4e8b..767bbe98a0f0 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -158,7 +158,10 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i, struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); - if (help && help->helper == me) { + if (help && rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me) { nf_conntrack_event(IPCT_HELPER, ct); rcu_assign_pointer(help->helper, NULL); } @@ -210,7 +213,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, hlist_for_each_entry_safe(exp, n, next, &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); - if ((help->helper == me || exp->helper == me) && + if ((rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me || exp->helper == me) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 03b56a0fff30..5701c8dd783c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -284,7 +284,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) mutex_lock(&nf_ct_proto_mutex); if (!nf_ct_protos[l4proto->l3proto]) { /* l3proto may be loaded latter. */ - struct nf_conntrack_l4proto **proto_array; + struct nf_conntrack_l4proto __rcu **proto_array; int i; proto_array = kmalloc(MAX_NF_CT_PROTO * @@ -296,7 +296,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) } for (i = 0; i < MAX_NF_CT_PROTO; i++) - proto_array[i] = &nf_conntrack_l4proto_generic; + RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic); /* Before making proto_array visible to lockless readers, * we must make sure its content is committed to memory. From e9e5eee8733739f13a204132b502494b3f494f3b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Nov 2010 20:05:57 +0900 Subject: [PATCH 15/80] IPVS: Add persistence engine to connection entry The dest of a connection may not exist if it has been created as the result of connection synchronisation. But in order for connection entries for templates with persistence engine data created through connection synchronisation to be valid access to the persistence engine pointer is required. So add the persistence engine to the connection itself. Signed-off-by: Simon Horman --- include/net/ip_vs.h | 16 ++++++++++++++-- net/netfilter/ipvs/ip_vs_conn.c | 19 ++++++++++--------- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- net/netfilter/ipvs/ip_vs_pe.c | 14 ++++---------- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b7bbd6c28cfa..be2b5690f892 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -422,6 +422,7 @@ struct ip_vs_conn { struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ + const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; }; @@ -814,8 +815,19 @@ void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe); void ip_vs_unbind_pe(struct ip_vs_service *svc); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); -extern struct ip_vs_pe *ip_vs_pe_get(const char *name); -extern void ip_vs_pe_put(struct ip_vs_pe *pe); +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); + +static inline void ip_vs_pe_get(const struct ip_vs_pe *pe) +{ + if (pe && pe->module) + __module_get(pe->module); +} + +static inline void ip_vs_pe_put(const struct ip_vs_pe *pe) +{ + if (pe && pe->module) + module_put(pe->module); +} /* * IPVS protocol functions (from ip_vs_proto.c) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index e9adecdc8ca4..64a9ca314100 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -176,8 +176,8 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, NULL, 0, &p); - if (cp->dest && cp->dest->svc->pe) { - p.pe = cp->dest->svc->pe; + if (cp->pe) { + p.pe = cp->pe; p.pe_data = cp->pe_data; p.pe_data_len = cp->pe_data_len; } @@ -765,6 +765,7 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->flags & IP_VS_CONN_F_NFCT) ip_vs_conn_drop_conntrack(cp); + ip_vs_pe_put(cp->pe); kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); @@ -826,7 +827,9 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; - if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) { + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; } @@ -958,15 +961,13 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; - if (cp->dest && cp->pe_data && - cp->dest->svc->pe->show_pe_data) { + if (cp->pe_data) { pe_data[0] = ' '; - len = strlen(cp->dest->svc->pe->name); - memcpy(pe_data + 1, cp->dest->svc->pe->name, len); + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); pe_data[len + 1] = ' '; len += 2; - len += cp->dest->svc->pe->show_pe_data(cp, - pe_data + len); + len += cp->pe->show_pe_data(cp, pe_data + len); } pe_data[len] = '\0'; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5f5daa30b0af..3e92558dfcc2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1139,7 +1139,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } if (u->pe_name && *u->pe_name) { - pe = ip_vs_pe_get(u->pe_name); + pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); @@ -1250,7 +1250,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = sched; if (u->pe_name && *u->pe_name) { - pe = ip_vs_pe_get(u->pe_name); + pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 3414af70ee12..e99f920b93d1 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -30,7 +30,7 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc) /* Get pe in the pe list by name */ static struct ip_vs_pe * -ip_vs_pe_getbyname(const char *pe_name) +__ip_vs_pe_getbyname(const char *pe_name) { struct ip_vs_pe *pe; @@ -60,28 +60,22 @@ ip_vs_pe_getbyname(const char *pe_name) } /* Lookup pe and try to load it if it doesn't exist */ -struct ip_vs_pe *ip_vs_pe_get(const char *name) +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) { struct ip_vs_pe *pe; /* Search for the pe by name */ - pe = ip_vs_pe_getbyname(name); + pe = __ip_vs_pe_getbyname(name); /* If pe not found, load the module and search again */ if (!pe) { request_module("ip_vs_pe_%s", name); - pe = ip_vs_pe_getbyname(name); + pe = __ip_vs_pe_getbyname(name); } return pe; } -void ip_vs_pe_put(struct ip_vs_pe *pe) -{ - if (pe && pe->module) - module_put(pe->module); -} - /* Register a pe in the pe list */ int register_ip_vs_pe(struct ip_vs_pe *pe) { From ea2c73afc23db3084fd857b027446c38fc7ff2c9 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Nov 2010 20:06:30 +0900 Subject: [PATCH 16/80] IPVS: Only match pe_data created by the same pe Only match persistence engine data if it was created by the same persistence engine. Reported-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 64a9ca314100..261db1a17633 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -354,7 +354,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { if (p->pe_data && p->pe->ct_match) { - if (p->pe->ct_match(p, cp)) + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) goto out; continue; } From d494262b8a0f3507b62104a565849124abe29827 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 9 Nov 2010 09:33:15 +0900 Subject: [PATCH 17/80] IPVS: Make the cp argument to ip_vs_sync_conn() static Acked-by: Hans Schillstrom Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index be2b5690f892..d5a32e47f9d9 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -916,7 +916,7 @@ extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid); extern int stop_sync_thread(int state); -extern void ip_vs_sync_conn(struct ip_vs_conn *cp); +extern void ip_vs_sync_conn(const struct ip_vs_conn *cp); /* diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index ab85aedea17e..a4dccbc4f1bb 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -236,7 +236,7 @@ get_curr_sync_buff(unsigned long time) * Add an ip_vs_conn information into the current sync_buff. * Called by ip_vs_in. */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) +void ip_vs_sync_conn(const struct ip_vs_conn *cp) { struct ip_vs_sync_mesg *m; struct ip_vs_sync_conn *s; From 7ae246a15a5c9d26cfb572d36794325db0400b18 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 9 Nov 2010 09:33:25 +0900 Subject: [PATCH 18/80] IPVS: Remove useless { } block from ip_vs_process_message() Acked-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index a4dccbc4f1bb..72b3d88d35f4 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -381,20 +381,18 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } } - { - if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, ¶m)) { - pr_err("ip_vs_conn_fill_param_sync failed"); - return; - } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(¶m); - else - cp = ip_vs_ct_in_get(¶m); + if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport, ¶m)) { + pr_err("ip_vs_conn_fill_param_sync failed"); + return; } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(¶m); + else + cp = ip_vs_ct_in_get(¶m); if (!cp) { /* * Find the appropriate destination for the connection. From 8aadf93c9c1ff1a53aafd18d038be0d709b5ebc0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 9 Nov 2010 09:33:28 +0900 Subject: [PATCH 19/80] IPVS: buffer argument to ip_vs_process_message() should not be const It is assigned to a non-const variable and its contents are modified. Acked-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 72b3d88d35f4..3897d6bf3b29 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -303,7 +303,7 @@ ip_vs_conn_fill_param_sync(int af, int protocol, * Process received multicast message and create the corresponding * ip_vs_conn entries. */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) +static void ip_vs_process_message(char *buffer, const size_t buflen) { struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; struct ip_vs_sync_conn *s; From 4ecd29447e6b9c12190e21c3e44ed5b12693c467 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 18:38:52 +0100 Subject: [PATCH 20/80] ipvs: add static and read_mostly attributes ip_vs_conn_tab_bits & ip_vs_conn_tab_mask are static to ipvs/ip_vs_conn.c ip_vs_conn_tab_size, ip_vs_conn_tab_mask, ip_vs_conn_tab [the pointer], ip_vs_conn_rnd are mostly read. Signed-off-by: Eric Dumazet Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 261db1a17633..7615f9e3d955 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -48,18 +48,18 @@ /* * Connection hash size. Default is what was selected at compile time. */ -int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); /* size and mask values */ -int ip_vs_conn_tab_size; -int ip_vs_conn_tab_mask; +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; /* * Connection hash table: for input and output packets lookups of IPVS */ -static struct list_head *ip_vs_conn_tab; +static struct list_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; @@ -71,7 +71,7 @@ static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); /* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; +static unsigned int ip_vs_conn_rnd __read_mostly; /* * Fine locking granularity for big connection hash table From a333e2ec05791bb866086274ac9749315900a0a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 19:46:33 +0100 Subject: [PATCH 21/80] ipvs: remove shadow rt variable Remove a sparse warning about rt variable. Signed-off-by: Eric Dumazet Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 10bd39c0ae2d..50b131c28b85 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -188,7 +188,6 @@ __ip_vs_reroute_locally(struct sk_buff *skb) }, .mark = skb->mark, }; - struct rtable *rt; if (ip_route_output_key(net, &rt, &fl)) return 0; From 8f1b03a4c18e8f3f0801447b62330faa8ed3bb37 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 9 Nov 2010 10:08:49 +0900 Subject: [PATCH 22/80] ipvs: allow transmit of GRO aggregated skbs Attempt at allowing LVS to transmit skbs of greater than MTU length that have been aggregated by GRO and can thus be deaggregated by GSO. Cc: Julian Anastasov Cc: Herbert Xu Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 50b131c28b85..fb2a445ddc59 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -407,7 +407,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); @@ -460,7 +461,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -560,7 +561,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); @@ -675,7 +677,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -790,8 +792,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, df |= (old_iph->frag_off & htons(IP_DF)); - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { + if ((old_iph->frag_off & htons(IP_DF) && + mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; @@ -903,7 +905,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && + !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -1008,7 +1011,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && + !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("%s(): frag needed\n", __func__); @@ -1175,7 +1179,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; @@ -1289,7 +1294,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); From 3bfd45f93c8bca7a5dc955235ff083602d95aa43 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2010 10:19:18 +0100 Subject: [PATCH 23/80] netfilter: nf_conntrack: one less atomic op in nf_ct_expect_insert() Instead of doing atomic_inc(&exp->use) twice, call atomic_add(2, &exp->use); Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_expect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index bbb21402596d..774f32ba2ac9 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -323,7 +323,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) const struct nf_conntrack_expect_policy *p; unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); - atomic_inc(&exp->use); + /* two references : one for hash insert, one for the timer */ + atomic_add(2, &exp->use); if (master_help) { hlist_add_head(&exp->lnode, &master_help->expectations); @@ -345,7 +346,6 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) } add_timer(&exp->timeout); - atomic_inc(&exp->use); NF_CT_STAT_INC(net, expect_create); } From 0e051e683ba4acb4e67c272c6a89707d974099d1 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:07 +0100 Subject: [PATCH 24/80] IPVS: Backup, Prepare for transferring firewall marks (fwmark) to the backup daemon. One struct will have fwmark added: * ip_vs_conn ip_vs_conn_new() and ip_vs_find_dest() will have an extra param - fwmark The effects of that, is in this patch. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 6 ++++-- net/netfilter/ipvs/ip_vs_conn.c | 5 +++-- net/netfilter/ipvs/ip_vs_core.c | 8 ++++---- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- net/netfilter/ipvs/ip_vs_ftp.c | 5 +++-- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d5a32e47f9d9..890f01c215e9 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -382,6 +382,7 @@ struct ip_vs_conn { union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ + __u32 fwmark; /* Fire wall mark from skb */ __be16 cport; __be16 vport; __be16 dport; @@ -720,7 +721,7 @@ extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest); + struct ip_vs_dest *dest, __u32 fwmark); extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); extern const char * ip_vs_state_name(__u16 proto, int state); @@ -901,7 +902,8 @@ extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); extern struct ip_vs_dest * ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport, - const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol); + const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, + __u32 fwmark); extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 7615f9e3d955..66e4662925d5 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -613,7 +613,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) if ((cp) && (!cp->dest)) { dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, - cp->protocol); + cp->protocol, cp->fwmark); ip_vs_bind_dest(cp, dest); return dest; } else @@ -803,7 +803,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) + struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); @@ -827,6 +827,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; + cp->fwmark = fwmark; if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { ip_vs_pe_get(p->pe); cp->pe = p->pe; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b4e51e9c5a04..e2bb3cd41c07 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -293,7 +293,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * and thus param.pe_data will be destroyed * when the template expires */ ct = ip_vs_conn_new(¶m, &dest->addr, dport, - IP_VS_CONN_F_TEMPLATE, dest); + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); return NULL; @@ -319,7 +319,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], &iph.daddr, ports[1], ¶m); - cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest); + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); return NULL; @@ -423,7 +423,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, pptr[0], &iph.daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, &dest->addr, dest->port ? dest->port : pptr[1], - flags, dest); + flags, dest, skb->mark); if (!cp) return NULL; } @@ -489,7 +489,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, &iph.daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, - NULL); + NULL, skb->mark); if (!cp) return NF_DROP; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3e92558dfcc2..a5bd00279047 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -657,12 +657,12 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol) + __be16 vport, __u16 protocol, __u32 fwmark) { struct ip_vs_dest *dest; struct ip_vs_service *svc; - svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); + svc = ip_vs_service_get(af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 75455000ad1c..84aef65b37d1 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -208,7 +208,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, n_cp = ip_vs_conn_new(&p, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, - cp->dest); + cp->dest, skb->mark); if (!n_cp) return 0; @@ -365,7 +365,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!n_cp) { n_cp = ip_vs_conn_new(&p, &cp->daddr, htons(ntohs(cp->dport)-1), - IP_VS_CONN_F_NFCT, cp->dest); + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); if (!n_cp) return 0; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3897d6bf3b29..47eed672dc08 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -404,7 +404,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen) s->dport, (union nf_inet_addr *)&s->vaddr, s->vport, - s->protocol); + s->protocol, 0); /* Set the approprite ativity flag */ if (s->protocol == IPPROTO_TCP) { if (state != IP_VS_TCP_S_ESTABLISHED) @@ -419,7 +419,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen) } cp = ip_vs_conn_new(¶m, (union nf_inet_addr *)&s->daddr, - s->dport, flags, dest); + s->dport, flags, dest, 0); if (dest) atomic_dec(&dest->refcnt); if (!cp) { From ce144f249f3f21a095a093d5d1ebd845177858da Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:08 +0100 Subject: [PATCH 25/80] IPVS: Split ports[2] into src_port and dst_port Avoid sending invalid pointer due to skb_linearize() call. This patch prepares for next patch where skb_linearize is a part. In ip_vs_sched_persist() params the ports ptr will be replaced by src and dst port. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e2bb3cd41c07..9acdd79a4a05 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -200,7 +200,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, struct sk_buff *skb, - __be16 ports[2]) + __be16 src_port, __be16 dst_port) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -224,8 +224,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -247,14 +247,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; __be16 vport = 0; - if (ports[1] == svc->port) { + if (dst_port == svc->port) { /* non-FTP template: * * FTP template: * */ if (svc->port != FTPPORT) - vport = ports[1]; + vport = dst_port; } else { /* Note: persistent fwmark-based services and * persistent port zero service are handled here. @@ -285,7 +285,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, return NULL; } - if (ports[1] == svc->port && svc->port != FTPPORT) + if (dst_port == svc->port && svc->port != FTPPORT) dport = dest->port; /* Create a template @@ -306,7 +306,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, kfree(param.pe_data); } - dport = ports[1]; + dport = dst_port; if (dport == svc->port && dest->port) dport = dest->port; @@ -317,8 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], - &iph.daddr, ports[1], ¶m); + ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port, + &iph.daddr, dst_port, ¶m); + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); @@ -388,7 +389,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) { *ignored = 0; - return ip_vs_sched_persist(svc, skb, pptr); + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]); } /* From 3716522653a79b724b02ee911f1b60c41932f847 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:09 +0100 Subject: [PATCH 26/80] IPVS: skb defrag in L7 helpers L7 helpers like sip needs skb defrag since L7 data can be fragmented. This patch requires "IPVS Break ports-2 into src_port and dst_port" patch Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_pe_sip.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index b8b4e9620f3e..0d83bc01fed4 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) struct ip_vs_iphdr iph; unsigned int dataoff, datalen, matchoff, matchlen; const char *dptr; + int retc; ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); @@ -83,6 +84,8 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) if (dataoff >= skb->len) return -EINVAL; + if ((retc=skb_linearize(skb)) < 0) + return retc; dptr = skb->data + dataoff; datalen = skb->len - dataoff; From a5959d53d6048a56103ee0ade1eb6f2c0c733b1d Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:10 +0100 Subject: [PATCH 27/80] IPVS: Handle Scheduling errors. If ip_vs_conn_fill_param_persist return an error to ip_vs_sched_persist, this error must propagate as ignored=-1 to ip_vs_schedule(). Errors from ip_vs_conn_new() in ip_vs_sched_persist() and ip_vs_schedule() should also return *ignored=-1; This patch just relies on the fact that ignored is 1 before calling ip_vs_sched_persist(). Sent from Julian: "The new case when ip_vs_conn_fill_param_persist fails should set *ignored = -1, so that we can use NF_DROP, see below. *ignored = -1 should be also used for ip_vs_conn_new failure in ip_vs_sched_persist() and ip_vs_schedule(). The new negative value should be handled in tcp,udp,sctp" "To summarize: - *ignored = 1: protocol tried to schedule (eg. on SYN), found svc but the svc/scheduler decides that this packet should be accepted with NF_ACCEPT because it must not be scheduled. - *ignored = 0: scheduler can not find destination, so try bypass or return ICMP and then NF_DROP (ip_vs_leave). - *ignored = -1: scheduler tried to schedule but fatal error occurred, eg. ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param failure such as missing Call-ID, ENOMEM on skb_linearize or pe_data. In this case we should return NF_DROP without any attempts to send ICMP with ip_vs_leave." More or less all ideas and input to this patch is work from Julian Anastasov Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 56 +++++++++++++++++++-------- net/netfilter/ipvs/ip_vs_proto_sctp.c | 11 ++++-- net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 ++++- net/netfilter/ipvs/ip_vs_proto_udp.c | 10 ++++- 4 files changed, 64 insertions(+), 23 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 9acdd79a4a05..3445da6e8c95 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -177,7 +177,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, return pp->state_transition(cp, direction, skb, pp); } -static inline void +static inline int ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, struct sk_buff *skb, int protocol, const union nf_inet_addr *caddr, __be16 cport, @@ -187,7 +187,9 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = svc->pe; if (p->pe && p->pe->fill_param) - p->pe->fill_param(p, skb); + return p->pe->fill_param(p, skb); + + return 0; } /* @@ -200,7 +202,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, struct sk_buff *skb, - __be16 src_port, __be16 dst_port) + __be16 src_port, __be16 dst_port, int *ignored) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -268,20 +270,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } - ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, - vaddr, vport, ¶m); + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } } /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { - /* No template found or the dest of the connection + /* + * No template found or the dest of the connection * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP */ dest = svc->scheduler->schedule(svc, skb); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); + *ignored = 0; return NULL; } @@ -296,6 +305,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); + *ignored = -1; return NULL; } @@ -323,6 +333,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); + *ignored = -1; return NULL; } @@ -342,6 +353,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, @@ -372,11 +398,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } /* - * Do not schedule replies from local real server. It is risky - * for fwmark services but mostly for persistent services. + * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); @@ -387,10 +411,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Persistent service */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) { - *ignored = 0; - return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]); - } + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); + + *ignored = 0; /* * Non-persistent service @@ -403,8 +427,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return NULL; } - *ignored = 0; - dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); @@ -425,8 +447,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, cp = ip_vs_conn_new(&p, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); - if (!cp) + if (!cp) { + *ignored = -1; return NULL; + } } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1ea96bcd342b..a315159983ad 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -47,13 +47,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pp, &ignored); - if (!*cpp && !ignored) { - *verdict = ip_vs_leave(svc, skb, pp); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pp); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } return 0; } ip_vs_service_put(svc); } - + /* NF_ACCEPT */ return 1; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index f6c5200e2146..1cdab12abfef 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -64,12 +64,18 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pp, &ignored); - if (!*cpp && !ignored) { - *verdict = ip_vs_leave(svc, skb, pp); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pp); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } return 0; } ip_vs_service_put(svc); } + /* NF_ACCEPT */ return 1; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 9d106a06bb0a..cd398de010cc 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -63,12 +63,18 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pp, &ignored); - if (!*cpp && !ignored) { - *verdict = ip_vs_leave(svc, skb, pp); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pp); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } return 0; } ip_vs_service_put(svc); } + /* NF_ACCEPT */ return 1; } From 2981bc9a63456500037ca1f434b93a561e63f384 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:11 +0100 Subject: [PATCH 28/80] IPVS: Backup, Adding structs for new sync format New structs defined for version 1 of sync. * ip_vs_sync_v4 Ipv4 base format struct * ip_vs_sync_v6 Ipv6 base format struct Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 154 +++++++++++++++++++++++++++++--- 1 file changed, 142 insertions(+), 12 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 47eed672dc08..566482f227fa 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -43,11 +43,13 @@ #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ #define IP_VS_SYNC_PORT 8848 /* multicast port */ +#define SYNC_PROTO_VER 1 /* Protocol version in header */ /* * IPVS sync connection entry + * Version 0, i.e. original version. */ -struct ip_vs_sync_conn { +struct ip_vs_sync_conn_v0 { __u8 reserved; /* Protocol, addresses and port numbers */ @@ -71,40 +73,157 @@ struct ip_vs_sync_conn_options { struct ip_vs_seq out_seq; /* outgoing seq. struct */ }; +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + struct ip_vs_sync_thread_data { struct socket *sock; char *buf; }; -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) #define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) /* - The master mulitcasts messages to the backup load balancers in the - following format. + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | IPVS Sync Connection (1) | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | . | - | . | + ~ . ~ | . | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | IPVS Sync Connection (n) | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | */ #define SYNC_MESG_HEADER_LEN 4 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ +/* Version 0 header */ struct ip_vs_sync_mesg { __u8 nr_conns; __u8 syncid; @@ -113,6 +232,17 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; +/* Version 1 header */ +struct ip_vs_sync_mesg_v2 { + __u8 reserved; /* must be zero */ + __u8 syncid; + __u16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; + /* the maximum length of sync (sending/receiving) message */ static int sync_send_mesg_maxlen; static int sync_recv_mesg_maxlen; @@ -239,7 +369,7 @@ get_curr_sync_buff(unsigned long time) void ip_vs_sync_conn(const struct ip_vs_conn *cp) { struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_v0 *s; int len; spin_lock(&curr_sb_lock); @@ -254,7 +384,7 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp) len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; + s = (struct ip_vs_sync_conn_v0 *)curr_sb->head; /* copy members */ s->protocol = cp->protocol; @@ -306,7 +436,7 @@ ip_vs_conn_fill_param_sync(int af, int protocol, static void ip_vs_process_message(char *buffer, const size_t buflen) { struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_conn_options *opt; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; @@ -343,7 +473,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen) IP_VS_ERR_RL("bogus conn in sync message\n"); return; } - s = (struct ip_vs_sync_conn *) p; + s = (struct ip_vs_sync_conn_v0 *) p; flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; flags &= ~IP_VS_CONN_F_HASHED; if (flags & IP_VS_CONN_F_SEQ_MASK) { @@ -849,7 +979,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); + sizeof(struct ip_vs_sync_conn_v0)); if (state == IP_VS_STATE_MASTER) { if (sync_master_thread) From fe5e7a1efb664df0280f10377813d7099fb7eb0f Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:12 +0100 Subject: [PATCH 29/80] IPVS: Backup, Adding Version 1 receive capability Functionality improvements * flags changed from 16 to 32 bits * fwmark added (32 bits) * timeout in sec. added (32 bits) * pe data added (Variable length) * IPv6 capabilities (3x16 bytes for addr.) * Version and type in every conn msg. ip_vs_process_message() now handles Version 1 messages and will call ip_vs_process_message_v0() for version 0 messages. ip_vs_proc_conn() is common for both version, and handles the update of connection hash. ip_vs_conn_fill_param_sync() - Version 1 messages only ip_vs_conn_fill_param_sync_v0() - Version 0 messages only Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/linux/ip_vs.h | 8 + include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_pe.c | 5 +- net/netfilter/ipvs/ip_vs_sync.c | 551 +++++++++++++++++++++++++------- 4 files changed, 441 insertions(+), 124 deletions(-) diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index 5f43a3b2e3ad..4deb3834d62c 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -89,6 +89,14 @@ #define IP_VS_CONN_F_TEMPLATE 0x1000 /* template, not connection */ #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ +#define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \ + IP_VS_CONN_F_NOOUTPUT | \ + IP_VS_CONN_F_INACTIVE | \ + IP_VS_CONN_F_SEQ_MASK | \ + IP_VS_CONN_F_NO_CPORT | \ + IP_VS_CONN_F_TEMPLATE \ + ) + /* Flags that are not sent to backup server start from bit 16 */ #define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */ diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 890f01c215e9..4069484df7bb 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -817,6 +817,7 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); static inline void ip_vs_pe_get(const struct ip_vs_pe *pe) { diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index e99f920b93d1..5cf859ccb31b 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc) } /* Get pe in the pe list by name */ -static struct ip_vs_pe * -__ip_vs_pe_getbyname(const char *pe_name) +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) { struct ip_vs_pe *pe; - IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__, + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); spin_lock_bh(&ip_vs_pe_lock); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 566482f227fa..e071508901d1 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -35,6 +35,8 @@ #include #include +#include /* Used for ntoh_seq and hton_seq */ + #include #include @@ -286,6 +288,16 @@ static struct sockaddr_in mcast_addr = { .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), }; +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} static inline struct ip_vs_sync_buff *sb_dequeue(void) { @@ -418,59 +430,186 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp) ip_vs_sync_conn(cp->control); } +/* + * fill_param used by version 1 + */ static inline int -ip_vs_conn_fill_param_sync(int af, int protocol, - const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - struct ip_vs_conn_param *p) +ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) { - /* XXX: Need to take into account persistence engine */ - ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + if (p->pe->module) + module_put(p->pe->module); + return -ENOMEM; + } + memcpy(p->pe_data, pe_data, pe_data_len); + p->pe_data_len = pe_data_len; + } return 0; } /* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. */ -static void ip_vs_process_message(char *buffer, const size_t buflen) +static void ip_vs_proc_conn(struct ip_vs_conn_param *param, unsigned flags, + unsigned state, unsigned protocol, unsigned type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt, + struct ip_vs_protocol *pp) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); + + if (cp && param->pe_data) /* Free pe_data */ + kfree(param->pe_data); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark); + + /* Set the approprite ativity flag */ + if (protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } else if (protocol == IPPROTO_SCTP) { + if (state != IP_VS_SCTP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + if (param->pe_data) + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && + (cp->state != state)) { + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + ip_vs_conn_put(cp); +} + +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) { struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; struct ip_vs_conn_param param; char *p; int i; - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; inr_conns; i++) { unsigned flags, state; if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); return; } s = (struct ip_vs_sync_conn_v0 *) p; @@ -480,7 +619,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen) opt = (struct ip_vs_sync_conn_options *)&s[1]; p += FULL_CONN_SIZE; if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); return; } } else { @@ -492,12 +631,12 @@ static void ip_vs_process_message(char *buffer, const size_t buflen) if (!(flags & IP_VS_CONN_F_TEMPLATE)) { pp = ip_vs_proto_get(s->protocol); if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", s->protocol); continue; } if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", pp->name, state); continue; } @@ -505,103 +644,273 @@ static void ip_vs_process_message(char *buffer, const size_t buflen) /* protocol in templates is not used for state/timeout */ pp = NULL; if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", state); state = 0; } } - if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, ¶m)) { - pr_err("ip_vs_conn_fill_param_sync failed"); - return; - } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(¶m); - else - cp = ip_vs_ct_in_get(¶m); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol, 0); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (s->protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; + ip_vs_conn_fill_param(AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt, pp); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; } - cp = ip_vs_conn_new(¶m, - (union nf_inet_addr *)&s->daddr, - s->dport, flags, dest, 0); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - pr_err("ip_vs_conn_new failed\n"); + } + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", + state); + state = 0; + } + } + if (ip_vs_conn_fill_param_sync(af, s, ¶m, + pe_data, pe_data_len, + pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL), + pp); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL), + pp); +#endif + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(__u8 *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer; + __u8 *p, *msg_end; + unsigned int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + /* Convert size back to host byte order */ + m2->size = ntohs(m2->size); + + if (buflen != m2->size) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg_v2); + nr_conns = m2->nr_conns; + + for (i=0; iv4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); + return; } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; } + /* Process a single sync_conn */ + if ((retc=ip_vs_proc_sync_conn(p, msg_end)) < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; + } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); + } else { + /* Old type of message */ + ip_vs_process_message_v0(buffer, buflen); + return; } } From 986a075795339c5ea1122ce9290dfd5504252eb0 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Fri, 19 Nov 2010 14:25:13 +0100 Subject: [PATCH 30/80] IPVS: Backup, Change sending to Version 1 format Enable sending and removal of version 0 sending Affected functions, ip_vs_sync_buff_create() ip_vs_sync_conn() ip_vs_core.c removal of IPv4 check. *v5 Just check cp->pe_data_len in ip_vs_sync_conn Check if padding needed before adding a new sync_conn to the buffer, i.e. avoid sending padding at the end. *v4 moved sanity check and pe_name_len after sloop. use cp->pe instead of cp->dest->svc->pe real length in each sync_conn, not padded length however total size of a sync_msg includes padding. *v3 Sending ip_vs_sync_conn_options in network order. Sending Templates for ONE_PACKET conn. Renaming of ip_vs_sync_mesg to ip_vs_sync_mesg_v0 Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_core.c | 13 ++- net/netfilter/ipvs/ip_vs_sync.c | 185 +++++++++++++++++++++++++------- 3 files changed, 154 insertions(+), 46 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4069484df7bb..a715f3db179a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -919,7 +919,7 @@ extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid); extern int stop_sync_thread(int state); -extern void ip_vs_sync_conn(const struct ip_vs_conn *cp); +extern void ip_vs_sync_conn(struct ip_vs_conn *cp); /* diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 3445da6e8c95..5287771d0647 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1560,9 +1560,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ - pkts = atomic_add_return(1, &cp->in_pkts); - if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = sysctl_ip_vs_sync_threshold[0]; + else + pkts = atomic_add_return(1, &cp->in_pkts); + + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && (pkts % sysctl_ip_vs_sync_threshold[1] @@ -1577,8 +1583,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if (af == AF_INET && - (ip_vs_sync_state & IP_VS_STATE_MASTER) && + else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && (pkts % sysctl_ip_vs_sync_threshold[1] diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index e071508901d1..df5abf0e25af 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -226,7 +226,7 @@ struct ip_vs_sync_thread_data { #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ /* Version 0 header */ -struct ip_vs_sync_mesg { +struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; __u16 size; @@ -235,7 +235,7 @@ struct ip_vs_sync_mesg { }; /* Version 1 header */ -struct ip_vs_sync_mesg_v2 { +struct ip_vs_sync_mesg { __u8 reserved; /* must be zero */ __u8 syncid; __u16 size; @@ -299,6 +299,17 @@ static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) ho->previous_delta = get_unaligned_be32(&no->previous_delta); } +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} + static inline struct ip_vs_sync_buff *sb_dequeue(void) { struct ip_vs_sync_buff *sb; @@ -317,6 +328,9 @@ static inline struct ip_vs_sync_buff *sb_dequeue(void) return sb; } +/* + * Create a new sync buffer for Version 1 proto. + */ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) { struct ip_vs_sync_buff *sb; @@ -328,11 +342,15 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) kfree(sb); return NULL; } - sb->mesg->nr_conns = 0; + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; + sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->nr_conns = 0; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; return sb; } @@ -373,18 +391,60 @@ get_curr_sync_buff(unsigned long time) return sb; } - /* * Add an ip_vs_conn information into the current sync_buff. * Called by ip_vs_in. + * Sending Version 1 messages */ -void ip_vs_sync_conn(const struct ip_vs_conn *cp) +void ip_vs_sync_conn(struct ip_vs_conn *cp) { struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn_v0 *s; - int len; + union ip_vs_sync_conn *s; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } spin_lock(&curr_sb_lock); + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + if (curr_sb) { + pad = (4 - (size_t)curr_sb->head) & 3; + if (curr_sb->head + len + pad > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + pad = 0; + } + } + if (!curr_sb) { if (!(curr_sb=ip_vs_sync_buff_create())) { spin_unlock(&curr_sb_lock); @@ -393,41 +453,84 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp) } } - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; m = curr_sb->mesg; - s = (struct ip_vs_sync_conn_v0 *)curr_sb->head; + p = curr_sb->head; + curr_sb->head += pad + len; + m->size += pad + len; + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; - /* copy members */ - s->protocol = cp->protocol; - s->cport = cp->cport; - s->vport = cp->vport; - s->dport = cp->dport; - s->caddr = cp->caddr.ip; - s->vaddr = cp->vaddr.ip; - s->daddr = cp->daddr.ip; - s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); - s->state = htons(cp->state); - if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { - struct ip_vs_sync_conn_options *opt = - (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); - } + s = (union ip_vs_sync_conn *)p; + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); m->nr_conns++; - m->size += len; - curr_sb->head += len; - /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6); + ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6); + ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6); + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + spin_unlock(&curr_sb_lock); +control: /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); + cp = cp->control; + if (!cp) + return; + /* + * Reduce sync rate for templates + * i.e only increment in_pkts for Templates. + */ + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + int pkts = atomic_add_return(1, &cp->in_pkts); + + if (pkts % sysctl_ip_vs_sync_threshold[1] != 1) + return; + } + goto sloop; } /* @@ -596,7 +699,7 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param, unsigned flags, */ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) { - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_conn_options *opt; struct ip_vs_protocol *pp; @@ -604,7 +707,7 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) char *p; int i; - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); for (i=0; inr_conns; i++) { unsigned flags, state; @@ -848,11 +951,11 @@ static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end) */ static void ip_vs_process_message(__u8 *buffer, const size_t buflen) { - struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer; + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; __u8 *p, *msg_end; - unsigned int i, nr_conns; + int i, nr_conns; - if (buflen < sizeof(struct ip_vs_sync_mesg)) { + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { IP_VS_DBG(2, "BACKUP, message header too short\n"); return; } @@ -872,7 +975,7 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen) if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) && (m2->spare == 0)) { - msg_end = buffer + sizeof(struct ip_vs_sync_mesg_v2); + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); nr_conns = m2->nr_conns; for (i=0; i Date: Fri, 19 Nov 2010 14:25:14 +0100 Subject: [PATCH 31/80] IPVS: Backup, adding version 0 sending capabilities This patch adds a sysclt net.ipv4.vs.sync_version that can be used to send sync msg in version 0 or 1 format. sync_version value is logical, Value 1 (default) New version 0 Plain old version Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 + net/netfilter/ipvs/ip_vs_ctl.c | 28 ++++++- net/netfilter/ipvs/ip_vs_sync.c | 134 ++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+), 1 deletion(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a715f3db179a..d858264217ba 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -883,7 +883,9 @@ extern int sysctl_ip_vs_conntrack; extern int sysctl_ip_vs_snat_reroute; extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; +extern int sysctl_ip_vs_sync_ver; +extern void ip_vs_sync_switch_mode(int mode); extern struct ip_vs_service * ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index a5bd00279047..d12a13c497ba 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -92,7 +92,7 @@ int sysctl_ip_vs_nat_icmp_send = 0; int sysctl_ip_vs_conntrack; #endif int sysctl_ip_vs_snat_reroute = 1; - +int sysctl_ip_vs_sync_ver = 1; /* Default version of sync proto */ #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; @@ -1536,6 +1536,25 @@ proc_do_sync_threshold(ctl_table *table, int write, return rc; } +static int +proc_do_sync_mode(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 1)) { + /* Restore the correct value */ + *valp = val; + } else { + ip_vs_sync_switch_mode(val); + } + } + return rc; +} /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) @@ -1602,6 +1621,13 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .procname = "sync_version", + .data = &sysctl_ip_vs_sync_ver, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, #if 0 { .procname = "timeout_established", diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index df5abf0e25af..c1c167ab73ee 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -5,6 +5,18 @@ * high-performance and highly available server based on a * cluster of servers. * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * * Authors: Wensong Zhang * * ip_vs_sync: sync connection info from master load balancer to backups @@ -15,6 +27,8 @@ * Alexandre Cassen : Added SyncID support for incoming sync * messages filtering. * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. */ #define KMSG_COMPONENT "IPVS" @@ -391,6 +405,121 @@ get_curr_sync_buff(unsigned long time) return sb; } +/* + * Switch mode from sending version 0 or 1 + * - must handle sync_buf + */ +void ip_vs_sync_switch_mode(int mode) { + + if (!ip_vs_sync_state & IP_VS_STATE_MASTER) + return; + if (mode == sysctl_ip_vs_sync_ver || !curr_sb) + return; + + spin_lock_bh(&curr_sb_lock); + /* Buffer empty ? then let buf_create do the job */ + if ( curr_sb->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { + kfree(curr_sb); + curr_sb = NULL; + } else { + spin_lock_bh(&ip_vs_sync_lock); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) + list_add_tail(&curr_sb->list, &ip_vs_sync_queue); + else + ip_vs_sync_buff_release(curr_sb); + spin_unlock_bh(&ip_vs_sync_lock); + } + spin_unlock_bh(&curr_sb_lock); +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ip_vs_master_syncid; + mesg->size = 4; + sb->head = (unsigned char *)mesg + 4; + sb->end = (unsigned char *)mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +/* + * Version 0 , could be switched in by sys_ctl. + * Add an ip_vs_conn information into the current sync_buff. + */ +void ip_vs_sync_conn_v0(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; + int len; + + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create_v0())) { + spin_unlock(&curr_sb_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = (struct ip_vs_sync_mesg_v0 *)curr_sb->mesg; + s = (struct ip_vs_sync_conn_v0 *)curr_sb->head; + + /* copy members */ + s->reserved = 0; + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head + FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + /* * Add an ip_vs_conn information into the current sync_buff. * Called by ip_vs_in. @@ -403,6 +532,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) __u8 *p; unsigned int len, pe_name_len, pad; + /* Handle old version of the protocol */ + if (sysctl_ip_vs_sync_ver == 0) { + ip_vs_sync_conn_v0(cp); + return; + } /* Do not sync ONE PACKET */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) goto control; From f1c722295e029eace7960fc687efd5afd67dc555 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 15 Dec 2010 22:58:53 +0100 Subject: [PATCH 32/80] netfilter: xtables: use guarded types We are supposed to use the kernel's own types in userspace exports. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/xt_CT.h | 10 +++++----- include/linux/netfilter/xt_TCPOPTSTRIP.h | 2 +- include/linux/netfilter/xt_TPROXY.h | 8 ++++---- include/linux/netfilter/xt_cluster.h | 8 ++++---- include/linux/netfilter/xt_quota.h | 6 +++--- include/linux/netfilter/xt_time.h | 14 +++++++------- include/linux/netfilter/xt_u32.h | 16 ++++++++-------- 7 files changed, 32 insertions(+), 32 deletions(-) diff --git a/include/linux/netfilter/xt_CT.h b/include/linux/netfilter/xt_CT.h index 1b564106891d..fbf4c5658554 100644 --- a/include/linux/netfilter/xt_CT.h +++ b/include/linux/netfilter/xt_CT.h @@ -4,11 +4,11 @@ #define XT_CT_NOTRACK 0x1 struct xt_ct_target_info { - u_int16_t flags; - u_int16_t zone; - u_int32_t ct_events; - u_int32_t exp_events; - char helper[16]; + __u16 flags; + __u16 zone; + __u32 ct_events; + __u32 exp_events; + char helper[16]; /* Used internally by the kernel */ struct nf_conn *ct __attribute__((aligned(8))); diff --git a/include/linux/netfilter/xt_TCPOPTSTRIP.h b/include/linux/netfilter/xt_TCPOPTSTRIP.h index 2db543214ff5..342ef14b1761 100644 --- a/include/linux/netfilter/xt_TCPOPTSTRIP.h +++ b/include/linux/netfilter/xt_TCPOPTSTRIP.h @@ -7,7 +7,7 @@ (((1U << (idx & 31)) & bmap[(idx) >> 5]) != 0) struct xt_tcpoptstrip_target_info { - u_int32_t strip_bmap[8]; + __u32 strip_bmap[8]; }; #endif /* _XT_TCPOPTSTRIP_H */ diff --git a/include/linux/netfilter/xt_TPROXY.h b/include/linux/netfilter/xt_TPROXY.h index 3f3d69361289..8097e0b4c15e 100644 --- a/include/linux/netfilter/xt_TPROXY.h +++ b/include/linux/netfilter/xt_TPROXY.h @@ -5,15 +5,15 @@ * redirection. We can get rid of that whenever we get support for * mutliple targets in the same rule. */ struct xt_tproxy_target_info { - u_int32_t mark_mask; - u_int32_t mark_value; + __u32 mark_mask; + __u32 mark_value; __be32 laddr; __be16 lport; }; struct xt_tproxy_target_info_v1 { - u_int32_t mark_mask; - u_int32_t mark_value; + __u32 mark_mask; + __u32 mark_value; union nf_inet_addr laddr; __be16 lport; }; diff --git a/include/linux/netfilter/xt_cluster.h b/include/linux/netfilter/xt_cluster.h index 886682656f09..66cfa3c782ac 100644 --- a/include/linux/netfilter/xt_cluster.h +++ b/include/linux/netfilter/xt_cluster.h @@ -6,10 +6,10 @@ enum xt_cluster_flags { }; struct xt_cluster_match_info { - u_int32_t total_nodes; - u_int32_t node_mask; - u_int32_t hash_seed; - u_int32_t flags; + __u32 total_nodes; + __u32 node_mask; + __u32 hash_seed; + __u32 flags; }; #define XT_CLUSTER_NODES_MAX 32 diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h index b0d28c659ab7..8bda65f0bc92 100644 --- a/include/linux/netfilter/xt_quota.h +++ b/include/linux/netfilter/xt_quota.h @@ -9,9 +9,9 @@ enum xt_quota_flags { struct xt_quota_priv; struct xt_quota_info { - u_int32_t flags; - u_int32_t pad; - aligned_u64 quota; + __u32 flags; + __u32 pad; + aligned_u64 quota; /* Used internally by the kernel */ struct xt_quota_priv *master; diff --git a/include/linux/netfilter/xt_time.h b/include/linux/netfilter/xt_time.h index 14b6df412c9f..b8bd4568efdb 100644 --- a/include/linux/netfilter/xt_time.h +++ b/include/linux/netfilter/xt_time.h @@ -2,13 +2,13 @@ #define _XT_TIME_H 1 struct xt_time_info { - u_int32_t date_start; - u_int32_t date_stop; - u_int32_t daytime_start; - u_int32_t daytime_stop; - u_int32_t monthdays_match; - u_int8_t weekdays_match; - u_int8_t flags; + __u32 date_start; + __u32 date_stop; + __u32 daytime_start; + __u32 daytime_stop; + __u32 monthdays_match; + __u8 weekdays_match; + __u8 flags; }; enum { diff --git a/include/linux/netfilter/xt_u32.h b/include/linux/netfilter/xt_u32.h index 9947f56cdbdd..e8c3d8722bae 100644 --- a/include/linux/netfilter/xt_u32.h +++ b/include/linux/netfilter/xt_u32.h @@ -9,13 +9,13 @@ enum xt_u32_ops { }; struct xt_u32_location_element { - u_int32_t number; - u_int8_t nextop; + __u32 number; + __u8 nextop; }; struct xt_u32_value_element { - u_int32_t min; - u_int32_t max; + __u32 min; + __u32 max; }; /* @@ -27,14 +27,14 @@ struct xt_u32_value_element { struct xt_u32_test { struct xt_u32_location_element location[XT_U32_MAXSIZE+1]; struct xt_u32_value_element value[XT_U32_MAXSIZE+1]; - u_int8_t nnums; - u_int8_t nvalues; + __u8 nnums; + __u8 nvalues; }; struct xt_u32 { struct xt_u32_test tests[XT_U32_MAXSIZE+1]; - u_int8_t ntests; - u_int8_t invert; + __u8 ntests; + __u8 invert; }; #endif /* _XT_U32_H */ From ae90bdeaeac6b964b7a1e853a90a19f358a9ac20 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 15 Dec 2010 23:53:41 +0100 Subject: [PATCH 33/80] netfilter: fix compilation when conntrack is disabled but tproxy is enabled The IPv6 tproxy patches split IPv6 defragmentation off of conntrack, but failed to update the #ifdef stanzas guarding the defragmentation related fields and code in skbuff and conntrack related code in nf_defrag_ipv6.c. This patch adds the required #ifdefs so that IPv6 tproxy can truly be used without connection tracking. Original report: http://marc.info/?l=linux-netdev&m=129010118516341&w=2 Reported-by: Randy Dunlap Signed-off-by: KOVACS Krisztian Acked-by: Randy Dunlap Signed-off-by: Patrick McHardy --- include/linux/skbuff.h | 15 +++++++++++++++ include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 10 ---------- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 10 ++++++++++ net/core/skbuff.c | 2 ++ net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 8 +++++++- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e6ba898de61c..4f2db79a2abb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -255,6 +255,11 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif +#if defined(CONFIG_NF_DEFRAG_IPV4) || defined(CONFIG_NF_DEFRAG_IPV4_MODULE) || \ + defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE) +#define NET_SKBUFF_NF_DEFRAG_NEEDED 1 +#endif + /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -362,6 +367,8 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED struct sk_buff *nfct_reasm; #endif #ifdef CONFIG_BRIDGE_NETFILTER @@ -2051,6 +2058,8 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) if (nfct) atomic_inc(&nfct->use); } +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED static inline void nf_conntrack_get_reasm(struct sk_buff *skb) { if (skb) @@ -2079,6 +2088,8 @@ static inline void nf_reset(struct sk_buff *skb) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb->nfct); skb->nfct = NULL; +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED nf_conntrack_put_reasm(skb->nfct_reasm); skb->nfct_reasm = NULL; #endif @@ -2095,6 +2106,8 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) dst->nfct = src->nfct; nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED dst->nfct_reasm = src->nfct_reasm; nf_conntrack_get_reasm(src->nfct_reasm); #endif @@ -2108,6 +2121,8 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED nf_conntrack_put_reasm(dst->nfct_reasm); #endif #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index 1ee717eb5b09..a4c993685795 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -7,16 +7,6 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; -extern int nf_ct_frag6_init(void); -extern void nf_ct_frag6_cleanup(void); -extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); -extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, - struct net_device *in, - struct net_device *out, - int (*okfn)(struct sk_buff *)); - -struct inet_frags_ctl; - #include extern struct ctl_table nf_ct_ipv6_sysctl_table[]; diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 94dd54d76b48..fd79c9a1779d 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -3,4 +3,14 @@ extern void nf_defrag_ipv6_enable(void); +extern int nf_ct_frag6_init(void); +extern void nf_ct_frag6_cleanup(void); +extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); +extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, + struct net_device *out, + int (*okfn)(struct sk_buff *)); + +struct inet_frags_ctl; + #endif /* _NF_DEFRAG_IPV6_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 104f8444754a..74ebf4b5258a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -380,6 +380,8 @@ static void skb_release_head_state(struct sk_buff *skb) } #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb->nfct); +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED nf_conntrack_put_reasm(skb->nfct_reasm); #endif #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 99abfb53bab9..97c5b21b9674 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -19,13 +19,15 @@ #include #include +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include #include #include #include #include -#include #include +#endif +#include #include static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -33,8 +35,10 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, { u16 zone = NF_CT_DEFAULT_ZONE; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge && @@ -56,9 +60,11 @@ static unsigned int ipv6_defrag(unsigned int hooknum, { struct sk_buff *reasm; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) /* Previously seen (loopback)? */ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; +#endif reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); /* queued */ From 61b1ab4583e275af216c8454b9256de680499b19 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:42 +0100 Subject: [PATCH 34/80] IPVS: netns, add basic init per netns. Preparation for network name-space init, in this stage some empty functions exists. In most files there is a check if it is root ns i.e. init_net if (!net_eq(net, &init_net)) return ... this will be removed by the last patch, when enabling name-space. *v3 ip_vs_conn.c merge error corrected. net_ipvs #ifdef removed as sugested by Jan Engelhardt [ horms@verge.net.au: Removed whitespace-change-only hunks ] Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 11 ++++++ include/net/net_namespace.h | 2 + include/net/netns/ip_vs.h | 25 +++++++++++++ net/netfilter/ipvs/ip_vs_app.c | 28 ++++++++++++-- net/netfilter/ipvs/ip_vs_conn.c | 34 ++++++++++++++--- net/netfilter/ipvs/ip_vs_core.c | 63 +++++++++++++++++++++++++++++++- net/netfilter/ipvs/ip_vs_ctl.c | 49 ++++++++++++++++++++----- net/netfilter/ipvs/ip_vs_est.c | 20 +++++++++- net/netfilter/ipvs/ip_vs_ftp.c | 34 +++++++++++++++-- net/netfilter/ipvs/ip_vs_lblc.c | 37 +++++++++++++++++-- net/netfilter/ipvs/ip_vs_lblcr.c | 38 +++++++++++++++++-- net/netfilter/ipvs/ip_vs_proto.c | 19 ++++++++++ net/netfilter/ipvs/ip_vs_sync.c | 27 ++++++++++++++ 13 files changed, 354 insertions(+), 33 deletions(-) create mode 100644 include/net/netns/ip_vs.h diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d858264217ba..c1c2ece3ed94 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -28,6 +28,15 @@ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include #endif +#include /* Netw namespace */ + +/* + * Generic access of ipvs struct + */ +static inline struct netns_ipvs *net_ipvs(struct net* net) +{ + return net->ipvs; +} /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; @@ -922,6 +931,8 @@ extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid); extern int stop_sync_thread(int state); extern void ip_vs_sync_conn(struct ip_vs_conn *cp); +extern int ip_vs_sync_init(void); +extern void ip_vs_sync_cleanup(void); /* diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1bf812b21fb7..b3b4a34cb2cc 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -20,6 +20,7 @@ #include #endif #include +#include struct proc_dir_entry; struct net_device; @@ -94,6 +95,7 @@ struct net { #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif + struct netns_ipvs *ipvs; }; diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h new file mode 100644 index 000000000000..12fe84087cec --- /dev/null +++ b/include/net/netns/ip_vs.h @@ -0,0 +1,25 @@ +/* + * IP Virtual Server + * Data structure for network namspace + * + */ + +#ifndef IP_VS_H_ +#define IP_VS_H_ + +#include +#include +#include +#include +#include +#include + +struct ip_vs_stats; +struct ip_vs_sync_buff; +struct ctl_table_header; + +struct netns_ipvs { + int gen; /* Generation */ +}; + +#endif /* IP_VS_H_ */ diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index a475edee0912..40b09ccc4896 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -569,15 +569,35 @@ static const struct file_operations ip_vs_app_fops = { }; #endif +static int __net_init __ip_vs_app_init(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + + proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + +static void __net_exit __ip_vs_app_cleanup(struct net *net) +{ + proc_net_remove(net, "ip_vs_app"); +} + +static struct pernet_operations ip_vs_app_ops = { + .init = __ip_vs_app_init, + .exit = __ip_vs_app_cleanup, +}; + int __init ip_vs_app_init(void) { - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; + int rv; + + rv = register_pernet_subsys(&ip_vs_app_ops); + return rv; } void ip_vs_app_cleanup(void) { - proc_net_remove(&init_net, "ip_vs_app"); + unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 66e4662925d5..7c1b502f8d8d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1201,11 +1201,36 @@ static void ip_vs_conn_flush(void) goto flush_again; } } +/* + * per netns init and exit + */ +int __net_init __ip_vs_conn_init(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + return 0; +} + +static void __net_exit __ip_vs_conn_cleanup(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return; + + proc_net_remove(net, "ip_vs_conn"); + proc_net_remove(net, "ip_vs_conn_sync"); +} +static struct pernet_operations ipvs_conn_ops = { + .init = __ip_vs_conn_init, + .exit = __ip_vs_conn_cleanup, +}; int __init ip_vs_conn_init(void) { int idx; + int retc; /* Compute size and mask */ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; @@ -1243,24 +1268,21 @@ int __init ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + retc = register_pernet_subsys(&ipvs_conn_ops); /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return 0; + return retc; } - void ip_vs_conn_cleanup(void) { + unregister_pernet_subsys(&ipvs_conn_ops); /* flush all the connection entries first */ ip_vs_conn_flush(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); vfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5287771d0647..206f40c548d7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -41,6 +41,7 @@ #include /* for icmp_send */ #include #include +#include /* net_generic() */ #include #include @@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +int ip_vs_net_id __read_mostly; +#ifdef IP_VS_GENERIC_NETNS +EXPORT_SYMBOL(ip_vs_net_id); +#endif +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) @@ -1813,6 +1820,44 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { #endif }; +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + if (!net_eq(net, &init_net)) { + pr_err("The final patch for enabling netns is missing\n"); + return -EPERM; + } + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + printk(KERN_INFO "IPVS: Creating netns size=%lu id=%d\n", + sizeof(struct netns_ipvs), ipvs->gen); + return 0; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->gen); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit = __ip_vs_cleanup, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; /* * Initialize IP Virtual Server @@ -1821,8 +1866,11 @@ static int __init ip_vs_init(void) { int ret; - ip_vs_estimator_init(); + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + return ret; + ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { pr_err("can't setup control.\n"); @@ -1843,15 +1891,23 @@ static int __init ip_vs_init(void) goto cleanup_app; } + ret = ip_vs_sync_init(); + if (ret < 0) { + pr_err("can't setup sync data.\n"); + goto cleanup_conn; + } + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { pr_err("can't register hooks.\n"); - goto cleanup_conn; + goto cleanup_sync; } pr_info("ipvs loaded.\n"); return ret; +cleanup_sync: + ip_vs_sync_cleanup(); cleanup_conn: ip_vs_conn_cleanup(); cleanup_app: @@ -1861,17 +1917,20 @@ static int __init ip_vs_init(void) ip_vs_control_cleanup(); cleanup_estimator: ip_vs_estimator_cleanup(); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ return ret; } static void __exit ip_vs_cleanup(void) { nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); ip_vs_estimator_cleanup(); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ca49e928f302..ceeef4352d34 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3406,6 +3406,42 @@ static void ip_vs_genl_unregister(void) /* End of Generic Netlink interface definitions */ +/* + * per netns intit/exit func. + */ +int __net_init __ip_vs_control_init(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + + proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); + sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, + vs_vars); + if (sysctl_header == NULL) + goto err_reg; + ip_vs_new_estimator(&ip_vs_stats); + return 0; + +err_reg: + return -ENOMEM; +} + +static void __net_exit __ip_vs_control_cleanup(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return; + + ip_vs_kill_estimator(&ip_vs_stats); + unregister_net_sysctl_table(sysctl_header); + proc_net_remove(net, "ip_vs_stats"); + proc_net_remove(net, "ip_vs"); +} + +static struct pernet_operations ipvs_control_ops = { + .init = __ip_vs_control_init, + .exit = __ip_vs_control_cleanup, +}; int __init ip_vs_control_init(void) { @@ -3437,12 +3473,9 @@ int __init ip_vs_control_init(void) return ret; } - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - ip_vs_new_estimator(&ip_vs_stats); + ret = register_pernet_subsys(&ipvs_control_ops); + if (ret) + return ret; /* Hook the defense timer */ schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); @@ -3459,9 +3492,7 @@ void ip_vs_control_cleanup(void) cancel_delayed_work_sync(&defense_work); cancel_work_sync(&defense_work.work); ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); + unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index ff28801962e0..7417a0c1408b 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -157,13 +157,31 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) est->outbps = 0; } +static int __net_init __ip_vs_estimator_init(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + + return 0; +} + +static struct pernet_operations ip_vs_app_ops = { + .init = __ip_vs_estimator_init, +}; + int __init ip_vs_estimator_init(void) { + int rv; + + rv = register_pernet_subsys(&ip_vs_app_ops); + if (rv < 0) + return rv; mod_timer(&est_timer, jiffies + 2 * HZ); - return 0; + return rv; } void ip_vs_estimator_cleanup(void) { del_timer_sync(&est_timer); + unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 84aef65b37d1..0e762f322aa3 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -399,15 +399,17 @@ static struct ip_vs_app ip_vs_ftp = { .pkt_in = ip_vs_ftp_in, }; - /* - * ip_vs_ftp initialization + * per netns ip_vs_ftp initialization */ -static int __init ip_vs_ftp_init(void) +static int __net_init __ip_vs_ftp_init(struct net *net) { int i, ret; struct ip_vs_app *app = &ip_vs_ftp; + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + ret = register_ip_vs_app(app); if (ret) return ret; @@ -427,14 +429,38 @@ static int __init ip_vs_ftp_init(void) return ret; } +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + struct ip_vs_app *app = &ip_vs_ftp; + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return; + + unregister_ip_vs_app(app); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; + +int __init ip_vs_ftp_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_ftp_ops); + return rv; +} /* * ip_vs_ftp finish. */ static void __exit ip_vs_ftp_exit(void) { - unregister_ip_vs_app(&ip_vs_ftp); + unregister_pernet_subsys(&ip_vs_ftp_ops); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 9323f8944199..84278fb4e055 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -543,23 +543,54 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .schedule = ip_vs_lblc_schedule, }; +/* + * per netns init. + */ +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + + sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, + vs_vars_table); + if (!sysctl_header) + return -ENOMEM; + + return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return; + + unregister_net_sysctl_table(sysctl_header); +} + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; static int __init ip_vs_lblc_init(void) { int ret; - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) - unregister_sysctl_table(sysctl_header); + unregister_pernet_subsys(&ip_vs_lblc_ops); return ret; } static void __exit ip_vs_lblc_cleanup(void) { - unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index dbeed8ea421a..7c7396a6acbf 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -744,23 +744,53 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .schedule = ip_vs_lblcr_schedule, }; +/* + * per netns init. + */ +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + + sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, + vs_vars_table); + if (!sysctl_header) + return -ENOMEM; + + return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return; + + unregister_net_sysctl_table(sysctl_header); +} + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; static int __init ip_vs_lblcr_init(void) { int ret; - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) - unregister_sysctl_table(sysctl_header); + unregister_pernet_subsys(&ip_vs_lblcr_ops); return ret; } - static void __exit ip_vs_lblcr_cleanup(void) { - unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index c53998390877..45392942d0e7 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -236,6 +236,23 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } +/* + * per network name-space init + */ +static int __net_init __ip_vs_protocol_init(struct net *net) +{ + return 0; +} + +static void __net_exit __ip_vs_protocol_cleanup(struct net *net) +{ + /* empty */ +} + +static struct pernet_operations ipvs_proto_ops = { + .init = __ip_vs_protocol_init, + .exit = __ip_vs_protocol_cleanup, +}; int __init ip_vs_protocol_init(void) { @@ -265,6 +282,7 @@ int __init ip_vs_protocol_init(void) REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); + return register_pernet_subsys(&ipvs_proto_ops); return 0; } @@ -275,6 +293,7 @@ void ip_vs_protocol_cleanup(void) struct ip_vs_protocol *pp; int i; + unregister_pernet_subsys(&ipvs_proto_ops); /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c1c167ab73ee..3668739a6d06 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1639,3 +1639,30 @@ int stop_sync_thread(int state) return 0; } + +/* + * Initialize data struct for each netns + */ +static int __net_init __ip_vs_sync_init(struct net *net) +{ + return 0; +} + +static void __ip_vs_sync_cleanup(struct net *net) +{ +} +static struct pernet_operations ipvs_sync_ops = { + .init = __ip_vs_sync_init, + .exit = __ip_vs_sync_cleanup, +}; + + +int __init ip_vs_sync_init(void) +{ + return register_pernet_subsys(&ipvs_sync_ops); +} + +void __exit ip_vs_sync_cleanup(void) +{ + unregister_pernet_subsys(&ipvs_sync_ops); +} From fc723250c9cb046cc19833a2b1c4309bbf59ac36 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:43 +0100 Subject: [PATCH 35/80] IPVS: netns to services part 1 Services hash tables got netns ptr a hash arg, While Real Servers (rs) has been moved to ipvs struct. Two new inline functions added to get net ptr from skb. Since ip_vs is called from different contexts there is two places to dig for the net ptr skb->dev or skb->sk this is handled in skb_net() and skb_sknet() Global functions, ip_vs_service_get() ip_vs_lookup_real_service() etc have got struct net *net as first param. If possible get net ptr skb etc, - if not &init_net is used at this early stage of patching. ip_vs_ctl.c procfs not ready for netns yet. *v3 Comments by Julian - __ip_vs_service_find and __ip_vs_svc_fwm_find are fast path, net_eq(svc->net, net) so the check is at the end now. - net = skb_net(skb) in ip_vs_out moved after check for skb_dst. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 64 ++++++- include/net/netns/ip_vs.h | 8 + net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/ipvs/ip_vs_core.c | 4 +- net/netfilter/ipvs/ip_vs_ctl.c | 232 +++++++++++++++----------- net/netfilter/ipvs/ip_vs_proto_sctp.c | 5 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 7 +- net/netfilter/ipvs/ip_vs_proto_udp.c | 5 +- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 9 files changed, 214 insertions(+), 115 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index c1c2ece3ed94..d551e0d8fd9a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -37,6 +37,59 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) { return net->ipvs; } +/* + * Get net ptr from skb in traffic cases + * use skb_sknet when call is from userland (ioctl or netlink) + */ +static inline struct net *skb_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS +#ifdef CONFIG_IP_VS_DEBUG + /* + * This is used for debug only. + * Start with the most likely hit + * End with BUG + */ + if (likely(skb->dev && skb->dev->nd_net)) + return dev_net(skb->dev); + if (skb_dst(skb)->dev) + return dev_net(skb_dst(skb)->dev); + WARN(skb->sk, "Maybe skb_sknet should be used in %s() at line:%d\n", + __func__, __LINE__); + if (likely(skb->sk && skb->sk->sk_net)) + return sock_net(skb->sk); + pr_err("There is no net ptr to find in the skb in %s() line:%d\n", + __func__, __LINE__); + BUG(); +#else + return dev_net(skb->dev ? : skb_dst(skb)->dev); +#endif +#else + return &init_net; +#endif +} + +static inline struct net *skb_sknet(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS +#ifdef CONFIG_IP_VS_DEBUG + /* Start with the most likely hit */ + if (likely(skb->sk && skb->sk->sk_net)) + return sock_net(skb->sk); + WARN(skb->dev, "Maybe skb_net should be used instead in %s() line:%d\n", + __func__, __LINE__); + if (likely(skb->dev && skb->dev->nd_net)) + return dev_net(skb->dev); + pr_err("There is no net ptr to find in the skb in %s() line:%d\n", + __func__, __LINE__); + BUG(); +#else + return sock_net(skb->sk); +#endif +#else + return &init_net; +#endif +} /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; @@ -496,6 +549,7 @@ struct ip_vs_service { unsigned flags; /* service status flags */ unsigned timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity */ + struct net *net; struct list_head destinations; /* real server d-linked list */ __u32 num_dests; /* number of servers */ @@ -896,7 +950,7 @@ extern int sysctl_ip_vs_sync_ver; extern void ip_vs_sync_switch_mode(int mode); extern struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, +ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); static inline void ip_vs_service_put(struct ip_vs_service *svc) @@ -905,7 +959,7 @@ static inline void ip_vs_service_put(struct ip_vs_service *svc) } extern struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, +ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); extern int ip_vs_use_count_inc(void); @@ -913,9 +967,9 @@ extern void ip_vs_use_count_dec(void); extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); extern struct ip_vs_dest * -ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport, - const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, - __u32 fwmark); +ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, + __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, + __u16 protocol, __u32 fwmark); extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 12fe84087cec..5b87d22a39fb 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -20,6 +20,14 @@ struct ctl_table_header; struct netns_ipvs { int gen; /* Generation */ + /* + * Hash table: for real service lookups + */ + #define IP_VS_RTAB_BITS 4 + #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) + #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + + struct list_head rs_table[IP_VS_RTAB_SIZE]; }; #endif /* IP_VS_H_ */ diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 7c1b502f8d8d..7a0e79e3ad0f 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -611,7 +611,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, + dest = ip_vs_find_dest(&init_net, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark); ip_vs_bind_dest(cp, dest); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 206f40c548d7..d0616ea1eebf 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1031,6 +1031,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { + struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; @@ -1054,6 +1055,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely(!skb_dst(skb))) return NF_ACCEPT; + net = skb_net(skb); ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1119,7 +1121,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, + if (ip_vs_lookup_real_service(net, af, iph.protocol, &iph.saddr, pptr[0])) { /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ceeef4352d34..2d7c96bd2114 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -287,15 +287,6 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - /* * Trash for destinations */ @@ -311,9 +302,9 @@ static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); /* * Returns hash value for virtual service */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, - __be16 port) +static inline unsigned +ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, + const union nf_inet_addr *addr, __be16 port) { register unsigned porth = ntohs(port); __be32 addr_fold = addr->ip; @@ -323,6 +314,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif + addr_fold ^= ((size_t)net>>8); return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) & IP_VS_SVC_TAB_MASK; @@ -331,13 +323,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, /* * Returns hash value of fwmark for virtual service lookup */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) { - return fwmark & IP_VS_SVC_TAB_MASK; + return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* - * Hashes a service in the ip_vs_svc_table by + * Hashes a service in the ip_vs_svc_table by * or in the ip_vs_svc_fwm_table by fwmark. * Should be called with locked tables. */ @@ -353,16 +345,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* - * Hash it by in ip_vs_svc_table + * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, - svc->port); + hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + &svc->addr, svc->port); list_add(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* - * Hash it by fwmark in ip_vs_svc_fwm_table + * Hash it by fwmark in svc_fwm_table */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } @@ -374,7 +366,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Unhashes a service from svc_table / svc_fwm_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) @@ -386,10 +378,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) } if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ + /* Remove it from the svc_table table */ list_del(&svc->s_list); } else { - /* Remove it from the ip_vs_svc_fwm_table table */ + /* Remove it from the svc_fwm_table table */ list_del(&svc->f_list); } @@ -400,23 +392,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* - * Get service by {proto,addr,port} in the service table. + * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, - __be16 vport) +__ip_vs_service_find(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { unsigned hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) - && (svc->protocol == protocol)) { + && (svc->protocol == protocol) + && net_eq(svc->net, net)) { /* HIT */ return svc; } @@ -430,16 +423,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_find(int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) { unsigned hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); + hash = ip_vs_svc_fwm_hashkey(net, fwmark); list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af) { + if (svc->fwmark == fwmark && svc->af == af + && net_eq(svc->net, net)) { /* HIT */ return svc; } @@ -449,7 +443,7 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark) } struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, +ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; @@ -459,14 +453,15 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark))) + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (fwmark && svc) goto out; /* * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_find(af, protocol, vaddr, vport); + svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -476,7 +471,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -484,7 +479,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_find(af, protocol, vaddr, 0); + svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); } out: @@ -545,10 +540,10 @@ static inline unsigned ip_vs_rs_hashkey(int af, } /* - * Hashes ip_vs_dest in ip_vs_rtable by . + * Hashes ip_vs_dest in rs_table by . * should be called with locked tables. */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) +static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned hash; @@ -562,19 +557,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ip_vs_rtable[hash]); + list_add(&dest->d_list, &ipvs->rs_table[hash]); return 1; } /* - * UNhashes ip_vs_dest from ip_vs_rtable. + * UNhashes ip_vs_dest from rs_table. * should be called with locked tables. */ static int ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* - * Remove it from the ip_vs_rtable table. + * Remove it from the rs_table table. */ if (!list_empty(&dest->d_list)) { list_del(&dest->d_list); @@ -588,10 +583,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest) * Lookup real service by in the real service table. */ struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, +ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { + struct netns_ipvs *ipvs = net_ipvs(net); unsigned hash; struct ip_vs_dest *dest; @@ -602,7 +598,7 @@ ip_vs_lookup_real_service(int af, __u16 protocol, hash = ip_vs_rs_hashkey(af, daddr, dport); read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { if ((dest->af == af) && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->port == dport) @@ -652,7 +648,8 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * ip_vs_lookup_real_service() looked promissing, but * seems not working as expected. */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, + const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark) @@ -660,7 +657,7 @@ struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, struct ip_vs_dest *dest; struct ip_vs_service *svc; - svc = ip_vs_service_get(af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); @@ -768,6 +765,7 @@ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { + struct netns_ipvs *ipvs = net_ipvs(svc->net); int conn_flags; /* set the weight and the flags */ @@ -780,11 +778,11 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* - * Put the real service in ip_vs_rtable if not present. + * Put the real service in rs_table if not present. * For now only for NAT! */ write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); + ip_vs_rs_hash(ipvs, dest); write_unlock_bh(&__ip_vs_rs_lock); } atomic_set(&dest->conn_flags, conn_flags); @@ -1117,7 +1115,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Add a service into the service hash table */ static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0; @@ -1172,6 +1170,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, svc->flags = u->flags; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; + svc->net = net; INIT_LIST_HEAD(&svc->destinations); rwlock_init(&svc->sched_lock); @@ -1428,17 +1427,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(void) +static int ip_vs_flush(struct net *net) { int idx; struct ip_vs_service *svc, *nxt; /* - * Flush the service table hashed by + * Flush the service table hashed by */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - ip_vs_unlink_service(svc); + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], + s_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); } } @@ -1448,7 +1449,8 @@ static int ip_vs_flush(void) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry_safe(svc, nxt, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_unlink_service(svc); + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); } } @@ -1472,20 +1474,22 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) return 0; } -static int ip_vs_zero_all(void) +static int ip_vs_zero_all(struct net *net) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); } } @@ -1763,6 +1767,7 @@ static struct ctl_table_header * sysctl_header; #ifdef CONFIG_PROC_FS struct ip_vs_iter { + struct seq_net_private p; /* Do not move this, netns depends upon it*/ struct list_head *table; int bucket; }; @@ -1789,6 +1794,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags) /* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { + struct net *net = seq_file_net(seq); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; @@ -1796,7 +1802,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ + if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; @@ -1807,7 +1813,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { + if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; @@ -1961,7 +1967,7 @@ static const struct seq_operations ip_vs_info_seq_ops = { static int ip_vs_info_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ip_vs_info_seq_ops, + return seq_open_net(inode, file, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)); } @@ -2011,7 +2017,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) { - return single_open(file, ip_vs_stats_show, NULL); + return single_open_net(inode, file, ip_vs_stats_show); } static const struct file_operations ip_vs_stats_fops = { @@ -2113,6 +2119,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, static int do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { + struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_ARG_LEN]; struct ip_vs_service_user *usvc_compat; @@ -2147,7 +2154,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(); + ret = ip_vs_flush(net); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ @@ -2174,7 +2181,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); + ret = ip_vs_zero_all(net); goto out_unlock; } } @@ -2191,10 +2198,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_find(usvc.af, usvc.protocol, + svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2207,7 +2214,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(&usvc, &svc); + ret = ip_vs_add_service(net, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); @@ -2267,7 +2274,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) } static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, +__ip_vs_get_service_entries(struct net *net, + const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; @@ -2278,7 +2286,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) + if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; if (count >= get->num_services) @@ -2297,7 +2305,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) + if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; if (count >= get->num_services) @@ -2317,7 +2325,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, } static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; @@ -2325,9 +2333,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, int ret = 0; if (get->fwmark) - svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else - svc = __ip_vs_service_find(AF_INET, get->protocol, &addr, + svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, get->port); if (svc) { @@ -2401,7 +2409,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) unsigned char arg[128]; int ret = 0; unsigned int copylen; + struct net *net = sock_net(sk); + BUG_ON(!net); if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -2463,7 +2473,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_service_entries(get, user); + ret = __ip_vs_get_service_entries(net, get, user); } break; @@ -2476,10 +2486,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; if (entry->fwmark) - svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else - svc = __ip_vs_service_find(AF_INET, entry->protocol, - &addr, entry->port); + svc = __ip_vs_service_find(net, AF_INET, + entry->protocol, &addr, + entry->port); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2502,7 +2513,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_dest_entries(get, user); + ret = __ip_vs_get_dest_entries(net, get, user); } break; @@ -2722,11 +2733,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; + struct net *net = skb_sknet(skb); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start) + if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2737,7 +2749,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start) + if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2753,7 +2765,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, return skb->len; } -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, +static int ip_vs_genl_parse_service(struct net *net, + struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, struct ip_vs_service **ret_svc) { @@ -2796,9 +2809,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, } if (usvc->fwmark) - svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark); + svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); else - svc = __ip_vs_service_find(usvc->af, usvc->protocol, + svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, &usvc->addr, usvc->port); *ret_svc = svc; @@ -2835,13 +2848,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, return 0; } -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, + struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc); + ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); return ret ? ERR_PTR(ret) : svc; } @@ -2909,6 +2923,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net; mutex_lock(&__ip_vs_mutex); @@ -2917,7 +2932,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) goto out_err; - svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + net = skb_sknet(skb); + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -3102,13 +3118,15 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; + struct net *net; + net = skb_sknet(skb); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(); + ret = ip_vs_flush(net); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(info->attrs); @@ -3133,7 +3151,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); + ret = ip_vs_zero_all(net); goto out; } @@ -3143,7 +3161,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = 1; - ret = ip_vs_genl_parse_service(&usvc, + ret = ip_vs_genl_parse_service(net, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) @@ -3173,7 +3191,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) - ret = ip_vs_add_service(&usvc, &svc); + ret = ip_vs_add_service(net, &usvc, &svc); else ret = -EEXIST; break; @@ -3211,7 +3229,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; + struct net *net; + net = skb_sknet(skb); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3240,7 +3260,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc; - svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + svc = ip_vs_genl_find_service(net, + info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; @@ -3411,9 +3432,15 @@ static void ip_vs_genl_unregister(void) */ int __net_init __ip_vs_control_init(struct net *net) { + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, @@ -3445,43 +3472,48 @@ static struct pernet_operations ipvs_control_ops = { int __init ip_vs_control_init(void) { - int ret; int idx; + int ret; EnterFunction(2); - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_LIST_HEAD(&ip_vs_svc_table[idx]); INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); + + ret = register_pernet_subsys(&ipvs_control_ops); + if (ret) { + pr_err("cannot register namespace.\n"); + goto err; } - smp_wmb(); + + smp_wmb(); /* Do we really need it now ? */ ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); - return ret; + goto err_net; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); nf_unregister_sockopt(&ip_vs_sockopts); - return ret; + goto err_net; } - ret = register_pernet_subsys(&ipvs_control_ops); - if (ret) - return ret; - /* Hook the defense timer */ schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); LeaveFunction(2); return 0; + +err_net: + unregister_pernet_subsys(&ipvs_control_ops); +err: + return ret; } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index a315159983ad..521b827083fe 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -12,6 +12,7 @@ static int sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { + struct net *net; struct ip_vs_service *svc; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; @@ -27,9 +28,9 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, sizeof(_schunkh), &_schunkh); if (sch == NULL) return 0; - + net = skb_net(skb); if ((sch->type == SCTP_CID_INIT) && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr, sh->dest))) { int ignored; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 1cdab12abfef..c175d3166263 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -31,6 +31,7 @@ static int tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { + struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; struct ip_vs_iphdr iph; @@ -42,11 +43,11 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, *verdict = NF_DROP; return 0; } - + net = skb_net(skb); /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (th->syn && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, - th->dest))) { + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, th->dest))) { int ignored; if (ip_vs_todrop()) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index cd398de010cc..5ab54f648654 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -31,6 +31,7 @@ static int udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { + struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; struct ip_vs_iphdr iph; @@ -42,8 +43,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, *verdict = NF_DROP; return 0; } - - svc = ip_vs_service_get(af, skb->mark, iph.protocol, + net = skb_net(skb); + svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { int ignored; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3668739a6d06..662aa2c22a05 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -749,7 +749,7 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param, unsigned flags, * If it is not found the connection will remain unbound * but still handled. */ - dest = ip_vs_find_dest(type, daddr, dport, param->vaddr, + dest = ip_vs_find_dest(&init_net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark); /* Set the approprite ativity flag */ From d0a1eef9c38218af20c809b2220a960b7ed81a36 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:44 +0100 Subject: [PATCH 36/80] IPVS: netns awarness to lblcr sheduler var sysctl_ip_vs_lblcr_expiration moved to ipvs struct as sysctl_lblcr_expiration procfs updated to handle this. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/netns/ip_vs.h | 5 +++ net/netfilter/ipvs/ip_vs_lblcr.c | 52 +++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 5b87d22a39fb..51a92ee1b167 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -28,6 +28,11 @@ struct netns_ipvs { #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) struct list_head rs_table[IP_VS_RTAB_SIZE]; + + /* ip_vs_lblcr */ + int sysctl_lblcr_expiration; + struct ctl_table_header *lblcr_ctl_header; + struct ctl_table *lblcr_ctl_table; }; #endif /* IP_VS_H_ */ diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 7c7396a6acbf..61ae8cfcf0b4 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -70,8 +70,6 @@ * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - /* * for IPVS lblcr entry hash table @@ -296,7 +294,7 @@ struct ip_vs_lblcr_table { static ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -304,8 +302,6 @@ static ctl_table vs_vars_table[] = { { } }; -static struct ctl_table_header * sysctl_header; - static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { list_del(&en->list); @@ -425,14 +421,15 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en, *nxt; + struct netns_ipvs *ipvs = net_ipvs(svc->net); for (i=0, j=tbl->rover; isched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) + if (time_after(en->lastuse + + ipvs->sysctl_lblcr_expiration, now)) continue; ip_vs_lblcr_free(en); @@ -664,6 +661,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); if (en) { + struct netns_ipvs *ipvs = net_ipvs(svc->net); /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; @@ -675,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && time_after(jiffies, en->set.lastmod + - sysctl_ip_vs_lblcr_expiration)) { + ipvs->sysctl_lblcr_expiration)) { struct ip_vs_dest *m; write_lock(&en->set.lock); @@ -749,23 +747,43 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = */ static int __net_init __ip_vs_lblcr_init(struct net *net) { - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; + struct netns_ipvs *ipvs = net_ipvs(net); - sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, - vs_vars_table); - if (!sysctl_header) - return -ENOMEM; + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + goto err_dup; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = 24*60*60*HZ; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) + goto err_reg; return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + +err_dup: + return -ENOMEM; } static void __net_exit __ip_vs_lblcr_exit(struct net *net) { - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return; + struct netns_ipvs *ipvs = net_ipvs(net); - unregister_net_sysctl_table(sysctl_header); + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); } static struct pernet_operations ip_vs_lblcr_ops = { From b6e885ddb903e681b7cbb4e68ad775154660e1f4 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:45 +0100 Subject: [PATCH 37/80] IPVS: netns awarness to lblc sheduler var sysctl_ip_vs_lblc_expiration moved to ipvs struct as sysctl_lblc_expiration procfs updated to handle this. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/netns/ip_vs.h | 4 +++ net/netfilter/ipvs/ip_vs_lblc.c | 48 ++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 51a92ee1b167..d14581cc4fe0 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -29,6 +29,10 @@ struct netns_ipvs { struct list_head rs_table[IP_VS_RTAB_SIZE]; + /* ip_vs_lblc */ + int sysctl_lblc_expiration; + struct ctl_table_header *lblc_ctl_header; + struct ctl_table *lblc_ctl_table; /* ip_vs_lblcr */ int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 84278fb4e055..d5bec3371871 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -70,7 +70,6 @@ * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; /* @@ -117,7 +116,7 @@ struct ip_vs_lblc_table { static ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -125,8 +124,6 @@ static ctl_table vs_vars_table[] = { { } }; -static struct ctl_table_header * sysctl_header; - static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { list_del(&en->list); @@ -248,6 +245,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) struct ip_vs_lblc_entry *en, *nxt; unsigned long now = jiffies; int i, j; + struct netns_ipvs *ipvs = net_ipvs(svc->net); for (i=0, j=tbl->rover; isched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) + en->lastuse + + ipvs->sysctl_lblc_expiration)) continue; ip_vs_lblc_free(en); @@ -548,23 +547,43 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = */ static int __net_init __ip_vs_lblc_init(struct net *net) { - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; + struct netns_ipvs *ipvs = net_ipvs(net); - sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, - vs_vars_table); - if (!sysctl_header) - return -ENOMEM; + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + goto err_dup; + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = 24*60*60*HZ; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) + goto err_reg; return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + +err_dup: + return -ENOMEM; } static void __net_exit __ip_vs_lblc_exit(struct net *net) { - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return; + struct netns_ipvs *ipvs = net_ipvs(net); - unregister_net_sysctl_table(sysctl_header); + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); } static struct pernet_operations ip_vs_lblc_ops = { @@ -586,7 +605,6 @@ static int __init ip_vs_lblc_init(void) return ret; } - static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); From 252c64103237f1841088f0f29b4f084b1c774546 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:46 +0100 Subject: [PATCH 38/80] IPVS: netns, prepare protocol Add support for protocol data per name-space. in struct ip_vs_protocol, appcnt will be removed when all protos are modified for network name-space. This patch causes warnings of unused functions, they will be used when next patch will be applied. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 20 +++++++++- include/net/netns/ip_vs.h | 3 ++ net/netfilter/ipvs/ip_vs_proto.c | 66 ++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d551e0d8fd9a..88d4e40b538a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -352,6 +352,7 @@ struct iphdr; struct ip_vs_conn; struct ip_vs_app; struct sk_buff; +struct ip_vs_proto_data; struct ip_vs_protocol { struct ip_vs_protocol *next; @@ -366,6 +367,10 @@ struct ip_vs_protocol { void (*exit)(struct ip_vs_protocol *pp); + void (*init_netns)(struct net *net, struct ip_vs_proto_data *pd); + + void (*exit_netns)(struct net *net, struct ip_vs_proto_data *pd); + int (*conn_schedule)(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp); @@ -417,7 +422,20 @@ struct ip_vs_protocol { int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to); }; -extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto); +/* + * protocol data per netns + */ +struct ip_vs_proto_data { + struct ip_vs_proto_data *next; + struct ip_vs_protocol *pp; + int *timeout_table; /* protocol timeout table */ + atomic_t appcnt; /* counter of proto app incs. */ + struct tcp_states_t *tcp_state_table; +}; + +extern struct ip_vs_protocol *ip_vs_proto_get(unsigned short proto); +extern struct ip_vs_proto_data *ip_vs_proto_data_get(struct net *net, + unsigned short proto); struct ip_vs_conn_param { const union nf_inet_addr *caddr; diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index d14581cc4fe0..6f4e089b8db2 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -28,6 +28,9 @@ struct netns_ipvs { #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) struct list_head rs_table[IP_VS_RTAB_SIZE]; + /* ip_vs_proto */ + #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ + struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; /* ip_vs_lblc */ int sysctl_lblc_expiration; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 45392942d0e7..576e29648c53 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -60,6 +60,31 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + + if (!pd) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) + pp->init_netns(net, pd); + + return 0; +} /* * unregister an ipvs protocol @@ -82,6 +107,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) return -ESRCH; } +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data **pd_p; + unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(net, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} /* * get ip_vs_protocol object by its proto. @@ -100,6 +148,24 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) } EXPORT_SYMBOL(ip_vs_proto_get); +/* + * get ip_vs_protocol object data by netns and proto + */ +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} +EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols From 4a85b96c08ef84076f84e87280223a4301988ed9 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:47 +0100 Subject: [PATCH 39/80] IPVS: netns preparation for proto_tcp In this phase (one), all local vars will be moved to ipvs struct. Remaining work, add param struct net *net to a couple of functions that is common for all protos and use all ip_vs_proto_data *v3 Removed unused function as sugested by Simon Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- include/net/netns/ip_vs.h | 8 +++ net/netfilter/ipvs/ip_vs_ftp.c | 8 ++- net/netfilter/ipvs/ip_vs_proto.c | 13 +++- net/netfilter/ipvs/ip_vs_proto_tcp.c | 97 +++++++++++++++------------- 5 files changed, 79 insertions(+), 49 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 88d4e40b538a..3c45a00cdc3e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -807,7 +807,7 @@ extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); extern const char * ip_vs_state_name(__u16 proto, int state); -extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); +extern void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp); extern int ip_vs_check_template(struct ip_vs_conn *ct); extern void ip_vs_random_dropentry(void); extern int ip_vs_conn_init(void); diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 6f4e089b8db2..ac77363647ab 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -31,6 +31,14 @@ struct netns_ipvs { /* ip_vs_proto */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; + /* ip_vs_proto_tcp */ +#ifdef CONFIG_IP_VS_PROTO_TCP + #define TCP_APP_TAB_BITS 4 + #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) + #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + struct list_head tcp_apps[TCP_APP_TAB_SIZE]; + spinlock_t tcp_app_lock; +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 0e762f322aa3..b38ae941f677 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; + struct net *net; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -257,8 +258,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * would be adjusted twice. */ + net = skb_net(skb); cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); + ip_vs_tcp_conn_listen(net, n_cp); ip_vs_conn_put(n_cp); return ret; } @@ -287,6 +289,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; + struct net *net; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -378,7 +381,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Move tunnel to listen state */ - ip_vs_tcp_conn_listen(n_cp); + net = skb_net(skb); + ip_vs_tcp_conn_listen(net, n_cp); ip_vs_conn_put(n_cp); return 1; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 576e29648c53..320c6a65f370 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -307,12 +307,23 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, */ static int __net_init __ip_vs_protocol_init(struct net *net) { +#ifdef CONFIG_IP_VS_PROTO_TCP + register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); +#endif return 0; } static void __net_exit __ip_vs_protocol_cleanup(struct net *net) { - /* empty */ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(net, pd); + } } static struct pernet_operations ipvs_proto_ops = { diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index c175d3166263..9d9df3d61093 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -9,8 +9,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: + * Changes: Hans Schillstrom * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns */ #define KMSG_COMPONENT "IPVS" @@ -345,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = { /* * Timeout table[state] */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, @@ -460,13 +464,6 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) tcp_state_table = (on? tcp_states_dos : tcp_states); } -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); -} - static inline int tcp_state_idx(struct tcphdr *th) { if (th->rst) @@ -487,6 +484,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; + struct ip_vs_proto_data *pd; /* Temp fix */ /* * Update state offset to INPUT_ONLY if necessary @@ -542,10 +540,13 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } - cp->timeout = pp->timeout_table[cp->state = new_state]; + pd = ip_vs_proto_data_get(&init_net, pp->protocol); + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; } - /* * Handle state transitions */ @@ -573,17 +574,6 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, return 1; } - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) @@ -597,21 +587,23 @@ static int tcp_register_app(struct ip_vs_app *inc) __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP); hash = tcp_app_hashkey(port); - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { + spin_lock_bh(&ipvs->tcp_app_lock); + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); + list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->pp->appcnt); out: - spin_unlock_bh(&tcp_app_lock); + spin_unlock_bh(&ipvs->tcp_app_lock); return ret; } @@ -619,16 +611,20 @@ static int tcp_register_app(struct ip_vs_app *inc) static void tcp_unregister_app(struct ip_vs_app *inc) { - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); + struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP); + + spin_lock_bh(&ipvs->tcp_app_lock); + atomic_dec(&pd->pp->appcnt); list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); + spin_unlock_bh(&ipvs->tcp_app_lock); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(&init_net); int hash; struct ip_vs_app *inc; int result = 0; @@ -640,12 +636,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { + spin_lock(&ipvs->tcp_app_lock); + list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&tcp_app_lock); + spin_unlock(&ipvs->tcp_app_lock); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -662,7 +658,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&tcp_app_lock); + spin_unlock(&ipvs->tcp_app_lock); out: return result; @@ -672,24 +668,34 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + spin_lock(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock(&cp->lock); } - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + spin_lock_init(&ipvs->tcp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); } - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) { + kfree(pd->timeout_table); } @@ -699,8 +705,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, @@ -714,5 +722,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, }; From 78b16bde104cc74bedbf462b0ebed2990f35ff6b Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:48 +0100 Subject: [PATCH 40/80] IPVS: netns preparation for proto_udp In this phase (one), all local vars will be moved to ipvs struct. Remaining work, add param struct net *net to a couple of functions that is common for all protos and use ip_vs_proto_data *v3 Removed unused function set_state_timeout() Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/netns/ip_vs.h | 8 +++ net/netfilter/ipvs/ip_vs_proto.c | 3 + net/netfilter/ipvs/ip_vs_proto_udp.c | 86 ++++++++++++++-------------- 3 files changed, 54 insertions(+), 43 deletions(-) diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index ac77363647ab..62b1448d3795 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -39,6 +39,14 @@ struct netns_ipvs { struct list_head tcp_apps[TCP_APP_TAB_SIZE]; spinlock_t tcp_app_lock; #endif + /* ip_vs_proto_udp */ +#ifdef CONFIG_IP_VS_PROTO_UDP + #define UDP_APP_TAB_BITS 4 + #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) + #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + struct list_head udp_apps[UDP_APP_TAB_SIZE]; + spinlock_t udp_app_lock; +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 320c6a65f370..cdc414238fcb 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -309,6 +309,9 @@ static int __net_init __ip_vs_protocol_init(struct net *net) { #ifdef CONFIG_IP_VS_PROTO_TCP register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); #endif return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 5ab54f648654..71a4721a8f8a 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,7 +9,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: + * Changes: Hans Schillstrom + * Network name space (netns) aware. * */ @@ -345,19 +346,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 1; } - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - static inline __u16 udp_app_hashkey(__be16 port) { return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) @@ -371,22 +359,24 @@ static int udp_register_app(struct ip_vs_app *inc) __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_UDP); hash = udp_app_hashkey(port); - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { + spin_lock_bh(&ipvs->udp_app_lock); + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); + list_add(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->pp->appcnt); out: - spin_unlock_bh(&udp_app_lock); + spin_unlock_bh(&ipvs->udp_app_lock); return ret; } @@ -394,15 +384,19 @@ static int udp_register_app(struct ip_vs_app *inc) static void udp_unregister_app(struct ip_vs_app *inc) { - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_UDP); + struct netns_ipvs *ipvs = net_ipvs(&init_net); + + spin_lock_bh(&ipvs->udp_app_lock); + atomic_dec(&pd->pp->appcnt); list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); + spin_unlock_bh(&ipvs->udp_app_lock); } static int udp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(&init_net); int hash; struct ip_vs_app *inc; int result = 0; @@ -414,12 +408,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { + spin_lock(&ipvs->udp_app_lock); + list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&udp_app_lock); + spin_unlock(&ipvs->udp_app_lock); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -436,14 +430,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&udp_app_lock); + spin_unlock(&ipvs->udp_app_lock); out: return result; } -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = 5*60*HZ, [IP_VS_UDP_S_LAST] = 2*HZ, }; @@ -453,14 +447,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_LAST] = "BUG!", }; - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - static const char * udp_state_name(int state) { if (state >= IP_VS_UDP_S_LAST) @@ -473,18 +459,31 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_protocol *pp) { - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + struct ip_vs_proto_data *pd; /* Temp fix, pp will be replaced by pd */ + + pd = ip_vs_proto_data_get(&init_net, IPPROTO_UDP); + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return 0; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; return 1; } -static void udp_init(struct ip_vs_protocol *pp) +static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + spin_lock_init(&ipvs->udp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); } -static void udp_exit(struct ip_vs_protocol *pp) +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) { + kfree(pd->timeout_table); } @@ -493,8 +492,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .protocol = IPPROTO_UDP, .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, .conn_schedule = udp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, @@ -508,5 +509,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .app_conn_bind = udp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, }; From 9d934878e7870fbbbd8eaed2e467552536877def Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:49 +0100 Subject: [PATCH 41/80] IPVS: netns preparation for proto_sctp In this phase (one), all local vars will be moved to ipvs struct. Remaining work, add param struct net *net to a couple of functions that is common for all protos and use ip_vs_proto_data *v3 Removed unuset function set_state_timeout() Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/netns/ip_vs.h | 9 ++ net/netfilter/ipvs/ip_vs_proto.c | 3 + net/netfilter/ipvs/ip_vs_proto_sctp.c | 121 ++++++++++++-------------- 3 files changed, 70 insertions(+), 63 deletions(-) diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 62b1448d3795..58bd3fd85a97 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -47,6 +47,15 @@ struct netns_ipvs { struct list_head udp_apps[UDP_APP_TAB_SIZE]; spinlock_t udp_app_lock; #endif + /* ip_vs_proto_sctp */ +#ifdef CONFIG_IP_VS_PROTO_SCTP + #define SCTP_APP_TAB_BITS 4 + #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) + #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) + /* Hash table for SCTP application incarnations */ + struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; + spinlock_t sctp_app_lock; +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index cdc414238fcb..001b2f825043 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -312,6 +312,9 @@ static int __net_init __ip_vs_protocol_init(struct net *net) #endif #ifdef CONFIG_IP_VS_PROTO_UDP register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); #endif return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 521b827083fe..f826dd1e4630 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -862,7 +862,7 @@ static struct ipvs_sctp_nextstate /* * Timeout table[state] */ -static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = 2 * HZ, [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, @@ -906,18 +906,6 @@ static const char *sctp_state_name(int state) return "?"; } -static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ -} - -static int -sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - -return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST, - sctp_state_name_table, sname, to); -} - static inline int set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) @@ -926,6 +914,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, unsigned char chunk_type; int event, next_state; int ihl; + struct ip_vs_proto_data *pd; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -1001,10 +990,13 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } } + pd = ip_vs_proto_data_get(&init_net, pp->protocol); /* tmp fix */ + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; - cp->timeout = pp->timeout_table[cp->state = next_state]; - - return 1; + return 1; } static int @@ -1020,16 +1012,6 @@ sctp_state_transition(struct ip_vs_conn *cp, int direction, return ret; } -/* - * Hash table for SCTP application incarnations - */ -#define SCTP_APP_TAB_BITS 4 -#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) -#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) - -static struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(sctp_app_lock); - static inline __u16 sctp_app_hashkey(__be16 port) { return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) @@ -1042,34 +1024,40 @@ static int sctp_register_app(struct ip_vs_app *inc) __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP); hash = sctp_app_hashkey(port); - spin_lock_bh(&sctp_app_lock); - list_for_each_entry(i, &sctp_apps[hash], p_list) { + spin_lock_bh(&ipvs->sctp_app_lock); + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &sctp_apps[hash]); - atomic_inc(&ip_vs_protocol_sctp.appcnt); + list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->pp->appcnt); out: - spin_unlock_bh(&sctp_app_lock); + spin_unlock_bh(&ipvs->sctp_app_lock); return ret; } static void sctp_unregister_app(struct ip_vs_app *inc) { - spin_lock_bh(&sctp_app_lock); - atomic_dec(&ip_vs_protocol_sctp.appcnt); + struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP); + + spin_lock_bh(&ipvs->sctp_app_lock); + atomic_dec(&pd->pp->appcnt); list_del(&inc->p_list); - spin_unlock_bh(&sctp_app_lock); + spin_unlock_bh(&ipvs->sctp_app_lock); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(&init_net); int hash; struct ip_vs_app *inc; int result = 0; @@ -1080,12 +1068,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - spin_lock(&sctp_app_lock); - list_for_each_entry(inc, &sctp_apps[hash], p_list) { + spin_lock(&ipvs->sctp_app_lock); + list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&sctp_app_lock); + spin_unlock(&ipvs->sctp_app_lock); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -1101,43 +1089,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&sctp_app_lock); + spin_unlock(&ipvs->sctp_app_lock); out: return result; } -static void ip_vs_sctp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(sctp_apps); - pp->timeout_table = sctp_timeouts; + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + spin_lock_init(&ipvs->tcp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); } - -static void ip_vs_sctp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) { - + kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_sctp = { - .name = "SCTP", - .protocol = IPPROTO_SCTP, - .num_states = IP_VS_SCTP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_sctp_init, - .exit = ip_vs_sctp_exit, - .register_app = sctp_register_app, + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, - .conn_schedule = sctp_conn_schedule, - .conn_in_get = ip_vs_conn_in_get_proto, - .conn_out_get = ip_vs_conn_out_get_proto, - .snat_handler = sctp_snat_handler, - .dnat_handler = sctp_dnat_handler, - .csum_check = sctp_csum_check, - .state_name = sctp_state_name, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .csum_check = sctp_csum_check, + .state_name = sctp_state_name, .state_transition = sctp_state_transition, - .app_conn_bind = sctp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = sctp_timeout_change, - .set_state_timeout = sctp_set_state_timeout, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, }; From 88fe2d372793a71ae4f6319a16f537d56a83906c Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:50 +0100 Subject: [PATCH 42/80] IPVS: netns preparation for proto_ah_esp In this phase (one), all local vars will be moved to ipvs struct. Remaining work, add param struct net *net to a couple of functions that common for all protos. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto.c | 6 ++++++ net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 20 ++++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 001b2f825043..9f609d4d5d58 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -315,6 +315,12 @@ static int __net_init __ip_vs_protocol_init(struct net *net) #endif #ifdef CONFIG_IP_VS_PROTO_SCTP register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); #endif return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 3a0461117d3f..b8b37fafc988 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -117,26 +117,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - #ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, + .init = NULL, + .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, @@ -159,8 +147,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .protocol = IPPROTO_ESP, .num_states = 1, .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, + .init = NULL, + .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, From 9330419d9aa4f97df412ac9be9fc0388c67dd315 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:51 +0100 Subject: [PATCH 43/80] IPVS: netns, use ip_vs_proto_data as param. ip_vs_protocol *pp is replaced by ip_vs_proto_data *pd in function call in ip_vs_protocol struct i.e. :, - timeout_change() - state_transition() ip_vs_protocol_timeout_change() got ipvs as param, due to above and a upcoming patch - defence work Most of this changes are triggered by Julians comment: "tcp_timeout_change should work with the new struct ip_vs_proto_data so that tcp_state_table will go to pd->state_table and set_tcp_state will get pd instead of pp" *v3 Mostly comments from Julian The pp -> pd conversion should start from functions like ip_vs_out() that use pp = ip_vs_proto_get(iph.protocol), now they should use ip_vs_proto_data_get(net, iph.protocol). conn_in_get() and conn_out_get() unused param *pp, removed. *v4 ip_vs_protocol_timeout_change() walk the proto_data path. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 18 ++---- net/netfilter/ipvs/ip_vs_conn.c | 2 - net/netfilter/ipvs/ip_vs_core.c | 77 +++++++++++++++---------- net/netfilter/ipvs/ip_vs_ctl.c | 55 +++++++++++------- net/netfilter/ipvs/ip_vs_proto.c | 21 ++++--- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 10 ++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 16 +++-- net/netfilter/ipvs/ip_vs_proto_tcp.c | 27 ++++----- net/netfilter/ipvs/ip_vs_proto_udp.c | 11 ++-- net/netfilter/xt_ipvs.c | 2 +- 10 files changed, 129 insertions(+), 110 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3c45a00cdc3e..464ea365ca07 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -372,13 +372,12 @@ struct ip_vs_protocol { void (*exit_netns)(struct net *net, struct ip_vs_proto_data *pd); int (*conn_schedule)(int af, struct sk_buff *skb, - struct ip_vs_protocol *pp, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -386,7 +385,6 @@ struct ip_vs_protocol { struct ip_vs_conn * (*conn_out_get)(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -404,7 +402,7 @@ struct ip_vs_protocol { int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp); + struct ip_vs_proto_data *pd); int (*register_app)(struct ip_vs_app *inc); @@ -417,9 +415,7 @@ struct ip_vs_protocol { int offset, const char *msg); - void (*timeout_change)(struct ip_vs_protocol *pp, int flags); - - int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to); + void (*timeout_change)(struct ip_vs_proto_data *pd, int flags); }; /* @@ -778,7 +774,6 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -786,7 +781,6 @@ struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -917,7 +911,7 @@ static inline void ip_vs_pe_put(const struct ip_vs_pe *pe) */ extern int ip_vs_protocol_init(void); extern void ip_vs_protocol_cleanup(void); -extern void ip_vs_protocol_timeout_change(int flags); +extern void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); extern int *ip_vs_create_timeout_table(int *table, int size); extern int ip_vs_set_state_timeout(int *table, int num, const char *const *names, @@ -947,9 +941,9 @@ extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp, int *ignored); + struct ip_vs_proto_data *pd, int *ignored); extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp); + struct ip_vs_proto_data *pd); /* diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 7a0e79e3ad0f..a7aba6a4697e 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -329,7 +329,6 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { @@ -428,7 +427,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d0616ea1eebf..9317affc5ea1 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -177,11 +177,11 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) static inline int ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { - if (unlikely(!pp->state_transition)) + if (unlikely(!pd->pp->state_transition)) return 0; - return pp->state_transition(cp, direction, skb, pp); + return pd->pp->state_transition(cp, direction, skb, pd); } static inline int @@ -378,8 +378,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp, int *ignored) + struct ip_vs_proto_data *pd, int *ignored) { + struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; struct ip_vs_dest *dest; @@ -408,7 +409,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { + (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); @@ -479,11 +480,12 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); @@ -530,10 +532,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_in_stats(cp, skb); /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); + ret = cp->packet_xmit(skb, cp, pd->pp); /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); @@ -840,7 +842,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -917,7 +919,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -956,9 +958,11 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) * Used for NAT and local client. */ static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int ihl) { + struct ip_vs_protocol *pp = pd->pp; + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); if (!skb_make_writable(skb, ihl)) @@ -1007,7 +1011,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); @@ -1034,6 +1038,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; EnterFunction(11); @@ -1079,9 +1084,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) return NF_ACCEPT; + pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 @@ -1107,10 +1113,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); if (likely(cp)) - return handle_response(af, skb, pp, cp, iph.len); + return handle_response(af, skb, pd, cp, iph.len); if (sysctl_ip_vs_nat_icmp_send && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || @@ -1236,12 +1242,14 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, static int ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) { + struct net *net = NULL; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; unsigned int offset, ihl, verdict; union nf_inet_addr snet; @@ -1283,9 +1291,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(cih->protocol); - if (!pp) + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->protocol); + if (!pd) return NF_ACCEPT; + pp = pd->pp; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && @@ -1299,10 +1309,10 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); if (!cp) { /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); if (cp) { snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, @@ -1346,6 +1356,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) { + struct net *net = NULL; struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; struct ipv6hdr _ciph, *cih; /* The ip header contained @@ -1353,6 +1364,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; unsigned int offset, verdict; union nf_inet_addr snet; struct rt6_info *rt; @@ -1395,9 +1407,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->nexthdr); + if (!pd) return NF_ACCEPT; + pp = pd->pp; /* Is the embedded protocol header present? */ /* TODO: we don't support fragmentation at the moment anyways */ @@ -1411,10 +1425,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); if (!cp) { /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); if (cp) { ipv6_addr_copy(&snet.in6, &iph->saddr); return handle_response_icmp(AF_INET6, skb, &snet, @@ -1457,8 +1471,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) static unsigned int ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { + struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, restart, pkts; @@ -1514,20 +1530,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } + net = skb_net(skb); /* Protocol supported? */ - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) return NF_ACCEPT; - + pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); if (unlikely(!cp)) { int v; - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) + if (!pp->conn_schedule(af, skb, pd, &v, &cp)) return v; } @@ -1555,7 +1572,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) ret = cp->packet_xmit(skb, cp, pp); /* do not touch skb anymore */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 2d7c96bd2114..88474f1e828a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -38,6 +38,7 @@ #include #include +#include #include #ifdef CONFIG_IP_VS_IPV6 #include @@ -125,7 +126,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ -static void update_defense_level(void) +static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; static int old_secure_tcp = 0; @@ -239,7 +240,8 @@ static void update_defense_level(void) } old_secure_tcp = sysctl_ip_vs_secure_tcp; if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + ip_vs_protocol_timeout_change(ipvs, + sysctl_ip_vs_secure_tcp > 1); spin_unlock(&ip_vs_securetcp_lock); local_bh_enable(); @@ -255,7 +257,10 @@ static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); static void defense_work_handler(struct work_struct *work) { - update_defense_level(); + struct net *net = &init_net; + struct netns_ipvs *ipvs = net_ipvs(net); + + update_defense_level(ipvs); if (atomic_read(&ip_vs_dropentry)) ip_vs_random_dropentry(); @@ -1502,6 +1507,7 @@ static int proc_do_defense_mode(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = current->nsproxy->net_ns; int *valp = table->data; int val = *valp; int rc; @@ -1512,7 +1518,7 @@ proc_do_defense_mode(ctl_table *table, int write, /* Restore the correct value */ *valp = val; } else { - update_defense_level(); + update_defense_level(net_ipvs(net)); } } return rc; @@ -2033,8 +2039,10 @@ static const struct file_operations ip_vs_stats_fops = { /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) { + struct ip_vs_proto_data *pd; + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, @@ -2042,19 +2050,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif @@ -2158,7 +2169,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); goto out_unlock; } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; @@ -2370,17 +2381,19 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, } static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) { + struct ip_vs_proto_data *pd; + #ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } @@ -2521,7 +2534,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } @@ -3092,11 +3105,11 @@ static int ip_vs_genl_del_daemon(struct nlattr **attrs) return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); } -static int ip_vs_genl_set_config(struct nlattr **attrs) +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3108,7 +3121,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs) if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - return ip_vs_set_timeout(&t); + return ip_vs_set_timeout(net, &t); } static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) @@ -3129,7 +3142,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) ret = ip_vs_flush(net); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(info->attrs); + ret = ip_vs_genl_set_config(net, info->attrs); goto out; } else if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { @@ -3281,7 +3294,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); #ifdef CONFIG_IP_VS_PROTO_TCP NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 9f609d4d5d58..6ac986cdcff3 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -152,9 +152,8 @@ EXPORT_SYMBOL(ip_vs_proto_get); * get ip_vs_protocol object data by netns and proto */ struct ip_vs_proto_data * -ip_vs_proto_data_get(struct net *net, unsigned short proto) +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; unsigned hash = IP_VS_PROTO_HASH(proto); @@ -165,20 +164,28 @@ ip_vs_proto_data_get(struct net *net, unsigned short proto) return NULL; } + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + return __ipvs_proto_data_get(ipvs, proto); +} EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ -void ip_vs_protocol_timeout_change(int flags) +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { - struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); } } } diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index b8b37fafc988..28039cbfcff4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -55,7 +55,7 @@ ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, } static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, +ah_esp_conn_in_get(int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { @@ -72,7 +72,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", inverse ? "ICMP+" : "", - pp->name, + ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } @@ -83,7 +83,6 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, static struct ip_vs_conn * ah_esp_conn_out_get(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) @@ -97,7 +96,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", inverse ? "ICMP+" : "", - pp->name, + ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } @@ -107,7 +106,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { /* @@ -137,7 +136,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, }; #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index f826dd1e4630..19bc37976ea7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -9,7 +9,7 @@ #include static int -sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { struct net *net; @@ -47,10 +47,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pp); + *verdict = ip_vs_leave(svc, skb, pd); else { ip_vs_service_put(svc); *verdict = NF_DROP; @@ -907,14 +907,13 @@ static const char *sctp_state_name(int state) } static inline int -set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) { sctp_chunkhdr_t _sctpch, *sch; unsigned char chunk_type; int event, next_state; int ihl; - struct ip_vs_proto_data *pd; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -966,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, IP_VS_DBG_BUF(8, "%s %s %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, + pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), IP_VS_DBG_ADDR(cp->af, &cp->daddr), @@ -990,7 +989,6 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } } - pd = ip_vs_proto_data_get(&init_net, pp->protocol); /* tmp fix */ if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; else /* What to do ? */ @@ -1001,12 +999,12 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, static int sctp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, struct ip_vs_protocol *pp) + const struct sk_buff *skb, struct ip_vs_proto_data *pd) { int ret = 0; spin_lock(&cp->lock); - ret = set_sctp_state(pp, cp, direction, skb); + ret = set_sctp_state(pd, cp, direction, skb); spin_unlock(&cp->lock); return ret; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 9d9df3d61093..d7c245532798 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -32,7 +32,7 @@ #include static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { struct net *net; @@ -68,10 +68,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pp); + *verdict = ip_vs_leave(svc, skb, pd); else { ip_vs_service_put(svc); *verdict = NF_DROP; @@ -448,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = { /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ @@ -461,7 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ - tcp_state_table = (on? tcp_states_dos : tcp_states); + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) @@ -478,13 +475,12 @@ static inline int tcp_state_idx(struct tcphdr *th) } static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; - struct ip_vs_proto_data *pd; /* Temp fix */ /* * Update state offset to INPUT_ONLY if necessary @@ -502,7 +498,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, goto tcp_state_out; } - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { @@ -510,7 +507,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, + pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', @@ -540,7 +537,6 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } - pd = ip_vs_proto_data_get(&init_net, pp->protocol); if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = new_state]; else /* What to do ? */ @@ -553,7 +549,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, static int tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; @@ -568,7 +564,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, return 0; spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); + set_tcp_state(pd, cp, direction, th); spin_unlock(&cp->lock); return 1; @@ -691,6 +687,7 @@ static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); + pd->tcp_state_table = tcp_states; } static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 71a4721a8f8a..aa85df2f14a0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -29,7 +29,7 @@ #include static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { struct net *net; @@ -64,10 +64,10 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pp); + *verdict = ip_vs_leave(svc, skb, pd); else { ip_vs_service_put(svc); *verdict = NF_DROP; @@ -457,11 +457,8 @@ static const char * udp_state_name(int state) static int udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { - struct ip_vs_proto_data *pd; /* Temp fix, pp will be replaced by pd */ - - pd = ip_vs_proto_data_get(&init_net, IPPROTO_UDP); if (unlikely(!pd)) { pr_err("UDP no ns data\n"); return 0; diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 9127a3d8aa35..bb10b0717f1b 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */); + cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */); if (unlikely(cp == NULL)) { match = false; goto out; From 9bbac6a904d0816dae58b454692c54d6773cc20d Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:52 +0100 Subject: [PATCH 44/80] IPVS: netns, common protocol changes and use of appcnt. appcnt and timeout_table moved from struct ip_vs_protocol to ip_vs proto_data. struct net *net added as first param to - register_app() - unregister_app() - app_conn_bind() - ip_vs_conn_new() [horms@verge.net.au: removed cosmetic-change-only hunk] Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 - net/netfilter/ipvs/ip_vs_conn.c | 6 +-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 4 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 5 +-- net/netfilter/ipvs/ip_vs_proto_udp.c | 4 +- net/netfilter/ipvs/ip_vs_sync.c | 55 +++++++++++++++------------ 6 files changed, 39 insertions(+), 37 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 464ea365ca07..cc6ae621a9b5 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -360,8 +360,6 @@ struct ip_vs_protocol { u16 protocol; u16 num_states; int dont_defrag; - atomic_t appcnt; /* counter of proto app incs */ - int *timeout_table; /* protocol timeout table */ void (*init)(struct ip_vs_protocol *pp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a7aba6a4697e..b2024c942345 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -804,7 +804,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, p->protocol); cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { @@ -863,8 +863,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, #endif ip_vs_bind_xmit(cp); - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); /* * Allow conntrack to be preserved. By default, conntrack diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 19bc37976ea7..0f14f793318a 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -1035,7 +1035,7 @@ static int sctp_register_app(struct ip_vs_app *inc) } } list_add(&inc->p_list, &ipvs->sctp_apps[hash]); - atomic_inc(&pd->pp->appcnt); + atomic_inc(&pd->appcnt); out: spin_unlock_bh(&ipvs->sctp_app_lock); @@ -1048,7 +1048,7 @@ static void sctp_unregister_app(struct ip_vs_app *inc) struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP); spin_lock_bh(&ipvs->sctp_app_lock); - atomic_dec(&pd->pp->appcnt); + atomic_dec(&pd->appcnt); list_del(&inc->p_list); spin_unlock_bh(&ipvs->sctp_app_lock); } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index d7c245532798..290b3803d8ce 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -596,7 +596,7 @@ static int tcp_register_app(struct ip_vs_app *inc) } } list_add(&inc->p_list, &ipvs->tcp_apps[hash]); - atomic_inc(&pd->pp->appcnt); + atomic_inc(&pd->appcnt); out: spin_unlock_bh(&ipvs->tcp_app_lock); @@ -611,7 +611,7 @@ tcp_unregister_app(struct ip_vs_app *inc) struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP); spin_lock_bh(&ipvs->tcp_app_lock); - atomic_dec(&pd->pp->appcnt); + atomic_dec(&pd->appcnt); list_del(&inc->p_list); spin_unlock_bh(&ipvs->tcp_app_lock); } @@ -701,7 +701,6 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), .init = NULL, .exit = NULL, .init_netns = __ip_vs_tcp_init, diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index aa85df2f14a0..3719837a8fdc 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -373,7 +373,7 @@ static int udp_register_app(struct ip_vs_app *inc) } } list_add(&inc->p_list, &ipvs->udp_apps[hash]); - atomic_inc(&pd->pp->appcnt); + atomic_inc(&pd->appcnt); out: spin_unlock_bh(&ipvs->udp_app_lock); @@ -388,7 +388,7 @@ udp_unregister_app(struct ip_vs_app *inc) struct netns_ipvs *ipvs = net_ipvs(&init_net); spin_lock_bh(&ipvs->udp_app_lock); - atomic_dec(&pd->pp->appcnt); + atomic_dec(&pd->appcnt); list_del(&inc->p_list); spin_unlock_bh(&ipvs->udp_app_lock); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 662aa2c22a05..6831e8fac8db 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -725,17 +725,16 @@ ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc, * Param: ... * timeout is in sec. */ -static void ip_vs_proc_conn(struct ip_vs_conn_param *param, unsigned flags, - unsigned state, unsigned protocol, unsigned type, +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, const union nf_inet_addr *daddr, __be16 dport, unsigned long timeout, __u32 fwmark, - struct ip_vs_sync_conn_options *opt, - struct ip_vs_protocol *pp) + struct ip_vs_sync_conn_options *opt) { struct ip_vs_dest *dest; struct ip_vs_conn *cp; - if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(param); else @@ -821,17 +820,23 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param, unsigned flags, if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) timeout = MAX_SCHEDULE_TIMEOUT / HZ; cp->timeout = timeout*HZ; - } else if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(net, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + } ip_vs_conn_put(cp); } /* * Process received multicast message for Version 0 */ -static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) +static void ip_vs_process_message_v0(struct net *net, const char *buffer, + const size_t buflen) { struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; struct ip_vs_sync_conn_v0 *s; @@ -879,7 +884,6 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) } } else { /* protocol in templates is not used for state/timeout */ - pp = NULL; if (state > 0) { IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", state); @@ -894,9 +898,9 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) s->vport, ¶m); /* Send timeout as Zero */ - ip_vs_proc_conn(¶m, flags, state, s->protocol, AF_INET, + ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, (union nf_inet_addr *)&s->daddr, s->dport, - 0, 0, opt, pp); + 0, 0, opt); } } @@ -945,7 +949,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, /* * Process a Version 1 sync. connection */ -static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end) +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) { struct ip_vs_sync_conn_options opt; union ip_vs_sync_conn *s; @@ -1043,7 +1047,6 @@ static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end) } } else { /* protocol in templates is not used for state/timeout */ - pp = NULL; if (state > 0) { IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", state); @@ -1058,18 +1061,18 @@ static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end) } /* If only IPv4, just silent skip IPv6 */ if (af == AF_INET) - ip_vs_proc_conn(¶m, flags, state, s->v4.protocol, af, + ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, ntohl(s->v4.timeout), ntohl(s->v4.fwmark), - (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL), - pp); + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); #ifdef CONFIG_IP_VS_IPV6 else - ip_vs_proc_conn(¶m, flags, state, s->v6.protocol, af, + ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, ntohl(s->v6.timeout), ntohl(s->v6.fwmark), - (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL), - pp); + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); #endif return 0; /* Error exit */ @@ -1083,7 +1086,8 @@ static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end) * ip_vs_conn entries. * Handles Version 0 & 1 */ -static void ip_vs_process_message(__u8 *buffer, const size_t buflen) +static void ip_vs_process_message(struct net *net, __u8 *buffer, + const size_t buflen) { struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; __u8 *p, *msg_end; @@ -1136,7 +1140,8 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen) return; } /* Process a single sync_conn */ - if ((retc=ip_vs_proc_sync_conn(p, msg_end)) < 0) { + retc = ip_vs_proc_sync_conn(net, p, msg_end); + if (retc < 0) { IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", retc); return; @@ -1146,7 +1151,7 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen) } } else { /* Old type of message */ - ip_vs_process_message_v0(buffer, buflen); + ip_vs_process_message_v0(net, buffer, buflen); return; } } @@ -1500,7 +1505,7 @@ static int sync_thread_backup(void *data) /* disable bottom half, because it accesses the data shared by softirq while getting/creating conns */ local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); + ip_vs_process_message(&init_net, tinfo->buf, len); local_bh_enable(); } } From ab8a5e8408c3df2d654611bffc3aaf04f418b266 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:53 +0100 Subject: [PATCH 45/80] IPVS: netns awareness to ip_vs_app All variables moved to struct ipvs, most external changes fixed (i.e. init_net removed) in ip_vs_protocol param struct net *net added to: - register_app() - unregister_app() This affected almost all proto_xxx.c files Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 12 ++--- include/net/netns/ip_vs.h | 5 ++ net/netfilter/ipvs/ip_vs_app.c | 73 ++++++++++++++++----------- net/netfilter/ipvs/ip_vs_ftp.c | 8 +-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 12 ++--- net/netfilter/ipvs/ip_vs_proto_tcp.c | 12 ++--- net/netfilter/ipvs/ip_vs_proto_udp.c | 12 ++--- 7 files changed, 76 insertions(+), 58 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index cc6ae621a9b5..0cdd8ce454c2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -402,9 +402,9 @@ struct ip_vs_protocol { const struct sk_buff *skb, struct ip_vs_proto_data *pd); - int (*register_app)(struct ip_vs_app *inc); + int (*register_app)(struct net *net, struct ip_vs_app *inc); - void (*unregister_app)(struct ip_vs_app *inc); + void (*unregister_app)(struct net *net, struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp); @@ -871,12 +871,12 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) * (from ip_vs_app.c) */ #define IP_VS_APP_MAX_PORTS 8 -extern int register_ip_vs_app(struct ip_vs_app *app); -extern void unregister_ip_vs_app(struct ip_vs_app *app); +extern int register_ip_vs_app(struct net *net, struct ip_vs_app *app); +extern void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app); extern int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp); extern void ip_vs_unbind_app(struct ip_vs_conn *cp); -extern int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port); +extern int register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, + __u16 proto, __u16 port); extern int ip_vs_app_inc_get(struct ip_vs_app *inc); extern void ip_vs_app_inc_put(struct ip_vs_app *inc); diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 58bd3fd85a97..03f7fe1bede6 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -28,6 +28,11 @@ struct netns_ipvs { #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) struct list_head rs_table[IP_VS_RTAB_SIZE]; + /* ip_vs_app */ + struct list_head app_list; + struct mutex app_mutex; + struct lock_class_key app_key; /* mutex debuging */ + /* ip_vs_proto */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 40b09ccc4896..286f46594e0e 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -43,11 +43,6 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - /* * Get an ip_vs_app object */ @@ -67,7 +62,8 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) * Allocate/initialize app incarnation and register it in proto apps. */ static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) { struct ip_vs_protocol *pp; struct ip_vs_app *inc; @@ -98,7 +94,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) } } - ret = pp->register_app(inc); + ret = pp->register_app(net, inc); if (ret) goto out; @@ -119,7 +115,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) * Release app incarnation */ static void -ip_vs_app_inc_release(struct ip_vs_app *inc) +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; @@ -127,7 +123,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) return; if (pp->unregister_app) - pp->unregister_app(inc); + pp->unregister_app(net, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", pp->name, inc->name, ntohs(inc->port)); @@ -168,15 +164,17 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc) * Register an application incarnation in protocol applications */ int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) { + struct netns_ipvs *ipvs = net_ipvs(net); int result; - mutex_lock(&__ip_vs_app_mutex); + mutex_lock(&ipvs->app_mutex); - result = ip_vs_app_inc_new(app, proto, port); + result = ip_vs_app_inc_new(net, app, proto, port); - mutex_unlock(&__ip_vs_app_mutex); + mutex_unlock(&ipvs->app_mutex); return result; } @@ -185,16 +183,17 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) /* * ip_vs_app registration routine */ -int register_ip_vs_app(struct ip_vs_app *app) +int register_ip_vs_app(struct net *net, struct ip_vs_app *app) { + struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); - mutex_lock(&__ip_vs_app_mutex); + mutex_lock(&ipvs->app_mutex); - list_add(&app->a_list, &ip_vs_app_list); + list_add(&app->a_list, &ipvs->app_list); - mutex_unlock(&__ip_vs_app_mutex); + mutex_unlock(&ipvs->app_mutex); return 0; } @@ -204,19 +203,20 @@ int register_ip_vs_app(struct ip_vs_app *app) * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services */ -void unregister_ip_vs_app(struct ip_vs_app *app) +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *inc, *nxt; - mutex_lock(&__ip_vs_app_mutex); + mutex_lock(&ipvs->app_mutex); list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); + ip_vs_app_inc_release(net, inc); } list_del(&app->a_list); - mutex_unlock(&__ip_vs_app_mutex); + mutex_unlock(&ipvs->app_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -226,7 +226,8 @@ void unregister_ip_vs_app(struct ip_vs_app *app) /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) { return pp->app_conn_bind(cp); } @@ -481,11 +482,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) * /proc/net/ip_vs_app entry function */ -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) { struct ip_vs_app *app, *inc; - list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(app, &ipvs->app_list, a_list) { list_for_each_entry(inc, &app->incs_list, a_list) { if (pos-- == 0) return inc; @@ -497,19 +498,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos) static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) { - mutex_lock(&__ip_vs_app_mutex); + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; + mutex_lock(&ipvs->app_mutex); + + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_app *inc, *app; struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); + return ip_vs_app_idx(ipvs, 0); inc = v; app = inc->app; @@ -518,7 +524,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) return list_entry(e, struct ip_vs_app, a_list); /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { app = list_entry(e, struct ip_vs_app, a_list); list_for_each_entry(inc, &app->incs_list, a_list) { return inc; @@ -529,7 +535,9 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { - mutex_unlock(&__ip_vs_app_mutex); + struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq)); + + mutex_unlock(&ipvs->app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) @@ -557,7 +565,8 @@ static const struct seq_operations ip_vs_app_seq_ops = { static int ip_vs_app_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_app_seq_ops); + return seq_open_net(inode, file, &ip_vs_app_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ip_vs_app_fops = { @@ -571,9 +580,13 @@ static const struct file_operations ip_vs_app_fops = { static int __net_init __ip_vs_app_init(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); + if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; + INIT_LIST_HEAD(&ipvs->app_list); + __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index b38ae941f677..77b0036dcb73 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -414,14 +414,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net) if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; - ret = register_ip_vs_app(app); + ret = register_ip_vs_app(net, app); if (ret) return ret; for (i=0; iprotocol, ports[i]); + ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); if (ret) break; pr_info("%s: loaded support on port[%d] = %d\n", @@ -429,7 +429,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) } if (ret) - unregister_ip_vs_app(app); + unregister_ip_vs_app(net, app); return ret; } @@ -443,7 +443,7 @@ static void __ip_vs_ftp_exit(struct net *net) if (!net_eq(net, &init_net)) /* netns not enabled yet */ return; - unregister_ip_vs_app(app); + unregister_ip_vs_app(net, app); } static struct pernet_operations ip_vs_ftp_ops = { diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 0f14f793318a..569e77bf08c4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -1016,14 +1016,14 @@ static inline __u16 sctp_app_hashkey(__be16 port) & SCTP_APP_TAB_MASK; } -static int sctp_register_app(struct ip_vs_app *inc) +static int sctp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(&init_net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); hash = sctp_app_hashkey(port); @@ -1042,10 +1042,10 @@ static int sctp_register_app(struct ip_vs_app *inc) return ret; } -static void sctp_unregister_app(struct ip_vs_app *inc) +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(&init_net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); spin_lock_bh(&ipvs->sctp_app_lock); atomic_dec(&pd->appcnt); diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 290b3803d8ce..757aaaf083bb 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -577,14 +577,14 @@ static inline __u16 tcp_app_hashkey(__be16 port) } -static int tcp_register_app(struct ip_vs_app *inc) +static int tcp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(&init_net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); hash = tcp_app_hashkey(port); @@ -605,10 +605,10 @@ static int tcp_register_app(struct ip_vs_app *inc) static void -tcp_unregister_app(struct ip_vs_app *inc) +tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(&init_net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); spin_lock_bh(&ipvs->tcp_app_lock); atomic_dec(&pd->appcnt); diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 3719837a8fdc..1dc394100fa8 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -353,14 +353,14 @@ static inline __u16 udp_app_hashkey(__be16 port) } -static int udp_register_app(struct ip_vs_app *inc) +static int udp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(&init_net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_UDP); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); hash = udp_app_hashkey(port); @@ -382,10 +382,10 @@ static int udp_register_app(struct ip_vs_app *inc) static void -udp_unregister_app(struct ip_vs_app *inc) +udp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_UDP); - struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct netns_ipvs *ipvs = net_ipvs(net); spin_lock_bh(&ipvs->udp_app_lock); atomic_dec(&pd->appcnt); From 29c2026fd4980c144d9c746dc1565060f08e5796 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:54 +0100 Subject: [PATCH 46/80] IPVS: netns awareness to ip_vs_est All variables moved to struct ipvs, most external changes fixed (i.e. init_net removed) *v3 timer per ns instead of a common timer in estimator. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 +- include/net/netns/ip_vs.h | 4 ++ net/netfilter/ipvs/ip_vs_ctl.c | 20 ++++---- net/netfilter/ipvs/ip_vs_est.c | 86 +++++++++++++++++++--------------- 4 files changed, 64 insertions(+), 50 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0cdd8ce454c2..c08927bb1728 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1004,8 +1004,8 @@ extern void ip_vs_sync_cleanup(void); */ extern int ip_vs_estimator_init(void); extern void ip_vs_estimator_cleanup(void); -extern void ip_vs_new_estimator(struct ip_vs_stats *stats); -extern void ip_vs_kill_estimator(struct ip_vs_stats *stats); +extern void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats); +extern void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats); extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); /* diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 03f7fe1bede6..db0240198339 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -70,6 +70,10 @@ struct netns_ipvs { int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; + /* ip_vs_est */ + struct list_head est_list; /* estimator list */ + spinlock_t est_lock; + struct timer_list est_timer; /* Estimation timer */ }; #endif /* IP_VS_H_ */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 88474f1e828a..c89beb8eafbb 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -816,7 +816,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock(&dest->dst_lock); if (add) - ip_vs_new_estimator(&dest->stats); + ip_vs_new_estimator(svc->net, &dest->stats); write_lock_bh(&__ip_vs_svc_lock); @@ -1009,9 +1009,9 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) { - ip_vs_kill_estimator(&dest->stats); + ip_vs_kill_estimator(net, &dest->stats); /* * Remove it from the d-linked list with the real services. @@ -1080,6 +1080,7 @@ static int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; + struct net *net = svc->net; __be16 dport = udest->port; EnterFunction(2); @@ -1108,7 +1109,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete the destination */ - __ip_vs_del_dest(dest); + __ip_vs_del_dest(net, dest); LeaveFunction(2); @@ -1197,7 +1198,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, else if (svc->port == 0) atomic_inc(&ip_vs_nullsvc_counter); - ip_vs_new_estimator(&svc->stats); + ip_vs_new_estimator(net, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1345,7 +1346,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) if (svc->af == AF_INET) ip_vs_num_services--; - ip_vs_kill_estimator(&svc->stats); + ip_vs_kill_estimator(svc->net, &svc->stats); /* Unbind scheduler */ old_sched = svc->scheduler; @@ -1368,7 +1369,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); + __ip_vs_del_dest(svc->net, dest); } /* @@ -3460,7 +3461,7 @@ int __net_init __ip_vs_control_init(struct net *net) vs_vars); if (sysctl_header == NULL) goto err_reg; - ip_vs_new_estimator(&ip_vs_stats); + ip_vs_new_estimator(net, &ip_vs_stats); return 0; err_reg: @@ -3472,7 +3473,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) if (!net_eq(net, &init_net)) /* netns not enabled yet */ return; - ip_vs_kill_estimator(&ip_vs_stats); + ip_vs_kill_estimator(net, &ip_vs_stats); unregister_net_sysctl_table(sysctl_header); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); @@ -3536,7 +3537,6 @@ void ip_vs_control_cleanup(void) ip_vs_trash_cleanup(); cancel_delayed_work_sync(&defense_work); cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 7417a0c1408b..07d839bef537 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -8,8 +8,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: - * + * Changes: Hans Schillstrom + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" @@ -48,12 +52,6 @@ */ -static void estimation_timer(unsigned long arg); - -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); - static void estimation_timer(unsigned long arg) { struct ip_vs_estimator *e; @@ -62,9 +60,12 @@ static void estimation_timer(unsigned long arg) u32 n_inpkts, n_outpkts; u64 n_inbytes, n_outbytes; u32 rate; + struct net *net = (struct net *)arg; + struct netns_ipvs *ipvs; - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { + ipvs = net_ipvs(net); + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); @@ -75,38 +76,39 @@ static void estimation_timer(unsigned long arg) n_outbytes = s->ustats.outbytes; /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; + rate = (n_conns - e->last_conns) << 9; e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->ustats.cps = (e->cps+0x1FF)>>10; + e->cps += ((long)rate - (long)e->cps) >> 2; + s->ustats.cps = (e->cps + 0x1FF) >> 10; - rate = (n_inpkts - e->last_inpkts)<<9; + rate = (n_inpkts - e->last_inpkts) << 9; e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->ustats.inpps = (e->inpps+0x1FF)>>10; + e->inpps += ((long)rate - (long)e->inpps) >> 2; + s->ustats.inpps = (e->inpps + 0x1FF) >> 10; - rate = (n_outpkts - e->last_outpkts)<<9; + rate = (n_outpkts - e->last_outpkts) << 9; e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->ustats.outpps = (e->outpps+0x1FF)>>10; + e->outpps += ((long)rate - (long)e->outpps) >> 2; + s->ustats.outpps = (e->outpps + 0x1FF) >> 10; - rate = (n_inbytes - e->last_inbytes)<<4; + rate = (n_inbytes - e->last_inbytes) << 4; e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->ustats.inbps = (e->inbps+0xF)>>5; + e->inbps += ((long)rate - (long)e->inbps) >> 2; + s->ustats.inbps = (e->inbps + 0xF) >> 5; - rate = (n_outbytes - e->last_outbytes)<<4; + rate = (n_outbytes - e->last_outbytes) << 4; e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->ustats.outbps = (e->outbps+0xF)>>5; + e->outbps += ((long)rate - (long)e->outbps) >> 2; + s->ustats.outbps = (e->outbps + 0xF) >> 5; spin_unlock(&s->lock); } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); + spin_unlock(&ipvs->est_lock); + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_new_estimator(struct ip_vs_stats *stats) +void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); @@ -126,18 +128,19 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats) est->last_outbytes = stats->ustats.outbytes; est->outbps = stats->ustats.outbps<<5; - spin_lock_bh(&est_lock); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_kill_estimator(struct ip_vs_stats *stats) +void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; - spin_lock_bh(&est_lock); + spin_lock_bh(&ipvs->est_lock); list_del(&est->list); - spin_unlock_bh(&est_lock); + spin_unlock_bh(&ipvs->est_lock); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) @@ -159,14 +162,25 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) static int __net_init __ip_vs_estimator_init(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); + if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } +static void __net_exit __ip_vs_estimator_exit(struct net *net) +{ + del_timer_sync(&net_ipvs(net)->est_timer); +} static struct pernet_operations ip_vs_app_ops = { .init = __ip_vs_estimator_init, + .exit = __ip_vs_estimator_exit, }; int __init ip_vs_estimator_init(void) @@ -174,14 +188,10 @@ int __init ip_vs_estimator_init(void) int rv; rv = register_pernet_subsys(&ip_vs_app_ops); - if (rv < 0) - return rv; - mod_timer(&est_timer, jiffies + 2 * HZ); return rv; } void ip_vs_estimator_cleanup(void) { - del_timer_sync(&est_timer); unregister_pernet_subsys(&ip_vs_app_ops); } From f131315fa272d337dfca7dad2f033ff5296dad65 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:55 +0100 Subject: [PATCH 47/80] IPVS: netns awareness to ip_vs_sync All global variables moved to struct ipvs, most external changes fixed (i.e. init_net removed) in sync_buf create + 4 replaced by sizeof(struct..) Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 14 +- include/net/netns/ip_vs.h | 16 ++ net/netfilter/ipvs/ip_vs_core.c | 15 +- net/netfilter/ipvs/ip_vs_ctl.c | 52 +++-- net/netfilter/ipvs/ip_vs_sync.c | 334 +++++++++++++++++--------------- 5 files changed, 240 insertions(+), 191 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index c08927bb1728..4265b5e00c94 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -958,7 +958,7 @@ extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; extern int sysctl_ip_vs_sync_ver; -extern void ip_vs_sync_switch_mode(int mode); +extern void ip_vs_sync_switch_mode(struct net *net, int mode); extern struct ip_vs_service * ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); @@ -987,14 +987,10 @@ extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); * IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ -extern volatile int ip_vs_sync_state; -extern volatile int ip_vs_master_syncid; -extern volatile int ip_vs_backup_syncid; -extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid); -extern int stop_sync_thread(int state); -extern void ip_vs_sync_conn(struct ip_vs_conn *cp); +extern int start_sync_thread(struct net *net, int state, char *mcast_ifn, + __u8 syncid); +extern int stop_sync_thread(struct net *net, int state); +extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp); extern int ip_vs_sync_init(void); extern void ip_vs_sync_cleanup(void); diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index db0240198339..aba78f3c8341 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -74,6 +74,22 @@ struct netns_ipvs { struct list_head est_list; /* estimator list */ spinlock_t est_lock; struct timer_list est_timer; /* Estimation timer */ + /* ip_vs_sync */ + struct list_head sync_queue; + spinlock_t sync_lock; + struct ip_vs_sync_buff *sync_buff; + spinlock_t sync_buff_lock; + struct sockaddr_in sync_mcast_addr; + struct task_struct *master_thread; + struct task_struct *backup_thread; + int send_mesg_maxlen; + int recv_mesg_maxlen; + volatile int sync_state; + volatile int master_syncid; + volatile int backup_syncid; + /* multicast interface name */ + char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; }; #endif /* IP_VS_H_ */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 9317affc5ea1..5531d569aa5e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1471,12 +1471,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) static unsigned int ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net = NULL; + struct net *net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, restart, pkts; + struct netns_ipvs *ipvs; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1556,7 +1557,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - + net = skb_net(skb); + ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1589,12 +1591,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_ip_vs_sync_threshold[0]; else pkts = atomic_add_return(1, &cp->in_pkts); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && (pkts % sysctl_ip_vs_sync_threshold[1] @@ -1603,13 +1606,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ((cp->state == IP_VS_SCTP_S_CLOSED) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { - ip_vs_sync_conn(cp); + ip_vs_sync_conn(net, cp); goto out; } } /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && + else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && (pkts % sysctl_ip_vs_sync_threshold[1] @@ -1619,7 +1622,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) (cp->state == IP_VS_TCP_S_CLOSE) || (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); + ip_vs_sync_conn(net, cp); out: cp->old_state = cp->state; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c89beb8eafbb..03f86312b4bb 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1559,7 +1559,8 @@ proc_do_sync_mode(ctl_table *table, int write, /* Restore the correct value */ *valp = val; } else { - ip_vs_sync_switch_mode(val); + struct net *net = current->nsproxy->net_ns; + ip_vs_sync_switch_mode(net, val); } } return rc; @@ -2174,11 +2175,12 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) goto out_unlock; } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); goto out_unlock; } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); + ret = stop_sync_thread(net, dm->state); goto out_unlock; } @@ -2424,6 +2426,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); if (!capable(CAP_NET_ADMIN)) @@ -2546,15 +2549,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; @@ -3061,20 +3066,23 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb_net(skb); + struct netns_ipvs *ipvs = net_ipvs(net); + mutex_lock(&__ip_vs_mutex); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ip_vs_master_mcast_ifn, - ip_vs_master_syncid, cb) < 0) + ipvs->master_mcast_ifn, + ipvs->master_syncid, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } - if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ip_vs_backup_mcast_ifn, - ip_vs_backup_syncid, cb) < 0) + ipvs->backup_mcast_ifn, + ipvs->backup_syncid, cb) < 0) goto nla_put_failure; cb->args[1] = 1; @@ -3086,24 +3094,26 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, return skb->len; } -static int ip_vs_genl_new_daemon(struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) { if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; - return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + return start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); } -static int ip_vs_genl_del_daemon(struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) { if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + return stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); } static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) @@ -3159,9 +3169,9 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(daemon_attrs); + ret = ip_vs_genl_new_daemon(net, daemon_attrs); else - ret = ip_vs_genl_del_daemon(daemon_attrs); + ret = ip_vs_genl_del_daemon(net, daemon_attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 6831e8fac8db..c29e73d686fb 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -192,6 +192,7 @@ union ip_vs_sync_conn { #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) struct ip_vs_sync_thread_data { + struct net *net; struct socket *sock; char *buf; }; @@ -259,10 +260,6 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; - struct ip_vs_sync_buff { struct list_head list; unsigned long firstuse; @@ -273,28 +270,6 @@ struct ip_vs_sync_buff { unsigned char *end; }; - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - /* multicast addr */ static struct sockaddr_in mcast_addr = { .sin_family = AF_INET, @@ -324,20 +299,20 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) put_unaligned_be32(ho->previous_delta, &no->previous_delta); } -static inline struct ip_vs_sync_buff *sb_dequeue(void) +static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) { struct ip_vs_sync_buff *sb; - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ipvs->sync_queue)) { sb = NULL; } else { - sb = list_entry(ip_vs_sync_queue.next, + sb = list_entry(ipvs->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); } - spin_unlock_bh(&ip_vs_sync_lock); + spin_unlock_bh(&ipvs->sync_lock); return sb; } @@ -345,25 +320,27 @@ static inline struct ip_vs_sync_buff *sb_dequeue(void) /* * Create a new sync buffer for Version 1 proto. */ -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { kfree(sb); return NULL; } sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ sb->mesg->version = SYNC_PROTO_VER; - sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->syncid = ipvs->master_syncid; sb->mesg->size = sizeof(struct ip_vs_sync_mesg); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; sb->firstuse = jiffies; return sb; @@ -375,14 +352,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +static inline void sb_queue_tail(struct netns_ipvs *ipvs) { - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); + struct ip_vs_sync_buff *sb = ipvs->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ipvs->sync_queue); else ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); + spin_unlock(&ipvs->sync_lock); } /* @@ -390,18 +369,18 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) { struct ip_vs_sync_buff *sb; - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; + spin_lock_bh(&ipvs->sync_buff_lock); + if (ipvs->sync_buff && (time == 0 || + time_before(jiffies - ipvs->sync_buff->firstuse, time))) { + sb = ipvs->sync_buff; + ipvs->sync_buff = NULL; } else sb = NULL; - spin_unlock_bh(&curr_sb_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); return sb; } @@ -409,33 +388,37 @@ get_curr_sync_buff(unsigned long time) * Switch mode from sending version 0 or 1 * - must handle sync_buf */ -void ip_vs_sync_switch_mode(int mode) { +void ip_vs_sync_switch_mode(struct net *net, int mode) +{ + struct netns_ipvs *ipvs = net_ipvs(net); - if (!ip_vs_sync_state & IP_VS_STATE_MASTER) + if (!ipvs->sync_state & IP_VS_STATE_MASTER) return; - if (mode == sysctl_ip_vs_sync_ver || !curr_sb) + if (mode == sysctl_ip_vs_sync_ver || !ipvs->sync_buff) return; - spin_lock_bh(&curr_sb_lock); + spin_lock_bh(&ipvs->sync_buff_lock); /* Buffer empty ? then let buf_create do the job */ - if ( curr_sb->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { - kfree(curr_sb); - curr_sb = NULL; + if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { + kfree(ipvs->sync_buff); + ipvs->sync_buff = NULL; } else { - spin_lock_bh(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&curr_sb->list, &ip_vs_sync_queue); + spin_lock_bh(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&ipvs->sync_buff->list, + &ipvs->sync_queue); else - ip_vs_sync_buff_release(curr_sb); - spin_unlock_bh(&ip_vs_sync_lock); + ip_vs_sync_buff_release(ipvs->sync_buff); + spin_unlock_bh(&ipvs->sync_lock); } - spin_unlock_bh(&curr_sb_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); } /* * Create a new sync buffer for Version 0 proto. */ -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void) +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) { struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg_v0 *mesg; @@ -443,16 +426,17 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void) if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { kfree(sb); return NULL; } mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; - mesg->syncid = ip_vs_master_syncid; - mesg->size = 4; - sb->head = (unsigned char *)mesg + 4; - sb->end = (unsigned char *)mesg + sync_send_mesg_maxlen; + mesg->syncid = ipvs->master_syncid; + mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; sb->firstuse = jiffies; return sb; } @@ -461,8 +445,9 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void) * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -void ip_vs_sync_conn_v0(struct ip_vs_conn *cp) +void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; int len; @@ -473,10 +458,12 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return; - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create_v0())) { - spin_unlock(&curr_sb_lock); + spin_lock(&ipvs->sync_buff_lock); + if (!ipvs->sync_buff) { + ipvs->sync_buff = + ip_vs_sync_buff_create_v0(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } @@ -484,8 +471,8 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp) len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = (struct ip_vs_sync_mesg_v0 *)curr_sb->mesg; - s = (struct ip_vs_sync_conn_v0 *)curr_sb->head; + m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; + s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; /* copy members */ s->reserved = 0; @@ -506,18 +493,18 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp) m->nr_conns++; m->size += len; - curr_sb->head += len; + ipvs->sync_buff->head += len; /* check if there is a space for next one */ - if (curr_sb->head + FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; + if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; } - spin_unlock(&curr_sb_lock); + spin_unlock(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ if (cp->control) - ip_vs_sync_conn(cp->control); + ip_vs_sync_conn(net, cp->control); } /* @@ -525,8 +512,9 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp) * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; __u8 *p; @@ -534,7 +522,7 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) /* Handle old version of the protocol */ if (sysctl_ip_vs_sync_ver == 0) { - ip_vs_sync_conn_v0(cp); + ip_vs_sync_conn_v0(net, cp); return; } /* Do not sync ONE PACKET */ @@ -551,7 +539,7 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); } - spin_lock(&curr_sb_lock); + spin_lock(&ipvs->sync_buff_lock); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -570,26 +558,27 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) /* check if there is a space for this one */ pad = 0; - if (curr_sb) { - pad = (4 - (size_t)curr_sb->head) & 3; - if (curr_sb->head + len + pad > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; + if (ipvs->sync_buff) { + pad = (4 - (size_t)ipvs->sync_buff->head) & 3; + if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; pad = 0; } } - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); + if (!ipvs->sync_buff) { + ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } } - m = curr_sb->mesg; - p = curr_sb->head; - curr_sb->head += pad + len; + m = ipvs->sync_buff->mesg; + p = ipvs->sync_buff->head; + ipvs->sync_buff->head += pad + len; m->size += pad + len; /* Add ev. padding from prev. sync_conn */ while (pad--) @@ -647,7 +636,7 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) } } - spin_unlock(&curr_sb_lock); + spin_unlock(&ipvs->sync_buff_lock); control: /* synchronize its controller if it has */ @@ -699,7 +688,8 @@ ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc, buff[pe_name_len]=0; p->pe = __ip_vs_pe_getbyname(buff); if (!p->pe) { - IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", buff); + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); return 1; } } else { @@ -748,7 +738,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * If it is not found the connection will remain unbound * but still handled. */ - dest = ip_vs_find_dest(&init_net, type, daddr, dport, param->vaddr, + dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark); /* Set the approprite ativity flag */ @@ -1089,6 +1079,7 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) static void ip_vs_process_message(struct net *net, __u8 *buffer, const size_t buflen) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; __u8 *p, *msg_end; int i, nr_conns; @@ -1105,7 +1096,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) { + if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); return; } @@ -1190,8 +1181,10 @@ static int set_mcast_if(struct sock *sk, char *ifname) { struct net_device *dev; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -1210,30 +1203,33 @@ static int set_mcast_if(struct sock *sk, char *ifname) * Set the maximum length of sync message according to the * specified interface's MTU. */ -static int set_sync_mesg_maxlen(int sync_state) +static int set_sync_mesg_maxlen(struct net *net, int sync_state) { + struct netns_ipvs *ipvs = net_ipvs(net); struct net_device *dev; int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); + if (!dev) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr) - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); + "message %d.\n", ipvs->send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); + if (!dev) return -ENODEV; - sync_recv_mesg_maxlen = dev->mtu - + ipvs->recv_mesg_maxlen = dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr); IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); + "message %d.\n", ipvs->recv_mesg_maxlen); } return 0; @@ -1248,6 +1244,7 @@ static int set_sync_mesg_maxlen(int sync_state) static int join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) { + struct net *net = sock_net(sk); struct ip_mreqn mreq; struct net_device *dev; int ret; @@ -1255,7 +1252,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -1272,11 +1270,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) static int bind_mcastif_addr(struct socket *sock, char *ifname) { + struct net *net = sock_net(sock->sk); struct net_device *dev; __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); @@ -1298,8 +1298,9 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket * make_send_sock(void) +static struct socket *make_send_sock(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); struct socket *sock; int result; @@ -1310,7 +1311,7 @@ static struct socket * make_send_sock(void) return ERR_PTR(result); } - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; @@ -1319,7 +1320,7 @@ static struct socket * make_send_sock(void) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; @@ -1343,8 +1344,9 @@ static struct socket * make_send_sock(void) /* * Set up receiving multicast socket over UDP */ -static struct socket * make_receive_sock(void) +static struct socket *make_receive_sock(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); struct socket *sock; int result; @@ -1368,7 +1370,7 @@ static struct socket * make_receive_sock(void) /* join the multicast group */ result = join_mcast_group(sock->sk, (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); + ipvs->backup_mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -1439,20 +1441,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); + ipvs->master_mcast_ifn, ipvs->master_syncid); while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { + while ((sb = sb_dequeue(ipvs))) { ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); } - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); + /* check if entries stay in ipvs->sync_buff for 2 seconds */ + sb = get_curr_sync_buff(ipvs, 2 * HZ); if (sb) { ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); @@ -1462,14 +1465,13 @@ static int sync_thread_master(void *data) } /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { + while ((sb = sb_dequeue(ipvs))) ip_vs_sync_buff_release(sb); - } /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { + sb = get_curr_sync_buff(ipvs, 0); + if (sb) ip_vs_sync_buff_release(sb); - } /* release the sending multicast socket */ sock_release(tinfo->sock); @@ -1482,11 +1484,12 @@ static int sync_thread_master(void *data) static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + ipvs->backup_mcast_ifn, ipvs->backup_syncid); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1496,7 +1499,7 @@ static int sync_thread_backup(void *data) /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); + ipvs->recv_mesg_maxlen); if (len <= 0) { pr_err("receiving message error\n"); break; @@ -1505,7 +1508,7 @@ static int sync_thread_backup(void *data) /* disable bottom half, because it accesses the data shared by softirq while getting/creating conns */ local_bh_disable(); - ip_vs_process_message(&init_net, tinfo->buf, len); + ip_vs_process_message(tinfo->net, tinfo->buf, len); local_bh_enable(); } } @@ -1519,11 +1522,12 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; struct task_struct **realtask, *task; struct socket *sock; + struct netns_ipvs *ipvs = net_ipvs(net); char *name, *buf = NULL; int (*threadfn)(void *data); int result = -ENOMEM; @@ -1533,27 +1537,27 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) sizeof(struct ip_vs_sync_conn_v0)); if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) + if (ipvs->master_thread) return -EEXIST; - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; + strlcpy(ipvs->master_mcast_ifn, mcast_ifn, + sizeof(ipvs->master_mcast_ifn)); + ipvs->master_syncid = syncid; + realtask = &ipvs->master_thread; + name = "ipvs_master:%d"; threadfn = sync_thread_master; - sock = make_send_sock(); + sock = make_send_sock(net); } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) + if (ipvs->backup_thread) return -EEXIST; - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; + strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, + sizeof(ipvs->backup_mcast_ifn)); + ipvs->backup_syncid = syncid; + realtask = &ipvs->backup_thread; + name = "ipvs_backup:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(); + sock = make_receive_sock(net); } else { return -EINVAL; } @@ -1563,9 +1567,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) goto out; } - set_sync_mesg_maxlen(state); + set_sync_mesg_maxlen(net, state); if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); if (!buf) goto outsocket; } @@ -1574,10 +1578,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) if (!tinfo) goto outbuf; + tinfo->net = net; tinfo->sock = sock; tinfo->buf = buf; - task = kthread_run(threadfn, tinfo, name); + task = kthread_run(threadfn, tinfo, name, ipvs->gen); if (IS_ERR(task)) { result = PTR_ERR(task); goto outtinfo; @@ -1585,7 +1590,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) /* mark as active */ *realtask = task; - ip_vs_sync_state |= state; + ipvs->sync_state |= state; /* increase the module use count */ ip_vs_use_count_inc(); @@ -1603,16 +1608,18 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) } -int stop_sync_thread(int state) +int stop_sync_thread(struct net *net, int state) { + struct netns_ipvs *ipvs = net_ipvs(net); + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) + if (!ipvs->master_thread) return -ESRCH; pr_info("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); + task_pid_nr(ipvs->master_thread)); /* * The lock synchronizes with sb_queue_tail(), so that we don't @@ -1620,21 +1627,21 @@ int stop_sync_thread(int state) * progress of stopping the master sync daemon. */ - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; + spin_lock_bh(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ipvs->sync_lock); + kthread_stop(ipvs->master_thread); + ipvs->master_thread = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) + if (!ipvs->backup_thread) return -ESRCH; pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); + task_pid_nr(ipvs->backup_thread)); - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(ipvs->backup_thread); + ipvs->backup_thread = NULL; } else { return -EINVAL; } @@ -1650,12 +1657,29 @@ int stop_sync_thread(int state) */ static int __net_init __ip_vs_sync_init(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return -EPERM; + + INIT_LIST_HEAD(&ipvs->sync_queue); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); + + ipvs->sync_mcast_addr.sin_family = AF_INET; + ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); + ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); return 0; } static void __ip_vs_sync_cleanup(struct net *net) { + if (!net_eq(net, &init_net)) /* netns not enabled yet */ + return; + stop_sync_thread(net, IP_VS_STATE_MASTER); + stop_sync_thread(net, IP_VS_STATE_BACKUP); } + static struct pernet_operations ipvs_sync_ops = { .init = __ip_vs_sync_init, .exit = __ip_vs_sync_cleanup, From b17fc9963f837ef1acfe36e193108fb16ed58647 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:56 +0100 Subject: [PATCH 48/80] IPVS: netns, ip_vs_stats and its procfs The statistic counter locks for every packet are now removed, and that statistic is now per CPU, i.e. no locks needed. However summing is made in ip_vs_est into ip_vs_stats struct which is moved to ipvs struc. procfs, ip_vs_stats now have a "per cpu" count and a grand total. A new function seq_file_single_net() in ip_vs.h created for handling of single_open_net() since it does not place net ptr in a struct, like others. /var/lib/lxc # cat /proc/net/ip_vs_stats_percpu Total Incoming Outgoing Incoming Outgoing CPU Conns Packets Packets Bytes Bytes 0 0 3 1 9D 34 1 0 1 2 49 70 2 0 1 2 34 76 3 1 2 2 70 74 ~ 1 7 7 18A 18E Conns/s Pkts/s Pkts/s Bytes/s Bytes/s 0 0 0 0 0 *v3 ip_vs_stats reamains as before, instead ip_vs_stats_percpu is added. u64 seq lock added *v4 Bug correction inbytes and outbytes as own vars.. per_cpu counter for all stats now as suggested by Julian. [horms@verge.net.au: removed whitespace-change-only hunk] Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 51 +++++++++++- include/net/netns/ip_vs.h | 4 + net/netfilter/ipvs/ip_vs_core.c | 81 +++++++++++-------- net/netfilter/ipvs/ip_vs_ctl.c | 134 +++++++++++++++++++++++++++----- net/netfilter/ipvs/ip_vs_est.c | 39 ++++++++++ 5 files changed, 252 insertions(+), 57 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4265b5e00c94..605d5db81a39 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -90,6 +90,18 @@ static inline struct net *skb_sknet(struct sk_buff *skb) return &init_net; #endif } +/* + * This one needed for single_open_net since net is stored directly in + * private not as a struct i.e. seq_file_net cant be used. + */ +static inline struct net *seq_file_single_net(struct seq_file *seq) +{ +#ifdef CONFIG_NET_NS + return (struct net *)seq->private; +#else + return &init_net; +#endif +} /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; @@ -320,6 +332,23 @@ struct ip_vs_seq { before last resized pkt */ }; +/* + * counters per cpu + */ +struct ip_vs_counters { + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ +}; +/* + * Stats per cpu + */ +struct ip_vs_cpu_stats { + struct ip_vs_counters ustats; + struct u64_stats_sync syncp; +}; /* * IPVS statistics objects @@ -341,12 +370,28 @@ struct ip_vs_estimator { }; struct ip_vs_stats { - struct ip_vs_stats_user ustats; /* statistics */ + struct ip_vs_stats_user ustats; /* statistics */ struct ip_vs_estimator est; /* estimator */ - - spinlock_t lock; /* spin lock */ + struct ip_vs_cpu_stats *cpustats; /* per cpu counters */ + spinlock_t lock; /* spin lock */ }; +/* + * Helper Macros for per cpu + * ipvs->tot_stats->ustats.count + */ +#define IPVS_STAT_INC(ipvs, count) \ + __this_cpu_inc((ipvs)->ustats->count) + +#define IPVS_STAT_ADD(ipvs, count, value) \ + do {\ + write_seqcount_begin(per_cpu_ptr((ipvs)->ustats_seq, \ + raw_smp_processor_id())); \ + __this_cpu_add((ipvs)->ustats->count, value); \ + write_seqcount_end(per_cpu_ptr((ipvs)->ustats_seq, \ + raw_smp_processor_id())); \ + } while (0) + struct dst_entry; struct iphdr; struct ip_vs_conn; diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index aba78f3c8341..bd1dad872178 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -61,6 +61,10 @@ struct netns_ipvs { struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; spinlock_t sctp_app_lock; #endif + /* ip_vs_ctl */ + struct ip_vs_stats *tot_stats; /* Statistics & est. */ + struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ + seqcount_t *ustats_seq; /* u64 read retry */ /* ip_vs_lblc */ int sysctl_lblc_expiration; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5531d569aa5e..7e6a2a046bf5 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -115,21 +115,28 @@ static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.inpkts++; - dest->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); + struct ip_vs_cpu_stats *s; - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.inpkts++; - dest->svc->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.inpkts++; - ip_vs_stats.ustats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); } } @@ -138,21 +145,28 @@ static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.outpkts++; - dest->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); + struct ip_vs_cpu_stats *s; - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.outpkts++; - dest->svc->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.outpkts++; - ip_vs_stats.ustats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); } } @@ -160,17 +174,17 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.ustats.conns++; - spin_unlock(&cp->dest->stats.lock); + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_cpu_stats *s; - spin_lock(&svc->stats.lock); - svc->stats.ustats.conns++; - spin_unlock(&svc->stats.lock); + s = this_cpu_ptr(cp->dest->stats.cpustats); + s->ustats.conns++; - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.conns++; - spin_unlock(&ip_vs_stats.lock); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(ipvs->cpustats); + s->ustats.conns++; } @@ -1841,7 +1855,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { }, #endif }; - /* * Initialize IP Virtual Server netns mem. */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 03f86312b4bb..cbd58c60e1bf 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -257,8 +257,7 @@ static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); static void defense_work_handler(struct work_struct *work) { - struct net *net = &init_net; - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = net_ipvs(&init_net); update_defense_level(ipvs); if (atomic_read(&ip_vs_dropentry)) @@ -519,6 +518,7 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); kfree(svc); } } @@ -722,6 +722,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); kfree(dest); } } @@ -747,6 +748,7 @@ static void ip_vs_trash_cleanup(void) list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); kfree(dest); } } @@ -868,6 +870,11 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto err_alloc; + } dest->af = svc->af; dest->protocol = svc->protocol; @@ -891,6 +898,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, LeaveFunction(2); return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; } @@ -1037,6 +1048,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) and only one user context can update virtual service at a time, so the operation here is OK */ atomic_dec(&dest->svc->refcnt); + free_percpu(dest->stats.cpustats); kfree(dest); } else { IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " @@ -1163,6 +1175,11 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ret = -ENOMEM; goto out_err; } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto out_err; + } /* I'm the first user of the service */ atomic_set(&svc->usecnt, 0); @@ -1212,6 +1229,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, *svc_p = svc; return 0; + out_err: if (svc != NULL) { ip_vs_unbind_scheduler(svc); @@ -1220,6 +1238,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ip_vs_app_inc_put(svc->inc); local_bh_enable(); } + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); kfree(svc); } ip_vs_scheduler_put(sched); @@ -1388,6 +1408,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); kfree(svc); } @@ -1499,7 +1520,7 @@ static int ip_vs_zero_all(struct net *net) } } - ip_vs_zero_stats(&ip_vs_stats); + ip_vs_zero_stats(net_ipvs(net)->tot_stats); return 0; } @@ -1989,13 +2010,11 @@ static const struct file_operations ip_vs_info_fops = { #endif -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), -}; - #ifdef CONFIG_PROC_FS static int ip_vs_stats_show(struct seq_file *seq, void *v) { + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -2003,22 +2022,22 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_printf(seq, " Conns Packets Packets Bytes Bytes\n"); - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, - ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, - (unsigned long long) ip_vs_stats.ustats.inbytes, - (unsigned long long) ip_vs_stats.ustats.outbytes); + spin_lock_bh(&tot_stats->lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns, + tot_stats->ustats.inpkts, tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.ustats.cps, - ip_vs_stats.ustats.inpps, - ip_vs_stats.ustats.outpps, - ip_vs_stats.ustats.inbps, - ip_vs_stats.ustats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); + tot_stats->ustats.cps, + tot_stats->ustats.inpps, + tot_stats->ustats.outpps, + tot_stats->ustats.inbps, + tot_stats->ustats.outbps); + spin_unlock_bh(&tot_stats->lock); return 0; } @@ -2036,6 +2055,59 @@ static const struct file_operations ip_vs_stats_fops = { .release = single_release, }; +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i); + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)u->ustats.inbytes, + (__u64)u->ustats.outbytes); + } + + spin_lock_bh(&tot_stats->lock); + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", + tot_stats->ustats.conns, tot_stats->ustats.inpkts, + tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8X %8X %8X %16X %16X\n", + tot_stats->ustats.cps, + tot_stats->ustats.inpps, + tot_stats->ustats.outpps, + tot_stats->ustats.inbps, + tot_stats->ustats.outbps); + spin_unlock_bh(&tot_stats->lock); + + return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif /* @@ -3461,32 +3533,54 @@ int __net_init __ip_vs_control_init(struct net *net) if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; + /* procfs stats */ + ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL); + if (ipvs->tot_stats == NULL) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto err_alloc; + } + spin_lock_init(&ipvs->tot_stats->lock); for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_LIST_HEAD(&ipvs->rs_table[idx]); proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); + proc_net_fops_create(net, "ip_vs_stats_percpu", 0, + &ip_vs_stats_percpu_fops); sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, vs_vars); if (sysctl_header == NULL) goto err_reg; - ip_vs_new_estimator(net, &ip_vs_stats); + ip_vs_new_estimator(net, ipvs->tot_stats); return 0; err_reg: + free_percpu(ipvs->cpustats); +err_alloc: + kfree(ipvs->tot_stats); return -ENOMEM; } static void __net_exit __ip_vs_control_cleanup(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); + if (!net_eq(net, &init_net)) /* netns not enabled yet */ return; - ip_vs_kill_estimator(net, &ip_vs_stats); + ip_vs_kill_estimator(net, ipvs->tot_stats); unregister_net_sysctl_table(sysctl_header); + proc_net_remove(net, "ip_vs_stats_percpu"); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); + free_percpu(ipvs->cpustats); + kfree(ipvs->tot_stats); } static struct pernet_operations ipvs_control_ops = { diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 07d839bef537..d13616b138cd 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -52,6 +52,43 @@ */ +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, + struct ip_vs_cpu_stats *stats) +{ + int i; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + __u64 inbytes, outbytes; + if (i) { + sum->conns += s->ustats.conns; + sum->inpkts += s->ustats.inpkts; + sum->outpkts += s->ustats.outpkts; + do { + start = u64_stats_fetch_begin_bh(&s->syncp); + inbytes = s->ustats.inbytes; + outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&s->syncp, start)); + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + sum->conns = s->ustats.conns; + sum->inpkts = s->ustats.inpkts; + sum->outpkts = s->ustats.outpkts; + do { + start = u64_stats_fetch_begin_bh(&s->syncp); + sum->inbytes = s->ustats.inbytes; + sum->outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&s->syncp, start)); + } + } +} + + static void estimation_timer(unsigned long arg) { struct ip_vs_estimator *e; @@ -64,10 +101,12 @@ static void estimation_timer(unsigned long arg) struct netns_ipvs *ipvs; ipvs = net_ipvs(net); + ip_vs_read_cpu_stats(&ipvs->tot_stats->ustats, ipvs->cpustats); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); spin_lock(&s->lock); n_conns = s->ustats.conns; n_inpkts = s->ustats.inpkts; From 6e67e586e7289c144d5a189d6e0fa7141d025746 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:57 +0100 Subject: [PATCH 49/80] IPVS: netns, connection hash got net as param. Connection hash table is now name space aware. i.e. net ptr >> 8 is xor:ed to the hash, and this is the first param to be compared. The net struct is 0xa40 in size ( a little bit smaller for 32 bit arch:s) and cache-line aligned, so a ptr >> 5 might be a more clever solution ? All lookups where net is compared uses net_eq() which returns 1 when netns is disabled, and the compiler seems to do something clever in that case. ip_vs_conn_fill_param() have *net as first param now. Three new inlines added to keep conn struct smaller when names space is disabled. - ip_vs_conn_net() - ip_vs_conn_net_set() - ip_vs_conn_net_eq() *v3 moved net compare to the end in "fast path" Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 53 ++++++++--- include/net/netns/ip_vs.h | 2 + net/netfilter/ipvs/ip_vs_conn.c | 112 +++++++++++++++--------- net/netfilter/ipvs/ip_vs_core.c | 15 ++-- net/netfilter/ipvs/ip_vs_ftp.c | 14 +-- net/netfilter/ipvs/ip_vs_nfct.c | 6 +- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 15 ++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_udp.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 13 ++- 11 files changed, 153 insertions(+), 83 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 605d5db81a39..f82c0ffdee74 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -477,6 +477,7 @@ extern struct ip_vs_proto_data *ip_vs_proto_data_get(struct net *net, unsigned short proto); struct ip_vs_conn_param { + struct net *net; const union nf_inet_addr *caddr; const union nf_inet_addr *vaddr; __be16 cport; @@ -494,17 +495,19 @@ struct ip_vs_conn_param { */ struct ip_vs_conn { struct list_head c_list; /* hashed list heads */ - +#ifdef CONFIG_NET_NS + struct net *net; /* Name space */ +#endif /* Protocol, addresses and port numbers */ - u16 af; /* address family */ - union nf_inet_addr caddr; /* client address */ - union nf_inet_addr vaddr; /* virtual address */ - union nf_inet_addr daddr; /* destination address */ - volatile __u32 flags; /* status flags */ - __u32 fwmark; /* Fire wall mark from skb */ - __be16 cport; - __be16 vport; - __be16 dport; + u16 af; /* address family */ + __be16 cport; + __be16 vport; + __be16 dport; + __u32 fwmark; /* Fire wall mark from skb */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + union nf_inet_addr daddr; /* destination address */ + volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ /* counter and timer */ @@ -547,6 +550,33 @@ struct ip_vs_conn { __u8 pe_data_len; }; +/* + * To save some memory in conn table when name space is disabled. + */ +static inline struct net *ip_vs_conn_net(const struct ip_vs_conn *cp) +{ +#ifdef CONFIG_NET_NS + return cp->net; +#else + return &init_net; +#endif +} +static inline void ip_vs_conn_net_set(struct ip_vs_conn *cp, struct net *net) +{ +#ifdef CONFIG_NET_NS + cp->net = net; +#endif +} + +static inline int ip_vs_conn_net_eq(const struct ip_vs_conn *cp, + struct net *net) +{ +#ifdef CONFIG_NET_NS + return cp->net == net; +#else + return 1; +#endif +} /* * Extended internal versions of struct ip_vs_service_user and @@ -796,13 +826,14 @@ enum { IP_VS_DIR_LAST, }; -static inline void ip_vs_conn_fill_param(int af, int protocol, +static inline void ip_vs_conn_fill_param(struct net *net, int af, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { + p->net = net; p->af = af; p->protocol = protocol; p->caddr = caddr; diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index bd1dad872178..1acfb334e69b 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -66,6 +66,8 @@ struct netns_ipvs { struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ seqcount_t *ustats_seq; /* u64 read retry */ + /* ip_vs_conn */ + atomic_t conn_count; /* connection counter */ /* ip_vs_lblc */ int sysctl_lblc_expiration; struct ctl_table_header *lblc_ctl_header; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index b2024c942345..0d5e4feabc1b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -64,9 +64,6 @@ static struct list_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - /* counter for no client port connections */ static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); @@ -76,7 +73,7 @@ static unsigned int ip_vs_conn_rnd __read_mostly; /* * Fine locking granularity for big connection hash table */ -#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_BITS 5 #define CT_LOCKARRAY_SIZE (1<>8)) & ip_vs_conn_tab_mask; #endif - return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, - ip_vs_conn_rnd) - & ip_vs_conn_tab_mask; + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, @@ -166,15 +163,15 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, port = p->vport; } - return ip_vs_conn_hashkey(p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); } static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, - NULL, 0, &p); + ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); if (cp->pe) { p.pe = cp->pe; @@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) } /* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. * returns bool success. */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) @@ -269,11 +266,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->af == p->af && + p->cport == cp->cport && p->vport == cp->vport && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && - p->cport == cp->cport && p->vport == cp->vport && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - p->protocol == cp->protocol) { + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { /* HIT */ atomic_inc(&cp->refcnt); ct_read_unlock(hash); @@ -313,17 +311,18 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; + struct net *net = skb_net(skb); pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); if (pptr == NULL) return 1; if (likely(!inverse)) - ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0], - &iph->daddr, pptr[1], p); + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); else - ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1], - &iph->saddr, pptr[0], p); + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); return 0; } @@ -352,6 +351,8 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; if (p->pe_data && p->pe->ct_match) { if (p->pe == cp->pe && p->pe->ct_match(p, cp)) goto out; @@ -403,10 +404,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->af == p->af && + p->vport == cp->cport && p->cport == cp->dport && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && - p->vport == cp->cport && p->cport == cp->dport && - p->protocol == cp->protocol) { + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { /* HIT */ atomic_inc(&cp->refcnt); ret = cp; @@ -609,8 +611,8 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(&init_net, cp->af, &cp->daddr, cp->dport, - &cp->vaddr, cp->vport, + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark); ip_vs_bind_dest(cp, dest); return dest; @@ -728,6 +730,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); cp->timeout = 60*HZ; @@ -770,7 +773,7 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); + atomic_dec(&ipvs->conn_count); kmem_cache_free(ip_vs_conn_cachep, cp); return; @@ -804,7 +807,9 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, p->protocol); + struct netns_ipvs *ipvs = net_ipvs(p->net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + p->protocol); cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { @@ -814,6 +819,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + ip_vs_conn_net_set(cp, p->net); cp->af = p->af; cp->protocol = p->protocol; ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); @@ -844,7 +850,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); - atomic_inc(&ip_vs_conn_count); + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); @@ -886,17 +892,22 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, * /proc/net/ip_vs_conn entries */ #ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct list_head *l; +}; static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) { int idx; struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; + iter->l = &ip_vs_conn_tab[idx]; return cp; } } @@ -908,14 +919,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) { - seq->private = NULL; + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; + struct ip_vs_iter_state *iter = seq->private; + struct list_head *e, *l = iter->l; int idx; ++*pos; @@ -932,18 +946,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) while (++idx < ip_vs_conn_tab_size) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; + iter->l = &ip_vs_conn_tab[idx]; return cp; } ct_read_unlock_bh(idx); } - seq->private = NULL; + iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) { - struct list_head *l = seq->private; + struct ip_vs_iter_state *iter = seq->private; + struct list_head *l = iter->l; if (l) ct_read_unlock_bh(l - ip_vs_conn_tab); @@ -957,9 +972,12 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; + if (!ip_vs_conn_net_eq(cp, net)) + return 0; if (cp->pe_data) { pe_data[0] = ' '; len = strlen(cp->pe->name); @@ -1004,7 +1022,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = { static int ip_vs_conn_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_conn_seq_ops); + return seq_open_net(inode, file, &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state)); } static const struct file_operations ip_vs_conn_fops = { @@ -1031,6 +1050,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); else { const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -1067,7 +1090,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_conn_sync_seq_ops); + return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state)); } static const struct file_operations ip_vs_conn_sync_fops = { @@ -1168,10 +1192,11 @@ void ip_vs_random_dropentry(void) /* * Flush all the connection entries in the ip_vs_conn_tab */ -static void ip_vs_conn_flush(void) +static void ip_vs_conn_flush(struct net *net) { int idx; struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); flush_again: for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { @@ -1181,7 +1206,8 @@ static void ip_vs_conn_flush(void) ct_write_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - + if (!ip_vs_conn_net_eq(cp, net)) + continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); if (cp->control) { @@ -1194,7 +1220,7 @@ static void ip_vs_conn_flush(void) /* the counter may be not NULL, because maybe some conn entries are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { + if (atomic_read(&ipvs->conn_count) != 0) { schedule(); goto flush_again; } @@ -1204,8 +1230,11 @@ static void ip_vs_conn_flush(void) */ int __net_init __ip_vs_conn_init(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); + if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; + atomic_set(&ipvs->conn_count, 0); proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); @@ -1217,6 +1246,8 @@ static void __net_exit __ip_vs_conn_cleanup(struct net *net) if (!net_eq(net, &init_net)) /* netns not enabled yet */ return; + /* flush all the connection entries first */ + ip_vs_conn_flush(net); proc_net_remove(net, "ip_vs_conn"); proc_net_remove(net, "ip_vs_conn_sync"); } @@ -1277,9 +1308,6 @@ int __init ip_vs_conn_init(void) void ip_vs_conn_cleanup(void) { unregister_pernet_subsys(&ipvs_conn_ops); - /* flush all the connection entries first */ - ip_vs_conn_flush(); - /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7e6a2a046bf5..7205b49c56c1 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -205,7 +205,8 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { - ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); + ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + vport, p); p->pe = svc->pe; if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); @@ -348,8 +349,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port, - &iph.daddr, dst_port, ¶m); + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, + src_port, &iph.daddr, dst_port, ¶m); cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { @@ -464,8 +465,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, - pptr[0], &iph.daddr, pptr[1], &p); + + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, + &iph.saddr, pptr[0], &iph.daddr, pptr[1], + &p); cp = ip_vs_conn_new(&p, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); @@ -532,7 +535,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->af, iph.protocol, + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, &daddr, 0, diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 77b0036dcb73..6a04f9ab9d0d 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -198,13 +198,15 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(AF_INET, iph->protocol, - &from, port, &cp->caddr, 0, &p); + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &from, port, + &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr, + ip_vs_conn_fill_param(ip_vs_conn_net(cp), + AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); n_cp = ip_vs_conn_new(&p, &from, port, IP_VS_CONN_F_NO_CPORT | @@ -361,9 +363,9 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1), - &p); + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { n_cp = ip_vs_conn_new(&p, &cp->daddr, diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 4680647cd450..f454c80df0a7 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, struct nf_conntrack_tuple *orig, new_reply; struct ip_vs_conn *cp; struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); if (exp->tuple.src.l3num != PF_INET) return; @@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum, + ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, &orig->src.u3, orig->src.u.tcp.port, &orig->dst.u3, orig->dst.u.tcp.port, &p); cp = ip_vs_conn_out_get(&p); @@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) " for conn " FMT_CONN "\n", __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); - h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); /* Show what happens instead of calling nf_ct_kill() */ diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 28039cbfcff4..5b8eb8b12c3e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -41,15 +41,16 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 static void -ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, - int inverse, struct ip_vs_conn_param *p) +ah_esp_conn_fill_param_proto(struct net *net, int af, + const struct ip_vs_iphdr *iph, int inverse, + struct ip_vs_conn_param *p) { if (likely(!inverse)) - ip_vs_conn_fill_param(af, IPPROTO_UDP, + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else - ip_vs_conn_fill_param(af, IPPROTO_UDP, + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } @@ -61,8 +62,9 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, { struct ip_vs_conn *cp; struct ip_vs_conn_param p; + struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* @@ -89,8 +91,9 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, { struct ip_vs_conn *cp; struct ip_vs_conn_param p; + struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 569e77bf08c4..550365a690c7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -1055,7 +1055,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) static int sctp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 757aaaf083bb..d8b3f9f15826 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -620,7 +620,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc) static int tcp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 1dc394100fa8..581157bbded5 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -396,7 +396,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc) static int udp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c29e73d686fb..f85e47daecc3 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -660,21 +660,21 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) * fill_param used by version 1 */ static inline int -ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc, +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, struct ip_vs_conn_param *p, __u8 *pe_data, unsigned int pe_data_len, __u8 *pe_name, unsigned int pe_name_len) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) - ip_vs_conn_fill_param(af, sc->v6.protocol, + ip_vs_conn_fill_param(net, af, sc->v6.protocol, (const union nf_inet_addr *)&sc->v6.caddr, sc->v6.cport, (const union nf_inet_addr *)&sc->v6.vaddr, sc->v6.vport, p); else #endif - ip_vs_conn_fill_param(af, sc->v4.protocol, + ip_vs_conn_fill_param(net, af, sc->v4.protocol, (const union nf_inet_addr *)&sc->v4.caddr, sc->v4.cport, (const union nf_inet_addr *)&sc->v4.vaddr, @@ -881,7 +881,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, } } - ip_vs_conn_fill_param(AF_INET, s->protocol, + ip_vs_conn_fill_param(net, AF_INET, s->protocol, (const union nf_inet_addr *)&s->caddr, s->cport, (const union nf_inet_addr *)&s->vaddr, @@ -1043,9 +1043,8 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) state = 0; } } - if (ip_vs_conn_fill_param_sync(af, s, ¶m, - pe_data, pe_data_len, - pe_name, pe_name_len)) { + if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { retc = 50; goto out; } From a0840e2e165a370ca24a59545e564e9881a55891 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:58 +0100 Subject: [PATCH 50/80] IPVS: netns, ip_vs_ctl local vars moved to ipvs struct. Moving global vars to ipvs struct, except for svc table lock. Next patch for ctl will be drop-rate handling. *v3 __ip_vs_mutex remains global ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 27 +- include/net/netns/ip_vs.h | 37 ++- net/netfilter/ipvs/ip_vs_conn.c | 7 +- net/netfilter/ipvs/ip_vs_core.c | 34 ++- net/netfilter/ipvs/ip_vs_ctl.c | 361 +++++++++++++------------- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_udp.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 9 +- 9 files changed, 265 insertions(+), 216 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index f82c0ffdee74..af9acf44e40a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -41,7 +41,7 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) * Get net ptr from skb in traffic cases * use skb_sknet when call is from userland (ioctl or netlink) */ -static inline struct net *skb_net(struct sk_buff *skb) +static inline struct net *skb_net(const struct sk_buff *skb) { #ifdef CONFIG_NET_NS #ifdef CONFIG_IP_VS_DEBUG @@ -69,7 +69,7 @@ static inline struct net *skb_net(struct sk_buff *skb) #endif } -static inline struct net *skb_sknet(struct sk_buff *skb) +static inline struct net *skb_sknet(const struct sk_buff *skb) { #ifdef CONFIG_NET_NS #ifdef CONFIG_IP_VS_DEBUG @@ -1023,13 +1023,6 @@ extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* * IPVS control data and functions (from ip_vs_ctl.c) */ -extern int sysctl_ip_vs_cache_bypass; -extern int sysctl_ip_vs_expire_nodest_conn; -extern int sysctl_ip_vs_expire_quiescent_template; -extern int sysctl_ip_vs_sync_threshold[2]; -extern int sysctl_ip_vs_nat_icmp_send; -extern int sysctl_ip_vs_conntrack; -extern int sysctl_ip_vs_snat_reroute; extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; extern int sysctl_ip_vs_sync_ver; @@ -1119,11 +1112,13 @@ extern int ip_vs_icmp_xmit_v6 extern int ip_vs_drop_rate; extern int ip_vs_drop_counter; -static __inline__ int ip_vs_todrop(void) +static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { - if (!ip_vs_drop_rate) return 0; - if (--ip_vs_drop_counter > 0) return 0; - ip_vs_drop_counter = ip_vs_drop_rate; + if (!ipvs->drop_rate) + return 0; + if (--ipvs->drop_counter > 0) + return 0; + ipvs->drop_counter = ipvs->drop_rate; return 1; } @@ -1211,9 +1206,9 @@ static inline void ip_vs_notrack(struct sk_buff *skb) * Netfilter connection tracking * (from ip_vs_nfct.c) */ -static inline int ip_vs_conntrack_enabled(void) +static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { - return sysctl_ip_vs_conntrack; + return ipvs->sysctl_conntrack; } extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, @@ -1226,7 +1221,7 @@ extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); #else -static inline int ip_vs_conntrack_enabled(void) +static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { return 0; } diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 1acfb334e69b..c4b1abf258e4 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -61,13 +61,46 @@ struct netns_ipvs { struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; spinlock_t sctp_app_lock; #endif + /* ip_vs_conn */ + atomic_t conn_count; /* connection counter */ + /* ip_vs_ctl */ struct ip_vs_stats *tot_stats; /* Statistics & est. */ struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ seqcount_t *ustats_seq; /* u64 read retry */ - /* ip_vs_conn */ - atomic_t conn_count; /* connection counter */ + int num_services; /* no of virtual services */ + /* 1/rate drop and drop-entry variables */ + int drop_rate; + int drop_counter; + atomic_t dropentry; + /* locks in ctl.c */ + spinlock_t dropentry_lock; /* drop entry handling */ + spinlock_t droppacket_lock; /* drop packet handling */ + spinlock_t securetcp_lock; /* state and timeout tables */ + rwlock_t rs_lock; /* real services table */ + /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ + struct lock_class_key ctl_key; /* ctl_mutex debuging */ + /* sys-ctl struct */ + struct ctl_table_header *sysctl_hdr; + struct ctl_table *sysctl_tbl; + /* sysctl variables */ + int sysctl_amemthresh; + int sysctl_am_droprate; + int sysctl_drop_entry; + int sysctl_drop_packet; + int sysctl_secure_tcp; +#ifdef CONFIG_IP_VS_NFCT + int sysctl_conntrack; +#endif + int sysctl_snat_reroute; + int sysctl_sync_ver; + int sysctl_cache_bypass; + int sysctl_expire_nodest_conn; + int sysctl_expire_quiescent_template; + int sysctl_sync_threshold[2]; + int sysctl_nat_icmp_send; + /* ip_vs_lblc */ int sysctl_lblc_expiration; struct ctl_table_header *lblc_ctl_header; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 0d5e4feabc1b..5ba205a4d79c 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -686,13 +686,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) int ip_vs_check_template(struct ip_vs_conn *ct) { struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); /* * Checking the dest server status. */ if ((dest == NULL) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && + (ipvs->sysctl_expire_quiescent_template && (atomic_read(&dest->weight) == 0))) { IP_VS_DBG_BUF(9, "check_template: dest not available for " "protocol %s s:%s:%d v:%s:%d " @@ -879,7 +880,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, * IP_VS_CONN_F_ONE_PACKET too. */ - if (ip_vs_conntrack_enabled()) + if (ip_vs_conntrack_enabled(ipvs)) cp->flags |= IP_VS_CONN_F_NFCT; /* Hash it in the ip_vs_conn_tab finally */ @@ -1198,7 +1199,7 @@ static void ip_vs_conn_flush(struct net *net) struct ip_vs_conn *cp; struct netns_ipvs *ipvs = net_ipvs(net); - flush_again: +flush_again: for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { /* * Lock is actually needed in this loop. diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7205b49c56c1..a7c59a722af3 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -499,6 +499,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd) { + struct netns_ipvs *ipvs; __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; int unicast; @@ -521,7 +522,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { + ipvs = net_ipvs(skb_net(skb)); + if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && @@ -733,6 +735,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl) { + struct netns_ipvs *ipvs; unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != 0) { @@ -754,6 +757,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (!skb_make_writable(skb, offset)) goto out; + ipvs = net_ipvs(skb_net(skb)); + #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_nat_icmp_v6(skb, pp, cp, 1); @@ -763,11 +768,11 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) goto out; } else #endif - if ((sysctl_ip_vs_snat_reroute || + if ((ipvs->sysctl_snat_reroute || skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) goto out; @@ -979,6 +984,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int ihl) { struct ip_vs_protocol *pp = pd->pp; + struct netns_ipvs *ipvs; IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); @@ -1014,13 +1020,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ + ipvs = net_ipvs(skb_net(skb)); + #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) goto drop; } else #endif - if ((sysctl_ip_vs_snat_reroute || + if ((ipvs->sysctl_snat_reroute || skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; @@ -1057,6 +1065,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + struct netns_ipvs *ipvs; EnterFunction(11); @@ -1131,10 +1140,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if the packet belongs to an existing entry */ cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); + ipvs = net_ipvs(net); if (likely(cp)) return handle_response(af, skb, pd, cp, iph.len); - if (sysctl_ip_vs_nat_icmp_send && + if (ipvs->sysctl_nat_icmp_send && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { @@ -1580,7 +1590,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ - if (sysctl_ip_vs_expire_nodest_conn) { + if (ipvs->sysctl_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } @@ -1610,15 +1620,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) - pkts = sysctl_ip_vs_sync_threshold[0]; + pkts = ipvs->sysctl_sync_threshold[0]; else pkts = atomic_add_return(1, &cp->in_pkts); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || + (pkts % ipvs->sysctl_sync_threshold[1] + == ipvs->sysctl_sync_threshold[0])) || (cp->old_state != cp->state && ((cp->state == IP_VS_SCTP_S_CLOSED) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || @@ -1632,8 +1642,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || + (pkts % ipvs->sysctl_sync_threshold[1] + == ipvs->sysctl_sync_threshold[0])) || ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && ((cp->state == IP_VS_TCP_S_FIN_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE) || diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index cbd58c60e1bf..183ac18bded5 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -58,42 +58,7 @@ static DEFINE_MUTEX(__ip_vs_mutex); /* lock for service table */ static DEFINE_RWLOCK(__ip_vs_svc_lock); -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_SPINLOCK(ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - /* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; -#ifdef CONFIG_IP_VS_NFCT -int sysctl_ip_vs_conntrack; -#endif -int sysctl_ip_vs_snat_reroute = 1; -int sysctl_ip_vs_sync_ver = 1; /* Default version of sync proto */ #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; @@ -142,73 +107,73 @@ static void update_defense_level(struct netns_ipvs *ipvs) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < sysctl_ip_vs_amemthresh); + nomem = (availmem < ipvs->sysctl_amemthresh); local_bh_disable(); /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { case 0: - atomic_set(&ip_vs_dropentry, 0); + atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; } else { - atomic_set(&ip_vs_dropentry, 0); + atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { - atomic_set(&ip_vs_dropentry, 1); + atomic_set(&ipvs->dropentry, 1); } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; }; break; case 3: - atomic_set(&ip_vs_dropentry, 1); + atomic_set(&ipvs->dropentry, 1); break; } - spin_unlock(&__ip_vs_dropentry_lock); + spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { case 0: - ip_vs_drop_rate = 0; + ipvs->drop_rate = 0; break; case 1: if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; } else { - ip_vs_drop_rate = 0; + ipvs->drop_rate = 0; } break; case 2: if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; } break; case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } - spin_unlock(&__ip_vs_droppacket_lock); + spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ - spin_lock(&ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { case 0: if (old_secure_tcp >= 2) to_change = 0; @@ -217,7 +182,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) if (nomem) { if (old_secure_tcp < 2) to_change = 1; - sysctl_ip_vs_secure_tcp = 2; + ipvs->sysctl_secure_tcp = 2; } else { if (old_secure_tcp >= 2) to_change = 0; @@ -230,7 +195,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) } else { if (old_secure_tcp >= 2) to_change = 0; - sysctl_ip_vs_secure_tcp = 1; + ipvs->sysctl_secure_tcp = 1; } break; case 3: @@ -238,11 +203,11 @@ static void update_defense_level(struct netns_ipvs *ipvs) to_change = 1; break; } - old_secure_tcp = sysctl_ip_vs_secure_tcp; + old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, - sysctl_ip_vs_secure_tcp > 1); - spin_unlock(&ip_vs_securetcp_lock); + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } @@ -260,7 +225,7 @@ static void defense_work_handler(struct work_struct *work) struct netns_ipvs *ipvs = net_ipvs(&init_net); update_defense_level(ipvs); - if (atomic_read(&ip_vs_dropentry)) + if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(); schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); @@ -602,7 +567,7 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&__ip_vs_rs_lock); + read_lock(&ipvs->rs_lock); list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { if ((dest->af == af) && ip_vs_addr_equal(af, &dest->addr, daddr) @@ -610,11 +575,11 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, && ((dest->protocol == protocol) || dest->vfwmark)) { /* HIT */ - read_unlock(&__ip_vs_rs_lock); + read_unlock(&ipvs->rs_lock); return dest; } } - read_unlock(&__ip_vs_rs_lock); + read_unlock(&ipvs->rs_lock); return NULL; } @@ -788,9 +753,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&__ip_vs_rs_lock); + write_lock_bh(&ipvs->rs_lock); ip_vs_rs_hash(ipvs, dest); - write_unlock_bh(&__ip_vs_rs_lock); + write_unlock_bh(&ipvs->rs_lock); } atomic_set(&dest->conn_flags, conn_flags); @@ -1022,14 +987,16 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) */ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) { + struct netns_ipvs *ipvs = net_ipvs(net); + ip_vs_kill_estimator(net, &dest->stats); /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&__ip_vs_rs_lock); + write_lock_bh(&ipvs->rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); + write_unlock_bh(&ipvs->rs_lock); /* * Decrease the refcnt of the dest, and free the dest @@ -1092,7 +1059,6 @@ static int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - struct net *net = svc->net; __be16 dport = udest->port; EnterFunction(2); @@ -1121,7 +1087,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete the destination */ - __ip_vs_del_dest(net, dest); + __ip_vs_del_dest(svc->net, dest); LeaveFunction(2); @@ -1140,6 +1106,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); @@ -1219,7 +1186,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) - ip_vs_num_services++; + ipvs->num_services++; /* Hash the service into the service table */ write_lock_bh(&__ip_vs_svc_lock); @@ -1359,12 +1326,13 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = net_ipvs(svc->net); pr_info("%s: enter\n", __func__); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) - ip_vs_num_services--; + ipvs->num_services--; ip_vs_kill_estimator(svc->net, &svc->stats); @@ -1589,12 +1557,88 @@ proc_do_sync_mode(ctl_table *table, int write, /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in __ip_vs_control_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "am_droprate", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_entry", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "secure_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "snat_reroute", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1608,57 +1652,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_do_defense_mode, - }, - { - .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_do_defense_mode, - }, -#ifdef CONFIG_IP_VS_NFCT - { - .procname = "conntrack", - .data = &sysctl_ip_vs_conntrack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_do_defense_mode, - }, - { - .procname = "snat_reroute", - .data = &sysctl_ip_vs_snat_reroute, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "sync_version", - .data = &sysctl_ip_vs_sync_ver, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_sync_mode, - }, #if 0 { .procname = "timeout_established", @@ -1745,41 +1738,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec_jiffies, }, #endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; @@ -1791,8 +1749,6 @@ const struct ctl_path net_vs_ctl_path[] = { }; EXPORT_SYMBOL_GPL(net_vs_ctl_path); -static struct ctl_table_header * sysctl_header; - #ifdef CONFIG_PROC_FS struct ip_vs_iter { @@ -2543,7 +2499,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; - info.num_services = ip_vs_num_services; + info.num_services = ipvs->num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } @@ -3014,7 +2970,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - struct net *net; + struct net *net = skb_sknet(skb); mutex_lock(&__ip_vs_mutex); @@ -3023,7 +2979,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) goto out_err; - net = skb_sknet(skb); + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -3215,8 +3171,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; struct net *net; + struct netns_ipvs *ipvs; net = skb_sknet(skb); + ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); @@ -3326,8 +3284,10 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) void *reply; int ret, cmd, reply_cmd; struct net *net; + struct netns_ipvs *ipvs; net = skb_sknet(skb); + ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3530,9 +3490,21 @@ int __net_init __ip_vs_control_init(struct net *net) { int idx; struct netns_ipvs *ipvs = net_ipvs(net); + struct ctl_table *tbl; if (!net_eq(net, &init_net)) /* netns not enabled yet */ return -EPERM; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL); if (ipvs->tot_stats == NULL) { @@ -3553,14 +3525,51 @@ int __net_init __ip_vs_control_init(struct net *net) proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); proc_net_fops_create(net, "ip_vs_stats_percpu", 0, &ip_vs_stats_percpu_fops); - sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = 3; + ipvs->sysctl_sync_threshold[1] = 50; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + + + ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, vs_vars); - if (sysctl_header == NULL) + if (ipvs->sysctl_hdr == NULL) goto err_reg; ip_vs_new_estimator(net, ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; return 0; err_reg: + if (!net_eq(net, &init_net)) + kfree(tbl); +err_dup: free_percpu(ipvs->cpustats); err_alloc: kfree(ipvs->tot_stats); @@ -3575,7 +3584,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) return; ip_vs_kill_estimator(net, ipvs->tot_stats); - unregister_net_sysctl_table(sysctl_header); + unregister_net_sysctl_table(ipvs->sysctl_hdr); proc_net_remove(net, "ip_vs_stats_percpu"); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 550365a690c7..fb2d04ac5d4e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -34,7 +34,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, &iph.daddr, sh->dest))) { int ignored; - if (ip_vs_todrop()) { + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index d8b3f9f15826..c0cc341b840d 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -54,7 +54,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, &iph.daddr, th->dest))) { int ignored; - if (ip_vs_todrop()) { + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 581157bbded5..f1282cbe6fe3 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -50,7 +50,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (svc) { int ignored; - if (ip_vs_todrop()) { + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f85e47daecc3..b1780562c42b 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -394,7 +394,7 @@ void ip_vs_sync_switch_mode(struct net *net, int mode) if (!ipvs->sync_state & IP_VS_STATE_MASTER) return; - if (mode == sysctl_ip_vs_sync_ver || !ipvs->sync_buff) + if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff) return; spin_lock_bh(&ipvs->sync_buff_lock); @@ -521,7 +521,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) unsigned int len, pe_name_len, pad; /* Handle old version of the protocol */ - if (sysctl_ip_vs_sync_ver == 0) { + if (ipvs->sysctl_sync_ver == 0) { ip_vs_sync_conn_v0(net, cp); return; } @@ -650,7 +650,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_TEMPLATE) { int pkts = atomic_add_return(1, &cp->in_pkts); - if (pkts % sysctl_ip_vs_sync_threshold[1] != 1) + if (pkts % ipvs->sysctl_sync_threshold[1] != 1) return; } goto sloop; @@ -724,6 +724,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, { struct ip_vs_dest *dest; struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(param); @@ -794,7 +795,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, if (opt) memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]); cp->state = state; cp->old_state = cp->state; /* From f6340ee0c6b9498ec918a7bb2f44e20abb8b2833 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:44:59 +0100 Subject: [PATCH 51/80] IPVS: netns, defense work timer. This patch makes defense work timer per name-space, A net ptr had to be added to the ipvs struct, since it's needed by defense_work_handler. [ horms@verge.net.au: Use cancel_delayed_work_sync() instead of cancel_rearming_delayed_work(). Found during merge conflict resoliution ] Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- include/net/netns/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_conn.c | 5 +++-- net/netfilter/ipvs/ip_vs_core.c | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 20 +++++++++----------- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index af9acf44e40a..fbe660f95873 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -877,7 +877,7 @@ extern const char * ip_vs_state_name(__u16 proto, int state); extern void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp); extern int ip_vs_check_template(struct ip_vs_conn *ct); -extern void ip_vs_random_dropentry(void); +extern void ip_vs_random_dropentry(struct net *net); extern int ip_vs_conn_init(void); extern void ip_vs_conn_cleanup(void); diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index c4b1abf258e4..41332619142c 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -71,6 +71,7 @@ struct netns_ipvs { int num_services; /* no of virtual services */ /* 1/rate drop and drop-entry variables */ + struct delayed_work defense_work; /* Work handler */ int drop_rate; int drop_counter; atomic_t dropentry; @@ -129,6 +130,8 @@ struct netns_ipvs { /* multicast interface name */ char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + /* net name space ptr */ + struct net *net; /* Needed by timer routines */ }; #endif /* IP_VS_H_ */ diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 5ba205a4d79c..28bdaf7c02f4 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1138,7 +1138,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) } /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) +void ip_vs_random_dropentry(struct net *net) { int idx; struct ip_vs_conn *cp; @@ -1158,7 +1158,8 @@ void ip_vs_random_dropentry(void) if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; - + if (!ip_vs_conn_net_eq(cp, net)) + continue; if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a7c59a722af3..bdda346a4f30 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1884,6 +1884,7 @@ static int __net_init __ip_vs_init(struct net *net) pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 183ac18bded5..6a963d44df48 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -217,18 +217,16 @@ static void update_defense_level(struct netns_ipvs *ipvs) * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); static void defense_work_handler(struct work_struct *work) { - struct netns_ipvs *ipvs = net_ipvs(&init_net); + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + ip_vs_random_dropentry(ipvs->net); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } int @@ -3564,6 +3562,9 @@ int __net_init __ip_vs_control_init(struct net *net) goto err_reg; ip_vs_new_estimator(net, ipvs->tot_stats); ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); return 0; err_reg: @@ -3588,6 +3589,8 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) proc_net_remove(net, "ip_vs_stats_percpu"); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); free_percpu(ipvs->cpustats); kfree(ipvs->tot_stats); } @@ -3631,9 +3634,6 @@ int __init ip_vs_control_init(void) goto err_net; } - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - LeaveFunction(2); return 0; @@ -3648,8 +3648,6 @@ void ip_vs_control_cleanup(void) { EnterFunction(2); ip_vs_trash_cleanup(); - cancel_delayed_work_sync(&defense_work); - cancel_work_sync(&defense_work.work); unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); From f2431e6e9255461eb1476340a89ad32ad4b38b03 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:45:00 +0100 Subject: [PATCH 52/80] IPVS: netns, trash handling trash list per namspace, and reordering of some params in dst struct. [ horms@verge.net.au: Use cancel_delayed_work_sync() instead of cancel_rearming_delayed_work(). Found during merge conflict resoliution ] Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++-- include/net/netns/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 23 +++++++++++------------ 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index fbe660f95873..b23bea62f708 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -662,8 +662,8 @@ struct ip_vs_dest { struct list_head d_list; /* for table with all the dests */ u16 af; /* address family */ - union nf_inet_addr addr; /* IP address of the server */ __be16 port; /* port number of the server */ + union nf_inet_addr addr; /* IP address of the server */ volatile unsigned flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ @@ -690,8 +690,8 @@ struct ip_vs_dest { /* for virtual service */ struct ip_vs_service *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ - union nf_inet_addr vaddr; /* virtual IP address */ __be16 vport; /* virtual port number */ + union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ }; diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 41332619142c..67ca1cf55af8 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -82,6 +82,9 @@ struct netns_ipvs { rwlock_t rs_lock; /* real services table */ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ struct lock_class_key ctl_key; /* ctl_mutex debuging */ + /* Trash for destinations */ + struct list_head dest_trash; + /* sys-ctl struct */ struct ctl_table_header *sysctl_hdr; struct ctl_table *sysctl_tbl; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6a963d44df48..442edf4be644 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -254,11 +254,6 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - /* * FTP & NULL virtual service counters */ @@ -650,11 +645,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, @@ -703,11 +699,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * are expired, and the refcnt of each destination in the trash must * be 1, so we simply release them here. */ -static void ip_vs_trash_cleanup(void) +static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); @@ -1021,7 +1018,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); + list_add(&dest->n_list, &ipvs->dest_trash); atomic_inc(&dest->refcnt); } } @@ -3503,6 +3500,8 @@ int __net_init __ip_vs_control_init(struct net *net) for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_LIST_HEAD(&ipvs->rs_table[idx]); + INIT_LIST_HEAD(&ipvs->dest_trash); + /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL); if (ipvs->tot_stats == NULL) { @@ -3584,13 +3583,14 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) if (!net_eq(net, &init_net)) /* netns not enabled yet */ return; + ip_vs_trash_cleanup(net); ip_vs_kill_estimator(net, ipvs->tot_stats); + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); proc_net_remove(net, "ip_vs_stats_percpu"); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); - cancel_delayed_work_sync(&ipvs->defense_work); - cancel_work_sync(&ipvs->defense_work.work); free_percpu(ipvs->cpustats); kfree(ipvs->tot_stats); } @@ -3647,7 +3647,6 @@ int __init ip_vs_control_init(void) void ip_vs_control_cleanup(void) { EnterFunction(2); - ip_vs_trash_cleanup(); unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); From 763f8d0ed4f1ce38b35cc0e05482b7799b82789b Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:45:01 +0100 Subject: [PATCH 53/80] IPVS: netns, svc counters moved in ip_vs_ctl,c Last two global vars to be moved, ip_vs_ftpsvc_counter and ip_vs_nullsvc_counter. [horms@verge.net.au: removed whitespace-change-only hunk] Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/netns/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 21 +++++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h index 67ca1cf55af8..259ebac904bf 100644 --- a/include/net/netns/ip_vs.h +++ b/include/net/netns/ip_vs.h @@ -84,6 +84,9 @@ struct netns_ipvs { struct lock_class_key ctl_key; /* ctl_mutex debuging */ /* Trash for destinations */ struct list_head dest_trash; + /* Service counters */ + atomic_t ftpsvc_counter; + atomic_t nullsvc_counter; /* sys-ctl struct */ struct ctl_table_header *sysctl_hdr; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 442edf4be644..65f5de405ad2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -254,12 +254,6 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - /* * Returns hash value for virtual service @@ -409,6 +403,7 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; + struct netns_ipvs *ipvs = net_ipvs(net); read_lock(&__ip_vs_svc_lock); @@ -427,7 +422,7 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, if (svc == NULL && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) + && atomic_read(&ipvs->ftpsvc_counter) && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { /* * Check if ftp service entry exists, the packet @@ -437,7 +432,7 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, } if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { + && atomic_read(&ipvs->nullsvc_counter)) { /* * Check if the catch-all port (port zero) exists */ @@ -1173,9 +1168,9 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, /* Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); + atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); + atomic_inc(&ipvs->nullsvc_counter); ip_vs_new_estimator(net, &svc->stats); @@ -1359,9 +1354,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) * Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); + atomic_dec(&ipvs->ftpsvc_counter); else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); + atomic_dec(&ipvs->nullsvc_counter); /* * Free the service if nobody refers to it @@ -3501,6 +3496,8 @@ int __net_init __ip_vs_control_init(struct net *net) INIT_LIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL); From 4a98480bccc2f5998c5564d254392635b9aa04c2 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:45:02 +0100 Subject: [PATCH 54/80] IPVS: netns, misc init_net removal in core. init_net removed in __ip_vs_addr_is_local_v6, and got net as param. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 6 ++++-- net/netfilter/ipvs/ip_vs_ctl.c | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index bdda346a4f30..9e10c7a0720a 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -499,6 +499,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd) { + struct net *net; struct netns_ipvs *ipvs; __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; @@ -511,18 +512,19 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_service_put(svc); return NF_DROP; } + net = skb_net(skb); #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; else #endif - unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - ipvs = net_ipvs(skb_net(skb)); + ipvs = net_ipvs(net); if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 65f5de405ad2..edf2b6dee972 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -71,7 +71,8 @@ int ip_vs_get_debug_level(void) #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +static int __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) { struct rt6_info *rt; struct flowi fl = { @@ -80,7 +81,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) .fl6_src = { .s6_addr32 = {0, 0, 0, 0} }, }; - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl); if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) return 1; @@ -810,12 +811,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) return -EINVAL; } else #endif { - atype = inet_addr_type(&init_net, udest->addr.ip); + atype = inet_addr_type(svc->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } From c6d2d445d8dee04cde47eb4021636399a4239e9f Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 3 Jan 2011 14:45:03 +0100 Subject: [PATCH 55/80] IPVS: netns, final patch enabling network name space. all init_net removed, (except for some alloc related that needs to be there) Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_app.c | 3 --- net/netfilter/ipvs/ip_vs_conn.c | 5 ----- net/netfilter/ipvs/ip_vs_core.c | 4 ---- net/netfilter/ipvs/ip_vs_ctl.c | 7 +------ net/netfilter/ipvs/ip_vs_est.c | 3 --- net/netfilter/ipvs/ip_vs_ftp.c | 6 ------ net/netfilter/ipvs/ip_vs_sync.c | 5 ----- 7 files changed, 1 insertion(+), 32 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 286f46594e0e..5c48ffb60c28 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -582,9 +582,6 @@ static int __net_init __ip_vs_app_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; - INIT_LIST_HEAD(&ipvs->app_list); __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 28bdaf7c02f4..83233fe24a08 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1234,8 +1234,6 @@ int __net_init __ip_vs_conn_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; atomic_set(&ipvs->conn_count, 0); proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); @@ -1245,9 +1243,6 @@ int __net_init __ip_vs_conn_init(struct net *net) static void __net_exit __ip_vs_conn_cleanup(struct net *net) { - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return; - /* flush all the connection entries first */ ip_vs_conn_flush(net); proc_net_remove(net, "ip_vs_conn"); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 9e10c7a0720a..f36a84f33efb 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1877,10 +1877,6 @@ static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; - if (!net_eq(net, &init_net)) { - pr_err("The final patch for enabling netns is missing\n"); - return -EPERM; - } ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) { pr_err("%s(): no memory.\n", __func__); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index edf2b6dee972..09ca2ce2f2b7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2617,6 +2617,7 @@ static struct genl_family ip_vs_genl_family = { .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ }; /* Policy used for first-level command attributes */ @@ -3483,9 +3484,6 @@ int __net_init __ip_vs_control_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); struct ctl_table *tbl; - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; - atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); @@ -3578,9 +3576,6 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return; - ip_vs_trash_cleanup(net); ip_vs_kill_estimator(net, ipvs->tot_stats); cancel_delayed_work_sync(&ipvs->defense_work); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index d13616b138cd..f560a05c965a 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -203,9 +203,6 @@ static int __net_init __ip_vs_estimator_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; - INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 6a04f9ab9d0d..6b5dd6ddaae9 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -413,9 +413,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net) int i, ret; struct ip_vs_app *app = &ip_vs_ftp; - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; - ret = register_ip_vs_app(net, app); if (ret) return ret; @@ -442,9 +439,6 @@ static void __ip_vs_ftp_exit(struct net *net) { struct ip_vs_app *app = &ip_vs_ftp; - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return; - unregister_ip_vs_app(net, app); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b1780562c42b..d1adf988eb08 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1659,9 +1659,6 @@ static int __net_init __ip_vs_sync_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return -EPERM; - INIT_LIST_HEAD(&ipvs->sync_queue); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); @@ -1674,8 +1671,6 @@ static int __net_init __ip_vs_sync_init(struct net *net) static void __ip_vs_sync_cleanup(struct net *net) { - if (!net_eq(net, &init_net)) /* netns not enabled yet */ - return; stop_sync_thread(net, IP_VS_STATE_MASTER); stop_sync_thread(net, IP_VS_STATE_BACKUP); } From 5df15196a2bbf16ca4c6a797ec00ff36d0d5c179 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sun, 26 Dec 2010 10:22:22 +0100 Subject: [PATCH 56/80] netfilter: xt_comment: drop unneeded unsigned qualifier Since a string is stored, and not something like a MAC address that would rely on (un)signedness, drop the qualifier. Signed-off-by: Jan Engelhardt Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_comment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netfilter/xt_comment.h b/include/linux/netfilter/xt_comment.h index eacfedc6b5d0..0ea5e79f5bd7 100644 --- a/include/linux/netfilter/xt_comment.h +++ b/include/linux/netfilter/xt_comment.h @@ -4,7 +4,7 @@ #define XT_MAX_COMMENT_LEN 256 struct xt_comment_info { - unsigned char comment[XT_MAX_COMMENT_LEN]; + char comment[XT_MAX_COMMENT_LEN]; }; #endif /* XT_COMMENT_H */ From b017900aac4a158b9bf7ffdcb8a369a91115b3e4 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 15 Dec 2010 09:46:26 +0100 Subject: [PATCH 57/80] netfilter: xt_conntrack: support matching on port ranges Add a new revision 3 that contains port ranges for all of origsrc, origdst, replsrc and repldst. The high ports are appended to the original v2 data structure to allow sharing most of the code with v1 and v2. Use of the revision specific port matching function is made dependant on par->match->revision. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_conntrack.h | 15 ++++++ net/netfilter/xt_conntrack.c | 75 +++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h index 54f47a2f6152..74b904d8f99c 100644 --- a/include/linux/netfilter/xt_conntrack.h +++ b/include/linux/netfilter/xt_conntrack.h @@ -58,4 +58,19 @@ struct xt_conntrack_mtinfo2 { __u16 state_mask, status_mask; }; +struct xt_conntrack_mtinfo3 { + union nf_inet_addr origsrc_addr, origsrc_mask; + union nf_inet_addr origdst_addr, origdst_mask; + union nf_inet_addr replsrc_addr, replsrc_mask; + union nf_inet_addr repldst_addr, repldst_mask; + __u32 expires_min, expires_max; + __u16 l4proto; + __u16 origsrc_port, origdst_port; + __u16 replsrc_port, repldst_port; + __u16 match_flags, invert_flags; + __u16 state_mask, status_mask; + __u16 origsrc_port_high, origdst_port_high; + __u16 replsrc_port_high, repldst_port_high; +}; + #endif /*_XT_CONNTRACK_H*/ diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index e536710ad916..4ef1b63ad73f 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -112,6 +112,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, return true; } +static inline bool +port_match(u16 min, u16 max, u16 port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static inline bool +ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + !port_match(info->origsrc_port, info->origsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + !port_match(info->origdst_port, info->origdst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + !port_match(info->replsrc_port, info->replsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + !port_match(info->repldst_port, info->repldst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + static bool conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 state_mask, u16 status_mask) @@ -170,8 +218,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; - if (!ct_proto_port_check(info, ct)) - return false; + if (par->match->revision != 3) { + if (!ct_proto_port_check(info, ct)) + return false; + } else { + if (!ct_proto_port_check_v3(par->matchinfo, ct)) + return false; + } if ((info->match_flags & XT_CONNTRACK_STATUS) && (!!(status_mask & ct->status) ^ @@ -207,6 +260,14 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) return conntrack_mt(skb, par, info->state_mask, info->status_mask); } +static bool +conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo3 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; @@ -244,6 +305,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, + { + .name = "conntrack", + .revision = 3, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo3), + .match = conntrack_mt_v3, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, }; static int __init conntrack_mt_init(void) From 255d0dc34068a976550ce555e153c0bfcfec7cc6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Dec 2010 18:35:15 +0100 Subject: [PATCH 58/80] netfilter: x_table: speedup compat operations One iptables invocation with 135000 rules takes 35 seconds of cpu time on a recent server, using a 32bit distro and a 64bit kernel. We eventually trigger NMI/RCU watchdog. INFO: rcu_sched_state detected stall on CPU 3 (t=6000 jiffies) COMPAT mode has quadratic behavior and consume 16 bytes of memory per rule. Switch the xt_compat algos to use an array instead of list, and use a binary search to locate an offset in the sorted array. This halves memory need (8 bytes per rule), and removes quadratic behavior [ O(N*N) -> O(N*log2(N)) ] Time of iptables goes from 35 s to 150 ms. Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 3 +- net/bridge/netfilter/ebtables.c | 1 + net/ipv4/netfilter/arp_tables.c | 2 + net/ipv4/netfilter/ip_tables.c | 2 + net/ipv6/netfilter/ip6_tables.c | 2 + net/netfilter/x_tables.c | 82 +++++++++++++++++------------- 6 files changed, 57 insertions(+), 35 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 742bec051440..0f04d985b41c 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -611,8 +611,9 @@ struct _compat_xt_align { extern void xt_compat_lock(u_int8_t af); extern void xt_compat_unlock(u_int8_t af); -extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta); +extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); extern void xt_compat_flush_offsets(u_int8_t af); +extern void xt_compat_init_offsets(u_int8_t af, unsigned int number); extern int xt_compat_calc_jump(u_int8_t af, unsigned int offset); extern int xt_compat_match_offset(const struct xt_match *match); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 16df0532d4b9..5f1825df9dca 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1764,6 +1764,7 @@ static int compat_table_info(const struct ebt_table_info *info, newinfo->entries_size = size; + xt_compat_init_offsets(AF_INET, info->nentries); return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3fac340a28d5..47e5178b998b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -883,6 +883,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1350,6 +1351,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a846d633b3b6..c5a75d70970f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1080,6 +1080,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1681,6 +1682,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 455582384ece..0c9973a657fb 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1093,6 +1093,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET6, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1696,6 +1697,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); + xt_compat_init_offsets(AF_INET6, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 80463507420e..ee5de3af510a 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -38,9 +38,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) struct compat_delta { - struct compat_delta *next; - unsigned int offset; - int delta; + unsigned int offset; /* offset in kernel */ + int delta; /* delta in 32bit user land */ }; struct xt_af { @@ -49,7 +48,9 @@ struct xt_af { struct list_head target; #ifdef CONFIG_COMPAT struct mutex compat_mutex; - struct compat_delta *compat_offsets; + struct compat_delta *compat_tab; + unsigned int number; /* number of slots in compat_tab[] */ + unsigned int cur; /* number of used slots in compat_tab[] */ #endif }; @@ -414,54 +415,67 @@ int xt_check_match(struct xt_mtchk_param *par, EXPORT_SYMBOL_GPL(xt_check_match); #ifdef CONFIG_COMPAT -int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta) +int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { - struct compat_delta *tmp; + struct xt_af *xp = &xt[af]; - tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - tmp->offset = offset; - tmp->delta = delta; - - if (xt[af].compat_offsets) { - tmp->next = xt[af].compat_offsets->next; - xt[af].compat_offsets->next = tmp; - } else { - xt[af].compat_offsets = tmp; - tmp->next = NULL; + if (!xp->compat_tab) { + if (!xp->number) + return -EINVAL; + xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); + if (!xp->compat_tab) + return -ENOMEM; + xp->cur = 0; } + + if (xp->cur >= xp->number) + return -EINVAL; + + if (xp->cur) + delta += xp->compat_tab[xp->cur - 1].delta; + xp->compat_tab[xp->cur].offset = offset; + xp->compat_tab[xp->cur].delta = delta; + xp->cur++; return 0; } EXPORT_SYMBOL_GPL(xt_compat_add_offset); void xt_compat_flush_offsets(u_int8_t af) { - struct compat_delta *tmp, *next; - - if (xt[af].compat_offsets) { - for (tmp = xt[af].compat_offsets; tmp; tmp = next) { - next = tmp->next; - kfree(tmp); - } - xt[af].compat_offsets = NULL; + if (xt[af].compat_tab) { + vfree(xt[af].compat_tab); + xt[af].compat_tab = NULL; + xt[af].number = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); int xt_compat_calc_jump(u_int8_t af, unsigned int offset) { - struct compat_delta *tmp; - int delta; + struct compat_delta *tmp = xt[af].compat_tab; + int mid, left = 0, right = xt[af].cur - 1; - for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next) - if (tmp->offset < offset) - delta += tmp->delta; - return delta; + while (left <= right) { + mid = (left + right) >> 1; + if (offset > tmp[mid].offset) + left = mid + 1; + else if (offset < tmp[mid].offset) + right = mid - 1; + else + return mid ? tmp[mid - 1].delta : 0; + } + WARN_ON_ONCE(1); + return 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); +void xt_compat_init_offsets(u_int8_t af, unsigned int number) +{ + xt[af].number = number; + xt[af].cur = 0; +} +EXPORT_SYMBOL(xt_compat_init_offsets); + int xt_compat_match_offset(const struct xt_match *match) { u_int16_t csize = match->compatsize ? : match->matchsize; @@ -1337,7 +1351,7 @@ static int __init xt_init(void) mutex_init(&xt[i].mutex); #ifdef CONFIG_COMPAT mutex_init(&xt[i].compat_mutex); - xt[i].compat_offsets = NULL; + xt[i].compat_tab = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); From 6faee60a4e82075853a437831768cc9e2e563e4e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 Dec 2010 15:57:47 +0100 Subject: [PATCH 59/80] netfilter: ebt_ip6: allow matching on ipv6-icmp types/codes To avoid adding a new match revision icmp type/code are stored in the sport/dport area. Signed-off-by: Florian Westphal Reviewed-by: Holger Eitzenberger Reviewed-by: Bart De Schuymer Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebt_ip6.h | 15 ++++++-- net/bridge/netfilter/ebt_ip6.c | 46 +++++++++++++++++------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/include/linux/netfilter_bridge/ebt_ip6.h b/include/linux/netfilter_bridge/ebt_ip6.h index e5de98701519..22af18a3c16b 100644 --- a/include/linux/netfilter_bridge/ebt_ip6.h +++ b/include/linux/netfilter_bridge/ebt_ip6.h @@ -18,8 +18,11 @@ #define EBT_IP6_PROTO 0x08 #define EBT_IP6_SPORT 0x10 #define EBT_IP6_DPORT 0x20 +#define EBT_IP6_ICMP6 0x40 + #define EBT_IP6_MASK (EBT_IP6_SOURCE | EBT_IP6_DEST | EBT_IP6_TCLASS |\ - EBT_IP6_PROTO | EBT_IP6_SPORT | EBT_IP6_DPORT) + EBT_IP6_PROTO | EBT_IP6_SPORT | EBT_IP6_DPORT | \ + EBT_IP6_ICMP6) #define EBT_IP6_MATCH "ip6" /* the same values are used for the invflags */ @@ -32,8 +35,14 @@ struct ebt_ip6_info { uint8_t protocol; uint8_t bitmask; uint8_t invflags; - uint16_t sport[2]; - uint16_t dport[2]; + union { + uint16_t sport[2]; + uint8_t icmpv6_type[2]; + }; + union { + uint16_t dport[2]; + uint8_t icmpv6_code[2]; + }; }; #endif diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 50a46afc2bcc..2ed0056a39a8 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -22,9 +22,15 @@ #include #include -struct tcpudphdr { - __be16 src; - __be16 dst; +union pkthdr { + struct { + __be16 src; + __be16 dst; + } tcpudphdr; + struct { + u8 type; + u8 code; + } icmphdr; }; static bool @@ -33,8 +39,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_ip6_info *info = par->matchinfo; const struct ipv6hdr *ih6; struct ipv6hdr _ip6h; - const struct tcpudphdr *pptr; - struct tcpudphdr _ports; + const union pkthdr *pptr; + union pkthdr _pkthdr; ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) @@ -56,26 +62,34 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) return false; - if (!(info->bitmask & EBT_IP6_DPORT) && - !(info->bitmask & EBT_IP6_SPORT)) + if (!(info->bitmask & ( EBT_IP6_DPORT | + EBT_IP6_SPORT | EBT_IP6_ICMP6))) return true; - pptr = skb_header_pointer(skb, offset_ph, sizeof(_ports), - &_ports); + + /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */ + pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr), + &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP6_DPORT) { - u32 dst = ntohs(pptr->dst); + u16 dst = ntohs(pptr->tcpudphdr.dst); if (FWINV(dst < info->dport[0] || dst > info->dport[1], EBT_IP6_DPORT)) return false; } if (info->bitmask & EBT_IP6_SPORT) { - u32 src = ntohs(pptr->src); + u16 src = ntohs(pptr->tcpudphdr.src); if (FWINV(src < info->sport[0] || src > info->sport[1], EBT_IP6_SPORT)) return false; } - return true; + if ((info->bitmask & EBT_IP6_ICMP6) && + FWINV(pptr->icmphdr.type < info->icmpv6_type[0] || + pptr->icmphdr.type > info->icmpv6_type[1] || + pptr->icmphdr.code < info->icmpv6_code[0] || + pptr->icmphdr.code > info->icmpv6_code[1], + EBT_IP6_ICMP6)) + return false; } return true; } @@ -103,6 +117,14 @@ static int ebt_ip6_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; + if (info->bitmask & EBT_IP6_ICMP6) { + if ((info->invflags & EBT_IP6_PROTO) || + info->protocol != IPPROTO_ICMPV6) + return -EINVAL; + if (info->icmpv6_type[0] > info->icmpv6_type[1] || + info->icmpv6_code[0] > info->icmpv6_code[1]) + return -EINVAL; + } return 0; } From c7066f70d9610df0b9406cc635fc09e86136e714 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 14 Jan 2011 13:36:42 +0100 Subject: [PATCH 60/80] netfilter: fix Kconfig dependencies Fix dependencies of netfilter realm match: it depends on NET_CLS_ROUTE, which itself depends on NET_SCHED; this dependency is missing from netfilter. Since matching on realms is also useful without having NET_SCHED enabled and the option really only controls whether the tclassid member is included in route and dst entries, rename the config option to IP_ROUTE_CLASSID and move it outside of traffic scheduling context to get rid of the NET_SCHED dependeny. Reported-by: Vladis Kletnieks Signed-off-by: Patrick McHardy --- include/net/dst.h | 2 +- include/net/ip_fib.h | 6 +++--- net/ipv4/Kconfig | 4 +++- net/ipv4/fib_rules.c | 10 +++++----- net/ipv4/fib_semantics.c | 14 +++++++------- net/ipv4/ip_input.c | 2 +- net/ipv4/route.c | 26 +++++++++++++------------- net/netfilter/Kconfig | 2 +- net/sched/Kconfig | 5 +---- net/sched/cls_flow.c | 2 +- net/sched/em_meta.c | 2 +- 11 files changed, 37 insertions(+), 38 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index a5bd72646d65..6baba836ad8b 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -72,7 +72,7 @@ struct dst_entry { u32 metrics[RTAX_MAX]; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else __u32 __pad2; diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 07bdb5e9e8ac..65d1fcdbc63b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -55,7 +55,7 @@ struct fib_nh { int nh_weight; int nh_power; #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif int nh_oif; @@ -201,7 +201,7 @@ static inline int fib_lookup(struct net *net, const struct flowi *flp, extern int __net_init fib4_rules_init(struct net *net); extern void __net_exit fib4_rules_exit(struct net *net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID extern u32 fib_rules_tclass(struct fib_result *res); #endif @@ -235,7 +235,7 @@ extern struct fib_table *fib_hash_table(u32 id); static inline void fib_combine_itag(u32 *itag, struct fib_result *res) { -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9e95d7fb6d5a..dcb2e18f6f8e 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -140,6 +140,9 @@ config IP_ROUTE_VERBOSE handled by the klogd daemon which is responsible for kernel messages ("man klogd"). +config IP_ROUTE_CLASSID + bool + config IP_PNP bool "IP: kernel level autoconfiguration" help @@ -655,4 +658,3 @@ config TCP_MD5SIG on the Internet. If unsure, say N. - diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24f5c7b..9cefe72029cf 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -41,12 +41,12 @@ struct fib4_rule { __be32 srcmask; __be32 dst; __be32 dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 fib_rules_tclass(struct fib_result *res) { return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; @@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); #endif @@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->tos && (rule4->tos != frh->tos)) return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif @@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->src_len) NLA_PUT_BE32(skb, FRA_SRC, rule4->src); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid) NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3e0da3ef6116..a72c62d03106 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -200,7 +200,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -422,7 +422,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_GATEWAY); nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif @@ -476,7 +476,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla && nla_get_be32(nla) != nh->nh_gw) return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) return 1; @@ -783,7 +783,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) goto err_inval; #endif @@ -796,7 +796,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1006,7 +1006,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (fi->fib_nh->nh_oif) NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif @@ -1031,7 +1031,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (nh->nh_gw) NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); #endif diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb7..d7b2b0987a3b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) } } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 66610ea3c87b..f70ae1bccb8a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -511,7 +511,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -564,14 +564,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -585,7 +585,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } @@ -1784,7 +1784,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) @@ -1811,7 +1811,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) rt->dst.dev->mtu > 576) rt->dst.metrics[RTAX_MTU-1] = 576; } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } else @@ -1827,7 +1827,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40) rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif @@ -1883,7 +1883,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_iif = @@ -2202,7 +2202,7 @@ out: return err; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_iif = @@ -2820,7 +2820,7 @@ static int rt_fill_info(struct net *net, } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif @@ -3245,9 +3245,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +#endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) @@ -3263,7 +3263,7 @@ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 1534f2b44caf..1b79353a5d29 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -886,7 +886,7 @@ config NETFILTER_XT_MATCH_RATEEST config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' depends on NETFILTER_ADVANCED - select NET_CLS_ROUTE + select IP_ROUTE_CLASSID help This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a36270a994d7..4b753ef70bb7 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -243,7 +243,7 @@ config NET_CLS_TCINDEX config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" - select NET_CLS_ROUTE + select IP_ROUTE_CLASSID select NET_CLS ---help--- If you say Y here, you will be able to classify packets @@ -252,9 +252,6 @@ config NET_CLS_ROUTE4 To compile this code as a module, choose M here: the module will be called cls_route. -config NET_CLS_ROUTE - bool - config NET_CLS_FW tristate "Netfilter mark (FW)" select NET_CLS diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5b271a18bc3a..a3b293d22c66 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -276,7 +276,7 @@ static u32 flow_get_nfct_proto_dst(struct sk_buff *skb) static u32 flow_get_rtclassid(const struct sk_buff *skb) { -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (skb_dst(skb)) return skb_dst(skb)->tclassid; #endif diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 34da5e29ea1a..0d66e58f26d8 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -255,7 +255,7 @@ META_COLLECTOR(int_rtclassid) if (unlikely(skb_dst(skb) == NULL)) *err = -1; else -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID dst->value = skb_dst(skb)->tclassid; #else dst->value = 0; From d862a6622e9db508d4b28cc7c5bc28bd548cc24e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 14 Jan 2011 15:45:56 +0100 Subject: [PATCH 61/80] netfilter: nf_conntrack: use is_vmalloc_addr() Use is_vmalloc_addr() in nf_ct_free_hashtable() and get rid of the vmalloc flags to indicate that a hash table has been allocated using vmalloc(). Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 4 ++-- include/net/netns/conntrack.h | 2 -- include/net/netns/ipv4.h | 1 - net/ipv4/netfilter/nf_nat_core.c | 6 ++---- net/netfilter/nf_conntrack_core.c | 26 +++++++++----------------- net/netfilter/nf_conntrack_expect.c | 9 +++------ net/netfilter/nf_conntrack_helper.c | 10 +++------- 7 files changed, 19 insertions(+), 39 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 2bc344c98215..d0d13378991e 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -202,9 +202,9 @@ extern void nf_ct_l3proto_module_put(unsigned short l3proto); * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) */ -extern void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls); +extern void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); -extern void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size); +extern void nf_ct_free_hashtable(void *hash, unsigned int size); extern struct nf_conntrack_tuple_hash * __nf_conntrack_find(struct net *net, u16 zone, diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index d4958d4c6574..5cf8a8c141aa 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -28,8 +28,6 @@ struct netns_ct { struct ctl_table_header *acct_sysctl_header; struct ctl_table_header *event_sysctl_header; #endif - int hash_vmalloc; - int expect_vmalloc; char *slabname; }; #endif diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d68c3f121774..e2e2ef57eca2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -43,7 +43,6 @@ struct netns_ipv4 { struct xt_table *nat_table; struct hlist_head *nat_bysource; unsigned int nat_htable_size; - int nat_vmalloced; #endif int sysctl_icmp_echo_ignore_all; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index eb55835a02c3..6972ceee99c6 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -682,8 +682,7 @@ static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -705,8 +704,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - net->ipv4.nat_htable_size); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e95ac42ef673..dc2ff2cd0a7e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1202,9 +1202,9 @@ static int kill_all(struct nf_conn *i, void *data) return 1; } -void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) +void nf_ct_free_hashtable(void *hash, unsigned int size) { - if (vmalloced) + if (is_vmalloc_addr(hash)) vfree(hash); else free_pages((unsigned long)hash, @@ -1271,8 +1271,7 @@ static void nf_conntrack_cleanup_net(struct net *net) goto i_see_dead_people; } - nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, - net->ct.htable_size); + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_ecache_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); @@ -1301,21 +1300,18 @@ void nf_conntrack_cleanup(struct net *net) } } -void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) +void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; size_t sz; - *vmalloced = 0; - BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); if (!hash) { - *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); @@ -1331,7 +1327,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) { - int i, bucket, vmalloced, old_vmalloced; + int i, bucket; unsigned int hashsize, old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; @@ -1348,7 +1344,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hashsize) return -EINVAL; - hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); + hash = nf_ct_alloc_hashtable(&hashsize, 1); if (!hash) return -ENOMEM; @@ -1370,15 +1366,13 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) } } old_size = init_net.ct.htable_size; - old_vmalloced = init_net.ct.hash_vmalloc; old_hash = init_net.ct.hash; init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash_vmalloc = vmalloced; init_net.ct.hash = hash; spin_unlock_bh(&nf_conntrack_lock); - nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); + nf_ct_free_hashtable(old_hash, old_size); return 0; } EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); @@ -1491,8 +1485,7 @@ static int nf_conntrack_init_net(struct net *net) } net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, - &net->ct.hash_vmalloc, 1); + net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); if (!net->ct.hash) { ret = -ENOMEM; printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); @@ -1515,8 +1508,7 @@ static int nf_conntrack_init_net(struct net *net) err_acct: nf_conntrack_expect_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, - net->ct.htable_size); + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); err_hash: kmem_cache_destroy(net->ct.nf_conntrack_cachep); err_cache: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4a9ed23180df..cd1e8e0970f2 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -639,8 +639,7 @@ int nf_conntrack_expect_init(struct net *net) } net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, - &net->ct.expect_vmalloc, 0); + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (net->ct.expect_hash == NULL) goto err1; @@ -662,8 +661,7 @@ int nf_conntrack_expect_init(struct net *net) if (net_eq(net, &init_net)) kmem_cache_destroy(nf_ct_expect_cachep); err2: - nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, - nf_ct_expect_hsize); + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); err1: return err; } @@ -675,6 +673,5 @@ void nf_conntrack_expect_fini(struct net *net) rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); } - nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, - nf_ct_expect_hsize); + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 767bbe98a0f0..1bdfea357955 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -33,7 +33,6 @@ static DEFINE_MUTEX(nf_ct_helper_mutex); static struct hlist_head *nf_ct_helper_hash __read_mostly; static unsigned int nf_ct_helper_hsize __read_mostly; static unsigned int nf_ct_helper_count __read_mostly; -static int nf_ct_helper_vmalloc; /* Stupid hash, but collision free for the default registrations of the @@ -267,8 +266,7 @@ int nf_conntrack_helper_init(void) int err; nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, - &nf_ct_helper_vmalloc, 0); + nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; @@ -279,14 +277,12 @@ int nf_conntrack_helper_init(void) return 0; err1: - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, - nf_ct_helper_hsize); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); return err; } void nf_conntrack_helper_fini(void) { nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, - nf_ct_helper_hsize); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); } From 43f393caec0362abe03c72799d3f342af3973070 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sun, 16 Jan 2011 18:10:28 +0100 Subject: [PATCH 62/80] netfilter: audit target to record accepted/dropped packets This patch adds a new netfilter target which creates audit records for packets traversing a certain chain. It can be used to record packets which are rejected administraively as follows: -N AUDIT_DROP -A AUDIT_DROP -j AUDIT --type DROP -A AUDIT_DROP -j DROP a rule which would typically drop or reject a packet would then invoke the new chain to record packets before dropping them. -j AUDIT_DROP The module is protocol independant and works for iptables, ip6tables and ebtables. The following information is logged: - netfilter hook - packet length - incomming/outgoing interface - MAC src/dst/proto for ethernet packets - src/dst/protocol address for IPv4/IPv6 - src/dst port for TCP/UDP/UDPLITE - icmp type/code Cc: Patrick McHardy Cc: Eric Paris Cc: Al Viro Signed-off-by: Thomas Graf Signed-off-by: Patrick McHardy --- include/linux/audit.h | 1 + include/linux/netfilter/Kbuild | 1 + include/linux/netfilter/xt_AUDIT.h | 30 +++++ net/netfilter/Kconfig | 10 ++ net/netfilter/Makefile | 1 + net/netfilter/xt_AUDIT.c | 204 +++++++++++++++++++++++++++++ 6 files changed, 247 insertions(+) create mode 100644 include/linux/netfilter/xt_AUDIT.h create mode 100644 net/netfilter/xt_AUDIT.c diff --git a/include/linux/audit.h b/include/linux/audit.h index 8b5c0620abf9..ae227dfcf9c6 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -103,6 +103,7 @@ #define AUDIT_BPRM_FCAPS 1321 /* Information about fcaps increasing perms */ #define AUDIT_CAPSET 1322 /* Record showing argument to sys_capset */ #define AUDIT_MMAP 1323 /* Record showing descriptor and flags in mmap */ +#define AUDIT_NETFILTER_PKT 1324 /* Packets traversing netfilter chains */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 9d40effe7ca7..9f11fbc377e2 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -9,6 +9,7 @@ header-y += nfnetlink_conntrack.h header-y += nfnetlink_log.h header-y += nfnetlink_queue.h header-y += x_tables.h +header-y += xt_AUDIT.h header-y += xt_CHECKSUM.h header-y += xt_CLASSIFY.h header-y += xt_CONNMARK.h diff --git a/include/linux/netfilter/xt_AUDIT.h b/include/linux/netfilter/xt_AUDIT.h new file mode 100644 index 000000000000..38751d2ea52b --- /dev/null +++ b/include/linux/netfilter/xt_AUDIT.h @@ -0,0 +1,30 @@ +/* + * Header file for iptables xt_AUDIT target + * + * (C) 2010-2011 Thomas Graf + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _XT_AUDIT_TARGET_H +#define _XT_AUDIT_TARGET_H + +#include + +enum { + XT_AUDIT_TYPE_ACCEPT = 0, + XT_AUDIT_TYPE_DROP, + XT_AUDIT_TYPE_REJECT, + __XT_AUDIT_TYPE_MAX, +}; + +#define XT_AUDIT_TYPE_MAX (__XT_AUDIT_TYPE_MAX - 1) + +struct xt_audit_info { + __u8 type; /* XT_AUDIT_TYPE_* */ +}; + +#endif /* _XT_AUDIT_TARGET_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 1b79353a5d29..93918f022555 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -326,6 +326,16 @@ config NETFILTER_XT_CONNMARK comment "Xtables targets" +config NETFILTER_XT_TARGET_AUDIT + tristate "AUDIT target support" + depends on AUDIT + depends on NETFILTER_ADVANCED + ---help--- + This option adds a 'AUDIT' target, which can be used to create + audit records for packets dropped/accepted. + + To compileit as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" depends on IP_NF_MANGLE || IP6_NF_MANGLE diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 441050f31111..401d574bdfd2 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o # targets +obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c new file mode 100644 index 000000000000..81802d27346e --- /dev/null +++ b/net/netfilter/xt_AUDIT.c @@ -0,0 +1,204 @@ +/* + * Creates audit record for dropped/accepted packets + * + * (C) 2010-2011 Thomas Graf + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Graf "); +MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); +MODULE_ALIAS("ipt_AUDIT"); +MODULE_ALIAS("ip6t_AUDIT"); +MODULE_ALIAS("ebt_AUDIT"); +MODULE_ALIAS("arpt_AUDIT"); + +static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, + unsigned int proto, unsigned int offset) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + const __be16 *pptr; + __be16 _ports[2]; + + pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); + if (pptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " sport=%hu dport=%hu", + ntohs(pptr[0]), ntohs(pptr[1])); + } + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: { + const u8 *iptr; + u8 _ih[2]; + + iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); + if (iptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", + iptr[0], iptr[1]); + + } + break; + } +} + +static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", + &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + + if (ntohs(ih->frag_off) & IP_OFFSET) { + audit_log_format(ab, " frag=1"); + return; + } + + audit_proto(ab, skb, ih->protocol, ih->ihl * 4); +} + +static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + int offset; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + nexthdr = ih->nexthdr; + offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), + &nexthdr); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + if (offset) + audit_proto(ab, skb, nexthdr, offset); +} + +static unsigned int +audit_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (ab == NULL) + goto errout; + + audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", + info->type, par->hooknum, skb->len, + par->in ? par->in->name : "?", + par->out ? par->out->name : "?"); + + if (skb->mark) + audit_log_format(ab, " mark=%#x", skb->mark); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + + if (par->family == NFPROTO_BRIDGE) { + switch (eth_hdr(skb)->h_proto) { + case __constant_htons(ETH_P_IP): + audit_ip4(ab, skb); + break; + + case __constant_htons(ETH_P_IPV6): + audit_ip6(ab, skb); + break; + } + } + } + + switch (par->family) { + case NFPROTO_IPV4: + audit_ip4(ab, skb); + break; + + case NFPROTO_IPV6: + audit_ip6(ab, skb); + break; + } + + audit_log_end(ab); + +errout: + return XT_CONTINUE; +} + +static int audit_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + + if (info->type > XT_AUDIT_TYPE_MAX) { + pr_info("Audit type out of range (valid range: 0..%hhu)\n", + XT_AUDIT_TYPE_MAX); + return -ERANGE; + } + + return 0; +} + +static struct xt_target audit_tg_reg __read_mostly = { + .name = "AUDIT", + .family = NFPROTO_UNSPEC, + .target = audit_tg, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, +}; + +static int __init audit_tg_init(void) +{ + return xt_register_target(&audit_tg_reg); +} + +static void __exit audit_tg_exit(void) +{ + xt_unregister_target(&audit_tg_reg); +} + +module_init(audit_tg_init); +module_exit(audit_tg_exit); From fbabf31e4d482149b5e2704eb0287cf9117bdcf3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sun, 16 Jan 2011 18:12:59 +0100 Subject: [PATCH 63/80] netfilter: create audit records for x_tables replaces The setsockopt() syscall to replace tables is already recorded in the audit logs. This patch stores additional information such as table name and netfilter protocol. Cc: Patrick McHardy Cc: Eric Paris Cc: Al Viro Signed-off-by: Thomas Graf Signed-off-by: Patrick McHardy --- include/linux/audit.h | 1 + net/netfilter/x_tables.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/audit.h b/include/linux/audit.h index ae227dfcf9c6..32b5c62a5042 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -104,6 +104,7 @@ #define AUDIT_CAPSET 1322 /* Record showing argument to sys_capset */ #define AUDIT_MMAP 1323 /* Record showing descriptor and flags in mmap */ #define AUDIT_NETFILTER_PKT 1324 /* Packets traversing netfilter chains */ +#define AUDIT_NETFILTER_CFG 1325 /* Netfilter chain modifications */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ee5de3af510a..fbc2b72091e0 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -834,6 +835,21 @@ xt_replace_table(struct xt_table *table, */ local_bh_enable(); +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + table->name, table->af, + private->number); + audit_log_end(ab); + } + } +#endif + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); From f1e231a356f90a67f8547c2881a62c92084683c6 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Tue, 18 Jan 2011 06:30:13 +0100 Subject: [PATCH 64/80] netfilter: xtables: add missing aliases for autoloading via iptables Signed-off-by: Jan Engelhardt --- net/netfilter/xt_IDLETIMER.c | 2 ++ net/netfilter/xt_LED.c | 2 ++ net/netfilter/xt_cpu.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index be1f22e13545..3bdd443aaf15 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -313,3 +313,5 @@ MODULE_AUTHOR("Timo Teras "); MODULE_AUTHOR("Luciano Coelho "); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ipt_IDLETIMER"); +MODULE_ALIAS("ip6t_IDLETIMER"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index a4140509eea1..993de2ba89d3 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -31,6 +31,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Adam Nielsen "); MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); +MODULE_ALIAS("ipt_LED"); +MODULE_ALIAS("ip6t_LED"); static LIST_HEAD(xt_led_triggers); static DEFINE_MUTEX(xt_led_mutex); diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c index b39db8a5cbae..c7a2e5466bc4 100644 --- a/net/netfilter/xt_cpu.c +++ b/net/netfilter/xt_cpu.c @@ -22,6 +22,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Dumazet "); MODULE_DESCRIPTION("Xtables: CPU match"); +MODULE_ALIAS("ipt_cpu"); +MODULE_ALIAS("ip6t_cpu"); static int cpu_mt_check(const struct xt_mtchk_param *par) { From ae9d67aff60af59548b6c7d1a74febea09660122 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Tue, 18 Jan 2011 06:48:12 +0100 Subject: [PATCH 65/80] audit: export symbol for use with xt_AUDIT When xt_AUDIT is built as a module, modpost reports a problem. MODPOST 322 modules ERROR: "audit_enabled" [net/netfilter/x_tables.ko] undefined! WARNING: modpost: Found 1 section mismatch(es). Cc: Thomas Graf Signed-off-by: Jan Engelhardt --- kernel/audit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d59..5842f65bedcb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -74,6 +74,8 @@ static int audit_initialized; int audit_enabled; int audit_ever_enabled; +EXPORT_SYMBOL_GPL(audit_enabled); + /* Default state when kernel boots without any parameters. */ static int audit_default; From 1cc34c30be0e27d4ba8c1ce04a8a4f46c927d121 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 18 Jan 2011 01:36:57 +0100 Subject: [PATCH 66/80] netfilter: xt_connlimit: use hotdrop jump mark Signed-off-by: Richard Weinberger Signed-off-by: Jan Engelhardt --- net/netfilter/xt_connlimit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 5c5b6b921b84..452bc16af56c 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -204,11 +204,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) &info->mask, par->family); spin_unlock_bh(&info->data->lock); - if (connections < 0) { + if (connections < 0) /* kmalloc failed, drop it entirely */ - par->hotdrop = true; - return false; - } + goto hotdrop; return (connections > info->limit) ^ info->inverse; From 0260c1dccc6a1018f8cf2c4778dffb47fc5d1c4c Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Tue, 18 Jan 2011 07:33:09 +0100 Subject: [PATCH 67/80] netfilter: xtables: use __uXX guarded types for userspace exports Signed-off-by: Jan Engelhardt --- include/linux/netfilter_bridge/ebt_802_3.h | 24 +++++++++---------- include/linux/netfilter_bridge/ebt_among.h | 2 +- include/linux/netfilter_bridge/ebt_arp.h | 4 ++-- include/linux/netfilter_bridge/ebt_ip.h | 12 +++++----- include/linux/netfilter_bridge/ebt_ip6.h | 16 ++++++------- include/linux/netfilter_bridge/ebt_limit.h | 8 +++---- include/linux/netfilter_bridge/ebt_log.h | 6 ++--- include/linux/netfilter_bridge/ebt_mark_m.h | 4 ++-- include/linux/netfilter_bridge/ebt_nflog.h | 10 ++++---- include/linux/netfilter_bridge/ebt_pkttype.h | 4 ++-- include/linux/netfilter_bridge/ebt_stp.h | 24 +++++++++---------- include/linux/netfilter_bridge/ebt_ulog.h | 2 +- include/linux/netfilter_bridge/ebt_vlan.h | 8 +++---- include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 14 +++++------ include/linux/netfilter_ipv4/ipt_ECN.h | 6 ++--- include/linux/netfilter_ipv4/ipt_SAME.h | 6 ++--- include/linux/netfilter_ipv4/ipt_TTL.h | 4 ++-- include/linux/netfilter_ipv4/ipt_addrtype.h | 14 +++++------ include/linux/netfilter_ipv4/ipt_ah.h | 4 ++-- include/linux/netfilter_ipv4/ipt_ecn.h | 8 +++---- include/linux/netfilter_ipv4/ipt_ttl.h | 4 ++-- include/linux/netfilter_ipv6/ip6t_HL.h | 4 ++-- include/linux/netfilter_ipv6/ip6t_REJECT.h | 2 +- include/linux/netfilter_ipv6/ip6t_ah.h | 8 +++---- include/linux/netfilter_ipv6/ip6t_frag.h | 8 +++---- include/linux/netfilter_ipv6/ip6t_hl.h | 4 ++-- .../linux/netfilter_ipv6/ip6t_ipv6header.h | 6 ++--- include/linux/netfilter_ipv6/ip6t_mh.h | 4 ++-- include/linux/netfilter_ipv6/ip6t_opts.h | 10 ++++---- include/linux/netfilter_ipv6/ip6t_rt.h | 12 +++++----- 30 files changed, 121 insertions(+), 121 deletions(-) diff --git a/include/linux/netfilter_bridge/ebt_802_3.h b/include/linux/netfilter_bridge/ebt_802_3.h index c73ef0b18bdc..c427764f4444 100644 --- a/include/linux/netfilter_bridge/ebt_802_3.h +++ b/include/linux/netfilter_bridge/ebt_802_3.h @@ -24,24 +24,24 @@ /* ui has one byte ctrl, ni has two */ struct hdr_ui { - uint8_t dsap; - uint8_t ssap; - uint8_t ctrl; - uint8_t orig[3]; + __u8 dsap; + __u8 ssap; + __u8 ctrl; + __u8 orig[3]; __be16 type; }; struct hdr_ni { - uint8_t dsap; - uint8_t ssap; + __u8 dsap; + __u8 ssap; __be16 ctrl; - uint8_t orig[3]; + __u8 orig[3]; __be16 type; }; struct ebt_802_3_hdr { - uint8_t daddr[6]; - uint8_t saddr[6]; + __u8 daddr[6]; + __u8 saddr[6]; __be16 len; union { struct hdr_ui ui; @@ -59,10 +59,10 @@ static inline struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb) #endif struct ebt_802_3_info { - uint8_t sap; + __u8 sap; __be16 type; - uint8_t bitmask; - uint8_t invflags; + __u8 bitmask; + __u8 invflags; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_among.h b/include/linux/netfilter_bridge/ebt_among.h index 0009558609a7..686c9619dbc0 100644 --- a/include/linux/netfilter_bridge/ebt_among.h +++ b/include/linux/netfilter_bridge/ebt_among.h @@ -30,7 +30,7 @@ */ struct ebt_mac_wormhash_tuple { - uint32_t cmp[2]; + __u32 cmp[2]; __be32 ip; }; diff --git a/include/linux/netfilter_bridge/ebt_arp.h b/include/linux/netfilter_bridge/ebt_arp.h index cbf4843b6b0f..e62b5af95869 100644 --- a/include/linux/netfilter_bridge/ebt_arp.h +++ b/include/linux/netfilter_bridge/ebt_arp.h @@ -27,8 +27,8 @@ struct ebt_arp_info unsigned char smmsk[ETH_ALEN]; unsigned char dmaddr[ETH_ALEN]; unsigned char dmmsk[ETH_ALEN]; - uint8_t bitmask; - uint8_t invflags; + __u8 bitmask; + __u8 invflags; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_ip.h b/include/linux/netfilter_bridge/ebt_ip.h index 6a708fb92241..d99de58da2c7 100644 --- a/include/linux/netfilter_bridge/ebt_ip.h +++ b/include/linux/netfilter_bridge/ebt_ip.h @@ -31,12 +31,12 @@ struct ebt_ip_info { __be32 daddr; __be32 smsk; __be32 dmsk; - uint8_t tos; - uint8_t protocol; - uint8_t bitmask; - uint8_t invflags; - uint16_t sport[2]; - uint16_t dport[2]; + __u8 tos; + __u8 protocol; + __u8 bitmask; + __u8 invflags; + __u16 sport[2]; + __u16 dport[2]; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_ip6.h b/include/linux/netfilter_bridge/ebt_ip6.h index 22af18a3c16b..998e9d5a6b60 100644 --- a/include/linux/netfilter_bridge/ebt_ip6.h +++ b/include/linux/netfilter_bridge/ebt_ip6.h @@ -31,17 +31,17 @@ struct ebt_ip6_info { struct in6_addr daddr; struct in6_addr smsk; struct in6_addr dmsk; - uint8_t tclass; - uint8_t protocol; - uint8_t bitmask; - uint8_t invflags; + __u8 tclass; + __u8 protocol; + __u8 bitmask; + __u8 invflags; union { - uint16_t sport[2]; - uint8_t icmpv6_type[2]; + __u16 sport[2]; + __u8 icmpv6_type[2]; }; union { - uint16_t dport[2]; - uint8_t icmpv6_code[2]; + __u16 dport[2]; + __u8 icmpv6_code[2]; }; }; diff --git a/include/linux/netfilter_bridge/ebt_limit.h b/include/linux/netfilter_bridge/ebt_limit.h index 4bf76b751676..721d51ffa513 100644 --- a/include/linux/netfilter_bridge/ebt_limit.h +++ b/include/linux/netfilter_bridge/ebt_limit.h @@ -10,13 +10,13 @@ seconds, or one every 59 hours. */ struct ebt_limit_info { - u_int32_t avg; /* Average secs between packets * scale */ - u_int32_t burst; /* Period multiplier for upper limit. */ + __u32 avg; /* Average secs between packets * scale */ + __u32 burst; /* Period multiplier for upper limit. */ /* Used internally by the kernel */ unsigned long prev; - u_int32_t credit; - u_int32_t credit_cap, cost; + __u32 credit; + __u32 credit_cap, cost; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_log.h b/include/linux/netfilter_bridge/ebt_log.h index cc2cdfb764bc..564beb4946ea 100644 --- a/include/linux/netfilter_bridge/ebt_log.h +++ b/include/linux/netfilter_bridge/ebt_log.h @@ -10,9 +10,9 @@ #define EBT_LOG_WATCHER "log" struct ebt_log_info { - uint8_t loglevel; - uint8_t prefix[EBT_LOG_PREFIX_SIZE]; - uint32_t bitmask; + __u8 loglevel; + __u8 prefix[EBT_LOG_PREFIX_SIZE]; + __u32 bitmask; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_mark_m.h b/include/linux/netfilter_bridge/ebt_mark_m.h index 9ceb10ec0ed6..97b96c4b8db4 100644 --- a/include/linux/netfilter_bridge/ebt_mark_m.h +++ b/include/linux/netfilter_bridge/ebt_mark_m.h @@ -6,8 +6,8 @@ #define EBT_MARK_MASK (EBT_MARK_AND | EBT_MARK_OR) struct ebt_mark_m_info { unsigned long mark, mask; - uint8_t invert; - uint8_t bitmask; + __u8 invert; + __u8 bitmask; }; #define EBT_MARK_MATCH "mark_m" diff --git a/include/linux/netfilter_bridge/ebt_nflog.h b/include/linux/netfilter_bridge/ebt_nflog.h index 052817849b83..477315bc3537 100644 --- a/include/linux/netfilter_bridge/ebt_nflog.h +++ b/include/linux/netfilter_bridge/ebt_nflog.h @@ -10,11 +10,11 @@ #define EBT_NFLOG_DEFAULT_THRESHOLD 1 struct ebt_nflog_info { - u_int32_t len; - u_int16_t group; - u_int16_t threshold; - u_int16_t flags; - u_int16_t pad; + __u32 len; + __u16 group; + __u16 threshold; + __u16 flags; + __u16 pad; char prefix[EBT_NFLOG_PREFIX_SIZE]; }; diff --git a/include/linux/netfilter_bridge/ebt_pkttype.h b/include/linux/netfilter_bridge/ebt_pkttype.h index 51a799840931..7c0fb0fdcf14 100644 --- a/include/linux/netfilter_bridge/ebt_pkttype.h +++ b/include/linux/netfilter_bridge/ebt_pkttype.h @@ -2,8 +2,8 @@ #define __LINUX_BRIDGE_EBT_PKTTYPE_H struct ebt_pkttype_info { - uint8_t pkt_type; - uint8_t invert; + __u8 pkt_type; + __u8 invert; }; #define EBT_PKTTYPE_MATCH "pkttype" diff --git a/include/linux/netfilter_bridge/ebt_stp.h b/include/linux/netfilter_bridge/ebt_stp.h index e503a0aa2728..13a0bd49a92a 100644 --- a/include/linux/netfilter_bridge/ebt_stp.h +++ b/include/linux/netfilter_bridge/ebt_stp.h @@ -21,24 +21,24 @@ #define EBT_STP_MATCH "stp" struct ebt_stp_config_info { - uint8_t flags; - uint16_t root_priol, root_priou; + __u8 flags; + __u16 root_priol, root_priou; char root_addr[6], root_addrmsk[6]; - uint32_t root_costl, root_costu; - uint16_t sender_priol, sender_priou; + __u32 root_costl, root_costu; + __u16 sender_priol, sender_priou; char sender_addr[6], sender_addrmsk[6]; - uint16_t portl, portu; - uint16_t msg_agel, msg_ageu; - uint16_t max_agel, max_ageu; - uint16_t hello_timel, hello_timeu; - uint16_t forward_delayl, forward_delayu; + __u16 portl, portu; + __u16 msg_agel, msg_ageu; + __u16 max_agel, max_ageu; + __u16 hello_timel, hello_timeu; + __u16 forward_delayl, forward_delayu; }; struct ebt_stp_info { - uint8_t type; + __u8 type; struct ebt_stp_config_info config; - uint16_t bitmask; - uint16_t invflags; + __u16 bitmask; + __u16 invflags; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_ulog.h b/include/linux/netfilter_bridge/ebt_ulog.h index b677e2671541..de35a51a7e46 100644 --- a/include/linux/netfilter_bridge/ebt_ulog.h +++ b/include/linux/netfilter_bridge/ebt_ulog.h @@ -10,7 +10,7 @@ #define EBT_ULOG_VERSION 1 struct ebt_ulog_info { - uint32_t nlgroup; + __u32 nlgroup; unsigned int cprange; unsigned int qthreshold; char prefix[EBT_ULOG_PREFIX_LEN]; diff --git a/include/linux/netfilter_bridge/ebt_vlan.h b/include/linux/netfilter_bridge/ebt_vlan.h index 1d98be4031e7..48dffc1dad36 100644 --- a/include/linux/netfilter_bridge/ebt_vlan.h +++ b/include/linux/netfilter_bridge/ebt_vlan.h @@ -8,12 +8,12 @@ #define EBT_VLAN_MATCH "vlan" struct ebt_vlan_info { - uint16_t id; /* VLAN ID {1-4095} */ - uint8_t prio; /* VLAN User Priority {0-7} */ + __u16 id; /* VLAN ID {1-4095} */ + __u8 prio; /* VLAN User Priority {0-7} */ __be16 encap; /* VLAN Encapsulated frame code {0-65535} */ - uint8_t bitmask; /* Args bitmask bit 1=1 - ID arg, + __u8 bitmask; /* Args bitmask bit 1=1 - ID arg, bit 2=1 User-Priority arg, bit 3=1 encap*/ - uint8_t invflags; /* Inverse bitmask bit 1=1 - inversed ID arg, + __u8 invflags; /* Inverse bitmask bit 1=1 - inversed ID arg, bit 2=1 - inversed Pirority arg */ }; diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h index e5a3687c8a72..3114f06939ef 100644 --- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -17,15 +17,15 @@ struct clusterip_config; struct ipt_clusterip_tgt_info { - u_int32_t flags; + __u32 flags; /* only relevant for new ones */ - u_int8_t clustermac[6]; - u_int16_t num_total_nodes; - u_int16_t num_local_nodes; - u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; - u_int32_t hash_mode; - u_int32_t hash_initval; + __u8 clustermac[6]; + __u16 num_total_nodes; + __u16 num_local_nodes; + __u16 local_nodes[CLUSTERIP_MAX_NODES]; + __u32 hash_mode; + __u32 hash_initval; /* Used internally by the kernel */ struct clusterip_config *config; diff --git a/include/linux/netfilter_ipv4/ipt_ECN.h b/include/linux/netfilter_ipv4/ipt_ECN.h index 7ca45918ab8e..c6e3e01b75e0 100644 --- a/include/linux/netfilter_ipv4/ipt_ECN.h +++ b/include/linux/netfilter_ipv4/ipt_ECN.h @@ -19,11 +19,11 @@ #define IPT_ECN_OP_MASK 0xce struct ipt_ECN_info { - u_int8_t operation; /* bitset of operations */ - u_int8_t ip_ect; /* ECT codepoint of IPv4 header, pre-shifted */ + __u8 operation; /* bitset of operations */ + __u8 ip_ect; /* ECT codepoint of IPv4 header, pre-shifted */ union { struct { - u_int8_t ece:1, cwr:1; /* TCP ECT bits */ + __u8 ece:1, cwr:1; /* TCP ECT bits */ } tcp; } proto; }; diff --git a/include/linux/netfilter_ipv4/ipt_SAME.h b/include/linux/netfilter_ipv4/ipt_SAME.h index 2529660c5b38..fa0ebeca5d95 100644 --- a/include/linux/netfilter_ipv4/ipt_SAME.h +++ b/include/linux/netfilter_ipv4/ipt_SAME.h @@ -7,9 +7,9 @@ struct ipt_same_info { unsigned char info; - u_int32_t rangesize; - u_int32_t ipnum; - u_int32_t *iparray; + __u32 rangesize; + __u32 ipnum; + __u32 *iparray; /* hangs off end. */ struct nf_nat_range range[IPT_SAME_MAX_RANGE]; diff --git a/include/linux/netfilter_ipv4/ipt_TTL.h b/include/linux/netfilter_ipv4/ipt_TTL.h index ee6611edc112..f6250e422d5e 100644 --- a/include/linux/netfilter_ipv4/ipt_TTL.h +++ b/include/linux/netfilter_ipv4/ipt_TTL.h @@ -13,8 +13,8 @@ enum { #define IPT_TTL_MAXMODE IPT_TTL_DEC struct ipt_TTL_info { - u_int8_t mode; - u_int8_t ttl; + __u8 mode; + __u8 ttl; }; diff --git a/include/linux/netfilter_ipv4/ipt_addrtype.h b/include/linux/netfilter_ipv4/ipt_addrtype.h index 446de6aef983..f29c3cfcc240 100644 --- a/include/linux/netfilter_ipv4/ipt_addrtype.h +++ b/include/linux/netfilter_ipv4/ipt_addrtype.h @@ -9,17 +9,17 @@ enum { }; struct ipt_addrtype_info_v1 { - u_int16_t source; /* source-type mask */ - u_int16_t dest; /* dest-type mask */ - u_int32_t flags; + __u16 source; /* source-type mask */ + __u16 dest; /* dest-type mask */ + __u32 flags; }; /* revision 0 */ struct ipt_addrtype_info { - u_int16_t source; /* source-type mask */ - u_int16_t dest; /* dest-type mask */ - u_int32_t invert_source; - u_int32_t invert_dest; + __u16 source; /* source-type mask */ + __u16 dest; /* dest-type mask */ + __u32 invert_source; + __u32 invert_dest; }; #endif diff --git a/include/linux/netfilter_ipv4/ipt_ah.h b/include/linux/netfilter_ipv4/ipt_ah.h index 2e555b4d05e3..8fea283ee62a 100644 --- a/include/linux/netfilter_ipv4/ipt_ah.h +++ b/include/linux/netfilter_ipv4/ipt_ah.h @@ -2,8 +2,8 @@ #define _IPT_AH_H struct ipt_ah { - u_int32_t spis[2]; /* Security Parameter Index */ - u_int8_t invflags; /* Inverse flags */ + __u32 spis[2]; /* Security Parameter Index */ + __u8 invflags; /* Inverse flags */ }; diff --git a/include/linux/netfilter_ipv4/ipt_ecn.h b/include/linux/netfilter_ipv4/ipt_ecn.h index 9945baa4ccd7..78b98aa8784d 100644 --- a/include/linux/netfilter_ipv4/ipt_ecn.h +++ b/include/linux/netfilter_ipv4/ipt_ecn.h @@ -20,12 +20,12 @@ /* match info */ struct ipt_ecn_info { - u_int8_t operation; - u_int8_t invert; - u_int8_t ip_ect; + __u8 operation; + __u8 invert; + __u8 ip_ect; union { struct { - u_int8_t ect; + __u8 ect; } tcp; } proto; }; diff --git a/include/linux/netfilter_ipv4/ipt_ttl.h b/include/linux/netfilter_ipv4/ipt_ttl.h index ee24fd86a3aa..93d9a06689a3 100644 --- a/include/linux/netfilter_ipv4/ipt_ttl.h +++ b/include/linux/netfilter_ipv4/ipt_ttl.h @@ -13,8 +13,8 @@ enum { struct ipt_ttl_info { - u_int8_t mode; - u_int8_t ttl; + __u8 mode; + __u8 ttl; }; diff --git a/include/linux/netfilter_ipv6/ip6t_HL.h b/include/linux/netfilter_ipv6/ip6t_HL.h index afb7813d45ab..81cdaf0480e3 100644 --- a/include/linux/netfilter_ipv6/ip6t_HL.h +++ b/include/linux/netfilter_ipv6/ip6t_HL.h @@ -14,8 +14,8 @@ enum { #define IP6T_HL_MAXMODE IP6T_HL_DEC struct ip6t_HL_info { - u_int8_t mode; - u_int8_t hop_limit; + __u8 mode; + __u8 hop_limit; }; diff --git a/include/linux/netfilter_ipv6/ip6t_REJECT.h b/include/linux/netfilter_ipv6/ip6t_REJECT.h index 6be6504162bb..b999aa4e5969 100644 --- a/include/linux/netfilter_ipv6/ip6t_REJECT.h +++ b/include/linux/netfilter_ipv6/ip6t_REJECT.h @@ -12,7 +12,7 @@ enum ip6t_reject_with { }; struct ip6t_reject_info { - u_int32_t with; /* reject type */ + __u32 with; /* reject type */ }; #endif /*_IP6T_REJECT_H*/ diff --git a/include/linux/netfilter_ipv6/ip6t_ah.h b/include/linux/netfilter_ipv6/ip6t_ah.h index 17a745cfb2c7..a602c165edd1 100644 --- a/include/linux/netfilter_ipv6/ip6t_ah.h +++ b/include/linux/netfilter_ipv6/ip6t_ah.h @@ -2,10 +2,10 @@ #define _IP6T_AH_H struct ip6t_ah { - u_int32_t spis[2]; /* Security Parameter Index */ - u_int32_t hdrlen; /* Header Length */ - u_int8_t hdrres; /* Test of the Reserved Filed */ - u_int8_t invflags; /* Inverse flags */ + __u32 spis[2]; /* Security Parameter Index */ + __u32 hdrlen; /* Header Length */ + __u8 hdrres; /* Test of the Reserved Filed */ + __u8 invflags; /* Inverse flags */ }; #define IP6T_AH_SPI 0x01 diff --git a/include/linux/netfilter_ipv6/ip6t_frag.h b/include/linux/netfilter_ipv6/ip6t_frag.h index 3724d0850920..538b31ef5e3d 100644 --- a/include/linux/netfilter_ipv6/ip6t_frag.h +++ b/include/linux/netfilter_ipv6/ip6t_frag.h @@ -2,10 +2,10 @@ #define _IP6T_FRAG_H struct ip6t_frag { - u_int32_t ids[2]; /* Security Parameter Index */ - u_int32_t hdrlen; /* Header Length */ - u_int8_t flags; /* */ - u_int8_t invflags; /* Inverse flags */ + __u32 ids[2]; /* Security Parameter Index */ + __u32 hdrlen; /* Header Length */ + __u8 flags; /* */ + __u8 invflags; /* Inverse flags */ }; #define IP6T_FRAG_IDS 0x01 diff --git a/include/linux/netfilter_ipv6/ip6t_hl.h b/include/linux/netfilter_ipv6/ip6t_hl.h index 5ef91b8319a8..c6fddcb971da 100644 --- a/include/linux/netfilter_ipv6/ip6t_hl.h +++ b/include/linux/netfilter_ipv6/ip6t_hl.h @@ -14,8 +14,8 @@ enum { struct ip6t_hl_info { - u_int8_t mode; - u_int8_t hop_limit; + __u8 mode; + __u8 hop_limit; }; diff --git a/include/linux/netfilter_ipv6/ip6t_ipv6header.h b/include/linux/netfilter_ipv6/ip6t_ipv6header.h index 01dfd445596a..73d53bd3ff62 100644 --- a/include/linux/netfilter_ipv6/ip6t_ipv6header.h +++ b/include/linux/netfilter_ipv6/ip6t_ipv6header.h @@ -9,9 +9,9 @@ on whether they contain certain headers */ #define __IPV6HEADER_H struct ip6t_ipv6header_info { - u_int8_t matchflags; - u_int8_t invflags; - u_int8_t modeflag; + __u8 matchflags; + __u8 invflags; + __u8 modeflag; }; #define MASK_HOPOPTS 128 diff --git a/include/linux/netfilter_ipv6/ip6t_mh.h b/include/linux/netfilter_ipv6/ip6t_mh.h index 18549bca2d1f..98c8cf685eea 100644 --- a/include/linux/netfilter_ipv6/ip6t_mh.h +++ b/include/linux/netfilter_ipv6/ip6t_mh.h @@ -3,8 +3,8 @@ /* MH matching stuff */ struct ip6t_mh { - u_int8_t types[2]; /* MH type range */ - u_int8_t invflags; /* Inverse flags */ + __u8 types[2]; /* MH type range */ + __u8 invflags; /* Inverse flags */ }; /* Values for "invflags" field in struct ip6t_mh. */ diff --git a/include/linux/netfilter_ipv6/ip6t_opts.h b/include/linux/netfilter_ipv6/ip6t_opts.h index 62d89bcd9f9c..405d309cd741 100644 --- a/include/linux/netfilter_ipv6/ip6t_opts.h +++ b/include/linux/netfilter_ipv6/ip6t_opts.h @@ -4,11 +4,11 @@ #define IP6T_OPTS_OPTSNR 16 struct ip6t_opts { - u_int32_t hdrlen; /* Header Length */ - u_int8_t flags; /* */ - u_int8_t invflags; /* Inverse flags */ - u_int16_t opts[IP6T_OPTS_OPTSNR]; /* opts */ - u_int8_t optsnr; /* Nr of OPts */ + __u32 hdrlen; /* Header Length */ + __u8 flags; /* */ + __u8 invflags; /* Inverse flags */ + __u16 opts[IP6T_OPTS_OPTSNR]; /* opts */ + __u8 optsnr; /* Nr of OPts */ }; #define IP6T_OPTS_LEN 0x01 diff --git a/include/linux/netfilter_ipv6/ip6t_rt.h b/include/linux/netfilter_ipv6/ip6t_rt.h index ab91bfd2cd00..e8dad20acd37 100644 --- a/include/linux/netfilter_ipv6/ip6t_rt.h +++ b/include/linux/netfilter_ipv6/ip6t_rt.h @@ -6,13 +6,13 @@ #define IP6T_RT_HOPS 16 struct ip6t_rt { - u_int32_t rt_type; /* Routing Type */ - u_int32_t segsleft[2]; /* Segments Left */ - u_int32_t hdrlen; /* Header Length */ - u_int8_t flags; /* */ - u_int8_t invflags; /* Inverse flags */ + __u32 rt_type; /* Routing Type */ + __u32 segsleft[2]; /* Segments Left */ + __u32 hdrlen; /* Header Length */ + __u8 flags; /* */ + __u8 invflags; /* Inverse flags */ struct in6_addr addrs[IP6T_RT_HOPS]; /* Hops */ - u_int8_t addrnr; /* Nr of Addresses */ + __u8 addrnr; /* Nr of Addresses */ }; #define IP6T_RT_TYP 0x01 From 0b8ad876275c74e4bfb6ec3150793f3c0ecfcee2 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Tue, 18 Jan 2011 11:23:06 +0100 Subject: [PATCH 68/80] netfilter: xtables: add missing header files to export list Signed-off-by: Jan Engelhardt --- include/linux/netfilter/Kbuild | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 9f11fbc377e2..fc4e0aa805a2 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -56,6 +56,8 @@ header-y += xt_rateest.h header-y += xt_realm.h header-y += xt_recent.h header-y += xt_sctp.h +header-y += xt_secmark.h +header-y += xt_socket.h header-y += xt_state.h header-y += xt_statistic.h header-y += xt_string.h From a7c2f4d7daf9bbea362763fa7353b1862a2487ad Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 18 Jan 2011 15:02:48 +0100 Subject: [PATCH 69/80] netfilter: nf_nat: fix conversion to non-atomic bit ops My previous patch (netfilter: nf_nat: don't use atomic bit operation) made a mistake when converting atomic_set to a normal bit 'or'. IPS_*_BIT should be replaced with IPS_*. Signed-off-by: Changli Gao Cc: Tim Gardner Cc: Eric Dumazet Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_nat_core.h | 4 ++-- net/ipv4/netfilter/nf_nat_core.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index 5aec85c29979..3dc7b98effeb 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -21,9 +21,9 @@ static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { if (manip == IP_NAT_MANIP_SRC) - return ct->status & IPS_SRC_NAT_DONE_BIT; + return ct->status & IPS_SRC_NAT_DONE; else - return ct->status & IPS_DST_NAT_DONE_BIT; + return ct->status & IPS_DST_NAT_DONE; } struct nlattr; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 6972ceee99c6..3002c0492fb0 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -323,9 +323,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - ct->status |= IPS_DST_NAT_DONE_BIT; + ct->status |= IPS_DST_NAT_DONE; else - ct->status |= IPS_SRC_NAT_DONE_BIT; + ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } From 45eec34195853e918518231dcefaca1ea4ebacfc Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 18 Jan 2011 15:08:13 +0100 Subject: [PATCH 70/80] netfilter: nf_conntrack: remove an atomic bit operation As this ct won't be seen by the others, we don't need to set the IPS_CONFIRMED_BIT in atomic way. Signed-off-by: Changli Gao Cc: Tim Gardner Cc: Eric Dumazet Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index dc2ff2cd0a7e..f47ac67e1bfe 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -486,7 +486,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) ct->timeout.expires += jiffies; add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); - set_bit(IPS_CONFIRMED_BIT, &ct->status); + ct->status |= IPS_CONFIRMED; /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers From 5f2cafe73671d865af88494159f3e8c1b322e1c5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jan 2011 15:18:08 +0100 Subject: [PATCH 71/80] netfilter: Kconfig: NFQUEUE is useless without NETFILTER_NETLINK_QUEUE NFLOG already does the same thing for NETFILTER_NETLINK_LOG. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 93918f022555..e2480bddbfd5 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -487,6 +487,7 @@ config NETFILTER_XT_TARGET_NFLOG config NETFILTER_XT_TARGET_NFQUEUE tristate '"NFQUEUE" target Support' depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_QUEUE help This target replaced the old obsolete QUEUE target. From f15850861860636c905b33a9a5be3dcbc2b0d56a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jan 2011 15:27:28 +0100 Subject: [PATCH 72/80] netfilter: nfnetlink_queue: return error number to caller instead of returning -1 on error, return an error number to allow the caller to handle some errors differently. ECANCELED is used to indicate that the hook is going away and should be ignored. A followup patch will introduce more 'ignore this hook' conditions, (depending on queue settings) and will move kfree_skb responsibility to the caller. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 6 +++-- net/netfilter/nf_queue.c | 44 +++++++++++++++++++++++---------- net/netfilter/nfnetlink_queue.c | 22 +++++++++++------ 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index e69d537362c7..91d66d2f8cd9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -179,9 +179,11 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_BITS)) + ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_BITS); + if (ret == -ECANCELED) goto next_hook; + ret = 0; } rcu_read_unlock(); return ret; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 1876f7411561..ad25c7e726bc 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -125,7 +125,7 @@ static int __nf_queue(struct sk_buff *skb, int (*okfn)(struct sk_buff *), unsigned int queuenum) { - int status; + int status = -ENOENT; struct nf_queue_entry *entry = NULL; #ifdef CONFIG_BRIDGE_NETFILTER struct net_device *physindev; @@ -146,8 +146,10 @@ static int __nf_queue(struct sk_buff *skb, goto err_unlock; entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); - if (!entry) + if (!entry) { + status = -ENOMEM; goto err_unlock; + } *entry = (struct nf_queue_entry) { .skb = skb, @@ -163,9 +165,8 @@ static int __nf_queue(struct sk_buff *skb, if (!try_module_get(entry->elem->owner)) { rcu_read_unlock(); kfree(entry); - return 0; + return -ECANCELED; } - /* Bump dev refs so they don't vanish while packet is out */ if (indev) dev_hold(indev); @@ -192,14 +193,14 @@ static int __nf_queue(struct sk_buff *skb, goto err; } - return 1; + return 0; err_unlock: rcu_read_unlock(); err: kfree_skb(skb); kfree(entry); - return 1; + return status; } int nf_queue(struct sk_buff *skb, @@ -211,6 +212,8 @@ int nf_queue(struct sk_buff *skb, unsigned int queuenum) { struct sk_buff *segs; + int err; + unsigned int queued; if (!skb_is_gso(skb)) return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, @@ -227,19 +230,32 @@ int nf_queue(struct sk_buff *skb, segs = skb_gso_segment(skb, 0); kfree_skb(skb); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to mean + * 'ignore this hook'. + */ if (IS_ERR(segs)) - return 1; + return -EINVAL; + queued = 0; + err = 0; do { struct sk_buff *nskb = segs->next; segs->next = NULL; - if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn, - queuenum)) + if (err == 0) + err = __nf_queue(segs, elem, pf, hook, indev, + outdev, okfn, queuenum); + if (err == 0) + queued++; + else kfree_skb(segs); segs = nskb; } while (segs); - return 1; + + if (unlikely(err && queued)) + err = 0; + return err; } void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) @@ -247,6 +263,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) struct sk_buff *skb = entry->skb; struct list_head *elem = &entry->elem->list; const struct nf_afinfo *afinfo; + int err; rcu_read_lock(); @@ -280,9 +297,10 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - if (!__nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_BITS)) + err = __nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_BITS); + if (err == -ECANCELED) goto next_hook; break; case NF_STOLEN: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 68e67d19724d..b83123f12b42 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -387,25 +387,31 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { struct sk_buff *nskb; struct nfqnl_instance *queue; - int err; + int err = -ENOBUFS; /* rcu_read_lock()ed by nf_hook_slow() */ queue = instance_lookup(queuenum); - if (!queue) + if (!queue) { + err = -ESRCH; goto err_out; + } - if (queue->copy_mode == NFQNL_COPY_NONE) + if (queue->copy_mode == NFQNL_COPY_NONE) { + err = -EINVAL; goto err_out; + } nskb = nfqnl_build_packet_message(queue, entry); - if (nskb == NULL) + if (nskb == NULL) { + err = -ENOMEM; goto err_out; - + } spin_lock_bh(&queue->lock); - if (!queue->peer_pid) + if (!queue->peer_pid) { + err = -EINVAL; goto err_out_free_nskb; - + } if (queue->queue_total >= queue->queue_maxlen) { queue->queue_dropped++; if (net_ratelimit()) @@ -432,7 +438,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) err_out_unlock: spin_unlock_bh(&queue->lock); err_out: - return -1; + return err; } static int From 06cdb6349c1f3fd439398dbc04ce4c696f0a41ab Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jan 2011 15:28:38 +0100 Subject: [PATCH 73/80] netfilter: nfnetlink_queue: do not free skb on error Move free responsibility from nf_queue to caller. This enables more flexible error handling; we can now accept the skb instead of freeing it. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 7 +++++-- net/netfilter/nf_queue.c | 17 ++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 91d66d2f8cd9..0c5b796ef527 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -181,8 +181,11 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_BITS); - if (ret == -ECANCELED) - goto next_hook; + if (ret < 0) { + if (ret == -ECANCELED) + goto next_hook; + kfree_skb(skb); + } ret = 0; } rcu_read_unlock(); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index ad25c7e726bc..5c4b730a2e68 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -163,9 +163,8 @@ static int __nf_queue(struct sk_buff *skb, /* If it's going away, ignore hook. */ if (!try_module_get(entry->elem->owner)) { - rcu_read_unlock(); - kfree(entry); - return -ECANCELED; + status = -ECANCELED; + goto err_unlock; } /* Bump dev refs so they don't vanish while packet is out */ if (indev) @@ -198,7 +197,6 @@ static int __nf_queue(struct sk_buff *skb, err_unlock: rcu_read_unlock(); err: - kfree_skb(skb); kfree(entry); return status; } @@ -229,7 +227,6 @@ int nf_queue(struct sk_buff *skb, } segs = skb_gso_segment(skb, 0); - kfree_skb(skb); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ECANCELED to mean * 'ignore this hook'. @@ -253,8 +250,11 @@ int nf_queue(struct sk_buff *skb, segs = nskb; } while (segs); + /* also free orig skb if only some segments were queued */ if (unlikely(err && queued)) err = 0; + if (err == 0) + kfree_skb(skb); return err; } @@ -300,8 +300,11 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) err = __nf_queue(skb, elem, entry->pf, entry->hook, entry->indev, entry->outdev, entry->okfn, verdict >> NF_VERDICT_BITS); - if (err == -ECANCELED) - goto next_hook; + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + kfree_skb(skb); + } break; case NF_STOLEN: default: From f615df76ed862b7d3927ec5f55b805ca19be29d9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jan 2011 15:52:14 +0100 Subject: [PATCH 74/80] netfilter: reduce NF_VERDICT_MASK to 0xff NF_VERDICT_MASK is currently 0xffff. This is because the upper 16 bits are used to store errno (for NF_DROP) or the queue number (NF_QUEUE verdict). As there are up to 0xffff different queues available, there is no more room to store additional flags. At the moment there are only 6 different verdicts, i.e. we can reduce NF_VERDICT_MASK to 0xff to allow storing additional flags in the 0xff00 space. NF_VERDICT_BITS would then be reduced to 8, but because the value is exported to userspace, this might cause breakage; e.g.: e.g. 'queuenr = (1 << NF_VERDICT_BITS) | NF_QUEUE' would now break. Thus, remove NF_VERDICT_BITS usage in the kernel and move the old value to the 'userspace compat' section. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 20 +++++++++++++++----- net/netfilter/core.c | 4 ++-- net/netfilter/nf_queue.c | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 0ab7ca787b22..78b73cc10216 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -24,16 +24,19 @@ #define NF_MAX_VERDICT NF_STOP /* we overload the higher bits for encoding auxiliary data such as the queue - * number. Not nice, but better than additional function arguments. */ -#define NF_VERDICT_MASK 0x0000ffff -#define NF_VERDICT_BITS 16 + * number or errno values. Not nice, but better than additional function + * arguments. */ +#define NF_VERDICT_MASK 0x000000ff +/* extra verdict flags have mask 0x0000ff00 */ + +/* queue number (NF_QUEUE) or errno (NF_DROP) */ #define NF_VERDICT_QMASK 0xffff0000 #define NF_VERDICT_QBITS 16 -#define NF_QUEUE_NR(x) ((((x) << NF_VERDICT_BITS) & NF_VERDICT_QMASK) | NF_QUEUE) +#define NF_QUEUE_NR(x) ((((x) << 16) & NF_VERDICT_QMASK) | NF_QUEUE) -#define NF_DROP_ERR(x) (((-x) << NF_VERDICT_BITS) | NF_DROP) +#define NF_DROP_ERR(x) (((-x) << 16) | NF_DROP) /* only for userspace compatibility */ #ifndef __KERNEL__ @@ -41,6 +44,9 @@ <= 0x2000 is used for protocol-flags. */ #define NFC_UNKNOWN 0x4000 #define NFC_ALTERED 0x8000 + +/* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */ +#define NF_VERDICT_BITS 16 #endif enum nf_inet_hooks { @@ -72,6 +78,10 @@ union nf_inet_addr { #ifdef __KERNEL__ #ifdef CONFIG_NETFILTER +static inline int NF_DROP_GETERR(int verdict) +{ + return -(verdict >> NF_VERDICT_QBITS); +} static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0c5b796ef527..4d88e45b978e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -175,12 +175,12 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { kfree_skb(skb); - ret = -(verdict >> NF_VERDICT_BITS); + ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_BITS); + verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ECANCELED) goto next_hook; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5c4b730a2e68..ce1150d4a3f2 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -299,7 +299,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) case NF_QUEUE: err = __nf_queue(skb, elem, entry->pf, entry->hook, entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_BITS); + verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ECANCELED) goto next_hook; From 94b27cc36123069966616670c3653cd6873babe9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jan 2011 16:08:30 +0100 Subject: [PATCH 75/80] netfilter: allow NFQUEUE bypass if no listener is available If an skb is to be NF_QUEUE'd, but no program has opened the queue, the packet is dropped. This adds a v2 target revision of xt_NFQUEUE that allows packets to continue through the ruleset instead. Because the actual queueing happens outside of the target context, the 'bypass' flag has to be communicated back to the netfilter core. Unfortunately the only choice to do this without adding a new function argument is to use the target function return value (i.e. the verdict). In the NF_QUEUE case, the upper 16bit already contain the queue number to use. The previous patch reduced NF_VERDICT_MASK to 0xff, i.e. we now have extra room for a new flag. If a hook issued a NF_QUEUE verdict, then the netfilter core will continue packet processing if the queueing hook returns -ESRCH (== "this queue does not exist") and the new NF_VERDICT_FLAG_QUEUE_BYPASS flag is set in the verdict value. Note: If the queue exists, but userspace does not consume packets fast enough, the skb will still be dropped. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 1 + include/linux/netfilter/xt_NFQUEUE.h | 6 ++++++ net/netfilter/core.c | 3 +++ net/netfilter/nf_queue.c | 7 ++++++- net/netfilter/xt_NFQUEUE.c | 28 +++++++++++++++++++++++++--- 5 files changed, 41 insertions(+), 4 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 78b73cc10216..eeec00abb664 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -29,6 +29,7 @@ #define NF_VERDICT_MASK 0x000000ff /* extra verdict flags have mask 0x0000ff00 */ +#define NF_VERDICT_FLAG_QUEUE_BYPASS 0x00008000 /* queue number (NF_QUEUE) or errno (NF_DROP) */ #define NF_VERDICT_QMASK 0xffff0000 diff --git a/include/linux/netfilter/xt_NFQUEUE.h b/include/linux/netfilter/xt_NFQUEUE.h index 2584f4a777de..9eafdbbb401c 100644 --- a/include/linux/netfilter/xt_NFQUEUE.h +++ b/include/linux/netfilter/xt_NFQUEUE.h @@ -20,4 +20,10 @@ struct xt_NFQ_info_v1 { __u16 queues_total; }; +struct xt_NFQ_info_v2 { + __u16 queuenum; + __u16 queues_total; + __u16 bypass; +}; + #endif /* _XT_NFQ_TARGET_H */ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 4d88e45b978e..1e00bf7d27c5 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -184,6 +184,9 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, if (ret < 0) { if (ret == -ECANCELED) goto next_hook; + if (ret == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; kfree_skb(skb); } ret = 0; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index ce1150d4a3f2..5ab22e2bbd7d 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -138,8 +138,10 @@ static int __nf_queue(struct sk_buff *skb, rcu_read_lock(); qh = rcu_dereference(queue_handler[pf]); - if (!qh) + if (!qh) { + status = -ESRCH; goto err_unlock; + } afinfo = nf_get_afinfo(pf); if (!afinfo) @@ -303,6 +305,9 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (err < 0) { if (err == -ECANCELED) goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; kfree_skb(skb); } break; diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 39627706aac6..d4f4b5d66b20 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -83,9 +83,20 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) return NF_QUEUE_NR(queue); } -static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par) +static unsigned int +nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_NFQ_info_v1 *info = par->targinfo; + const struct xt_NFQ_info_v2 *info = par->targinfo; + unsigned int ret = nfqueue_tg_v1(skb, par); + + if (info->bypass) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + return ret; +} + +static int nfqueue_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; u32 maxid; if (unlikely(!rnd_inited)) { @@ -102,6 +113,8 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par) info->queues_total, maxid); return -ERANGE; } + if (par->target->revision == 2 && info->bypass > 1) + return -EINVAL; return 0; } @@ -117,11 +130,20 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = { .name = "NFQUEUE", .revision = 1, .family = NFPROTO_UNSPEC, - .checkentry = nfqueue_tg_v1_check, + .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v1, .targetsize = sizeof(struct xt_NFQ_info_v1), .me = THIS_MODULE, }, + { + .name = "NFQUEUE", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v2, + .targetsize = sizeof(struct xt_NFQ_info_v2), + .me = THIS_MODULE, + }, }; static int __init nfqueue_tg_init(void) From 94d117a1c78df38abdea0c09ef00c205b923b567 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Jan 2011 16:27:56 +0100 Subject: [PATCH 76/80] netfilter: ipt_CLUSTERIP: remove "no conntrack!" When a packet is meant to be handled by another node of the cluster, silently drop it instead of flooding kernel log. Note : INVALID packets are also dropped without notice. Signed-off-by: Eric Dumazet Acked-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a4897655..403ca57f6011 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) { - pr_info("no conntrack!\n"); - /* FIXME: need to drop invalid ones, since replies - * to outgoing connections of other nodes will be - * marked as INVALID */ + if (ct == NULL) return NF_DROP; - } /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ From 93557f53e1fbd9e2b6574ab0a9b5852628fde9e3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 18 Jan 2011 18:12:24 +0100 Subject: [PATCH 77/80] netfilter: nf_conntrack: nf_conntrack snmp helper Adding support for SNMP broadcast connection tracking. The SNMP broadcast requests are now paired with the SNMP responses. Thus allowing using SNMP broadcasts with firewall enabled. Please refer to the following conversation: http://marc.info/?l=netfilter-devel&m=125992205006600&w=2 Patrick McHardy wrote: > > The best solution would be to add generic broadcast tracking, the > > use of expectations for this is a bit of abuse. > > The second best choice I guess would be to move the help() function > > to a shared module and generalize it so it can be used for both. This patch implements the "second best choice". Since the netbios-ns conntrack module uses the same helper functionality as the snmp, only one helper function is added for both snmp and netbios-ns modules into the new object - nf_conntrack_broadcast. Signed-off-by: Jiri Olsa Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_snmp.h | 9 +++ include/net/netfilter/nf_conntrack_helper.h | 6 ++ net/ipv4/netfilter/Kconfig | 3 +- net/ipv4/netfilter/nf_nat_snmp_basic.c | 9 ++- net/netfilter/Kconfig | 19 +++++ net/netfilter/Makefile | 2 + net/netfilter/nf_conntrack_broadcast.c | 82 +++++++++++++++++++++ net/netfilter/nf_conntrack_netbios_ns.c | 74 +++---------------- net/netfilter/nf_conntrack_snmp.c | 77 +++++++++++++++++++ 9 files changed, 211 insertions(+), 70 deletions(-) create mode 100644 include/linux/netfilter/nf_conntrack_snmp.h create mode 100644 net/netfilter/nf_conntrack_broadcast.c create mode 100644 net/netfilter/nf_conntrack_snmp.c diff --git a/include/linux/netfilter/nf_conntrack_snmp.h b/include/linux/netfilter/nf_conntrack_snmp.h new file mode 100644 index 000000000000..064bc63a5346 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_snmp.h @@ -0,0 +1,9 @@ +#ifndef _NF_CONNTRACK_SNMP_H +#define _NF_CONNTRACK_SNMP_H + +extern int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); + +#endif /* _NF_CONNTRACK_SNMP_H */ diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 32c305dbdab6..f1c1311adc2c 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -63,4 +63,10 @@ static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) extern int nf_conntrack_helper_init(void); extern void nf_conntrack_helper_fini(void); +extern int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout); + #endif /*_NF_CONNTRACK_HELPER_H*/ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2bae5f..f926a310075d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP ---help--- This module implements an Application Layer Gateway (ALG) for diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a56..8812a02078ab 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -54,6 +54,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris "); @@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) { int ret = 0; - ret = nf_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - nf_conntrack_helper_unregister(&snmp_helper); + rcu_assign_pointer(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e2480bddbfd5..939b504604c2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -185,9 +185,13 @@ config NF_CONNTRACK_IRC To compile it as a module, choose M here. If unsure, say N. +config NF_CONNTRACK_BROADCAST + tristate + config NF_CONNTRACK_NETBIOS_NS tristate "NetBIOS name service protocol support" depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST help NetBIOS name service requests are sent as broadcast messages from an unprivileged port and responded to with unicast messages to the @@ -204,6 +208,21 @@ config NF_CONNTRACK_NETBIOS_NS To compile it as a module, choose M here. If unsure, say N. +config NF_CONNTRACK_SNMP + tristate "SNMP service protocol support" + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST + help + SNMP service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating SNMP service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. + + To compile it as a module, choose M here. If unsure, say N. + config NF_CONNTRACK_PPTP tristate "PPtP protocol support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 401d574bdfd2..2c2628de9d3f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -28,7 +28,9 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o +obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o +obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c new file mode 100644 index 000000000000..4e99cca61612 --- /dev/null +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -0,0 +1,82 @@ +/* + * broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout) +{ + struct nf_conntrack_expect *exp; + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct nf_conn_help *help = nfct_help(ct); + __be32 mask = 0; + + /* we're only interested in locally generated packets */ + if (skb->sk == NULL) + goto out; + if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) + goto out; + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + goto out; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (in_dev != NULL) { + for_primary_ifa(in_dev) { + if (ifa->ifa_broadcast == iph->daddr) { + mask = ifa->ifa_mask; + break; + } + } endfor_ifa(in_dev); + } + rcu_read_unlock(); + + if (mask == 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + goto out; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + + exp->mask.src.u3.ip = mask; + exp->mask.src.u.udp.port = htons(0xFFFF); + + exp->expectfn = NULL; + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->class = NF_CT_EXPECT_CLASS_DEFAULT; + exp->helper = NULL; + + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); + + nf_ct_refresh(ct, skb, timeout * HZ); +out: + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index aadde018a072..4c8f30a3d6d2 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -18,14 +18,7 @@ #include #include #include -#include -#include -#include -#include #include -#include -#include -#include #include #include @@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns"); MODULE_ALIAS_NFCT_HELPER("netbios_ns"); static unsigned int timeout __read_mostly = 3; -module_param(timeout, uint, 0400); +module_param(timeout, uint, S_IRUSR); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); -static int help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) -{ - struct nf_conntrack_expect *exp; - struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - struct in_device *in_dev; - __be32 mask = 0; - - /* we're only interested in locally generated packets */ - if (skb->sk == NULL) - goto out; - if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) - goto out; - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - goto out; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->dst.dev); - if (in_dev != NULL) { - for_primary_ifa(in_dev) { - if (ifa->ifa_broadcast == iph->daddr) { - mask = ifa->ifa_mask; - break; - } - } endfor_ifa(in_dev); - } - rcu_read_unlock(); - - if (mask == 0) - goto out; - - exp = nf_ct_expect_alloc(ct); - if (exp == NULL) - goto out; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->tuple.src.u.udp.port = htons(NMBD_PORT); - - exp->mask.src.u3.ip = mask; - exp->mask.src.u.udp.port = htons(0xFFFF); - - exp->expectfn = NULL; - exp->flags = NF_CT_EXPECT_PERMANENT; - exp->class = NF_CT_EXPECT_CLASS_DEFAULT; - exp->helper = NULL; - - nf_ct_expect_related(exp); - nf_ct_expect_put(exp); - - nf_ct_refresh(ct, skb, timeout * HZ); -out: - return NF_ACCEPT; -} - static struct nf_conntrack_expect_policy exp_policy = { .max_expected = 1, }; +static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); +} + static struct nf_conntrack_helper helper __read_mostly = { .name = "netbios-ns", - .tuple.src.l3num = AF_INET, + .tuple.src.l3num = NFPROTO_IPV4, .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), .tuple.dst.protonum = IPPROTO_UDP, .me = THIS_MODULE, - .help = help, + .help = netbios_ns_help, .expect_policy = &exp_policy, }; diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c new file mode 100644 index 000000000000..6e545e26289e --- /dev/null +++ b/net/netfilter/nf_conntrack_snmp.c @@ -0,0 +1,77 @@ +/* + * SNMP service broadcast connection tracking helper + * + * (c) 2011 Jiri Olsa + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +#include +#include +#include + +#define SNMP_PORT 161 + +MODULE_AUTHOR("Jiri Olsa "); +MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFCT_HELPER("snmp"); + +static unsigned int timeout __read_mostly = 30; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); + +static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + typeof(nf_nat_snmp_hook) nf_nat_snmp; + + nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + + nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); + if (nf_nat_snmp && ct->status & IPS_NAT_MASK) + return nf_nat_snmp(skb, protoff, ct, ctinfo); + + return NF_ACCEPT; +} + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "snmp", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = snmp_conntrack_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_snmp_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_snmp_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_snmp_init); +module_exit(nf_conntrack_snmp_fini); From a992ca2a0498edd22a88ac8c41570f536de29c9e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 19 Jan 2011 16:00:07 +0100 Subject: [PATCH 78/80] netfilter: nf_conntrack_tstamp: add flow-based timestamp extension This patch adds flow-based timestamping for conntracks. This conntrack extension is disabled by default. Basically, we use two 64-bits variables to store the creation timestamp once the conntrack has been confirmed and the other to store the deletion time. This extension is disabled by default, to enable it, you have to: echo 1 > /proc/sys/net/netfilter/nf_conntrack_timestamp This patch allows to save memory for user-space flow-based loogers such as ulogd2. In short, ulogd2 does not need to keep a hashtable with the conntrack in user-space to know when they were created and destroyed, instead we use the kernel timestamp. If we want to have a sane IPFIX implementation in user-space, this nanosecs resolution timestamps are also useful. Other custom user-space applications can benefit from this via libnetfilter_conntrack. This patch modifies the /proc output to display the delta time in seconds since the flow start. You can also obtain the flow-start date by means of the conntrack-tools. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/linux/netfilter/nfnetlink_conntrack.h | 9 ++ include/net/netfilter/nf_conntrack_extend.h | 4 + .../net/netfilter/nf_conntrack_timestamp.h | 53 ++++++++ include/net/netns/conntrack.h | 2 + net/netfilter/Kconfig | 11 ++ net/netfilter/Makefile | 1 + net/netfilter/nf_conntrack_core.c | 26 ++++ net/netfilter/nf_conntrack_netlink.c | 46 ++++++- net/netfilter/nf_conntrack_standalone.c | 41 ++++++ net/netfilter/nf_conntrack_timestamp.c | 120 ++++++++++++++++++ 10 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 include/net/netfilter/nf_conntrack_timestamp.h create mode 100644 net/netfilter/nf_conntrack_timestamp.c diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 19711e3ffd42..debf1aefd753 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -42,6 +42,7 @@ enum ctattr_type { CTA_SECMARK, /* obsolete */ CTA_ZONE, CTA_SECCTX, + CTA_TIMESTAMP, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -127,6 +128,14 @@ enum ctattr_counters { }; #define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) +enum ctattr_tstamp { + CTA_TIMESTAMP_UNSPEC, + CTA_TIMESTAMP_START, + CTA_TIMESTAMP_STOP, + __CTA_TIMESTAMP_MAX +}; +#define CTA_TIMESTAMP_MAX (__CTA_TIMESTAMP_MAX - 1) + enum ctattr_nat { CTA_NAT_UNSPEC, CTA_NAT_MINIP, diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 1a9f96db3798..2dcf31703acb 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -16,6 +16,9 @@ enum nf_ct_ext_id { #endif #ifdef CONFIG_NF_CONNTRACK_ZONES NF_CT_EXT_ZONE, +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + NF_CT_EXT_TSTAMP, #endif NF_CT_EXT_NUM, }; @@ -25,6 +28,7 @@ enum nf_ct_ext_id { #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone +#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h new file mode 100644 index 000000000000..f17dcb664e29 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_timestamp.h @@ -0,0 +1,53 @@ +#ifndef _NF_CONNTRACK_TSTAMP_H +#define _NF_CONNTRACK_TSTAMP_H + +#include +#include +#include +#include +#include + +struct nf_conn_tstamp { + u_int64_t start; + u_int64_t stop; +}; + +static inline +struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP); +#else + return NULL; +#endif +} + +static inline +struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + struct net *net = nf_ct_net(ct); + + if (!net->ct.sysctl_tstamp) + return NULL; + + return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp); +#else + return NULL; +#endif +}; + +static inline bool nf_ct_tstamp_enabled(struct net *net) +{ + return net->ct.sysctl_tstamp != 0; +} + +static inline void nf_ct_set_tstamp(struct net *net, bool enable) +{ + net->ct.sysctl_tstamp = enable; +} + +extern int nf_conntrack_tstamp_init(struct net *net); +extern void nf_conntrack_tstamp_fini(struct net *net); + +#endif /* _NF_CONNTRACK_TSTAMP_H */ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 5cf8a8c141aa..341eb089349e 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -21,11 +21,13 @@ struct netns_ct { int sysctl_events; unsigned int sysctl_events_retry_timeout; int sysctl_acct; + int sysctl_tstamp; int sysctl_checksum; unsigned int sysctl_log_invalid; /* Log invalid packets */ #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; struct ctl_table_header *acct_sysctl_header; + struct ctl_table_header *tstamp_sysctl_header; struct ctl_table_header *event_sysctl_header; #endif char *slabname; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 939b504604c2..faf7412ea453 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -85,6 +85,17 @@ config NF_CONNTRACK_EVENTS If unsure, say `N'. +config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timestamping. + This allows you to store the flow start-time and to obtain + the flow-stop time (once it has been destroyed) via Connection + tracking events. + + If unsure, say `N'. + config NF_CT_PROTO_DCCP tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' depends on EXPERIMENTAL diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 2c2628de9d3f..9ae6878a85b1 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,6 +1,7 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o obj-$(CONFIG_NETFILTER) = netfilter.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f47ac67e1bfe..1909311c392a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -282,6 +283,11 @@ EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); static void death_by_timeout(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); if (!test_bit(IPS_DYING_BIT, &ct->status) && unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { @@ -419,6 +425,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; + struct nf_conn_tstamp *tstamp; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; @@ -488,6 +495,14 @@ __nf_conntrack_confirm(struct sk_buff *skb) atomic_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + tstamp->start = ktime_to_ns(skb->tstamp); + } /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above @@ -746,6 +761,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, @@ -1186,6 +1202,11 @@ struct __nf_ct_flush_report { static int kill_report(struct nf_conn *i, void *data) { struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(i); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); /* If we fail to deliver the event, death_by_timeout() will retry */ if (nf_conntrack_event_report(IPCT_DESTROY, i, @@ -1497,6 +1518,9 @@ static int nf_conntrack_init_net(struct net *net) ret = nf_conntrack_acct_init(net); if (ret < 0) goto err_acct; + ret = nf_conntrack_tstamp_init(net); + if (ret < 0) + goto err_tstamp; ret = nf_conntrack_ecache_init(net); if (ret < 0) goto err_ecache; @@ -1504,6 +1528,8 @@ static int nf_conntrack_init_net(struct net *net) return 0; err_ecache: + nf_conntrack_tstamp_fini(net); +err_tstamp: nf_conntrack_acct_fini(net); err_acct: nf_conntrack_expect_fini(net); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9eabaa6f28a8..715d56c85475 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef CONFIG_NF_NAT_NEEDED #include #include @@ -230,6 +231,33 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, return -1; } +static int +ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_count; + const struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (!tstamp) + return 0; + + nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)); + if (tstamp->stop != 0) { + NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP, + cpu_to_be64(tstamp->stop)); + } + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: + return -1; +} + #ifdef CONFIG_NF_CONNTRACK_MARK static inline int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) @@ -404,6 +432,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, ctnetlink_dump_timeout(skb, ct) < 0 || ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || @@ -470,6 +499,18 @@ ctnetlink_secctx_size(const struct nf_conn *ct) #endif } +static inline size_t +ctnetlink_timestamp_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + return 0; + return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); +#else + return 0; +#endif +} + static inline size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { @@ -481,6 +522,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + ctnetlink_counters_size(ct) + + ctnetlink_timestamp_size(ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ @@ -571,7 +613,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (events & (1 << IPCT_DESTROY)) { if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; } else { if (ctnetlink_dump_timeout(skb, ct) < 0) @@ -1360,6 +1403,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 8257bf643593..69107fd78d3e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -29,6 +29,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -46,6 +47,7 @@ EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; unsigned int bucket; + u_int64_t time_now; }; static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) @@ -96,6 +98,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ct_iter_state *st = seq->private; + + st->time_now = ktime_to_ns(ktime_get_real()); rcu_read_lock(); return ct_get_idx(seq, *pos); } @@ -135,6 +140,39 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +static u_int64_t ct_delta_time(u_int64_t time_now, const struct nf_conn *ct) +{ + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + u_int64_t delta_time = time_now - tstamp->start; + return delta_time > 0 ? div_s64(delta_time, NSEC_PER_SEC) : 0; + } + return -1; +} + +static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + struct ct_iter_state *st = s->private; + u_int64_t delta_time; + + delta_time = ct_delta_time(st->time_now, ct); + if (delta_time < 0) + return 0; + + return seq_printf(s, "delta-time=%llu ", + (unsigned long long)delta_time); +} +#else +static inline int +ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { @@ -203,6 +241,9 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif + if (ct_show_delta_time(s, ct)) + goto release; + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) goto release; diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c new file mode 100644 index 000000000000..af7dd31af0a1 --- /dev/null +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -0,0 +1,120 @@ +/* + * (C) 2010 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include +#include +#include +#include + +#include +#include +#include + +static int nf_ct_tstamp __read_mostly; + +module_param_named(tstamp, nf_ct_tstamp, bool, 0644); +MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table tstamp_sysctl_table[] = { + { + .procname = "nf_conntrack_timestamp", + .data = &init_net.ct.sysctl_tstamp, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type tstamp_extend __read_mostly = { + .len = sizeof(struct nf_conn_tstamp), + .align = __alignof__(struct nf_conn_tstamp), + .id = NF_CT_EXT_TSTAMP, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_tstamp; + + net->ct.tstamp_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.tstamp_sysctl_header) { + printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.tstamp_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_tstamp_init(struct net *net) +{ + int ret; + + net->ct.sysctl_tstamp = nf_ct_tstamp; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&tstamp_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_tstamp: Unable to register " + "extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_tstamp_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_tstamp_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +} From f5c88f56b35599ab9ff2d3398e0153e4cd4a4c82 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 19 Jan 2011 19:10:49 +0100 Subject: [PATCH 79/80] netfilter: nf_conntrack: fix lifetime display for disabled connections When no tstamp extension exists, ct_delta_time() returns -1, which is then assigned to an u64 and tested for negative values to decide whether to display the lifetime. This obviously doesn't work, use a s64 and merge the two minor functions into one. Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_standalone.c | 31 +++++++++++-------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 69107fd78d3e..0ae142825881 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -141,29 +141,24 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP -static u_int64_t ct_delta_time(u_int64_t time_now, const struct nf_conn *ct) -{ - struct nf_conn_tstamp *tstamp; - - tstamp = nf_conn_tstamp_find(ct); - if (tstamp) { - u_int64_t delta_time = time_now - tstamp->start; - return delta_time > 0 ? div_s64(delta_time, NSEC_PER_SEC) : 0; - } - return -1; -} - static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { struct ct_iter_state *st = s->private; - u_int64_t delta_time; + struct nf_conn_tstamp *tstamp; + s64 delta_time; - delta_time = ct_delta_time(st->time_now, ct); - if (delta_time < 0) - return 0; + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + delta_time = st->time_now - tstamp->start; + if (delta_time > 0) + delta_time = div_s64(delta_time, NSEC_PER_SEC); + else + delta_time = 0; - return seq_printf(s, "delta-time=%llu ", - (unsigned long long)delta_time); + return seq_printf(s, "delta-time=%llu ", + (unsigned long long)delta_time); + } + return 0; } #else static inline int From 5d8449286456659cdd0998e62d80df2d9e77e9e3 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 20 Jan 2011 08:48:15 +0100 Subject: [PATCH 80/80] netfilter: xtables: remove extraneous header that slipped in Commit 0b8ad87 (netfilter: xtables: add missing header files to export list) erroneously added this. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/Kbuild | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index fc4e0aa805a2..89c0d1e20d72 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -56,7 +56,6 @@ header-y += xt_rateest.h header-y += xt_realm.h header-y += xt_recent.h header-y += xt_sctp.h -header-y += xt_secmark.h header-y += xt_socket.h header-y += xt_state.h header-y += xt_statistic.h