From ecc6569f3503b39f45bc6b86197b5e0a8533fb72 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 25 Aug 2016 23:08:11 +0800
Subject: [PATCH 01/53] netfilter: gre: Use consistent GRE_* macros instead of
 ones defined by netfilter.

There are already some GRE_* macros in kernel, so it is unnecessary
to define these macros. And remove some useless macros

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../linux/netfilter/nf_conntrack_proto_gre.h  | 22 ++-----------------
 include/uapi/linux/if_tunnel.h                |  1 +
 net/ipv4/netfilter/nf_nat_proto_gre.c         |  4 ++--
 net/netfilter/nf_conntrack_proto_gre.c        |  4 ++--
 4 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h
index df78dc2b5524..0189747f2691 100644
--- a/include/linux/netfilter/nf_conntrack_proto_gre.h
+++ b/include/linux/netfilter/nf_conntrack_proto_gre.h
@@ -1,29 +1,11 @@
 #ifndef _CONNTRACK_PROTO_GRE_H
 #define _CONNTRACK_PROTO_GRE_H
 #include <asm/byteorder.h>
+#include <net/gre.h>
+#include <net/pptp.h>
 
 /* GRE PROTOCOL HEADER */
 
-/* GRE Version field */
-#define GRE_VERSION_1701	0x0
-#define GRE_VERSION_PPTP	0x1
-
-/* GRE Protocol field */
-#define GRE_PROTOCOL_PPTP	0x880B
-
-/* GRE Flags */
-#define GRE_FLAG_C		0x80
-#define GRE_FLAG_R		0x40
-#define GRE_FLAG_K		0x20
-#define GRE_FLAG_S		0x10
-#define GRE_FLAG_A		0x80
-
-#define GRE_IS_C(f)	((f)&GRE_FLAG_C)
-#define GRE_IS_R(f)	((f)&GRE_FLAG_R)
-#define GRE_IS_K(f)	((f)&GRE_FLAG_K)
-#define GRE_IS_S(f)	((f)&GRE_FLAG_S)
-#define GRE_IS_A(f)	((f)&GRE_FLAG_A)
-
 /* GRE is a mess: Four different standards */
 struct gre_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 9865c8caedde..fb7337d6b985 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -39,6 +39,7 @@
 #define GRE_IS_REC(f)		((f) & GRE_REC)
 #define GRE_IS_ACK(f)		((f) & GRE_ACK)
 
+#define GRE_VERSION_0		__cpu_to_be16(0x0000)
 #define GRE_VERSION_1		__cpu_to_be16(0x0001)
 #define GRE_PROTO_PPP		__cpu_to_be16(0x880b)
 #define GRE_PPTP_KEY_MASK	__cpu_to_be32(0xffff)
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 9414923f1e15..93198d71dbb6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -104,11 +104,11 @@ gre_manip_pkt(struct sk_buff *skb,
 	if (maniptype != NF_NAT_MANIP_DST)
 		return true;
 	switch (greh->version) {
-	case GRE_VERSION_1701:
+	case ntohs(GRE_VERSION_0):
 		/* We do not currently NAT any GREv0 packets.
 		 * Try to behave like "nf_nat_proto_unknown" */
 		break;
-	case GRE_VERSION_PPTP:
+	case ntohs(GRE_VERSION_1):
 		pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
 		pgreh->call_id = tuple->dst.u.gre.key;
 		break;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a96451a7af20..deb239a014e4 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -200,7 +200,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 
 	/* first only delinearize old RFC1701 GRE header */
 	grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
-	if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+	if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) {
 		/* try to behave like "nf_conntrack_proto_generic" */
 		tuple->src.u.all = 0;
 		tuple->dst.u.all = 0;
@@ -212,7 +212,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	if (!pgrehdr)
 		return true;
 
-	if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+	if (grehdr->protocol != GRE_PROTO_PPP) {
 		pr_debug("GRE_VERSION_PPTP but unknown proto\n");
 		return false;
 	}

From c579a9e7d58f66030a144c7a33cc9bdf827a4b6d Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 25 Aug 2016 23:08:47 +0800
Subject: [PATCH 02/53] netfilter: gre: Use consistent GRE and PTTP header
 structure instead of the ones defined by netfilter

There are two existing strutures which defines the GRE and PPTP header.
So use these two structures instead of the ones defined by netfilter to
keep consitent with other codes.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../linux/netfilter/nf_conntrack_proto_gre.h  | 42 -------------------
 net/ipv4/netfilter/nf_nat_proto_gre.c         | 13 +++---
 net/netfilter/nf_conntrack_proto_gre.c        | 12 +++---
 3 files changed, 13 insertions(+), 54 deletions(-)

diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h
index 0189747f2691..dee0acd0dd31 100644
--- a/include/linux/netfilter/nf_conntrack_proto_gre.h
+++ b/include/linux/netfilter/nf_conntrack_proto_gre.h
@@ -4,48 +4,6 @@
 #include <net/gre.h>
 #include <net/pptp.h>
 
-/* GRE PROTOCOL HEADER */
-
-/* GRE is a mess: Four different standards */
-struct gre_hdr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u16	rec:3,
-		srr:1,
-		seq:1,
-		key:1,
-		routing:1,
-		csum:1,
-		version:3,
-		reserved:4,
-		ack:1;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	__u16	csum:1,
-		routing:1,
-		key:1,
-		seq:1,
-		srr:1,
-		rec:3,
-		ack:1,
-		reserved:4,
-		version:3;
-#else
-#error "Adjust your <asm/byteorder.h> defines"
-#endif
-	__be16	protocol;
-};
-
-/* modified GRE header for PPTP */
-struct gre_hdr_pptp {
-	__u8   flags;		/* bitfield */
-	__u8   version;		/* should be GRE_VERSION_PPTP */
-	__be16 protocol;	/* should be GRE_PROTOCOL_PPTP */
-	__be16 payload_len;	/* size of ppp payload, not inc. gre header */
-	__be16 call_id;		/* peer's call_id for this session */
-	__be32 seq;		/* sequence number.  Present if S==1 */
-	__be32 ack;		/* seq number of highest packet received by */
-				/*  sender in this session */
-};
-
 struct nf_ct_gre {
 	unsigned int stream_timeout;
 	unsigned int timeout;
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 93198d71dbb6..edf05002d674 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb,
 	      const struct nf_conntrack_tuple *tuple,
 	      enum nf_nat_manip_type maniptype)
 {
-	const struct gre_hdr *greh;
-	struct gre_hdr_pptp *pgreh;
+	const struct gre_base_hdr *greh;
+	struct pptp_gre_header *pgreh;
 
 	/* pgreh includes two optional 32bit fields which are not required
 	 * to be there.  That's where the magic '8' comes from */
@@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb,
 		return false;
 
 	greh = (void *)skb->data + hdroff;
-	pgreh = (struct gre_hdr_pptp *)greh;
+	pgreh = (struct pptp_gre_header *)greh;
 
 	/* we only have destination manip of a packet, since 'source key'
 	 * is not present in the packet itself */
 	if (maniptype != NF_NAT_MANIP_DST)
 		return true;
-	switch (greh->version) {
-	case ntohs(GRE_VERSION_0):
+
+	switch (greh->flags & GRE_VERSION) {
+	case GRE_VERSION_0:
 		/* We do not currently NAT any GREv0 packets.
 		 * Try to behave like "nf_nat_proto_unknown" */
 		break;
-	case ntohs(GRE_VERSION_1):
+	case GRE_VERSION_1:
 		pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
 		pgreh->call_id = tuple->dst.u.gre.key;
 		break;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index deb239a014e4..9a715f88b2f1 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
 static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 			     struct net *net, struct nf_conntrack_tuple *tuple)
 {
-	const struct gre_hdr_pptp *pgrehdr;
-	struct gre_hdr_pptp _pgrehdr;
+	const struct pptp_gre_header *pgrehdr;
+	struct pptp_gre_header _pgrehdr;
 	__be16 srckey;
-	const struct gre_hdr *grehdr;
-	struct gre_hdr _grehdr;
+	const struct gre_base_hdr *grehdr;
+	struct gre_base_hdr _grehdr;
 
 	/* first only delinearize old RFC1701 GRE header */
 	grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
-	if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) {
+	if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) {
 		/* try to behave like "nf_conntrack_proto_generic" */
 		tuple->src.u.all = 0;
 		tuple->dst.u.all = 0;
@@ -213,7 +213,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 		return true;
 
 	if (grehdr->protocol != GRE_PROTO_PPP) {
-		pr_debug("GRE_VERSION_PPTP but unknown proto\n");
+		pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol));
 		return false;
 	}
 

From 68cb9fe47ea661bffd48c8ca35790be26935e1c5 Mon Sep 17 00:00:00 2001
From: Marco Angaroni <marcoangaroni@gmail.com>
Date: Tue, 30 Aug 2016 18:48:19 +0200
Subject: [PATCH 03/53] netfilter: nf_ct_sip: correct parsing of continuation
 lines in SIP headers

Current parsing methods for SIP headers do not properly manage
continuation lines: in case of Call-ID header the first character of
Call-ID header value is truncated. As a result IPVS SIP persistence
engine hashes over a call-id that is not exactly the one present in
the originale message.

Example: "Call-ID: \r\n abcdeABCDE1234"
results in extracted call-id equal to "bcdeABCDE1234".

In above example Call-ID is represented as a string in C language.
Obviously in real message the first bytes after colon (":") are
"20 0d 0a 20".

Proposed fix is in nf_conntrack_sip module.
Since sip_follow_continuation() function walks past the leading
spaces or tabs of the continuation line, sip_skip_whitespace()
should simply return the ouput of sip_follow_continuation().
Otherwise another iteration of the for loop is done and dptr
is incremented by one pointing to the second character of the
first word in the header.

Below is an extract of relevant SIP ABNF syntax.

Call-ID  =  ( "Call-ID" / "i" ) HCOLON callid
callid   =  word [ "@" word ]

HCOLON  =  *( SP / HTAB ) ":" SWS
SWS     =  [LWS] ; sep whitespace
LWS     =  [*WSP CRLF] 1*WSP ; linear whitespace
WSP     =  SP / HTAB
word    =  1*(alphanum / "-" / "." / "!" / "%" / "*" /
           "_" / "+" / "`" / "'" / "~" /
           "(" / ")" / "<" / ">" /
           ":" / "\" / DQUOTE /
           "/" / "[" / "]" / "?" /
           "{" / "}" )

Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 7d77217de6a3..251a9a44d189 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -334,8 +334,7 @@ static const char *sip_skip_whitespace(const char *dptr, const char *limit)
 		if (*dptr != '\r' && *dptr != '\n')
 			break;
 		dptr = sip_follow_continuation(dptr, limit);
-		if (dptr == NULL)
-			return NULL;
+		break;
 	}
 	return dptr;
 }

From f0608ceaa79d99d24e97517f9a9a0fed2b9698b4 Mon Sep 17 00:00:00 2001
From: Marco Angaroni <marcoangaroni@gmail.com>
Date: Tue, 30 Aug 2016 18:48:24 +0200
Subject: [PATCH 04/53] netfilter: nf_ct_sip: correct allowed characters in
 Call-ID SIP header

Current parsing methods for SIP header Call-ID do not check correctly all
characters allowed by RFC 3261. In particular "," character is allowed
instead of "'" character. As a result Call-ID headers like the following
are discarded by IPVS SIP persistence engine.

Call-ID: -.!%*_+`'~()<>:\"/[]?{}

Above example is composed using all non-alphanumeric characters listed
in RFC 3261 for Call-ID header syntax.

Proposed fix is in nf_conntrack_sip module; function iswordc() checks this
range: (c >= '(' && c <= '/') which includes these characters: ()*+,-./
They are all allowed except ",". Instead "'" is not included in the list.

Below is an extract of relevant SIP ABNF syntax.

Call-ID  =  ( "Call-ID" / "i" ) HCOLON callid
callid   =  word [ "@" word ]

HCOLON  =  *( SP / HTAB ) ":" SWS
SWS     =  [LWS] ; sep whitespace
LWS     =  [*WSP CRLF] 1*WSP ; linear whitespace
WSP     =  SP / HTAB
word    =  1*(alphanum / "-" / "." / "!" / "%" / "*" /
           "_" / "+" / "`" / "'" / "~" /
           "(" / ")" / "<" / ">" /
           ":" / "\" / DQUOTE /
           "/" / "[" / "]" / "?" /
           "{" / "}" )

Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 251a9a44d189..d8035351aff5 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -83,9 +83,10 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
 static int iswordc(const char c)
 {
 	if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
-	    (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+	    (c >= '(' && c <= '+') || c == ':' || c == '<' || c == '>' ||
 	    c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
-	    c == '{' || c == '}' || c == '~')
+	    c == '{' || c == '}' || c == '~' || (c >= '-' && c <= '/') ||
+	    c == '\'')
 		return 1;
 	return 0;
 }

From 723eb299de62ce75dbd31e7ee25d45887c32a602 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 1 Sep 2016 18:58:29 +0800
Subject: [PATCH 05/53] netfilter: ftp: Remove the useless dlen==0 condition
 check in find_pattern

The caller function "help" has already make sure the datalen could not be zero
before invoke find_pattern as a parameter by the following codes

        if (dataoff >= skb->len) {
                pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
                         skb->len);
                return NF_ACCEPT;
        }
        datalen = skb->len - dataoff;

And the latter codes "ends_in_nl = (fb_ptr[datalen - 1] == '\n');" use datalen
directly without checking if it is zero.

So it is unneccessary to check it in find_pattern too.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_ftp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index b6934b5edf7a..d49a2d410813 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -301,8 +301,6 @@ static int find_pattern(const char *data, size_t dlen,
 	size_t i = plen;
 
 	pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
-	if (dlen == 0)
-		return 0;
 
 	if (dlen <= plen) {
 		/* Short packet: try for partial? */

From ddb075b0cdbca140d6c6503db9cc9990d3461308 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 1 Sep 2016 18:59:02 +0800
Subject: [PATCH 06/53] netfilter: ftp: Remove the useless code

There are some debug code which are commented out in find_pattern by #if 0.
Now remove them.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_ftp.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index d49a2d410813..e3ed20060878 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -309,19 +309,8 @@ static int find_pattern(const char *data, size_t dlen,
 		else return 0;
 	}
 
-	if (strncasecmp(data, pattern, plen) != 0) {
-#if 0
-		size_t i;
-
-		pr_debug("ftp: string mismatch\n");
-		for (i = 0; i < plen; i++) {
-			pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
-				 i, data[i], data[i],
-				 pattern[i], pattern[i]);
-		}
-#endif
+	if (strncasecmp(data, pattern, plen) != 0)
 		return 0;
-	}
 
 	pr_debug("Pattern matches!\n");
 	/* Now we've found the constant string, try to skip

From 0d9932b2875f568d679f2af33ce610da3903ac11 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Fri, 2 Sep 2016 15:05:57 +0200
Subject: [PATCH 07/53] netfilter: nft_numgen: rename until attribute by
 modulus

The _until_ attribute is renamed to _modulus_ as the behaviour is similar to
other expresions with number limits (ex. nft_hash).

Renaming is possible because there isn't a kernel release yet with these
changes.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nft_numgen.c               | 30 ++++++++++++------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 28ce01d79707..24161e25576d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1126,13 +1126,13 @@ enum nft_trace_types {
  * enum nft_ng_attributes - nf_tables number generator expression netlink attributes
  *
  * @NFTA_NG_DREG: destination register (NLA_U32)
- * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32)
+ * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
 	NFTA_NG_DREG,
-	NFTA_NG_UNTIL,
+	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
 	__NFTA_NG_MAX
 };
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 294745ecb0fc..f51a3ede3932 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -21,7 +21,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
 
 struct nft_ng_inc {
 	enum nft_registers      dreg:8;
-	u32			until;
+	u32			modulus;
 	atomic_t		counter;
 };
 
@@ -34,7 +34,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 
 	do {
 		oval = atomic_read(&priv->counter);
-		nval = (oval + 1 < priv->until) ? oval + 1 : 0;
+		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
 	memcpy(&regs->data[priv->dreg], &priv->counter, sizeof(u32));
@@ -42,7 +42,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_DREG]		= { .type = NLA_U32 },
-	[NFTA_NG_UNTIL]		= { .type = NLA_U32 },
+	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
 };
 
@@ -52,8 +52,8 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_inc *priv = nft_expr_priv(expr);
 
-	priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
-	if (priv->until == 0)
+	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+	if (priv->modulus == 0)
 		return -ERANGE;
 
 	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
@@ -64,11 +64,11 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
-		       u32 until, enum nft_ng_types type)
+		       u32 modulus, enum nft_ng_types type)
 {
 	if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
 		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NG_UNTIL, htonl(until)))
+	if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus)))
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
 		goto nla_put_failure;
@@ -83,12 +83,12 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_inc *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL);
 }
 
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
-	u32			until;
+	u32			modulus;
 };
 
 static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -99,7 +99,7 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
 	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
 
 	regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
-						  priv->until);
+						  priv->modulus);
 }
 
 static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -108,8 +108,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
 
-	priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
-	if (priv->until == 0)
+	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+	if (priv->modulus == 0)
 		return -ERANGE;
 
 	prandom_init_once(&nft_numgen_prandom_state);
@@ -124,7 +124,7 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM);
 }
 
 static struct nft_expr_type nft_ng_type;
@@ -149,8 +149,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
 	u32 type;
 
-	if (!tb[NFTA_NG_DREG]	||
-	    !tb[NFTA_NG_UNTIL]	||
+	if (!tb[NFTA_NG_DREG]	 ||
+	    !tb[NFTA_NG_MODULUS] ||
 	    !tb[NFTA_NG_TYPE])
 		return ERR_PTR(-EINVAL);
 

From db6d857b819a00627a3bd911f49ee3156766bba8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 2 Sep 2016 21:00:58 +0200
Subject: [PATCH 08/53] netfilter: nft_quota: fix overquota logic

Use xor to decide to break further rule evaluation or not, since the
existing logic doesn't achieve the expected inversion.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_quota.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 6eafbf987ed9..92b6ff16dbb3 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -33,7 +33,7 @@ static void nft_quota_eval(const struct nft_expr *expr,
 {
 	struct nft_quota *priv = nft_expr_priv(expr);
 
-	if (nft_quota(priv, pkt) < 0 && !priv->invert)
+	if ((nft_quota(priv, pkt) < 0) ^ priv->invert)
 		regs->verdict.code = NFT_BREAK;
 }
 

From 22609b43b194917dce2188ae9a78bc40a14e67b5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 2 Sep 2016 21:00:59 +0200
Subject: [PATCH 09/53] netfilter: nft_quota: introduce nft_overquota()

This is patch renames the existing function to nft_overquota() and make
it return a boolean that tells us if we have exceeded our byte quota.
Just a cleanup.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_quota.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 92b6ff16dbb3..c00104c07095 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -21,10 +21,10 @@ struct nft_quota {
 	atomic64_t	remain;
 };
 
-static inline long nft_quota(struct nft_quota *priv,
-			     const struct nft_pktinfo *pkt)
+static inline bool nft_overquota(struct nft_quota *priv,
+				 const struct nft_pktinfo *pkt)
 {
-	return atomic64_sub_return(pkt->skb->len, &priv->remain);
+	return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
 }
 
 static void nft_quota_eval(const struct nft_expr *expr,
@@ -33,7 +33,7 @@ static void nft_quota_eval(const struct nft_expr *expr,
 {
 	struct nft_quota *priv = nft_expr_priv(expr);
 
-	if ((nft_quota(priv, pkt) < 0) ^ priv->invert)
+	if (nft_overquota(priv, pkt) ^ priv->invert)
 		regs->verdict.code = NFT_BREAK;
 }
 

From 1bcabc81ee94c0a65989128258f8c1d3e1c1b0ea Mon Sep 17 00:00:00 2001
From: Marco Angaroni <marcoangaroni@gmail.com>
Date: Tue, 30 Aug 2016 18:52:22 +0200
Subject: [PATCH 10/53] netfilter: nf_ct_sip: allow tab character in SIP
 headers

Current parsing methods for SIP headers do not allow the presence of
tab characters between header name and header value. As a result Call-ID
SIP headers like the following are discarded by IPVS SIP persistence
engine:

"Call-ID\t: mycallid@abcde"
"Call-ID:\tmycallid@abcde"

In above examples Call-IDs are represented as strings in C language.
Obviously in real message we have byte "09" before/after colon (":").

Proposed fix is in nf_conntrack_sip module.
Function sip_skip_whitespace() should skip tabs in addition to spaces,
since in SIP grammar whitespace (WSP) corresponds to space or tab.

Below is an extract of relevant SIP ABNF syntax.

Call-ID  =  ( "Call-ID" / "i" ) HCOLON callid
callid   =  word [ "@" word ]

HCOLON  =  *( SP / HTAB ) ":" SWS
SWS     =  [LWS] ; sep whitespace
LWS     =  [*WSP CRLF] 1*WSP ; linear whitespace
WSP     =  SP / HTAB
word    =  1*(alphanum / "-" / "." / "!" / "%" / "*" /
           "_" / "+" / "`" / "'" / "~" /
           "(" / ")" / "<" / ">" /
           ":" / "\" / DQUOTE /
           "/" / "[" / "]" / "?" /
           "{" / "}" )

Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index d8035351aff5..621b81c7bddc 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -330,7 +330,7 @@ static const char *sip_follow_continuation(const char *dptr, const char *limit)
 static const char *sip_skip_whitespace(const char *dptr, const char *limit)
 {
 	for (; dptr < limit; dptr++) {
-		if (*dptr == ' ')
+		if (*dptr == ' ' || *dptr == '\t')
 			continue;
 		if (*dptr != '\r' && *dptr != '\n')
 			break;

From fe01111d23810c0cf6830ce5af1c14c6d3df6dc5 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Tue, 6 Sep 2016 22:33:37 +0800
Subject: [PATCH 11/53] netfilter: nft_queue: check the validation of
 queues_total and queuenum

Although the validation of queues_total and queuenum is checked in nft
utility, but user can add nft rules via nfnetlink, so it is necessary
to check the validation at the nft_queue expr init routine too.

Tested by run ./nft-test.py any/queue.t:
  any/queue.t: 6 unit tests, 0 error, 0 warning

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_queue.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 61d216eb7917..d16d59959ff6 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -65,6 +65,7 @@ static int nft_queue_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_queue *priv = nft_expr_priv(expr);
+	u32 maxid;
 
 	if (tb[NFTA_QUEUE_NUM] == NULL)
 		return -EINVAL;
@@ -74,6 +75,16 @@ static int nft_queue_init(const struct nft_ctx *ctx,
 
 	if (tb[NFTA_QUEUE_TOTAL] != NULL)
 		priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
+	else
+		priv->queues_total = 1;
+
+	if (priv->queues_total == 0)
+		return -EINVAL;
+
+	maxid = priv->queues_total - 1 + priv->queuenum;
+	if (maxid > U16_MAX)
+		return -ERANGE;
+
 	if (tb[NFTA_QUEUE_FLAGS] != NULL) {
 		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
 		if (priv->flags & ~NFT_QUEUE_FLAG_MASK)

From 3c15b8e112d7351b2586c649a759318548523364 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Tue, 6 Sep 2016 22:35:47 +0800
Subject: [PATCH 12/53] netfilter: nf_conntrack: remove unused ctl_table_path
 member in nf_conntrack_l3proto

After commit adf0516845bc ("netfilter: remove ip_conntrack* sysctl
compat code"), ctl_table_path member in struct nf_conntrack_l3proto{}
is not used anymore, remove it.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index cdc920b4c4c2..8992e4229da9 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -63,10 +63,6 @@ struct nf_conntrack_l3proto {
 
 	size_t nla_size;
 
-#ifdef CONFIG_SYSCTL
-	const char		*ctl_table_path;
-#endif /* CONFIG_SYSCTL */
-
 	/* Init l3proto pernet data */
 	int (*init_net)(struct net *net);
 

From 70ca767ea1b2748f45e96192400e515dddbe517c Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Tue, 6 Sep 2016 08:44:19 +0200
Subject: [PATCH 13/53] netfilter: nft_hash: Add hash offset value

Add support to pass through an offset to the hash value. With this
feature, the sysadmin is able to generate a hash with a given
offset value.

Example:

	meta mark set jhash ip saddr mod 2 seed 0xabcd offset 100

This option generates marks according to the source address from 100 to
101.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_hash.c                 | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 24161e25576d..8c653bbd1ead 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -731,6 +731,7 @@ enum nft_meta_keys {
  * @NFTA_HASH_LEN: source data length (NLA_U32)
  * @NFTA_HASH_MODULUS: modulus value (NLA_U32)
  * @NFTA_HASH_SEED: seed value (NLA_U32)
+ * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32)
  */
 enum nft_hash_attributes {
 	NFTA_HASH_UNSPEC,
@@ -739,6 +740,7 @@ enum nft_hash_attributes {
 	NFTA_HASH_LEN,
 	NFTA_HASH_MODULUS,
 	NFTA_HASH_SEED,
+	NFTA_HASH_OFFSET,
 	__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 764251d31e46..bd12f7a801c2 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -23,6 +23,7 @@ struct nft_hash {
 	u8			len;
 	u32			modulus;
 	u32			seed;
+	u32			offset;
 };
 
 static void nft_hash_eval(const struct nft_expr *expr,
@@ -31,10 +32,10 @@ static void nft_hash_eval(const struct nft_expr *expr,
 {
 	struct nft_hash *priv = nft_expr_priv(expr);
 	const void *data = &regs->data[priv->sreg];
+	u32 h;
 
-	regs->data[priv->dreg] =
-		reciprocal_scale(jhash(data, priv->len, priv->seed),
-				 priv->modulus);
+	h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+	regs->data[priv->dreg] = h + priv->offset;
 }
 
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
@@ -59,6 +60,9 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	    !tb[NFTA_HASH_MODULUS])
 		return -EINVAL;
 
+	if (tb[NFTA_HASH_OFFSET])
+		priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
+
 	priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
 	priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
 
@@ -72,6 +76,9 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	if (priv->modulus <= 1)
 		return -ERANGE;
 
+	if (priv->offset + priv->modulus - 1 < U32_MAX)
+		return -EOVERFLOW;
+
 	priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
 
 	return nft_validate_register_load(priv->sreg, len) &&
@@ -94,7 +101,9 @@ static int nft_hash_dump(struct sk_buff *skb,
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
 		goto nla_put_failure;
-
+	if (priv->offset != 0)
+		if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:

From dbd2be0646e3239022630c426cbceefa15714bca Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 7 Sep 2016 12:22:18 +0200
Subject: [PATCH 14/53] netfilter: nft_dynset: allow to invert match criteria

The dynset expression matches if we can fit a new entry into the set.
If there is no room for it, then it breaks the rule evaluation.

This patch introduces the inversion flag so you can add rules to
explicitly drop packets that don't fit into the set. For example:

 # nft filter input flow table xyz size 4 { ip saddr timeout 120s counter } overflow drop

This is useful to provide a replacement for connlimit.

For the rule above, every new entry uses the IPv4 address as key in the
set, this entry gets a timeout of 120 seconds that gets refresh on every
packet seen. If we get new flow and our set already contains 4 entries
already, then this packet is dropped.

You can already express this in positive logic, assuming default policy
to drop:

 # nft filter input flow table xyz size 4 { ip saddr timeout 10s counter } accept

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  6 ++++++
 net/netfilter/nft_dynset.c               | 20 +++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 8c653bbd1ead..bc0eb6a1066d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -575,6 +575,10 @@ enum nft_dynset_ops {
 	NFT_DYNSET_OP_UPDATE,
 };
 
+enum nft_dynset_flags {
+	NFT_DYNSET_F_INV	= (1 << 0),
+};
+
 /**
  * enum nft_dynset_attributes - dynset expression attributes
  *
@@ -585,6 +589,7 @@ enum nft_dynset_ops {
  * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32)
  * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64)
  * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_DYNSET_FLAGS: flags (NLA_U32)
  */
 enum nft_dynset_attributes {
 	NFTA_DYNSET_UNSPEC,
@@ -596,6 +601,7 @@ enum nft_dynset_attributes {
 	NFTA_DYNSET_TIMEOUT,
 	NFTA_DYNSET_EXPR,
 	NFTA_DYNSET_PAD,
+	NFTA_DYNSET_FLAGS,
 	__NFTA_DYNSET_MAX,
 };
 #define NFTA_DYNSET_MAX		(__NFTA_DYNSET_MAX - 1)
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 0af26699bf04..e3b83c31da2e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -22,6 +22,7 @@ struct nft_dynset {
 	enum nft_dynset_ops		op:8;
 	enum nft_registers		sreg_key:8;
 	enum nft_registers		sreg_data:8;
+	bool				invert;
 	u64				timeout;
 	struct nft_expr			*expr;
 	struct nft_set_binding		binding;
@@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr,
 
 		if (sexpr != NULL)
 			sexpr->ops->eval(sexpr, regs, pkt);
+
+		if (priv->invert)
+			regs->verdict.code = NFT_BREAK;
 		return;
 	}
 out:
-	regs->verdict.code = NFT_BREAK;
+	if (!priv->invert)
+		regs->verdict.code = NFT_BREAK;
 }
 
 static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
@@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
 	[NFTA_DYNSET_SREG_DATA]	= { .type = NLA_U32 },
 	[NFTA_DYNSET_TIMEOUT]	= { .type = NLA_U64 },
 	[NFTA_DYNSET_EXPR]	= { .type = NLA_NESTED },
+	[NFTA_DYNSET_FLAGS]	= { .type = NLA_U32 },
 };
 
 static int nft_dynset_init(const struct nft_ctx *ctx,
@@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	    tb[NFTA_DYNSET_SREG_KEY] == NULL)
 		return -EINVAL;
 
+	if (tb[NFTA_DYNSET_FLAGS]) {
+		u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
+
+		if (flags & ~NFT_DYNSET_F_INV)
+			return -EINVAL;
+		if (flags & NFT_DYNSET_F_INV)
+			priv->invert = true;
+	}
+
 	set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
 				   genmask);
 	if (IS_ERR(set)) {
@@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
 static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_dynset *priv = nft_expr_priv(expr);
+	u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
 
 	if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
 		goto nla_put_failure;
@@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:

From beac5afa2d78605b70f40cf5ab5601ab10659c7f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 12:42:49 +0200
Subject: [PATCH 15/53] netfilter: nf_tables: ensure proper initialization of
 nft_pktinfo fields

This patch introduces nft_set_pktinfo_unspec() that ensures proper
initialization all of pktinfo fields for non-IP traffic. This is used
by the bridge, netdev and arp families.

This new function relies on nft_set_pktinfo_proto_unspec() to set a new
tprot_set field that indicates if transport protocol information is
available. Remain fields are zeroed.

The meta expression has been also updated to check to tprot_set in first
place given that zero is a valid tprot value. Even a handcrafted packet
may come with the IPPROTO_RAW (255) protocol number so we can't rely on
this value as tprot unset.

Reported-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       | 18 ++++++++++++++++++
 include/net/netfilter/nf_tables_ipv4.h  |  1 +
 include/net/netfilter/nf_tables_ipv6.h  |  1 +
 net/bridge/netfilter/nf_tables_bridge.c |  6 +++---
 net/ipv4/netfilter/nf_tables_arp.c      |  2 +-
 net/netfilter/nf_tables_netdev.c        |  4 +++-
 net/netfilter/nft_meta.c                |  2 ++
 7 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8972468bc94b..a7a7cebc8d07 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -19,6 +19,7 @@ struct nft_pktinfo {
 	const struct net_device		*out;
 	u8				pf;
 	u8				hook;
+	bool				tprot_set;
 	u8				tprot;
 	/* for x_tables compatibility */
 	struct xt_action_param		xt;
@@ -36,6 +37,23 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 	pkt->pf = pkt->xt.family = state->pf;
 }
 
+static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt,
+						struct sk_buff *skb)
+{
+	pkt->tprot_set = false;
+	pkt->tprot = 0;
+	pkt->xt.thoff = 0;
+	pkt->xt.fragoff = 0;
+}
+
+static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt,
+					  struct sk_buff *skb,
+					  const struct nf_hook_state *state)
+{
+	nft_set_pktinfo(pkt, skb, state);
+	nft_set_pktinfo_proto_unspec(pkt, skb);
+}
+
 /**
  * 	struct nft_verdict - nf_tables verdict
  *
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
index ca6ef6bf775e..af952f7843ee 100644
--- a/include/net/netfilter/nf_tables_ipv4.h
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -14,6 +14,7 @@ nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
 	nft_set_pktinfo(pkt, skb, state);
 
 	ip = ip_hdr(pkt->skb);
+	pkt->tprot_set = true;
 	pkt->tprot = ip->protocol;
 	pkt->xt.thoff = ip_hdrlen(pkt->skb);
 	pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index 8ad39a6a5fe1..6aeee47b1b5e 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -19,6 +19,7 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	if (protohdr < 0)
 		return -1;
 
+	pkt->tprot_set = true;
 	pkt->tprot = protohdr;
 	pkt->xt.thoff = thoff;
 	pkt->xt.fragoff = frag_off;
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index a78c4e2826e5..29899887163e 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -71,7 +71,7 @@ static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
 	if (nft_bridge_iphdr_validate(skb))
 		nft_set_pktinfo_ipv4(pkt, skb, state);
 	else
-		nft_set_pktinfo(pkt, skb, state);
+		nft_set_pktinfo_unspec(pkt, skb, state);
 }
 
 static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
@@ -83,7 +83,7 @@ static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	    nft_set_pktinfo_ipv6(pkt, skb, state) == 0)
 		return;
 #endif
-	nft_set_pktinfo(pkt, skb, state);
+	nft_set_pktinfo_unspec(pkt, skb, state);
 }
 
 static unsigned int
@@ -101,7 +101,7 @@ nft_do_chain_bridge(void *priv,
 		nft_bridge_set_pktinfo_ipv6(&pkt, skb, state);
 		break;
 	default:
-		nft_set_pktinfo(&pkt, skb, state);
+		nft_set_pktinfo_unspec(&pkt, skb, state);
 		break;
 	}
 
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index cd84d4295a20..058c034be376 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_unspec(&pkt, skb, state);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 5eefe4a355c6..8de502b0c37b 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -41,6 +41,7 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
 	else if (len < thoff)
 		return;
 
+	pkt->tprot_set = true;
 	pkt->tprot = iph->protocol;
 	pkt->xt.thoff = thoff;
 	pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
@@ -74,6 +75,7 @@ __nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	if (protohdr < 0)
                 return;
 
+	pkt->tprot_set = true;
 	pkt->tprot = protohdr;
 	pkt->xt.thoff = thoff;
 	pkt->xt.fragoff = frag_off;
@@ -102,7 +104,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 		nft_netdev_set_pktinfo_ipv6(&pkt, skb, state);
 		break;
 	default:
-		nft_set_pktinfo(&pkt, skb, state);
+		nft_set_pktinfo_unspec(&pkt, skb, state);
 		break;
 	}
 
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 2863f3493038..14264edf2d77 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -52,6 +52,8 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = pkt->pf;
 		break;
 	case NFT_META_L4PROTO:
+		if (!pkt->tprot_set)
+			goto err;
 		*dest = pkt->tprot;
 		break;
 	case NFT_META_PRIORITY:

From 8df9e32e7ed2978f90cce780ce6a27513044158a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 12:42:50 +0200
Subject: [PATCH 16/53] netfilter: nf_tables_ipv6: setup pktinfo transport
 field on failure to parse

Make sure the pktinfo protocol fields are initialized if this fails to
parse the transport header.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_ipv6.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index 6aeee47b1b5e..1e0ffd5aea47 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -15,9 +15,10 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	nft_set_pktinfo(pkt, skb, state);
 
 	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
-	/* If malformed, drop it */
-	if (protohdr < 0)
+	if (protohdr < 0) {
+		nft_set_pktinfo_proto_unspec(pkt, skb);
 		return -1;
+	}
 
 	pkt->tprot_set = true;
 	pkt->tprot = protohdr;

From ddc8b6027ad08d145a6d7a6a6abc00e43f315bd1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 12:42:51 +0200
Subject: [PATCH 17/53] netfilter: introduce nft_set_pktinfo_{ipv4,
 ipv6}_validate()

These functions are extracted from the netdev family, they initialize
the pktinfo structure and validate that the IPv4 and IPv6 headers are
well-formed given that these functions are called from a path where
layer 3 sanitization did not happen yet.

These functions are placed in include/net/netfilter/nf_tables_ipv{4,6}.h
so they can be reused by a follow up patch to use them from the bridge
family too.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_ipv4.h | 42 ++++++++++++++
 include/net/netfilter/nf_tables_ipv6.h | 49 ++++++++++++++++
 net/netfilter/nf_tables_netdev.c       | 79 +-------------------------
 3 files changed, 93 insertions(+), 77 deletions(-)

diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
index af952f7843ee..968f00b82fb5 100644
--- a/include/net/netfilter/nf_tables_ipv4.h
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -20,6 +20,48 @@ nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
 	pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
 }
 
+static inline int
+__nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
+				struct sk_buff *skb,
+				const struct nf_hook_state *state)
+{
+	struct iphdr *iph, _iph;
+	u32 len, thoff;
+
+	iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
+				 &_iph);
+	if (!iph)
+		return -1;
+
+	iph = ip_hdr(skb);
+	if (iph->ihl < 5 || iph->version != 4)
+		return -1;
+
+	len = ntohs(iph->tot_len);
+	thoff = iph->ihl * 4;
+	if (skb->len < len)
+		return -1;
+	else if (len < thoff)
+		return -1;
+
+	pkt->tprot_set = true;
+	pkt->tprot = iph->protocol;
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
+
+	return 0;
+}
+
+static inline void
+nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
+			      struct sk_buff *skb,
+			      const struct nf_hook_state *state)
+{
+	nft_set_pktinfo(pkt, skb, state);
+	if (__nft_set_pktinfo_ipv4_validate(pkt, skb, state) < 0)
+		nft_set_pktinfo_proto_unspec(pkt, skb);
+}
+
 extern struct nft_af_info nft_af_ipv4;
 
 #endif
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index 1e0ffd5aea47..39b7b717b540 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -28,6 +28,55 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	return 0;
 }
 
+static inline int
+__nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
+				struct sk_buff *skb,
+				const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct ipv6hdr *ip6h, _ip6h;
+	unsigned int thoff = 0;
+	unsigned short frag_off;
+	int protohdr;
+	u32 pkt_len;
+
+	ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
+				  &_ip6h);
+	if (!ip6h)
+		return -1;
+
+	if (ip6h->version != 6)
+		return -1;
+
+	pkt_len = ntohs(ip6h->payload_len);
+	if (pkt_len + sizeof(*ip6h) > skb->len)
+		return -1;
+
+	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+	if (protohdr < 0)
+		return -1;
+
+	pkt->tprot_set = true;
+	pkt->tprot = protohdr;
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = frag_off;
+
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+static inline void
+nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
+			      struct sk_buff *skb,
+			      const struct nf_hook_state *state)
+{
+	nft_set_pktinfo(pkt, skb, state);
+	if (__nft_set_pktinfo_ipv6_validate(pkt, skb, state) < 0)
+		nft_set_pktinfo_proto_unspec(pkt, skb);
+}
+
 extern struct nft_af_info nft_af_ipv6;
 
 #endif
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 8de502b0c37b..3e5475a833a5 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -15,81 +15,6 @@
 #include <net/netfilter/nf_tables_ipv4.h>
 #include <net/netfilter/nf_tables_ipv6.h>
 
-static inline void
-nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
-			    struct sk_buff *skb,
-			    const struct nf_hook_state *state)
-{
-	struct iphdr *iph, _iph;
-	u32 len, thoff;
-
-	nft_set_pktinfo(pkt, skb, state);
-
-	iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
-				 &_iph);
-	if (!iph)
-		return;
-
-	iph = ip_hdr(skb);
-	if (iph->ihl < 5 || iph->version != 4)
-		return;
-
-	len = ntohs(iph->tot_len);
-	thoff = iph->ihl * 4;
-	if (skb->len < len)
-		return;
-	else if (len < thoff)
-		return;
-
-	pkt->tprot_set = true;
-	pkt->tprot = iph->protocol;
-	pkt->xt.thoff = thoff;
-	pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
-}
-
-static inline void
-__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
-			      struct sk_buff *skb,
-			      const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	struct ipv6hdr *ip6h, _ip6h;
-	unsigned int thoff = 0;
-	unsigned short frag_off;
-	int protohdr;
-	u32 pkt_len;
-
-	ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
-				  &_ip6h);
-	if (!ip6h)
-		return;
-
-	if (ip6h->version != 6)
-		return;
-
-	pkt_len = ntohs(ip6h->payload_len);
-	if (pkt_len + sizeof(*ip6h) > skb->len)
-		return;
-
-	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
-	if (protohdr < 0)
-                return;
-
-	pkt->tprot_set = true;
-	pkt->tprot = protohdr;
-	pkt->xt.thoff = thoff;
-	pkt->xt.fragoff = frag_off;
-#endif
-}
-
-static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
-					       struct sk_buff *skb,
-					       const struct nf_hook_state *state)
-{
-	nft_set_pktinfo(pkt, skb, state);
-	__nft_netdev_set_pktinfo_ipv6(pkt, skb, state);
-}
-
 static unsigned int
 nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 		    const struct nf_hook_state *state)
@@ -98,10 +23,10 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
+		nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
 		break;
 	case htons(ETH_P_IPV6):
-		nft_netdev_set_pktinfo_ipv6(&pkt, skb, state);
+		nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
 		break;
 	default:
 		nft_set_pktinfo_unspec(&pkt, skb, state);

From 10151d7b03e23afce76a59f717f2616a10ddef86 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 12:42:52 +0200
Subject: [PATCH 18/53] netfilter: nf_tables_bridge: use nft_set_pktinfo_ipv{4,
 6}_validate

Consolidate pktinfo setup and validation by using the new generic
functions so we converge to the netdev family codebase.

We only need a linear IPv4 and IPv6 header from the reject expression,
so move nft_bridge_iphdr_validate() and nft_bridge_ip6hdr_validate()
to net/bridge/netfilter/nft_reject_bridge.c.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_bridge.h |  7 ---
 net/bridge/netfilter/nf_tables_bridge.c  | 72 +-----------------------
 net/bridge/netfilter/nft_reject_bridge.c | 44 ++++++++++++++-
 3 files changed, 45 insertions(+), 78 deletions(-)
 delete mode 100644 include/net/netfilter/nf_tables_bridge.h

diff --git a/include/net/netfilter/nf_tables_bridge.h b/include/net/netfilter/nf_tables_bridge.h
deleted file mode 100644
index 511fb79f6dad..000000000000
--- a/include/net/netfilter/nf_tables_bridge.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _NET_NF_TABLES_BRIDGE_H
-#define _NET_NF_TABLES_BRIDGE_H
-
-int nft_bridge_iphdr_validate(struct sk_buff *skb);
-int nft_bridge_ip6hdr_validate(struct sk_buff *skb);
-
-#endif /* _NET_NF_TABLES_BRIDGE_H */
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 29899887163e..06f0f81456a0 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -13,79 +13,11 @@
 #include <linux/module.h>
 #include <linux/netfilter_bridge.h>
 #include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_bridge.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <net/netfilter/nf_tables_ipv4.h>
 #include <net/netfilter/nf_tables_ipv6.h>
 
-int nft_bridge_iphdr_validate(struct sk_buff *skb)
-{
-	struct iphdr *iph;
-	u32 len;
-
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		return 0;
-
-	iph = ip_hdr(skb);
-	if (iph->ihl < 5 || iph->version != 4)
-		return 0;
-
-	len = ntohs(iph->tot_len);
-	if (skb->len < len)
-		return 0;
-	else if (len < (iph->ihl*4))
-		return 0;
-
-	if (!pskb_may_pull(skb, iph->ihl*4))
-		return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate);
-
-int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
-{
-	struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return 0;
-
-	hdr = ipv6_hdr(skb);
-	if (hdr->version != 6)
-		return 0;
-
-	pkt_len = ntohs(hdr->payload_len);
-	if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-		return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate);
-
-static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
-					       struct sk_buff *skb,
-					       const struct nf_hook_state *state)
-{
-	if (nft_bridge_iphdr_validate(skb))
-		nft_set_pktinfo_ipv4(pkt, skb, state);
-	else
-		nft_set_pktinfo_unspec(pkt, skb, state);
-}
-
-static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
-					       struct sk_buff *skb,
-					       const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	if (nft_bridge_ip6hdr_validate(skb) &&
-	    nft_set_pktinfo_ipv6(pkt, skb, state) == 0)
-		return;
-#endif
-	nft_set_pktinfo_unspec(pkt, skb, state);
-}
-
 static unsigned int
 nft_do_chain_bridge(void *priv,
 		    struct sk_buff *skb,
@@ -95,10 +27,10 @@ nft_do_chain_bridge(void *priv,
 
 	switch (eth_hdr(skb)->h_proto) {
 	case htons(ETH_P_IP):
-		nft_bridge_set_pktinfo_ipv4(&pkt, skb, state);
+		nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
 		break;
 	case htons(ETH_P_IPV6):
-		nft_bridge_set_pktinfo_ipv6(&pkt, skb, state);
+		nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
 		break;
 	default:
 		nft_set_pktinfo_unspec(&pkt, skb, state);
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 0b77ffbc27d6..4b3df6b0e3b9 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -14,7 +14,6 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nft_reject.h>
-#include <net/netfilter/nf_tables_bridge.h>
 #include <net/netfilter/ipv4/nf_reject.h>
 #include <net/netfilter/ipv6/nf_reject.h>
 #include <linux/ip.h>
@@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
 	skb_pull(nskb, ETH_HLEN);
 }
 
+static int nft_bridge_iphdr_validate(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	u32 len;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		return 0;
+
+	iph = ip_hdr(skb);
+	if (iph->ihl < 5 || iph->version != 4)
+		return 0;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len)
+		return 0;
+	else if (len < (iph->ihl*4))
+		return 0;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		return 0;
+
+	return 1;
+}
+
 /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
  * or the bridge port (NF_BRIDGE PREROUTING).
  */
@@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
 	br_forward(br_port_get_rcu(dev), nskb, false, true);
 }
 
+static int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
+{
+	struct ipv6hdr *hdr;
+	u32 pkt_len;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		return 0;
+
+	hdr = ipv6_hdr(skb);
+	if (hdr->version != 6)
+		return 0;
+
+	pkt_len = ntohs(hdr->payload_len);
+	if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+		return 0;
+
+	return 1;
+}
+
 static void nft_reject_br_send_v6_tcp_reset(struct net *net,
 					    struct sk_buff *oldskb,
 					    const struct net_device *dev,

From 71212c9b04eba76faa4dca26ccd1552d6bb300c1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 12:42:53 +0200
Subject: [PATCH 19/53] netfilter: nf_tables: don't drop IPv6 packets that
 cannot parse transport

This is overly conservative and not flexible at all, so better let them
go through and let the filtering policy decide what to do with them. We
use skb_header_pointer() all over the place so we would just fail to
match when trying to access fields from malformed traffic.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_ipv6.h    | 6 ++----
 net/ipv6/netfilter/nf_tables_ipv6.c       | 4 +---
 net/ipv6/netfilter/nft_chain_route_ipv6.c | 4 +---
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index 39b7b717b540..d150b5066201 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -4,7 +4,7 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/ipv6.h>
 
-static inline int
+static inline void
 nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 		     struct sk_buff *skb,
 		     const struct nf_hook_state *state)
@@ -17,15 +17,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
 	if (protohdr < 0) {
 		nft_set_pktinfo_proto_unspec(pkt, skb);
-		return -1;
+		return;
 	}
 
 	pkt->tprot_set = true;
 	pkt->tprot = protohdr;
 	pkt->xt.thoff = thoff;
 	pkt->xt.fragoff = frag_off;
-
-	return 0;
 }
 
 static inline int
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 30b22f4dff55..05d05926962a 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
 {
 	struct nft_pktinfo pkt;
 
-	/* malformed packet, drop it */
-	if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
-		return NF_DROP;
+	nft_set_pktinfo_ipv6(&pkt, skb, state);
 
 	return nft_do_chain(&pkt, priv);
 }
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 71d995ff3108..01eb0f658366 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -32,9 +32,7 @@ static unsigned int nf_route_table_hook(void *priv,
 	u_int8_t hop_limit;
 	u32 mark, flowlabel;
 
-	/* malformed packet, drop it */
-	if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
-		return NF_DROP;
+	nft_set_pktinfo_ipv6(&pkt, skb, state);
 
 	/* save source/dest address, mark, hoplimit, flowlabel, priority */
 	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));

From cf71c03edf10076f05a0b678fc9c8f8e6c6e24e4 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 14:01:26 +0200
Subject: [PATCH 20/53] netfilter: nf_conntrack: simplify
 __nf_ct_try_assign_helper() return logic

Instead of several goto's just to return the result, simply return it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_helper.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index b989b81ac156..4ffe388a9a1e 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -189,7 +189,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	struct nf_conntrack_helper *helper = NULL;
 	struct nf_conn_help *help;
 	struct net *net = nf_ct_net(ct);
-	int ret = 0;
 
 	/* We already got a helper explicitly attached. The function
 	 * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -223,15 +222,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	if (helper == NULL) {
 		if (help)
 			RCU_INIT_POINTER(help->helper, NULL);
-		goto out;
+		return 0;
 	}
 
 	if (help == NULL) {
 		help = nf_ct_helper_ext_add(ct, helper, flags);
-		if (help == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		if (help == NULL)
+			return -ENOMEM;
 	} else {
 		/* We only allow helper re-assignment of the same sort since
 		 * we cannot reallocate the helper extension area.
@@ -240,13 +237,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 
 		if (tmp && tmp->help != helper->help) {
 			RCU_INIT_POINTER(help->helper, NULL);
-			goto out;
+			return 0;
 		}
 	}
 
 	rcu_assign_pointer(help->helper, helper);
-out:
-	return ret;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
 

From 4e6577de71803142d01e374cf15664af0388799a Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Fri, 9 Sep 2016 23:25:09 +0800
Subject: [PATCH 21/53] netfilter: Add the missed return value check of
 register_netdevice_notifier

There are some codes of netfilter module which did not check the return
value of register_netdevice_notifier. Add the checks now.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_netdev.c | 18 +++++++++++++-----
 net/netfilter/nfnetlink_queue.c  |  9 ++++++++-
 net/netfilter/xt_TEE.c           |  8 +++++++-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 3e5475a833a5..38a3e8385042 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -151,12 +151,20 @@ static int __init nf_tables_netdev_init(void)
 
 	nft_register_chain_type(&nft_filter_chain_netdev);
 	ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
-	if (ret < 0) {
-		nft_unregister_chain_type(&nft_filter_chain_netdev);
-		return ret;
-	}
-	register_netdevice_notifier(&nf_tables_netdev_notifier);
+	if (ret)
+		goto err1;
+
+	ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
+	if (ret)
+		goto err2;
+
 	return 0;
+
+err2:
+	unregister_pernet_subsys(&nf_tables_netdev_net_ops);
+err1:
+	nft_unregister_chain_type(&nft_filter_chain_netdev);
+	return ret;
 }
 
 static void __exit nf_tables_netdev_exit(void)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f49f45081acb..808da34f94cd 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1522,9 +1522,16 @@ static int __init nfnetlink_queue_init(void)
 		goto cleanup_netlink_notifier;
 	}
 
-	register_netdevice_notifier(&nfqnl_dev_notifier);
+	status = register_netdevice_notifier(&nfqnl_dev_notifier);
+	if (status < 0) {
+		pr_err("nf_queue: failed to register netdevice notifier\n");
+		goto cleanup_netlink_subsys;
+	}
+
 	return status;
 
+cleanup_netlink_subsys:
+	nfnetlink_subsys_unregister(&nfqnl_subsys);
 cleanup_netlink_notifier:
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
 	unregister_pernet_subsys(&nfnl_queue_net_ops);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 6e57a3966dc5..0471db4032c5 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -89,6 +89,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 
 	if (info->oif[0]) {
+		int ret;
+
 		if (info->oif[sizeof(info->oif)-1] != '\0')
 			return -EINVAL;
 
@@ -101,7 +103,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 		priv->notifier.notifier_call = tee_netdev_event;
 		info->priv    = priv;
 
-		register_netdevice_notifier(&priv->notifier);
+		ret = register_netdevice_notifier(&priv->notifier);
+		if (ret) {
+			kfree(priv);
+			return ret;
+		}
 	} else
 		info->priv = NULL;
 

From 23d07508d25cea9984ee068538b4e86932b015c2 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Sat, 10 Sep 2016 10:04:30 +0800
Subject: [PATCH 22/53] netfilter: Add the missed return value check of
 nft_register_chain_type

There are some codes of netfilter module which did not check the return
value of nft_register_chain_type. Add the checks now.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nf_tables_bridge.c | 18 +++++++++++++-----
 net/ipv4/netfilter/nf_tables_arp.c      |  5 ++++-
 net/ipv4/netfilter/nf_tables_ipv4.c     |  5 ++++-
 net/ipv6/netfilter/nf_tables_ipv6.c     |  5 ++++-
 net/netfilter/nf_tables_inet.c          |  5 ++++-
 net/netfilter/nf_tables_netdev.c        |  5 ++++-
 6 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 06f0f81456a0..97afdc0744e6 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -139,12 +139,20 @@ static int __init nf_tables_bridge_init(void)
 	int ret;
 
 	nf_register_afinfo(&nf_br_afinfo);
-	nft_register_chain_type(&filter_bridge);
+	ret = nft_register_chain_type(&filter_bridge);
+	if (ret < 0)
+		goto err1;
+
 	ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
-	if (ret < 0) {
-		nft_unregister_chain_type(&filter_bridge);
-		nf_unregister_afinfo(&nf_br_afinfo);
-	}
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	nft_unregister_chain_type(&filter_bridge);
+err1:
+	nf_unregister_afinfo(&nf_br_afinfo);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 058c034be376..805c8ddfe860 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -80,7 +80,10 @@ static int __init nf_tables_arp_init(void)
 {
 	int ret;
 
-	nft_register_chain_type(&filter_arp);
+	ret = nft_register_chain_type(&filter_arp);
+	if (ret < 0)
+		return ret;
+
 	ret = register_pernet_subsys(&nf_tables_arp_net_ops);
 	if (ret < 0)
 		nft_unregister_chain_type(&filter_arp);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index e44ba3b12fbb..2840a29b2e04 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -103,7 +103,10 @@ static int __init nf_tables_ipv4_init(void)
 {
 	int ret;
 
-	nft_register_chain_type(&filter_ipv4);
+	ret = nft_register_chain_type(&filter_ipv4);
+	if (ret < 0)
+		return ret;
+
 	ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
 	if (ret < 0)
 		nft_unregister_chain_type(&filter_ipv4);
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 05d05926962a..d6e4ba5de916 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -100,7 +100,10 @@ static int __init nf_tables_ipv6_init(void)
 {
 	int ret;
 
-	nft_register_chain_type(&filter_ipv6);
+	ret = nft_register_chain_type(&filter_ipv6);
+	if (ret < 0)
+		return ret;
+
 	ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
 	if (ret < 0)
 		nft_unregister_chain_type(&filter_ipv6);
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 6b5f76295d3d..f713cc205669 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -82,7 +82,10 @@ static int __init nf_tables_inet_init(void)
 {
 	int ret;
 
-	nft_register_chain_type(&filter_inet);
+	ret = nft_register_chain_type(&filter_inet);
+	if (ret < 0)
+		return ret;
+
 	ret = register_pernet_subsys(&nf_tables_inet_net_ops);
 	if (ret < 0)
 		nft_unregister_chain_type(&filter_inet);
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 38a3e8385042..9e2ae424b640 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -149,7 +149,10 @@ static int __init nf_tables_netdev_init(void)
 {
 	int ret;
 
-	nft_register_chain_type(&nft_filter_chain_netdev);
+	ret = nft_register_chain_type(&nft_filter_chain_netdev);
+	if (ret)
+		return ret;
+
 	ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
 	if (ret)
 		goto err1;

From 6bd14303a908833e10ac731a3308eba938305269 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sun, 11 Sep 2016 22:05:27 +0800
Subject: [PATCH 23/53] netfilter: nf_queue: get rid of dependency on
 IP6_NF_IPTABLES

hash_v6 is used by both nftables and ip6tables, so depend on
IP6_NF_IPTABLES is not properly.

Actually, it only parses ipv6hdr and computes a hash value, so
even if IPV6 is disabled, there's no side effect too, remove it.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 0dbce55437f2..cc8a11f7e306 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -54,7 +54,6 @@ static inline u32 hash_v4(const struct sk_buff *skb, u32 jhash_initval)
 			(__force u32)iph->saddr, iph->protocol, jhash_initval);
 }
 
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval)
 {
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -77,7 +76,6 @@ static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval)
 
 	return jhash_3words(a, b, c, jhash_initval);
 }
-#endif
 
 static inline u32
 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family,
@@ -85,10 +83,8 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family,
 {
 	if (family == NFPROTO_IPV4)
 		queue += ((u64) hash_v4(skb, jhash_initval) * queues_total) >> 32;
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	else if (family == NFPROTO_IPV6)
 		queue += ((u64) hash_v6(skb, jhash_initval) * queues_total) >> 32;
-#endif
 
 	return queue;
 }

From 8e8118f893138d4cc3d4dbf4163d7497fca54a9d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 11 Sep 2016 22:55:53 +0200
Subject: [PATCH 24/53] netfilter: conntrack: remove packet hotpath stats

These counters sit in hot path and do show up in perf, this is especially
true for 'found' and 'searched' which get incremented for every packet
processed.

Information like

searched=212030105
new=623431
found=333613
delete=623327

does not seem too helpful nowadays:

- on busy systems found and searched will overflow every few hours
(these are 32bit integers), other more busy ones every few days.

- for debugging there are better methods, such as iptables' trace target,
the conntrack log sysctls.  Nowadays we also have perf tool.

This removes packet path stat counters except those that
are expected to be 0 (or close to 0) on a normal system, e.g.
'insert_failed' (race happened) or 'invalid' (proto tracker rejects).

The insert stat is retained for the ctnetlink case.
The found stat is retained for the tuple-is-taken check when NAT has to
determine if it needs to pick a different source address.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h      |  4 ----
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  8 ++++----
 net/netfilter/nf_conntrack_core.c                  | 14 ++------------
 net/netfilter/nf_conntrack_netlink.c               |  6 +-----
 net/netfilter/nf_conntrack_standalone.c            |  8 ++++----
 5 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 275505792664..1d1ef4e20512 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -4,13 +4,9 @@
 #include <uapi/linux/netfilter/nf_conntrack_common.h>
 
 struct ip_conntrack_stat {
-	unsigned int searched;
 	unsigned int found;
-	unsigned int new;
 	unsigned int invalid;
 	unsigned int ignore;
-	unsigned int delete;
-	unsigned int delete_list;
 	unsigned int insert;
 	unsigned int insert_failed;
 	unsigned int drop;
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 9df789709abe..6deb8867c5fc 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -231,13 +231,13 @@ enum ctattr_secctx {
 
 enum ctattr_stats_cpu {
 	CTA_STATS_UNSPEC,
-	CTA_STATS_SEARCHED,
+	CTA_STATS_SEARCHED,	/* no longer used */
 	CTA_STATS_FOUND,
-	CTA_STATS_NEW,
+	CTA_STATS_NEW,		/* no longer used */
 	CTA_STATS_INVALID,
 	CTA_STATS_IGNORE,
-	CTA_STATS_DELETE,
-	CTA_STATS_DELETE_LIST,
+	CTA_STATS_DELETE,	/* no longer used */
+	CTA_STATS_DELETE_LIST,	/* no longer used */
 	CTA_STATS_INSERT,
 	CTA_STATS_INSERT_FAILED,
 	CTA_STATS_DROP,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac1db4019d5c..8d1ddb9b63ed 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -379,7 +379,6 @@ static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
 	struct nf_conn *ct = (struct nf_conn *)nfct;
-	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_l4proto *l4proto;
 
 	pr_debug("destroy_conntrack(%p)\n", ct);
@@ -406,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
 	nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
-	NF_CT_STAT_INC(net, delete);
 	local_bh_enable();
 
 	if (ct->master)
@@ -438,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
 
 	nf_ct_add_to_dying_list(ct);
 
-	NF_CT_STAT_INC(net, delete_list);
 	local_bh_enable();
 }
 
@@ -529,11 +526,8 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 		if (nf_ct_is_dying(ct))
 			continue;
 
-		if (nf_ct_key_equal(h, tuple, zone, net)) {
-			NF_CT_STAT_INC_ATOMIC(net, found);
+		if (nf_ct_key_equal(h, tuple, zone, net))
 			return h;
-		}
-		NF_CT_STAT_INC_ATOMIC(net, searched);
 	}
 	/*
 	 * if the nulls value we got at the end of this lookup is
@@ -798,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 */
 	__nf_conntrack_hash_insert(ct, hash, reply_hash);
 	nf_conntrack_double_unlock(hash, reply_hash);
-	NF_CT_STAT_INC(net, insert);
 	local_bh_enable();
 
 	help = nfct_help(ct);
@@ -857,7 +850,6 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			rcu_read_unlock();
 			return 1;
 		}
-		NF_CT_STAT_INC_ATOMIC(net, searched);
 	}
 
 	if (get_nulls_value(n) != hash) {
@@ -1177,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		}
 		spin_unlock(&nf_conntrack_expect_lock);
 	}
-	if (!exp) {
+	if (!exp)
 		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
-		NF_CT_STAT_INC(net, new);
-	}
 
 	/* Now it is inserted into the unconfirmed list, bump refcount */
 	nf_conntrack_get(&ct->ct_general);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c052b712c49f..27540455dc62 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1984,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 	nfmsg->version      = NFNETLINK_V0;
 	nfmsg->res_id	    = htons(cpu);
 
-	if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
-	    nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
-	    nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+	if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
 	    nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
 	    nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
-	    nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
-	    nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
 	    nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
 	    nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
 				htonl(st->insert_failed)) ||
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 3d9a316a3c77..7d52f8401afd 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -352,13 +352,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
 			"%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
 		   nr_conntracks,
-		   st->searched,
+		   0,
 		   st->found,
-		   st->new,
+		   0,
 		   st->invalid,
 		   st->ignore,
-		   st->delete,
-		   st->delete_list,
+		   0,
+		   0,
 		   st->insert,
 		   st->insert_failed,
 		   st->drop,

From 2e917d602acd9e3e8c6e4c43b213c8929d986503 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 12 Sep 2016 22:21:36 +0800
Subject: [PATCH 25/53] netfilter: nft_numgen: fix race between num generate
 and store it

After we generate a new number, we still use the priv->counter and
store it to the dreg. This is not correct, another cpu may already
change it to a new number. So we must use the generated number, not
the priv->counter itself.

Fixes: 91dbc6be0a62 ("netfilter: nf_tables: add number generator expression")
Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_numgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index f51a3ede3932..f173ebec30a7 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -37,7 +37,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
-	memcpy(&regs->data[priv->dreg], &priv->counter, sizeof(u32));
+	regs->data[priv->dreg] = nval;
 }
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {

From 14e2dee0996f51e0ff0d868497c7e1b90f012665 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Tue, 13 Sep 2016 10:21:46 +0200
Subject: [PATCH 26/53] netfilter: nft_hash: fix hash overflow validation

The overflow validation in the init() function establishes that the
maximum value that the hash could reach is less than U32_MAX, which is
likely to be true.

The fix detects the overflow when the maximum hash value is less than
the offset itself.

Fixes: 70ca767ea1b2 ("netfilter: nft_hash: Add hash offset value")
Reported-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index bd12f7a801c2..09473b415b95 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -76,7 +76,7 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	if (priv->modulus <= 1)
 		return -ERANGE;
 
-	if (priv->offset + priv->modulus - 1 < U32_MAX)
+	if (priv->offset + priv->modulus - 1 < priv->offset)
 		return -EOVERFLOW;
 
 	priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));

From 2b03bf732488a3c2e920afe22c03b82cb8477e28 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Tue, 13 Sep 2016 13:49:53 +0200
Subject: [PATCH 27/53] netfilter: nft_numgen: add number generation offset

Add support of an offset value for incremental counter and random. With
this option the sysadmin is able to start the counter to a certain value
and then apply the generated number.

Example:

	meta mark set numgen inc mod 2 offset 100

This will generate marks with the serie 100, 101, 100, 101, ...

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_numgen.c               | 32 +++++++++++++++++++-----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index bc0eb6a1066d..bcfb892ff148 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1136,12 +1136,14 @@ enum nft_trace_types {
  * @NFTA_NG_DREG: destination register (NLA_U32)
  * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
+ * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
 	NFTA_NG_DREG,
 	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
+	NFTA_NG_OFFSET,
 	__NFTA_NG_MAX
 };
 #define NFTA_NG_MAX	(__NFTA_NG_MAX - 1)
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index f173ebec30a7..55bc5ab78d4a 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -23,6 +23,7 @@ struct nft_ng_inc {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	atomic_t		counter;
+	u32			offset;
 };
 
 static void nft_ng_inc_eval(const struct nft_expr *expr,
@@ -37,13 +38,14 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
-	regs->data[priv->dreg] = nval;
+	regs->data[priv->dreg] = nval + priv->offset;
 }
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_DREG]		= { .type = NLA_U32 },
 	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
+	[NFTA_NG_OFFSET]	= { .type = NLA_U32 },
 };
 
 static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -52,10 +54,16 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_inc *priv = nft_expr_priv(expr);
 
+	if (tb[NFTA_NG_OFFSET])
+		priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
 	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
 	if (priv->modulus == 0)
 		return -ERANGE;
 
+	if (priv->offset + priv->modulus - 1 < priv->offset)
+		return -EOVERFLOW;
+
 	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
 	atomic_set(&priv->counter, 0);
 
@@ -64,7 +72,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
-		       u32 modulus, enum nft_ng_types type)
+		       u32 modulus, enum nft_ng_types type, u32 offset)
 {
 	if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
 		goto nla_put_failure;
@@ -72,6 +80,8 @@ static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset)))
+		goto nla_put_failure;
 
 	return 0;
 
@@ -83,12 +93,14 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_inc *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL,
+			   priv->offset);
 }
 
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
+	u32			offset;
 };
 
 static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -97,9 +109,10 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
 	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+	u32 val;
 
-	regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
-						  priv->modulus);
+	val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
+	regs->data[priv->dreg] = val + priv->offset;
 }
 
 static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -108,10 +121,16 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
 
+	if (tb[NFTA_NG_OFFSET])
+		priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
 	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
 	if (priv->modulus == 0)
 		return -ERANGE;
 
+	if (priv->offset + priv->modulus - 1 < priv->offset)
+		return -EOVERFLOW;
+
 	prandom_init_once(&nft_numgen_prandom_state);
 
 	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
@@ -124,7 +143,8 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM,
+			   priv->offset);
 }
 
 static struct nft_expr_type nft_ng_type;

From 36b701fae12ac763a568037e4e7c96b5727a8b3e Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Wed, 14 Sep 2016 15:00:02 +0200
Subject: [PATCH 28/53] netfilter: nf_tables: validate maximum value of u32
 netlink attributes

Fetch value and validate u32 netlink attribute. This validation is
usually required when the u32 netlink attributes are being stored in a
field whose size is smaller.

This patch revisits 4da449ae1df9 ("netfilter: nft_exthdr: Add size check
on u8 nft_exthdr attributes").

Fixes: 96518518cc41 ("netfilter: add nftables")
Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  1 +
 net/netfilter/nf_tables_api.c     | 25 +++++++++++++++++++++++++
 net/netfilter/nft_bitwise.c       |  8 +++++++-
 net/netfilter/nft_byteorder.c     | 15 +++++++++++++--
 net/netfilter/nft_cmp.c           |  3 +++
 net/netfilter/nft_exthdr.c        | 12 +++++++-----
 net/netfilter/nft_immediate.c     |  4 ++++
 7 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index a7a7cebc8d07..5031e072567b 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -145,6 +145,7 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
 	return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE;
 }
 
+unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest);
 unsigned int nft_parse_register(const struct nlattr *attr);
 int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg);
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bd9715e5ff26..b70d3ea1430e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4409,6 +4409,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 	return 0;
 }
 
+/**
+ *	nft_parse_u32_check - fetch u32 attribute and check for maximum value
+ *
+ *	@attr: netlink attribute to fetch value from
+ *	@max: maximum value to be stored in dest
+ *	@dest: pointer to the variable
+ *
+ *	Parse, check and store a given u32 netlink attribute into variable.
+ *	This function returns -ERANGE if the value goes over maximum value.
+ *	Otherwise a 0 is returned and the attribute value is stored in the
+ *	destination variable.
+ */
+unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
+{
+	int val;
+
+	val = ntohl(nla_get_be32(attr));
+	if (val > max)
+		return -ERANGE;
+
+	*dest = val;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_u32_check);
+
 /**
  *	nft_parse_register - parse a register value from a netlink attribute
  *
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index d71cc18fa35d..31c15ed2e5fc 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 {
 	struct nft_bitwise *priv = nft_expr_priv(expr);
 	struct nft_data_desc d1, d2;
+	u32 len;
 	int err;
 
 	if (tb[NFTA_BITWISE_SREG] == NULL ||
@@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 	    tb[NFTA_BITWISE_XOR] == NULL)
 		return -EINVAL;
 
-	priv->len  = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+	err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
+
+	priv->len = len;
+
 	priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
 	err = nft_validate_register_load(priv->sreg, priv->len);
 	if (err < 0)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index b78c28ba465f..ee63d981268d 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
 			      const struct nlattr * const tb[])
 {
 	struct nft_byteorder *priv = nft_expr_priv(expr);
+	u32 size, len;
 	int err;
 
 	if (tb[NFTA_BYTEORDER_SREG] == NULL ||
@@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
 		return -EINVAL;
 	}
 
-	priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+	err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size);
+	if (err < 0)
+		return err;
+
+	priv->size = size;
+
 	switch (priv->size) {
 	case 2:
 	case 4:
@@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
 	}
 
 	priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
-	priv->len  = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+	err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
+
+	priv->len = len;
+
 	err = nft_validate_register_load(priv->sreg, priv->len);
 	if (err < 0)
 		return err;
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index e25b35d70e4d..2e53739812b1 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -84,6 +84,9 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	if (err < 0)
 		return err;
 
+	if (desc.len > U8_MAX)
+		return -ERANGE;
+
 	priv->op  = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
 	priv->len = desc.len;
 	return 0;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 82c264e40278..a84cf3d66056 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,7 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
-	u32 offset, len;
+	u32 offset, len, err;
 
 	if (tb[NFTA_EXTHDR_DREG] == NULL ||
 	    tb[NFTA_EXTHDR_TYPE] == NULL ||
@@ -67,11 +67,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 	    tb[NFTA_EXTHDR_LEN] == NULL)
 		return -EINVAL;
 
-	offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
-	len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
+	if (err < 0)
+		return err;
 
-	if (offset > U8_MAX || len > U8_MAX)
-		return -ERANGE;
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
 
 	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
 	priv->offset = offset;
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index db3b746858e3..d17018ff54e6 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -53,6 +53,10 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
 			    tb[NFTA_IMMEDIATE_DATA]);
 	if (err < 0)
 		return err;
+
+	if (desc.len > U8_MAX)
+		return -ERANGE;
+
 	priv->dlen = desc.len;
 
 	priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);

From 8061bb54436c19fd16b7c734a69ff60bac26e3e9 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Wed, 14 Sep 2016 23:41:46 +0800
Subject: [PATCH 29/53] netfilter: nft_queue: add _SREG_QNUM attr to select the
 queue number

Currently, the user can specify the queue numbers by _QUEUE_NUM and
_QUEUE_TOTAL attributes, this is enough in most situations.

But acctually, it is not very flexible, for example:
  tcp dport 80 mapped to queue0
  tcp dport 81 mapped to queue1
  tcp dport 82 mapped to queue2
In order to do this thing, we must add 3 nft rules, and more
mapping meant more rules ...

So take one register to select the queue number, then we can add one
simple rule to mapping queues, maybe like this:
  queue num tcp dport map { 80:0, 81:1, 82:2 ... }

Florian Westphal also proposed wider usage scenarios:
  queue num jhash ip saddr . ip daddr mod ...
  queue num meta cpu ...
  queue num meta mark ...

The last point is how to load a queue number from sreg, although we can
use *(u16*)&regs->data[reg] to load the queue number, just like nat expr
to load its l4port do.

But we will cooperate with hash expr, meta cpu, meta mark expr and so on.
They all store the result to u32 type, so cast it to u16 pointer and
dereference it will generate wrong result in the big endian system.

So just keep it simple, we treat queue number as u32 type, although u16
type is already enough.

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   2 +
 net/netfilter/nft_queue.c                | 102 ++++++++++++++++++++---
 2 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index bcfb892ff148..1cf41dd838b2 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -894,12 +894,14 @@ enum nft_log_attributes {
  * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16)
  * @NFTA_QUEUE_TOTAL: number of queues to load balance packets on (NLA_U16)
  * @NFTA_QUEUE_FLAGS: various flags (NLA_U16)
+ * @NFTA_QUEUE_SREG_QNUM: source register of queue number (NLA_U32: nft_registers)
  */
 enum nft_queue_attributes {
 	NFTA_QUEUE_UNSPEC,
 	NFTA_QUEUE_NUM,
 	NFTA_QUEUE_TOTAL,
 	NFTA_QUEUE_FLAGS,
+	NFTA_QUEUE_SREG_QNUM,
 	__NFTA_QUEUE_MAX
 };
 #define NFTA_QUEUE_MAX		(__NFTA_QUEUE_MAX - 1)
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index d16d59959ff6..393d359a1889 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -22,9 +22,10 @@
 static u32 jhash_initval __read_mostly;
 
 struct nft_queue {
-	u16	queuenum;
-	u16	queues_total;
-	u16	flags;
+	enum nft_registers	sreg_qnum:8;
+	u16			queuenum;
+	u16			queues_total;
+	u16			flags;
 };
 
 static void nft_queue_eval(const struct nft_expr *expr,
@@ -54,26 +55,39 @@ static void nft_queue_eval(const struct nft_expr *expr,
 	regs->verdict.code = ret;
 }
 
+static void nft_queue_sreg_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	struct nft_queue *priv = nft_expr_priv(expr);
+	u32 queue, ret;
+
+	queue = regs->data[priv->sreg_qnum];
+
+	ret = NF_QUEUE_NR(queue);
+	if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
+		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+	regs->verdict.code = ret;
+}
+
 static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
 	[NFTA_QUEUE_NUM]	= { .type = NLA_U16 },
 	[NFTA_QUEUE_TOTAL]	= { .type = NLA_U16 },
 	[NFTA_QUEUE_FLAGS]	= { .type = NLA_U16 },
+	[NFTA_QUEUE_SREG_QNUM]	= { .type = NLA_U32 },
 };
 
 static int nft_queue_init(const struct nft_ctx *ctx,
-			   const struct nft_expr *expr,
-			   const struct nlattr * const tb[])
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
 {
 	struct nft_queue *priv = nft_expr_priv(expr);
 	u32 maxid;
 
-	if (tb[NFTA_QUEUE_NUM] == NULL)
-		return -EINVAL;
-
-	init_hashrandom(&jhash_initval);
 	priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
 
-	if (tb[NFTA_QUEUE_TOTAL] != NULL)
+	if (tb[NFTA_QUEUE_TOTAL])
 		priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
 	else
 		priv->queues_total = 1;
@@ -85,7 +99,7 @@ static int nft_queue_init(const struct nft_ctx *ctx,
 	if (maxid > U16_MAX)
 		return -ERANGE;
 
-	if (tb[NFTA_QUEUE_FLAGS] != NULL) {
+	if (tb[NFTA_QUEUE_FLAGS]) {
 		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
 		if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
 			return -EINVAL;
@@ -93,6 +107,29 @@ static int nft_queue_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
+static int nft_queue_sreg_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_queue *priv = nft_expr_priv(expr);
+	int err;
+
+	priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
+	err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_QUEUE_FLAGS]) {
+		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
+		if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
+			return -EINVAL;
+		if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT)
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_queue *priv = nft_expr_priv(expr);
@@ -108,6 +145,21 @@ static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return -1;
 }
 
+static int
+nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_queue *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) ||
+	    nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_queue_type;
 static const struct nft_expr_ops nft_queue_ops = {
 	.type		= &nft_queue_type,
@@ -117,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = {
 	.dump		= nft_queue_dump,
 };
 
+static const struct nft_expr_ops nft_queue_sreg_ops = {
+	.type		= &nft_queue_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_queue)),
+	.eval		= nft_queue_sreg_eval,
+	.init		= nft_queue_sreg_init,
+	.dump		= nft_queue_sreg_dump,
+};
+
+static const struct nft_expr_ops *
+nft_queue_select_ops(const struct nft_ctx *ctx,
+		     const struct nlattr * const tb[])
+{
+	if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM])
+		return ERR_PTR(-EINVAL);
+
+	init_hashrandom(&jhash_initval);
+
+	if (tb[NFTA_QUEUE_NUM])
+		return &nft_queue_ops;
+
+	if (tb[NFTA_QUEUE_SREG_QNUM])
+		return &nft_queue_sreg_ops;
+
+	return ERR_PTR(-EINVAL);
+}
+
 static struct nft_expr_type nft_queue_type __read_mostly = {
 	.name		= "queue",
-	.ops		= &nft_queue_ops,
+	.select_ops	= &nft_queue_select_ops,
 	.policy		= nft_queue_policy,
 	.maxattr	= NFTA_QUEUE_MAX,
 	.owner		= THIS_MODULE,

From 2462f3f4a7e079192b78f36900c34f18dad824a7 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Thu, 15 Sep 2016 20:50:16 +0800
Subject: [PATCH 30/53] netfilter: nf_queue: improve queue range support for
 bridge family

After commit ac2863445686 ("netfilter: bridge: add nf_afinfo to enable
queuing to userspace"), we can queue packets to the user space in bridge
family. But when the user specify the queue range, packets will be only
delivered to the first queue num. Because in nfqueue_hash, we only support
ipv4 and ipv6 family. Now add support for bridge family too.

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 56 ++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index cc8a11f7e306..903dca050fb6 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -41,22 +41,19 @@ static inline void init_hashrandom(u32 *jhash_initval)
 		*jhash_initval = prandom_u32();
 }
 
-static inline u32 hash_v4(const struct sk_buff *skb, u32 jhash_initval)
+static inline u32 hash_v4(const struct iphdr *iph, u32 initval)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-
 	/* packets in either direction go into same queue */
 	if ((__force u32)iph->saddr < (__force u32)iph->daddr)
 		return jhash_3words((__force u32)iph->saddr,
-			(__force u32)iph->daddr, iph->protocol, jhash_initval);
+			(__force u32)iph->daddr, iph->protocol, initval);
 
 	return jhash_3words((__force u32)iph->daddr,
-			(__force u32)iph->saddr, iph->protocol, jhash_initval);
+			(__force u32)iph->saddr, iph->protocol, initval);
 }
 
-static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval)
+static inline u32 hash_v6(const struct ipv6hdr *ip6h, u32 initval)
 {
-	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	u32 a, b, c;
 
 	if ((__force u32)ip6h->saddr.s6_addr32[3] <
@@ -74,17 +71,50 @@ static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval)
 	else
 		c = (__force u32) ip6h->daddr.s6_addr32[1];
 
-	return jhash_3words(a, b, c, jhash_initval);
+	return jhash_3words(a, b, c, initval);
+}
+
+static inline u32 hash_bridge(const struct sk_buff *skb, u32 initval)
+{
+	struct ipv6hdr *ip6h, _ip6h;
+	struct iphdr *iph, _iph;
+
+	switch (eth_hdr(skb)->h_proto) {
+	case htons(ETH_P_IP):
+		iph = skb_header_pointer(skb, skb_network_offset(skb),
+					 sizeof(*iph), &_iph);
+		if (iph)
+			return hash_v4(iph, initval);
+		break;
+	case htons(ETH_P_IPV6):
+		ip6h = skb_header_pointer(skb, skb_network_offset(skb),
+					  sizeof(*ip6h), &_ip6h);
+		if (ip6h)
+			return hash_v6(ip6h, initval);
+		break;
+	}
+
+	return 0;
 }
 
 static inline u32
 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family,
-	     u32 jhash_initval)
+	     u32 initval)
 {
-	if (family == NFPROTO_IPV4)
-		queue += ((u64) hash_v4(skb, jhash_initval) * queues_total) >> 32;
-	else if (family == NFPROTO_IPV6)
-		queue += ((u64) hash_v6(skb, jhash_initval) * queues_total) >> 32;
+	switch (family) {
+	case NFPROTO_IPV4:
+		queue += reciprocal_scale(hash_v4(ip_hdr(skb), initval),
+					  queues_total);
+		break;
+	case NFPROTO_IPV6:
+		queue += reciprocal_scale(hash_v6(ipv6_hdr(skb), initval),
+					  queues_total);
+		break;
+	case NFPROTO_BRIDGE:
+		queue += reciprocal_scale(hash_bridge(skb, initval),
+					  queues_total);
+		break;
+	}
 
 	return queue;
 }

From 8dc3c2b86bb16e8f345b80a8af69696e9a7edb65 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Thu, 15 Sep 2016 21:29:08 +0800
Subject: [PATCH 31/53] netfilter: nf_tables: improve nft payload fast eval

There's an off-by-one issue in nft_payload_fast_eval, skb_tail_pointer
and ptr + priv->len all point to the last valid address plus 1. So if
they are equal, we can still fetch the valid data. It's unnecessary to
fall back to nft_payload_eval.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index fb8b5892b5ff..36ba4e55d84e 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -98,7 +98,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
 
 	ptr += priv->offset;
 
-	if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+	if (unlikely(ptr + priv->len > skb_tail_pointer(skb)))
 		return false;
 
 	*dest = 0;

From a20877b5edec4d2b62560b5245199af04846476c Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sat, 17 Sep 2016 14:31:20 +0800
Subject: [PATCH 32/53] netfilter: nf_tables: check tprot_set first when we use
 xt.thoff

pkt->xt.thoff is not always set properly, but we use it without any check.
For payload expr, it will cause wrong results. For nftrace, we may notify
the wrong network or transport header to the user space, furthermore,
input the following nft rules, warning message will be printed out:
  # nft add rule arp filter output meta nftrace set 1

  WARNING: CPU: 0 PID: 13428 at net/netfilter/nf_tables_trace.c:263
  nft_trace_notify+0x4a3/0x5e0 [nf_tables]
  Call Trace:
  [<ffffffff813d58ae>] dump_stack+0x63/0x85
  [<ffffffff810a4c0b>] __warn+0xcb/0xf0
  [<ffffffff810a4d3d>] warn_slowpath_null+0x1d/0x20
  [<ffffffffa0589703>] nft_trace_notify+0x4a3/0x5e0 [nf_tables]
  [ ... ]
  [<ffffffffa05690a8>] nft_do_chain_arp+0x78/0x90 [nf_tables_arp]
  [<ffffffff816f4aa2>] nf_iterate+0x62/0x80
  [<ffffffff816f4b33>] nf_hook_slow+0x73/0xd0
  [<ffffffff81732bbf>] arp_xmit+0x8f/0xb0
  [ ... ]
  [<ffffffff81732d36>] arp_solicit+0x106/0x2c0

So before we use pkt->xt.thoff, check the tprot_set first.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c  |  5 ++++-
 net/netfilter/nf_tables_trace.c | 20 +++++++++++---------
 net/netfilter/nft_payload.c     |  4 ++++
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 36ba4e55d84e..67259cefef06 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -93,8 +93,11 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
 
 	if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
 		ptr = skb_network_header(skb);
-	else
+	else {
+		if (!pkt->tprot_set)
+			return false;
 		ptr = skb_network_header(skb) + pkt->xt.thoff;
+	}
 
 	ptr += priv->offset;
 
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 39eb1cc62e91..696fe8f6f2f2 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -113,20 +113,22 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
 				  const struct nft_pktinfo *pkt)
 {
 	const struct sk_buff *skb = pkt->skb;
-	unsigned int len = min_t(unsigned int,
-				 pkt->xt.thoff - skb_network_offset(skb),
-				 NFT_TRACETYPE_NETWORK_HSIZE);
 	int off = skb_network_offset(skb);
+	unsigned int len, nh_end;
 
+	nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
+	len = min_t(unsigned int, nh_end - skb_network_offset(skb),
+		    NFT_TRACETYPE_NETWORK_HSIZE);
 	if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
 		return -1;
 
-	len = min_t(unsigned int, skb->len - pkt->xt.thoff,
-		    NFT_TRACETYPE_TRANSPORT_HSIZE);
-
-	if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
-			      pkt->xt.thoff, len))
-		return -1;
+	if (pkt->tprot_set) {
+		len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+			    NFT_TRACETYPE_TRANSPORT_HSIZE);
+		if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
+				      pkt->xt.thoff, len))
+			return -1;
+	}
 
 	if (!skb_mac_header_was_set(skb))
 		return 0;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 12cd4bf16d17..b2f88617611a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -92,6 +92,8 @@ static void nft_payload_eval(const struct nft_expr *expr,
 		offset = skb_network_offset(skb);
 		break;
 	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		if (!pkt->tprot_set)
+			goto err;
 		offset = pkt->xt.thoff;
 		break;
 	default:
@@ -184,6 +186,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
 		offset = skb_network_offset(skb);
 		break;
 	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		if (!pkt->tprot_set)
+			goto err;
 		offset = pkt->xt.thoff;
 		break;
 	default:

From 7bdc66242de7f9cbe8dbb01757042dd18744d800 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Sun, 18 Sep 2016 10:52:25 +0800
Subject: [PATCH 33/53] netfilter: Enhance the codes used to get random once

There are some codes which are used to get one random once in netfilter.
We could use net_get_random_once to simplify these codes.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_RATEEST.c   | 6 +-----
 net/netfilter/xt_connlimit.c | 8 +-------
 net/netfilter/xt_recent.c    | 7 ++-----
 3 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 515131f9e021..dbd6c4a12b97 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -24,7 +24,6 @@ static DEFINE_MUTEX(xt_rateest_mutex);
 #define RATEEST_HSIZE	16
 static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
 static unsigned int jhash_rnd __read_mostly;
-static bool rnd_inited __read_mostly;
 
 static unsigned int xt_rateest_hash(const char *name)
 {
@@ -99,10 +98,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 	} cfg;
 	int ret;
 
-	if (unlikely(!rnd_inited)) {
-		get_random_bytes(&jhash_rnd, sizeof(jhash_rnd));
-		rnd_inited = true;
-	}
+	net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
 	est = xt_rateest_lookup(info->name);
 	if (est) {
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 99bbc829868d..b6dc322593a3 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -366,14 +366,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
 	unsigned int i;
 	int ret;
 
-	if (unlikely(!connlimit_rnd)) {
-		u_int32_t rand;
+	net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
 
-		do {
-			get_random_bytes(&rand, sizeof(rand));
-		} while (!rand);
-		cmpxchg(&connlimit_rnd, 0, rand);
-	}
 	ret = nf_ct_l3proto_try_module_get(par->family);
 	if (ret < 0) {
 		pr_info("cannot load conntrack support for "
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d725a27743a1..e3b7a09b103e 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -110,7 +110,6 @@ static const struct file_operations recent_old_fops, recent_mt_fops;
 #endif
 
 static u_int32_t hash_rnd __read_mostly;
-static bool hash_rnd_inited __read_mostly;
 
 static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
 {
@@ -340,10 +339,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 	int ret = -EINVAL;
 	size_t sz;
 
-	if (unlikely(!hash_rnd_inited)) {
-		get_random_bytes(&hash_rnd, sizeof(hash_rnd));
-		hash_rnd_inited = true;
-	}
+	net_get_random_once(&hash_rnd, sizeof(hash_rnd));
+
 	if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
 		pr_info("Unsupported user space flags (%08x)\n",
 			info->check_set);

From b9d80f83bf8c3485ae53a4f3a715363d764bb0e4 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Tue, 20 Sep 2016 10:31:04 +0800
Subject: [PATCH 34/53] netfilter: xt_helper: Use sizeof(variable) instead of
 literal number

It's better to use sizeof(info->name)-1 as index to force set the string
tail instead of literal number '29'.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_helper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 9f4ab00c8050..805c9f64a04c 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -65,7 +65,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
 			par->family);
 		return ret;
 	}
-	info->name[29] = '\0';
+	info->name[sizeof(info->name) - 1] = '\0';
 	return 0;
 }
 

From 4004d5c374dabcbce201e16442e4596b764cc60b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 20 Sep 2016 18:22:46 +0200
Subject: [PATCH 35/53] netfilter: nft_lookup: remove superfluous element found
 check

We already checked for !found just a bit before:

        if (!found) {
                regs->verdict.code = NFT_BREAK;
                return;
        }

        if (found && set->flags & NFT_SET_MAP)
            ^^^^^

So this redundant check can just go away.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_lookup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index e164325d1bc0..8166b6994cc7 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -43,7 +43,7 @@ static void nft_lookup_eval(const struct nft_expr *expr,
 		return;
 	}
 
-	if (found && set->flags & NFT_SET_MAP)
+	if (set->flags & NFT_SET_MAP)
 		nft_data_copy(&regs->data[priv->dreg],
 			      nft_set_ext_data(ext), set->dlen);
 

From 50f4c7b73f831a53fa9ddeb9bdf4cfb5b23d3aa7 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Wed, 7 Sep 2016 10:40:24 +0800
Subject: [PATCH 36/53] netfilter: xt_TCPMSS: Refactor the codes to decrease
 one condition check and more readable

The origin codes perform two condition checks with dst_mtu(skb_dst(skb))
and in_mtu. And the last statement is "min(dst_mtu(skb_dst(skb)),
in_mtu) - minlen". It may let reader think about how about the result.
Would it be negative.

Now assign the result of min(dst_mtu(skb_dst(skb)), in_mtu) to a new
variable, then only perform one condition check, and it is more readable.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_TCPMSS.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index e118397254af..872db2d0e2a9 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -110,18 +110,14 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
 		struct net *net = par->net;
 		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
+		unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
 
-		if (dst_mtu(skb_dst(skb)) <= minlen) {
+		if (min_mtu <= minlen) {
 			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-					    dst_mtu(skb_dst(skb)));
+					    min_mtu);
 			return -1;
 		}
-		if (in_mtu <= minlen) {
-			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-					    in_mtu);
-			return -1;
-		}
-		newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
+		newmss = min_mtu - minlen;
 	} else
 		newmss = info->mss;
 

From c5136b15ea364124299c8a9ba96b300e96061e3a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Sep 2016 11:35:01 -0400
Subject: [PATCH 37/53] netfilter: bridge: add and use br_nf_hook_thresh

This replaces the last uses of NF_HOOK_THRESH().
Followup patch will remove it and rename nf_hook_thresh.

The reason is that inet (non-bridge) netfilter no longer invokes the
hooks from hooks, so we do no longer need the thresh value to skip hooks
with a lower priority.

The bridge netfilter however may need to do this. br_nf_hook_thresh is a
wrapper that is supposed to do this, i.e. only call hooks with a
priority that exceeds NF_BR_PRI_BRNF.

It's used only in the recursion cases of br_netfilter.  It invokes
nf_hook_slow while holding an rcu read-side critical section to make a
future cleanup simpler.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/br_netfilter.h |  6 +++
 net/bridge/br_netfilter_hooks.c      | 60 +++++++++++++++++++++++-----
 net/bridge/br_netfilter_ipv6.c       | 12 +++---
 3 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index e8d1448425a7..0b0c35c37125 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -15,6 +15,12 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
 
 void nf_bridge_update_protocol(struct sk_buff *skb);
 
+int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk,
+		      struct sk_buff *skb, struct net_device *indev,
+		      struct net_device *outdev,
+		      int (*okfn)(struct net *, struct sock *,
+				  struct sk_buff *));
+
 static inline struct nf_bridge_info *
 nf_bridge_info_get(const struct sk_buff *skb)
 {
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 77e7f69bf80d..6029af47377d 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -30,6 +30,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
 #include <linux/in_route.h>
+#include <linux/rculist.h>
 #include <linux/inetdevice.h>
 
 #include <net/ip.h>
@@ -395,11 +396,10 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
 				skb->dev = nf_bridge->physindev;
 				nf_bridge_update_protocol(skb);
 				nf_bridge_push_encap_header(skb);
-				NF_HOOK_THRESH(NFPROTO_BRIDGE,
-					       NF_BR_PRE_ROUTING,
-					       net, sk, skb, skb->dev, NULL,
-					       br_nf_pre_routing_finish_bridge,
-					       1);
+				br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+						  net, sk, skb, skb->dev,
+						  NULL,
+						  br_nf_pre_routing_finish);
 				return 0;
 			}
 			ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -417,10 +417,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
 	skb->dev = nf_bridge->physindev;
 	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
-	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
-		       skb->dev, NULL,
-		       br_handle_frame_finish, 1);
-
+	br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
+			  br_handle_frame_finish);
 	return 0;
 }
 
@@ -992,6 +990,50 @@ static struct notifier_block brnf_notifier __read_mostly = {
 	.notifier_call = brnf_device_event,
 };
 
+/* recursively invokes nf_hook_slow (again), skipping already-called
+ * hooks (< NF_BR_PRI_BRNF).
+ *
+ * Called with rcu read lock held.
+ */
+int br_nf_hook_thresh(unsigned int hook, struct net *net,
+		      struct sock *sk, struct sk_buff *skb,
+		      struct net_device *indev,
+		      struct net_device *outdev,
+		      int (*okfn)(struct net *, struct sock *,
+				  struct sk_buff *))
+{
+	struct nf_hook_ops *elem;
+	struct nf_hook_state state;
+	struct list_head *head;
+	int ret;
+
+	head = &net->nf.hooks[NFPROTO_BRIDGE][hook];
+
+	list_for_each_entry_rcu(elem, head, list) {
+		struct nf_hook_ops *next;
+
+		next = list_entry_rcu(list_next_rcu(&elem->list),
+				      struct nf_hook_ops, list);
+		if (next->priority <= NF_BR_PRI_BRNF)
+			continue;
+	}
+
+	if (&elem->list == head)
+		return okfn(net, sk, skb);
+
+	/* We may already have this, but read-locks nest anyway */
+	rcu_read_lock();
+	nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1,
+			   NFPROTO_BRIDGE, indev, outdev, sk, net, okfn);
+
+	ret = nf_hook_slow(skb, &state);
+	rcu_read_unlock();
+	if (ret == 1)
+		ret = okfn(net, sk, skb);
+
+	return ret;
+}
+
 #ifdef CONFIG_SYSCTL
 static
 int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5e59a8457e7b..5989661c659f 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
 			skb->dev = nf_bridge->physindev;
 			nf_bridge_update_protocol(skb);
 			nf_bridge_push_encap_header(skb);
-			NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
-				       net, sk, skb, skb->dev, NULL,
-				       br_nf_pre_routing_finish_bridge,
-				       1);
+			br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+					  net, sk, skb, skb->dev, NULL,
+					  br_nf_pre_routing_finish_bridge);
 			return 0;
 		}
 		ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
 	skb->dev = nf_bridge->physindev;
 	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
-	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
-		       skb->dev, NULL,
-		       br_handle_frame_finish, 1);
+	br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
+			  skb->dev, NULL, br_handle_frame_finish);
 
 	return 0;
 }

From fe72926b792e52ab00abfa81a201805bfb2247d6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Sep 2016 11:35:02 -0400
Subject: [PATCH 38/53] netfilter: call nf_hook_state_init with rcu_read_lock
 held

This makes things simpler because we can store the head of the list
in the nf_state structure without worrying about concurrent add/delete
of hook elements from the list.

A future commit will make use of this to implement a simpler
linked-list.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         | 8 +++++++-
 include/linux/netfilter_ingress.h | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..ad444f0b4ed0 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -174,10 +174,16 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 
 	if (!list_empty(hook_list)) {
 		struct nf_hook_state state;
+		int ret;
 
+		/* We may already have this, but read-locks nest anyway */
+		rcu_read_lock();
 		nf_hook_state_init(&state, hook_list, hook, thresh,
 				   pf, indev, outdev, sk, net, okfn);
-		return nf_hook_slow(skb, &state);
+
+		ret = nf_hook_slow(skb, &state);
+		rcu_read_unlock();
+		return ret;
 	}
 	return 1;
 }
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 5fcd375ef175..6965ba09eba7 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -14,6 +14,7 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
 	return !list_empty(&skb->dev->nf_hooks_ingress);
 }
 
+/* caller must hold rcu_read_lock */
 static inline int nf_hook_ingress(struct sk_buff *skb)
 {
 	struct nf_hook_state state;

From 2c1e2703ff812ccaa42a4bc8a25803955e342b85 Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Wed, 21 Sep 2016 11:35:03 -0400
Subject: [PATCH 39/53] netfilter: call nf_hook_ingress with rcu_read_lock

This commit ensures that the rcu read-side lock is held while the
ingress hook is called.  This ensures that a call to nf_hook_slow (and
ultimately nf_ingress) will be read protected.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/core/dev.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 34b5322bc081..064919425b7d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4040,12 +4040,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 {
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (nf_hook_ingress_active(skb)) {
+		int ingress_retval;
+
 		if (*pt_prev) {
 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 			*pt_prev = NULL;
 		}
 
-		return nf_hook_ingress(skb);
+		rcu_read_lock();
+		ingress_retval = nf_hook_ingress(skb);
+		rcu_read_unlock();
+		return ingress_retval;
 	}
 #endif /* CONFIG_NETFILTER_INGRESS */
 	return 0;

From e2361cb90a0327bdab34d01d1a7b9dbd67c31e60 Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Wed, 21 Sep 2016 11:35:04 -0400
Subject: [PATCH 40/53] netfilter: Remove explicit rcu_read_lock in
 nf_hook_slow

All of the callers of nf_hook_slow already hold the rcu_read_lock, so this
cleanup removes the recursive call.  This is just a cleanup, as the locking
code gracefully handles this situation.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_redirect.c            | 2 +-
 net/bridge/netfilter/ebtables.c                | 2 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 2 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +-
 net/netfilter/core.c                           | 6 +-----
 net/netfilter/nf_conntrack_core.c              | 2 +-
 net/netfilter/nf_conntrack_h323_main.c         | 2 +-
 net/netfilter/nf_conntrack_helper.c            | 2 +-
 net/netfilter/nfnetlink_cthelper.c             | 2 +-
 net/netfilter/nfnetlink_log.c                  | 8 ++++++--
 net/netfilter/nfnetlink_queue.c                | 2 +-
 net/netfilter/xt_helper.c                      | 2 +-
 14 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 203964997a51..2e7c4f974340 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -24,7 +24,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		return EBT_DROP;
 
 	if (par->hooknum != NF_BR_BROUTING)
-		/* rcu_read_lock()ed by nf_hook_slow */
+		/* rcu_read_lock()ed by nf_hook_thresh */
 		ether_addr_copy(eth_hdr(skb)->h_dest,
 				br_port_get_rcu(par->in)->br->dev->dev_addr);
 	else
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index cceac5bb658f..dd7133216c9c 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -146,7 +146,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
 		return 1;
 	if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
 		return 1;
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	if (in && (p = br_port_get_rcu(in)) != NULL &&
 	    NF_INVF(e, EBT_ILOGICALIN,
 		    ebt_dev_check(e->logical_in, p->br->dev)))
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 870aebda2932..713c09a74b90 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -110,7 +110,7 @@ static unsigned int ipv4_helper(void *priv,
 	if (!help)
 		return NF_ACCEPT;
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	helper = rcu_dereference(help->helper);
 	if (!helper)
 		return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b5904bc2614..d075b3cf2400 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -149,7 +149,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 		return -NF_ACCEPT;
 	}
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
 
 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 1aa5848764a7..963ee3848675 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -115,7 +115,7 @@ static unsigned int ipv6_helper(void *priv,
 	help = nfct_help(ct);
 	if (!help)
 		return NF_ACCEPT;
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	helper = rcu_dereference(help->helper);
 	if (!helper)
 		return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 660bc10c7a9c..f5a61bc3ec2b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -165,7 +165,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		return -NF_ACCEPT;
 	}
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
 
 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..c8faf8102394 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -291,16 +291,13 @@ unsigned int nf_iterate(struct list_head *head,
 
 
 /* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
+ * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
 {
 	struct nf_hook_ops *elem;
 	unsigned int verdict;
 	int ret = 0;
 
-	/* We may already have this, but read-locks nest anyway */
-	rcu_read_lock();
-
 	elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
 next_hook:
 	verdict = nf_iterate(state->hook_list, skb, state, &elem);
@@ -321,7 +318,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
 			kfree_skb(skb);
 		}
 	}
-	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL(nf_hook_slow);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8d1ddb9b63ed..c94ec197845c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1275,7 +1275,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		skb->nfct = NULL;
 	}
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	l3proto = __nf_ct_l3proto_find(pf);
 	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
 				   &dataoff, &protonum);
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5c0db5c64734..f65d93639d12 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -736,7 +736,7 @@ static int callforward_do_filter(struct net *net,
 	const struct nf_afinfo *afinfo;
 	int ret = 0;
 
-	/* rcu_read_lock()ed by nf_hook_slow() */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	afinfo = nf_get_afinfo(family);
 	if (!afinfo)
 		return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 4ffe388a9a1e..336e21559e01 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -346,7 +346,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
 	/* Called from the helper function, this call never fails */
 	help = nfct_help(ct);
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	helper = rcu_dereference(help->helper);
 
 	nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index e924e95fcc7f..3b79f34b5095 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -43,7 +43,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
 	if (help == NULL)
 		return NF_DROP;
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	helper = rcu_dereference(help->helper);
 	if (helper == NULL)
 		return NF_DROP;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6577db524ef6..eb086a192c5a 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -442,7 +442,9 @@ __build_packet_message(struct nfnl_log_net *log,
 			if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
 					 htonl(indev->ifindex)) ||
 			/* this is the bridge group "brX" */
-			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+			/* rcu_read_lock()ed by nf_hook_thresh or
+			 * nf_log_packet.
+			 */
 			    nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
 					 htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
 				goto nla_put_failure;
@@ -477,7 +479,9 @@ __build_packet_message(struct nfnl_log_net *log,
 			if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
 					 htonl(outdev->ifindex)) ||
 			/* this is the bridge group "brX" */
-			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+			/* rcu_read_lock()ed by nf_hook_thresh or
+			 * nf_log_packet.
+			 */
 			    nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
 					 htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
 				goto nla_put_failure;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 808da34f94cd..7caa8b082c41 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -740,7 +740,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	struct net *net = entry->state.net;
 	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 
-	/* rcu_read_lock()ed by nf_hook_slow() */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	queue = instance_lookup(q, queuenum);
 	if (!queue)
 		return -ESRCH;
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 805c9f64a04c..f679dd4c272a 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -41,7 +41,7 @@ helper_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (!master_help)
 		return ret;
 
-	/* rcu_read_lock()ed by nf_hook_slow */
+	/* rcu_read_lock()ed by nf_hook_thresh */
 	helper = rcu_dereference(master_help->helper);
 	if (!helper)
 		return ret;

From d4bb5caa9cc1a802ba25f605b24b5640c025806b Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Wed, 21 Sep 2016 11:35:05 -0400
Subject: [PATCH 41/53] netfilter: Only allow sane values in
 nf_register_net_hook

This commit adds an upfront check for sane values to be passed when
registering a netfilter hook.  This will be used in a future patch for a
simplified hook list traversal.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index c8faf8102394..67b74287535d 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -89,6 +89,11 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	struct nf_hook_entry *entry;
 	struct nf_hook_ops *elem;
 
+	if (reg->pf == NFPROTO_NETDEV &&
+	    (reg->hooknum != NF_NETDEV_INGRESS ||
+	     !reg->dev || dev_net(reg->dev) != net))
+		return -EINVAL;
+
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;

From 54f17bbc52f71e2d313721046627c383d6c5c7da Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Wed, 21 Sep 2016 11:35:06 -0400
Subject: [PATCH 42/53] netfilter: nf_queue: whitespace cleanup

A future patch will modify the hook drop and outfn functions.  This will
cause the line lengths to take up too much space.  This is simply a
readability change.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 903dca050fb6..8fe85b98b5c8 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -22,10 +22,10 @@ struct nf_queue_entry {
 
 /* Packet queuing */
 struct nf_queue_handler {
-	int			(*outfn)(struct nf_queue_entry *entry,
-					 unsigned int queuenum);
-	void			(*nf_hook_drop)(struct net *net,
-						struct nf_hook_ops *ops);
+	int		(*outfn)(struct nf_queue_entry *entry,
+				 unsigned int queuenum);
+	void		(*nf_hook_drop)(struct net *net,
+					struct nf_hook_ops *ops);
 };
 
 void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh);

From e3b37f11e6e4e6b6f02cc762f182ce233d2c1c9d Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Wed, 21 Sep 2016 11:35:07 -0400
Subject: [PATCH 43/53] netfilter: replace list_head with single linked list

The netfilter hook list never uses the prev pointer, and so can be trimmed to
be a simple singly-linked list.

In addition to having a more light weight structure for hook traversal,
struct net becomes 5568 bytes (down from 6400) and struct net_device becomes
2176 bytes (down from 2240).

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h         |   2 +-
 include/linux/netfilter.h         |  63 +++++++------
 include/linux/netfilter_ingress.h |  17 ++--
 include/net/netfilter/nf_queue.h  |   3 +-
 include/net/netns/netfilter.h     |   2 +-
 net/bridge/br_netfilter_hooks.c   |  19 ++--
 net/netfilter/core.c              | 145 +++++++++++++++++++-----------
 net/netfilter/nf_internals.h      |  10 +--
 net/netfilter/nf_queue.c          |  18 ++--
 net/netfilter/nfnetlink_queue.c   |   8 +-
 10 files changed, 169 insertions(+), 118 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 67bb978470dc..41f49f5ab62a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1783,7 +1783,7 @@ struct net_device {
 #endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
-	struct list_head	nf_hooks_ingress;
+	struct nf_hook_entry __rcu *nf_hooks_ingress;
 #endif
 
 	unsigned char		broadcast[MAX_ADDR_LEN];
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index ad444f0b4ed0..44e20dac98a9 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -55,12 +55,34 @@ struct nf_hook_state {
 	struct net_device *out;
 	struct sock *sk;
 	struct net *net;
-	struct list_head *hook_list;
+	struct nf_hook_entry __rcu *hook_entries;
 	int (*okfn)(struct net *, struct sock *, struct sk_buff *);
 };
 
+typedef unsigned int nf_hookfn(void *priv,
+			       struct sk_buff *skb,
+			       const struct nf_hook_state *state);
+struct nf_hook_ops {
+	struct list_head	list;
+
+	/* User fills in from here down. */
+	nf_hookfn		*hook;
+	struct net_device	*dev;
+	void			*priv;
+	u_int8_t		pf;
+	unsigned int		hooknum;
+	/* Hooks are ordered in ascending priority. */
+	int			priority;
+};
+
+struct nf_hook_entry {
+	struct nf_hook_entry __rcu	*next;
+	struct nf_hook_ops		ops;
+	const struct nf_hook_ops	*orig_ops;
+};
+
 static inline void nf_hook_state_init(struct nf_hook_state *p,
-				      struct list_head *hook_list,
+				      struct nf_hook_entry *hook_entry,
 				      unsigned int hook,
 				      int thresh, u_int8_t pf,
 				      struct net_device *indev,
@@ -76,26 +98,11 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
 	p->out = outdev;
 	p->sk = sk;
 	p->net = net;
-	p->hook_list = hook_list;
+	RCU_INIT_POINTER(p->hook_entries, hook_entry);
 	p->okfn = okfn;
 }
 
-typedef unsigned int nf_hookfn(void *priv,
-			       struct sk_buff *skb,
-			       const struct nf_hook_state *state);
 
-struct nf_hook_ops {
-	struct list_head 	list;
-
-	/* User fills in from here down. */
-	nf_hookfn		*hook;
-	struct net_device	*dev;
-	void			*priv;
-	u_int8_t		pf;
-	unsigned int		hooknum;
-	/* Hooks are ordered in ascending priority. */
-	int			priority;
-};
 
 struct nf_sockopt_ops {
 	struct list_head list;
@@ -161,7 +168,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct net *, struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	struct list_head *hook_list;
+	struct nf_hook_entry *hook_head;
+	int ret = 1;
 
 #ifdef HAVE_JUMP_LABEL
 	if (__builtin_constant_p(pf) &&
@@ -170,22 +178,19 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 		return 1;
 #endif
 
-	hook_list = &net->nf.hooks[pf][hook];
-
-	if (!list_empty(hook_list)) {
+	rcu_read_lock();
+	hook_head = rcu_dereference(net->nf.hooks[pf][hook]);
+	if (hook_head) {
 		struct nf_hook_state state;
-		int ret;
 
-		/* We may already have this, but read-locks nest anyway */
-		rcu_read_lock();
-		nf_hook_state_init(&state, hook_list, hook, thresh,
+		nf_hook_state_init(&state, hook_head, hook, thresh,
 				   pf, indev, outdev, sk, net, okfn);
 
 		ret = nf_hook_slow(skb, &state);
-		rcu_read_unlock();
-		return ret;
 	}
-	return 1;
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 6965ba09eba7..33e37fb41d5d 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -11,23 +11,30 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
 	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
 		return false;
 #endif
-	return !list_empty(&skb->dev->nf_hooks_ingress);
+	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
 }
 
 /* caller must hold rcu_read_lock */
 static inline int nf_hook_ingress(struct sk_buff *skb)
 {
+	struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress);
 	struct nf_hook_state state;
 
-	nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress,
-			   NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV,
-			   skb->dev, NULL, NULL, dev_net(skb->dev), NULL);
+	/* Must recheck the ingress hook head, in the event it became NULL
+	 * after the check in nf_hook_ingress_active evaluated to true.
+	 */
+	if (unlikely(!e))
+		return 0;
+
+	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN,
+			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
+			   dev_net(skb->dev), NULL);
 	return nf_hook_slow(skb, &state);
 }
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
 {
-	INIT_LIST_HEAD(&dev->nf_hooks_ingress);
+	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
 }
 #else /* CONFIG_NETFILTER_INGRESS */
 static inline int nf_hook_ingress_active(struct sk_buff *skb)
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 8fe85b98b5c8..2280cfe86c56 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -11,7 +11,6 @@ struct nf_queue_entry {
 	struct sk_buff		*skb;
 	unsigned int		id;
 
-	struct nf_hook_ops	*elem;
 	struct nf_hook_state	state;
 	u16			size; /* sizeof(entry) + saved route keys */
 
@@ -25,7 +24,7 @@ struct nf_queue_handler {
 	int		(*outfn)(struct nf_queue_entry *entry,
 				 unsigned int queuenum);
 	void		(*nf_hook_drop)(struct net *net,
-					struct nf_hook_ops *ops);
+					const struct nf_hook_entry *hooks);
 };
 
 void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh);
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 36d723579af2..58487b1cc99a 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -16,6 +16,6 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
-	struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+	struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 };
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 6029af47377d..2fe9345c1407 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1002,28 +1002,21 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 		      int (*okfn)(struct net *, struct sock *,
 				  struct sk_buff *))
 {
-	struct nf_hook_ops *elem;
+	struct nf_hook_entry *elem;
 	struct nf_hook_state state;
-	struct list_head *head;
 	int ret;
 
-	head = &net->nf.hooks[NFPROTO_BRIDGE][hook];
+	elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
 
-	list_for_each_entry_rcu(elem, head, list) {
-		struct nf_hook_ops *next;
+	while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF))
+		elem = rcu_dereference(elem->next);
 
-		next = list_entry_rcu(list_next_rcu(&elem->list),
-				      struct nf_hook_ops, list);
-		if (next->priority <= NF_BR_PRI_BRNF)
-			continue;
-	}
-
-	if (&elem->list == head)
+	if (!elem)
 		return okfn(net, sk, skb);
 
 	/* We may already have this, but read-locks nest anyway */
 	rcu_read_lock();
-	nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1,
+	nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1,
 			   NFPROTO_BRIDGE, indev, outdev, sk, net, okfn);
 
 	ret = nf_hook_slow(skb, &state);
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 67b74287535d..72fc514ec676 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -22,6 +22,7 @@
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/rcupdate.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 
@@ -61,33 +62,50 @@ EXPORT_SYMBOL(nf_hooks_needed);
 #endif
 
 static DEFINE_MUTEX(nf_hook_mutex);
+#define nf_entry_dereference(e) \
+	rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
 
-static struct list_head *nf_find_hook_list(struct net *net,
-					   const struct nf_hook_ops *reg)
+static struct nf_hook_entry *nf_hook_entry_head(struct net *net,
+						const struct nf_hook_ops *reg)
 {
-	struct list_head *hook_list = NULL;
+	struct nf_hook_entry *hook_head = NULL;
 
 	if (reg->pf != NFPROTO_NETDEV)
-		hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+		hook_head = nf_entry_dereference(net->nf.hooks[reg->pf]
+						 [reg->hooknum]);
 	else if (reg->hooknum == NF_NETDEV_INGRESS) {
 #ifdef CONFIG_NETFILTER_INGRESS
 		if (reg->dev && dev_net(reg->dev) == net)
-			hook_list = &reg->dev->nf_hooks_ingress;
+			hook_head =
+				nf_entry_dereference(
+					reg->dev->nf_hooks_ingress);
 #endif
 	}
-	return hook_list;
+	return hook_head;
 }
 
-struct nf_hook_entry {
-	const struct nf_hook_ops	*orig_ops;
-	struct nf_hook_ops		ops;
-};
+/* must hold nf_hook_mutex */
+static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg,
+			      struct nf_hook_entry *entry)
+{
+	switch (reg->pf) {
+	case NFPROTO_NETDEV:
+		/* We already checked in nf_register_net_hook() that this is
+		 * used from ingress.
+		 */
+		rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry);
+		break;
+	default:
+		rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum],
+				   entry);
+		break;
+	}
+}
 
 int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct list_head *hook_list;
+	struct nf_hook_entry *hooks_entry;
 	struct nf_hook_entry *entry;
-	struct nf_hook_ops *elem;
 
 	if (reg->pf == NFPROTO_NETDEV &&
 	    (reg->hooknum != NF_NETDEV_INGRESS ||
@@ -100,19 +118,30 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 
 	entry->orig_ops	= reg;
 	entry->ops	= *reg;
-
-	hook_list = nf_find_hook_list(net, reg);
-	if (!hook_list) {
-		kfree(entry);
-		return -ENOENT;
-	}
+	entry->next	= NULL;
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, hook_list, list) {
-		if (reg->priority < elem->priority)
-			break;
+	hooks_entry = nf_hook_entry_head(net, reg);
+
+	if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) {
+		/* This is the case where we need to insert at the head */
+		entry->next = hooks_entry;
+		hooks_entry = NULL;
 	}
-	list_add_rcu(&entry->ops.list, elem->list.prev);
+
+	while (hooks_entry &&
+		reg->priority >= hooks_entry->orig_ops->priority &&
+		nf_entry_dereference(hooks_entry->next)) {
+		hooks_entry = nf_entry_dereference(hooks_entry->next);
+	}
+
+	if (hooks_entry) {
+		entry->next = nf_entry_dereference(hooks_entry->next);
+		rcu_assign_pointer(hooks_entry->next, entry);
+	} else {
+		nf_set_hooks_head(net, reg, entry);
+	}
+
 	mutex_unlock(&nf_hook_mutex);
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -127,24 +156,33 @@ EXPORT_SYMBOL(nf_register_net_hook);
 
 void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct list_head *hook_list;
-	struct nf_hook_entry *entry;
-	struct nf_hook_ops *elem;
-
-	hook_list = nf_find_hook_list(net, reg);
-	if (!hook_list)
-		return;
+	struct nf_hook_entry *hooks_entry;
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, hook_list, list) {
-		entry = container_of(elem, struct nf_hook_entry, ops);
-		if (entry->orig_ops == reg) {
-			list_del_rcu(&entry->ops.list);
-			break;
-		}
+	hooks_entry = nf_hook_entry_head(net, reg);
+	if (hooks_entry->orig_ops == reg) {
+		nf_set_hooks_head(net, reg,
+				  nf_entry_dereference(hooks_entry->next));
+		goto unlock;
 	}
+	while (hooks_entry && nf_entry_dereference(hooks_entry->next)) {
+		struct nf_hook_entry *next =
+			nf_entry_dereference(hooks_entry->next);
+		struct nf_hook_entry *nnext;
+
+		if (next->orig_ops != reg) {
+			hooks_entry = next;
+			continue;
+		}
+		nnext = nf_entry_dereference(next->next);
+		rcu_assign_pointer(hooks_entry->next, nnext);
+		hooks_entry = next;
+		break;
+	}
+
+unlock:
 	mutex_unlock(&nf_hook_mutex);
-	if (&elem->list == hook_list) {
+	if (!hooks_entry) {
 		WARN(1, "nf_unregister_net_hook: hook not found!\n");
 		return;
 	}
@@ -156,10 +194,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	synchronize_net();
-	nf_queue_nf_hook_drop(net, &entry->ops);
+	nf_queue_nf_hook_drop(net, hooks_entry);
 	/* other cpu might still process nfqueue verdict that used reg */
 	synchronize_net();
-	kfree(entry);
+	kfree(hooks_entry);
 }
 EXPORT_SYMBOL(nf_unregister_net_hook);
 
@@ -258,10 +296,9 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
-unsigned int nf_iterate(struct list_head *head,
-			struct sk_buff *skb,
+unsigned int nf_iterate(struct sk_buff *skb,
 			struct nf_hook_state *state,
-			struct nf_hook_ops **elemp)
+			struct nf_hook_entry **entryp)
 {
 	unsigned int verdict;
 
@@ -269,20 +306,23 @@ unsigned int nf_iterate(struct list_head *head,
 	 * The caller must not block between calls to this
 	 * function because of risk of continuing from deleted element.
 	 */
-	list_for_each_entry_continue_rcu((*elemp), head, list) {
-		if (state->thresh > (*elemp)->priority)
+	while (*entryp) {
+		if (state->thresh > (*entryp)->ops.priority) {
+			*entryp = rcu_dereference((*entryp)->next);
 			continue;
+		}
 
 		/* Optimization: we don't need to hold module
 		   reference here, since function can't sleep. --RR */
 repeat:
-		verdict = (*elemp)->hook((*elemp)->priv, skb, state);
+		verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
 		if (verdict != NF_ACCEPT) {
 #ifdef CONFIG_NETFILTER_DEBUG
 			if (unlikely((verdict & NF_VERDICT_MASK)
 							> NF_MAX_VERDICT)) {
 				NFDEBUG("Evil return from %p(%u).\n",
-					(*elemp)->hook, state->hook);
+					(*entryp)->ops.hook, state->hook);
+				*entryp = rcu_dereference((*entryp)->next);
 				continue;
 			}
 #endif
@@ -290,6 +330,7 @@ unsigned int nf_iterate(struct list_head *head,
 				return verdict;
 			goto repeat;
 		}
+		*entryp = rcu_dereference((*entryp)->next);
 	}
 	return NF_ACCEPT;
 }
@@ -299,13 +340,13 @@ unsigned int nf_iterate(struct list_head *head,
  * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
 {
-	struct nf_hook_ops *elem;
+	struct nf_hook_entry *entry;
 	unsigned int verdict;
 	int ret = 0;
 
-	elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
+	entry = rcu_dereference(state->hook_entries);
 next_hook:
-	verdict = nf_iterate(state->hook_list, skb, state, &elem);
+	verdict = nf_iterate(skb, state, &entry);
 	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
 		ret = 1;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
@@ -314,8 +355,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
 		if (ret == 0)
 			ret = -EPERM;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
-		int err = nf_queue(skb, elem, state,
-				   verdict >> NF_VERDICT_QBITS);
+		int err;
+
+		RCU_INIT_POINTER(state->hook_entries, entry);
+		err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
 		if (err < 0) {
 			if (err == -ESRCH &&
 			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
@@ -442,7 +485,7 @@ static int __net_init netfilter_net_init(struct net *net)
 
 	for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
 		for (h = 0; h < NF_MAX_HOOKS; h++)
-			INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+			RCU_INIT_POINTER(net->nf.hooks[i][h], NULL);
 	}
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 065522564ac6..e0adb5959342 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -13,13 +13,13 @@
 
 
 /* core.c */
-unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
-			struct nf_hook_state *state, struct nf_hook_ops **elemp);
+unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state,
+			struct nf_hook_entry **entryp);
 
 /* nf_queue.c */
-int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
-	     struct nf_hook_state *state, unsigned int queuenum);
-void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops);
+int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
+	     unsigned int queuenum);
+void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry);
 int __init netfilter_queue_init(void);
 
 /* nf_log.c */
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b19ad20a705c..96964a0070e1 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
 }
 EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
 
-void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
+void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
 {
 	const struct nf_queue_handler *qh;
 
 	rcu_read_lock();
 	qh = rcu_dereference(net->nf.queue_handler);
 	if (qh)
-		qh->nf_hook_drop(net, ops);
+		qh->nf_hook_drop(net, entry);
 	rcu_read_unlock();
 }
 
@@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
  * through nf_reinject().
  */
 int nf_queue(struct sk_buff *skb,
-	     struct nf_hook_ops *elem,
 	     struct nf_hook_state *state,
 	     unsigned int queuenum)
 {
@@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb,
 
 	*entry = (struct nf_queue_entry) {
 		.skb	= skb,
-		.elem	= elem,
 		.state	= *state,
 		.size	= sizeof(*entry) + afinfo->route_key_size,
 	};
@@ -165,11 +163,15 @@ int nf_queue(struct sk_buff *skb,
 
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
+	struct nf_hook_entry *hook_entry;
 	struct sk_buff *skb = entry->skb;
-	struct nf_hook_ops *elem = entry->elem;
 	const struct nf_afinfo *afinfo;
+	struct nf_hook_ops *elem;
 	int err;
 
+	hook_entry = rcu_dereference(entry->state.hook_entries);
+	elem = &hook_entry->ops;
+
 	nf_queue_entry_release_refs(entry);
 
 	/* Continue traversal iff userspace said ok... */
@@ -186,8 +188,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 
 	if (verdict == NF_ACCEPT) {
 	next_hook:
-		verdict = nf_iterate(entry->state.hook_list,
-				     skb, &entry->state, &elem);
+		verdict = nf_iterate(skb, &entry->state, &hook_entry);
 	}
 
 	switch (verdict & NF_VERDICT_MASK) {
@@ -198,7 +199,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 		local_bh_enable();
 		break;
 	case NF_QUEUE:
-		err = nf_queue(skb, elem, &entry->state,
+		RCU_INIT_POINTER(entry->state.hook_entries, hook_entry);
+		err = nf_queue(skb, &entry->state,
 			       verdict >> NF_VERDICT_QBITS);
 		if (err < 0) {
 			if (err == -ESRCH &&
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 7caa8b082c41..af832c526048 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = {
 	.notifier_call	= nfqnl_rcv_dev_event,
 };
 
-static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr)
+static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
 {
-	return entry->elem == (struct nf_hook_ops *)ops_ptr;
+	return rcu_access_pointer(entry->state.hook_entries) ==
+		(struct nf_hook_entry *)entry_ptr;
 }
 
-static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook)
+static void nfqnl_nf_hook_drop(struct net *net,
+			       const struct nf_hook_entry *hook)
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 	int i;

From 8d11350f5f33378efc5f905bee325f3e76d6bcca Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 22 Sep 2016 14:53:53 +0800
Subject: [PATCH 44/53] netfilter: seqadj: Fix the wrong ack adjust for the RST
 packet without ack

It is valid that the TCP RST packet which does not set ack flag, and bytes
of ack number are zero. But current seqadj codes would adjust the "0" ack
to invalid ack number. Actually seqadj need to check the ack flag before
adjust it for these RST packets.

The following is my test case

client is 10.26.98.245, and add one iptable rule:
iptables  -I INPUT -p tcp --sport 12345 -m connbytes --connbytes 2:
--connbytes-dir reply --connbytes-mode packets -j REJECT --reject-with
tcp-reset
This iptables rule could generate on TCP RST without ack flag.

server:10.172.135.55
Enable the synproxy with seqadjust by the following iptables rules
iptables -t raw -A PREROUTING -i eth0 -p tcp -d 10.172.135.55 --dport 12345
-m tcp --syn -j CT --notrack

iptables -A INPUT -i eth0 -p tcp -d 10.172.135.55 --dport 12345 -m conntrack
--ctstate INVALID,UNTRACKED -j SYNPROXY --sack-perm --timestamp --wscale 7
--mss 1460
iptables -A OUTPUT -o eth0 -p tcp -s 10.172.135.55 --sport 12345 -m conntrack
--ctstate INVALID,UNTRACKED -m tcp --tcp-flags SYN,RST,ACK SYN,ACK -j ACCEPT

The following is my test result.

1. packet trace on client
root@routers:/tmp# tcpdump -i eth0 tcp port 12345 -n
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
IP 10.26.98.245.45154 > 10.172.135.55.12345: Flags [S], seq 3695959829,
win 29200, options [mss 1460,sackOK,TS val 452367884 ecr 0,nop,wscale 7],
length 0
IP 10.172.135.55.12345 > 10.26.98.245.45154: Flags [S.], seq 546723266,
ack 3695959830, win 0, options [mss 1460,sackOK,TS val 15643479 ecr 452367884,
nop,wscale 7], length 0
IP 10.26.98.245.45154 > 10.172.135.55.12345: Flags [.], ack 1, win 229,
options [nop,nop,TS val 452367885 ecr 15643479], length 0
IP 10.172.135.55.12345 > 10.26.98.245.45154: Flags [.], ack 1, win 226,
options [nop,nop,TS val 15643479 ecr 452367885], length 0
IP 10.26.98.245.45154 > 10.172.135.55.12345: Flags [R], seq 3695959830,
win 0, length 0

2. seqadj log on server
[62873.867319] Adjusting sequence number from 602341895->546723267,
ack from 3695959830->3695959830
[62873.867644] Adjusting sequence number from 602341895->546723267,
ack from 3695959830->3695959830
[62873.869040] Adjusting sequence number from 3695959830->3695959830,
ack from 0->55618628

To summarize, it is clear that the seqadj codes adjust the 0 ack when receive
one TCP RST packet without ack.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_seqadj.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index dff0f0cc59e4..ef7063eced7c 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -169,7 +169,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
 	s32 seqoff, ackoff;
 	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
 	struct nf_ct_seqadj *this_way, *other_way;
-	int res;
+	int res = 1;
 
 	this_way  = &seqadj->seq[dir];
 	other_way = &seqadj->seq[!dir];
@@ -184,27 +184,31 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
 	else
 		seqoff = this_way->offset_before;
 
+	newseq = htonl(ntohl(tcph->seq) + seqoff);
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
+	pr_debug("Adjusting sequence number from %u->%u\n",
+		 ntohl(tcph->seq), ntohl(newseq));
+	tcph->seq = newseq;
+
+	if (!tcph->ack)
+		goto out;
+
 	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
 		  other_way->correction_pos))
 		ackoff = other_way->offset_after;
 	else
 		ackoff = other_way->offset_before;
 
-	newseq = htonl(ntohl(tcph->seq) + seqoff);
 	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
 	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
 				 false);
-
-	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+	pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n",
 		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
 		 ntohl(newack));
-
-	tcph->seq = newseq;
 	tcph->ack_seq = newack;
 
 	res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+out:
 	spin_unlock_bh(&ct->lock);
 
 	return res;

From d767ff2c84f19be1aa403762f34eebbb403caf6d Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Thu, 22 Sep 2016 22:28:51 +0800
Subject: [PATCH 45/53] netfilter: nft_ct: unnecessary to require dir when use
 ct l3proto/protocol

Currently, if the user want to match ct l3proto, we must specify the
direction, for example:
  # nft add rule filter input ct original l3proto ipv4
                                 ^^^^^^^^
Otherwise, error message will be reported:
  # nft add rule filter input ct l3proto ipv4
  nft add rule filter input ct l3proto ipv4
  <cmdline>:1:1-38: Error: Could not process rule: Invalid argument
  add rule filter input ct l3proto ipv4
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Actually, there's no need to require NFTA_CT_DIRECTION attr, because
ct l3proto and protocol are unrelated to direction.

And for compatibility, even if the user specify the NFTA_CT_DIRECTION
attr, do not report error, just skip it.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_ct.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 51e180f2a003..825fbbc62f48 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -128,15 +128,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 		memcpy(dest, &count, sizeof(count));
 		return;
 	}
+	case NFT_CT_L3PROTOCOL:
+		*dest = nf_ct_l3num(ct);
+		return;
+	case NFT_CT_PROTOCOL:
+		*dest = nf_ct_protonum(ct);
+		return;
 	default:
 		break;
 	}
 
 	tuple = &ct->tuplehash[priv->dir].tuple;
 	switch (priv->key) {
-	case NFT_CT_L3PROTOCOL:
-		*dest = nf_ct_l3num(ct);
-		return;
 	case NFT_CT_SRC:
 		memcpy(dest, tuple->src.u3.all,
 		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
@@ -145,9 +148,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 		memcpy(dest, tuple->dst.u3.all,
 		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
 		return;
-	case NFT_CT_PROTOCOL:
-		*dest = nf_ct_protonum(ct);
-		return;
 	case NFT_CT_PROTO_SRC:
 		*dest = (__force __u16)tuple->src.u.all;
 		return;
@@ -283,8 +283,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 
 	case NFT_CT_L3PROTOCOL:
 	case NFT_CT_PROTOCOL:
-		if (tb[NFTA_CT_DIRECTION] == NULL)
-			return -EINVAL;
+		/* For compatibility, do not report error if NFTA_CT_DIRECTION
+		 * attribute is specified.
+		 */
 		len = sizeof(u8);
 		break;
 	case NFT_CT_SRC:
@@ -432,8 +433,6 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 
 	switch (priv->key) {
-	case NFT_CT_L3PROTOCOL:
-	case NFT_CT_PROTOCOL:
 	case NFT_CT_SRC:
 	case NFT_CT_DST:
 	case NFT_CT_PROTO_SRC:

From 7bfdde7045ad54d9fdccac70baffd094d9de73f8 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Thu, 22 Sep 2016 22:28:52 +0800
Subject: [PATCH 46/53] netfilter: nft_ct: report error if mark and dir
 specified simultaneously

NFT_CT_MARK is unrelated to direction, so if NFTA_CT_DIRECTION attr is
specified, report EINVAL to the userspace. This validation check was
already done at nft_ct_get_init, but we missed it in nft_ct_set_init.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_ct.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 825fbbc62f48..d7b0d171172a 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -364,6 +364,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 	switch (priv->key) {
 #ifdef CONFIG_NF_CONNTRACK_MARK
 	case NFT_CT_MARK:
+		if (tb[NFTA_CT_DIRECTION])
+			return -EINVAL;
 		len = FIELD_SIZEOF(struct nf_conn, mark);
 		break;
 #endif

From 0dc60a4546fefc6dc9f54abf60beeeb3501726fa Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Thu, 22 Sep 2016 12:42:46 -0400
Subject: [PATCH 47/53] netfilter: xt_hashlimit: Prepare for revision 2

I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the
end of the names.

Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Joshua Hunt <johunt@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 61 ++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 178696852bde..e93d9e0a3f35 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,7 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 }
 
 /* need to declare this at the top */
-static const struct file_operations dl_file_ops;
+static const struct file_operations dl_file_ops_v1;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -215,8 +215,8 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-			 u_int8_t family)
+static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
+			    u_int8_t family)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
@@ -265,7 +265,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	hinfo->pde = proc_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		&dl_file_ops, hinfo);
+		&dl_file_ops_v1, hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -398,7 +398,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
    (slowest userspace tool allows), which means
    CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -410,7 +410,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
 
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
  * credit_cap is used as a counter that tells us how many times we can
@@ -428,11 +428,12 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 static u32 user2credits(u32 user)
 {
 	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
 		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+		return (user / XT_HASHLIMIT_SCALE) *\
+					HZ * CREDITS_PER_JIFFY_v1;
 
-	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+	return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE;
 }
 
 static u32 user2credits_byte(u32 user)
@@ -461,7 +462,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 			return;
 		}
 	} else {
-		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
 		cap = dh->rateinfo.credit_cap;
 	}
 	if (dh->rateinfo.credit > cap)
@@ -603,7 +604,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
@@ -660,7 +661,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return false;
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 {
 	struct net *net = par->net;
 	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
@@ -701,7 +702,7 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
 	mutex_lock(&hashlimit_mutex);
 	info->hinfo = htable_find_get(net, info->name, par->family);
 	if (info->hinfo == NULL) {
-		ret = htable_create(net, info, par->family);
+		ret = htable_create_v1(net, info, par->family);
 		if (ret < 0) {
 			mutex_unlock(&hashlimit_mutex);
 			return ret;
@@ -711,7 +712,7 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 
@@ -723,10 +724,10 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 1,
 		.family         = NFPROTO_IPV4,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
-		.checkentry     = hashlimit_mt_check,
-		.destroy        = hashlimit_mt_destroy,
+		.checkentry     = hashlimit_mt_check_v1,
+		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
@@ -734,10 +735,10 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 1,
 		.family         = NFPROTO_IPV6,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
-		.checkentry     = hashlimit_mt_check,
-		.destroy        = hashlimit_mt_destroy,
+		.checkentry     = hashlimit_mt_check_v1,
+		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
 #endif
@@ -786,8 +787,8 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	spin_unlock_bh(&htable->lock);
 }
 
-static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
-				   struct seq_file *s)
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
 {
 	const struct xt_hashlimit_htable *ht = s->private;
 
@@ -825,7 +826,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 	return seq_has_overflowed(s);
 }
 
-static int dl_seq_show(struct seq_file *s, void *v)
+static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
 	struct xt_hashlimit_htable *htable = s->private;
 	unsigned int *bucket = (unsigned int *)v;
@@ -833,22 +834,22 @@ static int dl_seq_show(struct seq_file *s, void *v)
 
 	if (!hlist_empty(&htable->hash[*bucket])) {
 		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
-			if (dl_seq_real_show(ent, htable->family, s))
+			if (dl_seq_real_show_v1(ent, htable->family, s))
 				return -1;
 	}
 	return 0;
 }
 
-static const struct seq_operations dl_seq_ops = {
+static const struct seq_operations dl_seq_ops_v1 = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
 	.stop  = dl_seq_stop,
-	.show  = dl_seq_show
+	.show  = dl_seq_show_v1
 };
 
-static int dl_proc_open(struct inode *inode, struct file *file)
+static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
-	int ret = seq_open(file, &dl_seq_ops);
+	int ret = seq_open(file, &dl_seq_ops_v1);
 
 	if (!ret) {
 		struct seq_file *sf = file->private_data;
@@ -857,9 +858,9 @@ static int dl_proc_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static const struct file_operations dl_file_ops = {
+static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
-	.open    = dl_proc_open,
+	.open    = dl_proc_open_v1,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release

From 11d5f15723c9f39d7c131d0149d024c17dbef676 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Thu, 22 Sep 2016 12:43:44 -0400
Subject: [PATCH 48/53] netfilter: xt_hashlimit: Create revision 2 to support
 higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destroy.

Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very
similar in both rev1 and rev2 with only minor changes, so I have split
those functions and moved all the common code to a *_common function.

Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Joshua Hunt <johunt@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  23 ++
 net/netfilter/xt_hashlimit.c                | 332 ++++++++++++++++----
 2 files changed, 286 insertions(+), 69 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index 6db90372f09c..3efc0ca18345 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -6,6 +6,7 @@
 
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE 10000
+#define XT_HASHLIMIT_SCALE_v2 1000000llu
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
@@ -63,6 +64,20 @@ struct hashlimit_cfg1 {
 	__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg2 {
+	__u64 avg;		/* Average secs between packets * scale */
+	__u64 burst;		/* Period multiplier for upper limit. */
+	__u32 mode;		/* bitmask of XT_HASHLIMIT_HASH_* */
+
+	/* user specified */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;		/* when do entries expire? */
+
+	__u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
 	char name[IFNAMSIZ];
 	struct hashlimit_cfg1 cfg;
@@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 {
 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo2 {
+	char name[NAME_MAX];
+	struct hashlimit_cfg2 cfg;
+
+	/* Used internally by the kernel */
+	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index e93d9e0a3f35..44a095ecc7b7 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops_v1;
+static const struct file_operations dl_file_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -86,8 +87,8 @@ struct dsthash_ent {
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
-		u_int32_t credit;
-		u_int32_t credit_cap, cost;
+		u_int64_t credit;
+		u_int64_t credit_cap, cost;
 	} rateinfo;
 	struct rcu_head rcu;
 };
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
 	u_int8_t family;
 	bool rnd_initialized;
 
-	struct hashlimit_cfg1 cfg;	/* config */
+	struct hashlimit_cfg2 cfg;	/* config */
 
 	/* used internally */
 	spinlock_t lock;		/* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
 	struct hlist_head hash[0];	/* hashtable itself */
 };
 
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+	if (revision == 1) {
+		struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+		to->mode = cfg->mode;
+		to->avg = cfg->avg;
+		to->burst = cfg->burst;
+		to->size = cfg->size;
+		to->max = cfg->max;
+		to->gc_interval = cfg->gc_interval;
+		to->expire = cfg->expire;
+		to->srcmask = cfg->srcmask;
+		to->dstmask = cfg->dstmask;
+	} else if (revision == 2) {
+		memcpy(to, from, sizeof(struct hashlimit_cfg2));
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static DEFINE_MUTEX(hashlimit_mutex);	/* protects htables list */
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-			    u_int8_t family)
+static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+			 const char *name, u_int8_t family,
+			 struct xt_hashlimit_htable **out_hinfo,
+			 int revision)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
-	unsigned int size;
-	unsigned int i;
+	unsigned int size, i;
+	int ret;
 
-	if (minfo->cfg.size) {
-		size = minfo->cfg.size;
+	if (cfg->size) {
+		size = cfg->size;
 	} else {
 		size = (totalram_pages << PAGE_SHIFT) / 16384 /
 		       sizeof(struct list_head);
@@ -238,10 +265,14 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	                sizeof(struct list_head) * size);
 	if (hinfo == NULL)
 		return -ENOMEM;
-	minfo->hinfo = hinfo;
+	*out_hinfo = hinfo;
 
 	/* copy match config into hashtable config */
-	memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+
+	if (ret)
+		return ret;
+
 	hinfo->cfg.size = size;
 	if (hinfo->cfg.max == 0)
 		hinfo->cfg.max = 8 * hinfo->cfg.size;
@@ -255,17 +286,18 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+	hinfo->name = kstrdup(name, GFP_KERNEL);
 	if (!hinfo->name) {
 		vfree(hinfo);
 		return -ENOMEM;
 	}
 	spin_lock_init(&hinfo->lock);
 
-	hinfo->pde = proc_create_data(minfo->name, 0,
+	hinfo->pde = proc_create_data(name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		&dl_file_ops_v1, hinfo);
+		(revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
+		hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -399,6 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
    CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
 #define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -408,8 +441,11 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
 #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1)
 
+#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ)
 #define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
@@ -425,15 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 }
 
 /* Precision saver. */
-static u32 user2credits(u32 user)
+static u64 user2credits(u64 user, int revision)
 {
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
-		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE) *\
-					HZ * CREDITS_PER_JIFFY_v1;
+	if (revision == 1) {
+		/* If multiplying would overflow... */
+		if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
+			/* Divide first. */
+			return (user / XT_HASHLIMIT_SCALE) *\
+						HZ * CREDITS_PER_JIFFY_v1;
 
-	return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE;
+		return (user * HZ * CREDITS_PER_JIFFY_v1) \
+						/ XT_HASHLIMIT_SCALE;
+	} else {
+		if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+			return (user / XT_HASHLIMIT_SCALE_v2) *\
+						HZ * CREDITS_PER_JIFFY;
+
+		return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2;
+	}
 }
 
 static u32 user2credits_byte(u32 user)
@@ -443,10 +488,11 @@ static u32 user2credits_byte(u32 user)
 	return (u32) (us >> 32);
 }
 
-static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+			    u32 mode, int revision)
 {
 	unsigned long delta = now - dh->rateinfo.prev;
-	u32 cap;
+	u64 cap, cpj;
 
 	if (delta == 0)
 		return;
@@ -454,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 	dh->rateinfo.prev = now;
 
 	if (mode & XT_HASHLIMIT_BYTES) {
-		u32 tmp = dh->rateinfo.credit;
+		u64 tmp = dh->rateinfo.credit;
 		dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
 		cap = CREDITS_PER_JIFFY_BYTES * HZ;
 		if (tmp >= dh->rateinfo.credit) {/* overflow */
@@ -462,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 			return;
 		}
 	} else {
-		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
+		cpj = (revision == 1) ?
+			CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
+		dh->rateinfo.credit += delta * cpj;
 		cap = dh->rateinfo.credit_cap;
 	}
 	if (dh->rateinfo.credit > cap)
@@ -470,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 }
 
 static void rateinfo_init(struct dsthash_ent *dh,
-			  struct xt_hashlimit_htable *hinfo)
+			  struct xt_hashlimit_htable *hinfo, int revision)
 {
 	dh->rateinfo.prev = jiffies;
 	if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
@@ -479,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
 		dh->rateinfo.credit_cap = hinfo->cfg.burst;
 	} else {
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-						   hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+						   hinfo->cfg.burst, revision);
+		dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision);
 		dh->rateinfo.credit_cap = dh->rateinfo.credit;
 	}
 }
@@ -604,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
+		    struct xt_hashlimit_htable *hinfo,
+		    const struct hashlimit_cfg2 *cfg, int revision)
 {
-	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
-	struct xt_hashlimit_htable *hinfo = info->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
 	struct dsthash_dst dst;
 	bool race = false;
-	u32 cost;
+	u64 cost;
 
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
@@ -627,18 +675,18 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		} else if (race) {
 			/* Already got an entry, update expiration timeout */
 			dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-			rateinfo_recalc(dh, now, hinfo->cfg.mode);
+			rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 		} else {
 			dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-			rateinfo_init(dh, hinfo);
+			rateinfo_init(dh, hinfo, revision);
 		}
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now, hinfo->cfg.mode);
+		rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 	}
 
-	if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+	if (cfg->mode & XT_HASHLIMIT_BYTES)
 		cost = hashlimit_byte_cost(skb->len, dh);
 	else
 		cost = dh->rateinfo.cost;
@@ -648,70 +696,126 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		dh->rateinfo.credit -= cost;
 		spin_unlock(&dh->lock);
 		rcu_read_unlock_bh();
-		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+		return !(cfg->mode & XT_HASHLIMIT_INVERT);
 	}
 
 	spin_unlock(&dh->lock);
 	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
-	return info->cfg.mode & XT_HASHLIMIT_INVERT;
+	return cfg->mode & XT_HASHLIMIT_INVERT;
 
  hotdrop:
 	par->hotdrop = true;
 	return false;
 }
 
-static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+static bool
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
-	struct net *net = par->net;
-	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+	struct hashlimit_cfg2 cfg = {};
 	int ret;
 
-	if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
-		return -EINVAL;
-	if (info->name[sizeof(info->name)-1] != '\0')
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_common(skb, par, hinfo, &cfg, 1);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+
+	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+}
+
+static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
+				     struct xt_hashlimit_htable **hinfo,
+				     struct hashlimit_cfg2 *cfg,
+				     const char *name, int revision)
+{
+	struct net *net = par->net;
+	int ret;
+
+	if (cfg->gc_interval == 0 || cfg->expire == 0)
 		return -EINVAL;
 	if (par->family == NFPROTO_IPV4) {
-		if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+		if (cfg->srcmask > 32 || cfg->dstmask > 32)
 			return -EINVAL;
 	} else {
-		if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+		if (cfg->srcmask > 128 || cfg->dstmask > 128)
 			return -EINVAL;
 	}
 
-	if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+	if (cfg->mode & ~XT_HASHLIMIT_ALL) {
 		pr_info("Unknown mode mask %X, kernel too old?\n",
-						info->cfg.mode);
+						cfg->mode);
 		return -EINVAL;
 	}
 
 	/* Check for overflow. */
-	if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
-		if (user2credits_byte(info->cfg.avg) == 0) {
-			pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+	if (cfg->mode & XT_HASHLIMIT_BYTES) {
+		if (user2credits_byte(cfg->avg) == 0) {
+			pr_info("overflow, rate too high: %llu\n", cfg->avg);
 			return -EINVAL;
 		}
-	} else if (info->cfg.burst == 0 ||
-		    user2credits(info->cfg.avg * info->cfg.burst) <
-		    user2credits(info->cfg.avg)) {
-			pr_info("overflow, try lower: %u/%u\n",
-				info->cfg.avg, info->cfg.burst);
+	} else if (cfg->burst == 0 ||
+		    user2credits(cfg->avg * cfg->burst, revision) <
+		    user2credits(cfg->avg, revision)) {
+			pr_info("overflow, try lower: %llu/%llu\n",
+				cfg->avg, cfg->burst);
 			return -ERANGE;
 	}
 
 	mutex_lock(&hashlimit_mutex);
-	info->hinfo = htable_find_get(net, info->name, par->family);
-	if (info->hinfo == NULL) {
-		ret = htable_create_v1(net, info, par->family);
+	*hinfo = htable_find_get(net, name, par->family);
+	if (*hinfo == NULL) {
+		ret = htable_create(net, cfg, name, par->family,
+				    hinfo, revision);
 		if (ret < 0) {
 			mutex_unlock(&hashlimit_mutex);
 			return ret;
 		}
 	}
 	mutex_unlock(&hashlimit_mutex);
+
 	return 0;
 }
 
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct hashlimit_cfg2 cfg = {};
+	int ret;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_check_common(par, &info->hinfo,
+					 &cfg, info->name, 1);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
+					 info->name, 2);
+}
+
 static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
@@ -719,6 +823,13 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 	htable_put(info->hinfo);
 }
 
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	htable_put(info->hinfo);
+}
+
 static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 	{
 		.name           = "hashlimit",
@@ -730,6 +841,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
+	{
+		.name           = "hashlimit",
+		.revision       = 2,
+		.family         = NFPROTO_IPV4,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	{
 		.name           = "hashlimit",
@@ -741,6 +862,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
+	{
+		.name           = "hashlimit",
+		.revision       = 2,
+		.family         = NFPROTO_IPV6,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 #endif
 };
 
@@ -787,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	spin_unlock_bh(&htable->lock);
 }
 
-static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
-			       struct seq_file *s)
+static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
+			 struct seq_file *s)
 {
-	const struct xt_hashlimit_htable *ht = s->private;
-
-	spin_lock(&ent->lock);
-	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies, ht->cfg.mode);
-
 	switch (family) {
 	case NFPROTO_IPV4:
-		seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n",
 			   (long)(ent->expires - jiffies)/HZ,
 			   &ent->dst.ip.src,
 			   ntohs(ent->dst.src_port),
@@ -809,7 +934,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 		break;
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	case NFPROTO_IPV6:
-		seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n",
 			   (long)(ent->expires - jiffies)/HZ,
 			   &ent->dst.ip6.src,
 			   ntohs(ent->dst.src_port),
@@ -822,6 +947,34 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 	default:
 		BUG();
 	}
+}
+
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1);
+
+	dl_seq_print(ent, family, s);
+
+	spin_unlock(&ent->lock);
+	return seq_has_overflowed(s);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+			    struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+	dl_seq_print(ent, family, s);
+
 	spin_unlock(&ent->lock);
 	return seq_has_overflowed(s);
 }
@@ -840,6 +993,20 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 	return 0;
 }
 
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+
+	if (!hlist_empty(&htable->hash[*bucket])) {
+		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+			if (dl_seq_real_show(ent, htable->family, s))
+				return -1;
+	}
+	return 0;
+}
+
 static const struct seq_operations dl_seq_ops_v1 = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
@@ -847,6 +1014,13 @@ static const struct seq_operations dl_seq_ops_v1 = {
 	.show  = dl_seq_show_v1
 };
 
+static const struct seq_operations dl_seq_ops = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show
+};
+
 static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
 	int ret = seq_open(file, &dl_seq_ops_v1);
@@ -858,6 +1032,18 @@ static int dl_proc_open_v1(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+
+		sf->private = PDE_DATA(inode);
+	}
+	return ret;
+}
+
 static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v1,
@@ -866,6 +1052,14 @@ static const struct file_operations dl_file_ops_v1 = {
 	.release = seq_release
 };
 
+static const struct file_operations dl_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
 static int __net_init hashlimit_proc_net_init(struct net *net)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);

From 58e207e4983d7acea39b7fbec9343d8a6d218a18 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 22 Sep 2016 23:49:17 +0200
Subject: [PATCH 49/53] netfilter: evict stale entries when user reads
 /proc/net/nf_conntrack

Fabian reports a possible conntrack memory leak (could not reproduce so
far), however, one minor issue can be easily resolved:

> cat /proc/net/nf_conntrack | wc -l = 5
> 4 minutes required to clean up the table.

We should not report those timed-out entries to the user in first place.
And instead of just skipping those timed-out entries while iterating over
the table we can also zap them (we already do this during ctnetlink
walks, but I forgot about the /proc interface).

Fixes: f330a7fdbe16 ("netfilter: conntrack: get rid of conntrack timer")
Reported-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_standalone.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 7d52f8401afd..5f446cd9f3fd 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -212,6 +212,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
 		return 0;
 
+	if (nf_ct_should_gc(ct)) {
+		nf_ct_kill(ct);
+		goto release;
+	}
+
 	/* we only want to print DIR_ORIGINAL */
 	if (NF_CT_DIRECTION(hash))
 		goto release;

From 7a682575ad4829b4de3e672a6ad5f73a05826b82 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@balabit.com>
Date: Fri, 23 Sep 2016 11:27:42 +0200
Subject: [PATCH 50/53] netfilter: xt_socket: fix transparent match for IPv6
 request sockets

The introduction of TCP_NEW_SYN_RECV state, and the addition of request
sockets to the ehash table seems to have broken the --transparent option
of the socket match for IPv6 (around commit a9407000).

Now that the socket lookup finds the TCP_NEW_SYN_RECV socket instead of the
listener, the --transparent option tries to match on the no_srccheck flag
of the request socket.

Unfortunately, that flag was only set for IPv4 sockets in tcp_v4_init_req()
by copying the transparent flag of the listener socket. This effectively
causes '-m socket --transparent' not match on the ACK packet sent by the
client in a TCP handshake.

Based on the suggestion from Eric Dumazet, this change moves the code
initializing no_srccheck to tcp_conn_request(), rendering the above
scenario working again.

Fixes: a940700003 ("netfilter: xt_socket: prepare for TCP_NEW_SYN_RECV support")
Signed-off-by: Alex Badics <alex.badics@balabit.com>
Signed-off-by: KOVACS Krisztian <hidden@balabit.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/tcp_input.c | 1 +
 net/ipv4/tcp_ipv4.c  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8cd02c0b056c..f3a9f3c2c8d8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6269,6 +6269,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
+	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
 
 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
 	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a75bf48d7950..13b05adf9d3e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1196,7 +1196,6 @@ static void tcp_v4_init_req(struct request_sock *req,
 
 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
-	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
 	ireq->opt = tcp_v4_save_options(skb);
 }
 

From 0f3cd9b3697708c86a825ae3cedabf7be6fd3e72 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 23 Sep 2016 15:23:33 +0200
Subject: [PATCH 51/53] netfilter: nf_tables: add range expression

Inverse ranges != [a,b] are not currently possible because rules are
composites of && operations, and we need to express this:

	data < a || data > b

This patch adds a new range expression. Positive ranges can be already
through two cmp expressions:

	cmp(sreg, data, >=)
	cmp(sreg, data, <=)

This new range expression provides an alternative way to express this.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h   |   3 +
 include/uapi/linux/netfilter/nf_tables.h |  29 +++++
 net/netfilter/Makefile                   |   3 +-
 net/netfilter/nf_tables_core.c           |   7 +-
 net/netfilter/nft_range.c                | 138 +++++++++++++++++++++++
 5 files changed, 178 insertions(+), 2 deletions(-)
 create mode 100644 net/netfilter/nft_range.c

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index a9060dd99db7..00f4f6b1b1ba 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -28,6 +28,9 @@ extern const struct nft_expr_ops nft_cmp_fast_ops;
 int nft_cmp_module_init(void);
 void nft_cmp_module_exit(void);
 
+int nft_range_module_init(void);
+void nft_range_module_exit(void);
+
 int nft_lookup_module_init(void);
 void nft_lookup_module_exit(void);
 
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 1cf41dd838b2..c6c4477c136b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -546,6 +546,35 @@ enum nft_cmp_attributes {
 };
 #define NFTA_CMP_MAX		(__NFTA_CMP_MAX - 1)
 
+/**
+ * enum nft_range_ops - nf_tables range operator
+ *
+ * @NFT_RANGE_EQ: equal
+ * @NFT_RANGE_NEQ: not equal
+ */
+enum nft_range_ops {
+	NFT_RANGE_EQ,
+	NFT_RANGE_NEQ,
+};
+
+/**
+ * enum nft_range_attributes - nf_tables range expression netlink attributes
+ *
+ * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers)
+ * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops)
+ * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes)
+ * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes)
+ */
+enum nft_range_attributes {
+	NFTA_RANGE_UNSPEC,
+	NFTA_RANGE_SREG,
+	NFTA_RANGE_OP,
+	NFTA_RANGE_FROM_DATA,
+	NFTA_RANGE_TO_DATA,
+	__NFTA_RANGE_MAX
+};
+#define NFTA_RANGE_MAX		(__NFTA_RANGE_MAX - 1)
+
 enum nft_lookup_flags {
 	NFT_LOOKUP_F_INV = (1 << 0),
 };
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0c8581100ac6..c23c3c84416f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 
 # nf_tables
 nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o
-nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o
 nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+nf_tables-objs += nft_lookup.o nft_dynset.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NF_TABLES_INET)	+= nf_tables_inet.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 67259cefef06..7c94ce0080d5 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -263,8 +263,13 @@ int __init nf_tables_core_module_init(void)
 	if (err < 0)
 		goto err7;
 
-	return 0;
+	err = nft_range_module_init();
+	if (err < 0)
+		goto err8;
 
+	return 0;
+err8:
+	nft_dynset_module_exit();
 err7:
 	nft_payload_module_exit();
 err6:
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
new file mode 100644
index 000000000000..c6d5358482d1
--- /dev/null
+++ b/net/netfilter/nft_range.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_range_expr {
+	struct nft_data		data_from;
+	struct nft_data		data_to;
+	enum nft_registers	sreg:8;
+	u8			len;
+	enum nft_range_ops	op:8;
+};
+
+static void nft_range_eval(const struct nft_expr *expr,
+			 struct nft_regs *regs,
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_range_expr *priv = nft_expr_priv(expr);
+	bool mismatch;
+	int d1, d2;
+
+	d1 = memcmp(&regs->data[priv->sreg], &priv->data_from, priv->len);
+	d2 = memcmp(&regs->data[priv->sreg], &priv->data_to, priv->len);
+	switch (priv->op) {
+	case NFT_RANGE_EQ:
+		mismatch = (d1 < 0 || d2 > 0);
+		break;
+	case NFT_RANGE_NEQ:
+		mismatch = (d1 >= 0 && d2 <= 0);
+		break;
+	}
+
+	if (mismatch)
+		regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
+	[NFTA_RANGE_SREG]		= { .type = NLA_U32 },
+	[NFTA_RANGE_OP]			= { .type = NLA_U32 },
+	[NFTA_RANGE_FROM_DATA]		= { .type = NLA_NESTED },
+	[NFTA_RANGE_TO_DATA]		= { .type = NLA_NESTED },
+};
+
+static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_range_expr *priv = nft_expr_priv(expr);
+	struct nft_data_desc desc_from, desc_to;
+	int err;
+
+	err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
+			    &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+	if (err < 0)
+		return err;
+
+	err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
+			    &desc_to, tb[NFTA_RANGE_TO_DATA]);
+	if (err < 0)
+		goto err1;
+
+	if (desc_from.len != desc_to.len) {
+		err = -EINVAL;
+		goto err2;
+	}
+
+	priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
+	err = nft_validate_register_load(priv->sreg, desc_from.len);
+	if (err < 0)
+		goto err2;
+
+	priv->op  = ntohl(nla_get_be32(tb[NFTA_RANGE_OP]));
+	priv->len = desc_from.len;
+	return 0;
+err2:
+	nft_data_uninit(&priv->data_to, desc_to.type);
+err1:
+	nft_data_uninit(&priv->data_from, desc_from.type);
+	return err;
+}
+
+static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_range_expr *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op)))
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from,
+			  NFT_DATA_VALUE, priv->len) < 0 ||
+	    nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_range_type;
+static const struct nft_expr_ops nft_range_ops = {
+	.type		= &nft_range_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
+	.eval		= nft_range_eval,
+	.init		= nft_range_init,
+	.dump		= nft_range_dump,
+};
+
+static struct nft_expr_type nft_range_type __read_mostly = {
+	.name		= "range",
+	.ops		= &nft_range_ops,
+	.policy		= nft_range_policy,
+	.maxattr	= NFTA_RANGE_MAX,
+	.owner		= THIS_MODULE,
+};
+
+int __init nft_range_module_init(void)
+{
+	return nft_register_expr(&nft_range_type);
+}
+
+void nft_range_module_exit(void)
+{
+	nft_unregister_expr(&nft_range_type);
+}

From ff107d27761ff4b644c82c209e004ec9c8fbbc22 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sun, 25 Sep 2016 16:35:56 +0800
Subject: [PATCH 52/53] netfilter: nft_log: complete NFTA_LOG_FLAGS attr
 support

NFTA_LOG_FLAGS attribute is already supported, but the related
NF_LOG_XXX flags are not exposed to the userspace. So we cannot
explicitly enable log flags to log uid, tcp sequence, ip options
and so on, i.e. such rule "nft add rule filter output log uid"
is not supported yet.

So move NF_LOG_XXX macro definitions to the uapi/../nf_log.h. In
order to keep consistent with other modules, change NF_LOG_MASK to
refer to all supported log flags. On the other hand, add a new
NF_LOG_DEFAULT_MASK to refer to the original default log flags.

Finally, if user specify the unsupported log flags or NFTA_LOG_GROUP
and NFTA_LOG_FLAGS are set at the same time, report EINVAL to the
userspace.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h        | 11 +++--------
 include/uapi/linux/netfilter/nf_log.h | 12 ++++++++++++
 net/bridge/netfilter/ebt_log.c        |  2 +-
 net/ipv4/netfilter/ip_tables.c        |  2 +-
 net/ipv4/netfilter/nf_log_arp.c       |  2 +-
 net/ipv4/netfilter/nf_log_ipv4.c      |  4 ++--
 net/ipv6/netfilter/ip6_tables.c       |  2 +-
 net/ipv6/netfilter/nf_log_ipv6.c      |  4 ++--
 net/netfilter/nf_tables_core.c        |  2 +-
 net/netfilter/nft_log.c               |  9 ++++++++-
 10 files changed, 32 insertions(+), 18 deletions(-)
 create mode 100644 include/uapi/linux/netfilter/nf_log.h

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index ee07dc8b0a7b..309cd267be4f 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -2,15 +2,10 @@
 #define _NF_LOG_H
 
 #include <linux/netfilter.h>
+#include <linux/netfilter/nf_log.h>
 
-/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will
- * disappear once iptables is replaced with pkttables.  Please DO NOT use them
- * for any new code! */
-#define NF_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
-#define NF_LOG_TCPOPT		0x02	/* Log TCP options */
-#define NF_LOG_IPOPT		0x04	/* Log IP options */
-#define NF_LOG_UID		0x08	/* Log UID owning local socket */
-#define NF_LOG_MASK		0x0f
+/* Log tcp sequence, tcp options, ip options and uid owning local socket */
+#define NF_LOG_DEFAULT_MASK	0x0f
 
 /* This flag indicates that copy_len field in nf_loginfo is set */
 #define NF_LOG_F_COPY_LEN	0x1
diff --git a/include/uapi/linux/netfilter/nf_log.h b/include/uapi/linux/netfilter/nf_log.h
new file mode 100644
index 000000000000..8be21e02387d
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_log.h
@@ -0,0 +1,12 @@
+#ifndef _NETFILTER_NF_LOG_H
+#define _NETFILTER_NF_LOG_H
+
+#define NF_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
+#define NF_LOG_TCPOPT		0x02	/* Log TCP options */
+#define NF_LOG_IPOPT		0x04	/* Log IP options */
+#define NF_LOG_UID		0x08	/* Log UID owning local socket */
+#define NF_LOG_NFLOG		0x10	/* Unsupported, don't reuse */
+#define NF_LOG_MACDECODE	0x20	/* Decode MAC header */
+#define NF_LOG_MASK		0x2f
+
+#endif /* _NETFILTER_NF_LOG_H */
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 152300d164ac..9a11086ba6ff 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 	if (loginfo->type == NF_LOG_TYPE_LOG)
 		bitmask = loginfo->u.log.logflags;
 	else
-		bitmask = NF_LOG_MASK;
+		bitmask = NF_LOG_DEFAULT_MASK;
 
 	if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
 	   htons(ETH_P_IP)) {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f993545a3373..7c00ce90adb8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = {
 	.u = {
 		.log = {
 			.level = 4,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 8945c2653814..b24795e2ee6d 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
 	.u = {
 		.log = {
 			.level	  = LOGLEVEL_NOTICE,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 20f225593a8b..5b571e1b5f15 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
 	.u = {
 		.log = {
 			.level	  = LOGLEVEL_NOTICE,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
@@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
 	else
-		logflags = NF_LOG_MASK;
+		logflags = NF_LOG_DEFAULT_MASK;
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 552fac2f390a..55aacea24396 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = {
 	.u = {
 		.log = {
 			.level = LOGLEVEL_WARNING,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index c1bcf699a23d..f6aee2895fee 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
 	.u = {
 		.log = {
 			.level	  = LOGLEVEL_NOTICE,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
@@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
 	else
-		logflags = NF_LOG_MASK;
+		logflags = NF_LOG_DEFAULT_MASK;
 
 	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
 	if (ih == NULL) {
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 7c94ce0080d5..0dd5c695482f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = {
 	.u = {
 		.log = {
 			.level = LOGLEVEL_WARNING,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 	        },
 	},
 };
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 24a73bb26e94..1b01404bb33f 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx,
 	if (tb[NFTA_LOG_LEVEL] != NULL &&
 	    tb[NFTA_LOG_GROUP] != NULL)
 		return -EINVAL;
-	if (tb[NFTA_LOG_GROUP] != NULL)
+	if (tb[NFTA_LOG_GROUP] != NULL) {
 		li->type = NF_LOG_TYPE_ULOG;
+		if (tb[NFTA_LOG_FLAGS] != NULL)
+			return -EINVAL;
+	}
 
 	nla = tb[NFTA_LOG_PREFIX];
 	if (nla != NULL) {
@@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		if (tb[NFTA_LOG_FLAGS] != NULL) {
 			li->u.log.logflags =
 				ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
+			if (li->u.log.logflags & ~NF_LOG_MASK) {
+				err = -EINVAL;
+				goto err1;
+			}
 		}
 		break;
 	case NF_LOG_TYPE_ULOG:

From 8cb2a7d5667ab9a9c2fdd356357b85b63b320901 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sun, 25 Sep 2016 16:47:05 +0800
Subject: [PATCH 53/53] netfilter: nf_log: get rid of XT_LOG_* macros

nf_log is used by both nftables and iptables, so use XT_LOG_XXX macros
here is not appropriate. Replace them with NF_LOG_XXX.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_log_ipv4.c |  6 +++---
 net/ipv6/netfilter/nf_log_ipv6.c | 14 +++++++-------
 net/netfilter/nf_log_common.c    |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 5b571e1b5f15..856648966f4c 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -76,7 +76,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
 	if (ntohs(ih->frag_off) & IP_OFFSET)
 		nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
-	if ((logflags & XT_LOG_IPOPT) &&
+	if ((logflags & NF_LOG_IPOPT) &&
 	    ih->ihl * 4 > sizeof(struct iphdr)) {
 		const unsigned char *op;
 		unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
@@ -250,7 +250,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & XT_LOG_UID) && !iphoff)
+	if ((logflags & NF_LOG_UID) && !iphoff)
 		nf_log_dump_sk_uid_gid(m, skb->sk);
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -282,7 +282,7 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
 
-	if (!(logflags & XT_LOG_MACDECODE))
+	if (!(logflags & NF_LOG_MACDECODE))
 		goto fallback;
 
 	switch (dev->type) {
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index f6aee2895fee..57d86066a13b 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -84,7 +84,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 		}
 
 		/* Max length: 48 "OPT (...) " */
-		if (logflags & XT_LOG_IPOPT)
+		if (logflags & NF_LOG_IPOPT)
 			nf_log_buf_add(m, "OPT ( ");
 
 		switch (currenthdr) {
@@ -121,7 +121,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 		case IPPROTO_ROUTING:
 		case IPPROTO_HOPOPTS:
 			if (fragment) {
-				if (logflags & XT_LOG_IPOPT)
+				if (logflags & NF_LOG_IPOPT)
 					nf_log_buf_add(m, ")");
 				return;
 			}
@@ -129,7 +129,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 			break;
 		/* Max Length */
 		case IPPROTO_AH:
-			if (logflags & XT_LOG_IPOPT) {
+			if (logflags & NF_LOG_IPOPT) {
 				struct ip_auth_hdr _ahdr;
 				const struct ip_auth_hdr *ah;
 
@@ -161,7 +161,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 			hdrlen = (hp->hdrlen+2)<<2;
 			break;
 		case IPPROTO_ESP:
-			if (logflags & XT_LOG_IPOPT) {
+			if (logflags & NF_LOG_IPOPT) {
 				struct ip_esp_hdr _esph;
 				const struct ip_esp_hdr *eh;
 
@@ -194,7 +194,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 			nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
 			return;
 		}
-		if (logflags & XT_LOG_IPOPT)
+		if (logflags & NF_LOG_IPOPT)
 			nf_log_buf_add(m, ") ");
 
 		currenthdr = hp->nexthdr;
@@ -277,7 +277,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & XT_LOG_UID) && recurse)
+	if ((logflags & NF_LOG_UID) && recurse)
 		nf_log_dump_sk_uid_gid(m, skb->sk);
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -295,7 +295,7 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
 
-	if (!(logflags & XT_LOG_MACDECODE))
+	if (!(logflags & NF_LOG_MACDECODE))
 		goto fallback;
 
 	switch (dev->type) {
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index a5aa5967b8e1..119fe1cb1ea9 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -77,7 +77,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
 	nf_log_buf_add(m, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 	/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-	if (logflags & XT_LOG_TCPSEQ) {
+	if (logflags & NF_LOG_TCPSEQ) {
 		nf_log_buf_add(m, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 	}
@@ -107,7 +107,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
 	/* Max length: 11 "URGP=65535 " */
 	nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
-	if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
+	if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
 		u_int8_t _opt[60 - sizeof(struct tcphdr)];
 		const u_int8_t *op;
 		unsigned int i;