linux/include/net/ip_fib.h

338 lines
8.3 KiB
C
Raw Normal View History

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the Forwarding Information Base.
*
* Authors: A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _NET_IP_FIB_H
#define _NET_IP_FIB_H
#include <net/flow.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <net/fib_rules.h>
#include <net/inetpeer.h>
#include <linux/percpu.h>
struct fib_config {
u8 fc_dst_len;
u8 fc_tos;
u8 fc_protocol;
u8 fc_scope;
u8 fc_type;
/* 3 bytes unused */
u32 fc_table;
__be32 fc_dst;
__be32 fc_gw;
int fc_oif;
u32 fc_flags;
u32 fc_priority;
__be32 fc_prefsrc;
struct nlattr *fc_mx;
struct rtnexthop *fc_mp;
int fc_mx_len;
int fc_mp_len;
u32 fc_flow;
u32 fc_nlflags;
struct nl_info fc_nlinfo;
};
struct fib_info;
struct rtable;
struct fib_nh_exception {
struct fib_nh_exception __rcu *fnhe_next;
int fnhe_genid;
__be32 fnhe_daddr;
u32 fnhe_pmtu;
__be32 fnhe_gw;
unsigned long fnhe_expires;
struct rtable __rcu *fnhe_rth_input;
struct rtable __rcu *fnhe_rth_output;
unsigned long fnhe_stamp;
};
struct fnhe_hash_bucket {
struct fib_nh_exception __rcu *chain;
};
#define FNHE_HASH_SIZE 2048
#define FNHE_RECLAIM_DEPTH 5
struct fib_nh {
struct net_device *nh_dev;
struct hlist_node nh_hash;
struct fib_info *nh_parent;
unsigned int nh_flags;
unsigned char nh_scope;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int nh_weight;
int nh_power;
#endif
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 nh_tclassid;
#endif
int nh_oif;
__be32 nh_gw;
__be32 nh_saddr;
int nh_saddr_genid;
struct rtable __rcu * __percpu *nh_pcpu_rth_output;
struct rtable __rcu *nh_rth_input;
struct fnhe_hash_bucket *nh_exceptions;
};
/*
* This structure contains data shared by many of routes.
*/
struct fib_info {
struct hlist_node fib_hash;
struct hlist_node fib_lhash;
struct net *fib_net;
int fib_treeref;
atomic_t fib_clntref;
unsigned int fib_flags;
unsigned char fib_dead;
unsigned char fib_protocol;
unsigned char fib_scope;
ipv4: add a fib_type to fib_info commit d2d68ba9fe8 (ipv4: Cache input routes in fib_info nexthops.) introduced a regression for forwarding. This was hard to reproduce but the symptom was that packets were delivered to local host instead of being forwarded. David suggested to add fib_type to fib_info so that we dont inadvertently share same fib_info for different purposes. With help from Julian Anastasov who provided very helpful hints, reproduced here : <quote> Can it be a problem related to fib_info reuse from different routes. For example, when local IP address is created for subnet we have: broadcast 192.168.0.255 dev DEV proto kernel scope link src 192.168.0.1 192.168.0.0/24 dev DEV proto kernel scope link src 192.168.0.1 local 192.168.0.1 dev DEV proto kernel scope host src 192.168.0.1 The "dev DEV proto kernel scope link src 192.168.0.1" is a reused fib_info structure where we put cached routes. The result can be same fib_info for 192.168.0.255 and 192.168.0.0/24. RTN_BROADCAST is cached only for input routes. Incoming broadcast to 192.168.0.255 can be cached and can cause problems for traffic forwarded to 192.168.0.0/24. So, this patch should solve the problem because it separates the broadcast from unicast traffic. And the ip_route_input_slow caching will work for local and broadcast input routes (above routes 1 and 3) just because they differ in scope and use different fib_info. </quote> Many thanks to Chris Clayton for his patience and help. Reported-by: Chris Clayton <chris2553@googlemail.com> Bisected-by: Chris Clayton <chris2553@googlemail.com> Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Julian Anastasov <ja@ssi.bg> Tested-by: Chris Clayton <chris2553@googlemail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-10-04 09:25:26 +08:00
unsigned char fib_type;
__be32 fib_prefsrc;
u32 fib_priority;
u32 *fib_metrics;
#define fib_mtu fib_metrics[RTAX_MTU-1]
#define fib_window fib_metrics[RTAX_WINDOW-1]
#define fib_rtt fib_metrics[RTAX_RTT-1]
#define fib_advmss fib_metrics[RTAX_ADVMSS-1]
int fib_nhs;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_power;
#endif
fib: RCU conversion of fib_lookup() fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-10-05 18:41:36 +08:00
struct rcu_head rcu;
struct fib_nh fib_nh[0];
#define fib_dev fib_nh[0].nh_dev
};
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule;
#endif
struct fib_table;
struct fib_result {
unsigned char prefixlen;
unsigned char nh_sel;
unsigned char type;
unsigned char scope;
u32 tclassid;
struct fib_info *fi;
struct fib_table *table;
struct list_head *fa_head;
};
struct fib_result_nl {
__be32 fl_addr; /* To be looked up*/
u32 fl_mark;
unsigned char fl_tos;
unsigned char fl_scope;
unsigned char tb_id_in;
unsigned char tb_id; /* Results */
unsigned char prefixlen;
unsigned char nh_sel;
unsigned char type;
unsigned char scope;
int err;
};
#ifdef CONFIG_IP_ROUTE_MULTIPATH
#define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel])
#else /* CONFIG_IP_ROUTE_MULTIPATH */
#define FIB_RES_NH(res) ((res).fi->fib_nh[0])
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
#ifdef CONFIG_IP_MULTIPLE_TABLES
#define FIB_TABLE_HASHSZ 256
#else
#define FIB_TABLE_HASHSZ 2
#endif
__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
#define FIB_RES_SADDR(net, res) \
((FIB_RES_NH(res).nh_saddr_genid == \
atomic_read(&(net)->ipv4.dev_addr_genid)) ? \
FIB_RES_NH(res).nh_saddr : \
fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
#define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw)
#define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev)
#define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif)
#define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \
FIB_RES_SADDR(net, res))
struct fib_table {
struct hlist_node tb_hlist;
u32 tb_id;
int tb_default;
int tb_num_default;
unsigned long tb_data[0];
};
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags);
int fib_table_insert(struct fib_table *, struct fib_config *);
int fib_table_delete(struct fib_table *, struct fib_config *);
int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
struct netlink_callback *cb);
int fib_table_flush(struct fib_table *table);
void fib_free_table(struct fib_table *tb);
#ifndef CONFIG_IP_MULTIPLE_TABLES
#define TABLE_LOCAL_INDEX 0
#define TABLE_MAIN_INDEX 1
static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct hlist_head *ptr;
ptr = id == RT_TABLE_LOCAL ?
&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}
static inline struct fib_table *fib_new_table(struct net *net, u32 id)
{
return fib_get_table(net, id);
}
static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
struct fib_result *res)
{
struct fib_table *table;
table = fib_get_table(net, RT_TABLE_LOCAL);
fib: RCU conversion of fib_lookup() fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-10-05 18:41:36 +08:00
if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
return 0;
table = fib_get_table(net, RT_TABLE_MAIN);
fib: RCU conversion of fib_lookup() fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-10-05 18:41:36 +08:00
if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
return 0;
return -ENETUNREACH;
}
#else /* CONFIG_IP_MULTIPLE_TABLES */
int __net_init fib4_rules_init(struct net *net);
void __net_exit fib4_rules_exit(struct net *net);
struct fib_table *fib_new_table(struct net *net, u32 id);
struct fib_table *fib_get_table(struct net *net, u32 id);
int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res);
static inline int fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res)
{
if (!net->ipv4.fib_has_custom_rules) {
res->tclassid = 0;
if (net->ipv4.fib_local &&
!fib_table_lookup(net->ipv4.fib_local, flp, res,
FIB_LOOKUP_NOREF))
return 0;
if (net->ipv4.fib_main &&
!fib_table_lookup(net->ipv4.fib_main, flp, res,
FIB_LOOKUP_NOREF))
return 0;
if (net->ipv4.fib_default &&
!fib_table_lookup(net->ipv4.fib_default, flp, res,
FIB_LOOKUP_NOREF))
return 0;
return -ENETUNREACH;
}
return __fib_lookup(net, flp, res);
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
/* Exported by fib_frontend.c */
extern const struct nla_policy rtm_ipv4_policy[];
void ip_fib_init(void);
__be32 fib_compute_spec_dst(struct sk_buff *skb);
int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
u8 tos, int oif, struct net_device *dev,
struct in_device *idev, u32 *itag);
void fib_select_default(struct fib_result *res);
#ifdef CONFIG_IP_ROUTE_CLASSID
static inline int fib_num_tclassid_users(struct net *net)
{
return net->ipv4.fib_num_tclassid_users;
}
#else
static inline int fib_num_tclassid_users(struct net *net)
{
return 0;
}
#endif
/* Exported by fib_semantics.c */
int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, int force);
int fib_sync_down_addr(struct net *net, __be32 local);
int fib_sync_up(struct net_device *dev);
void fib_select_multipath(struct fib_result *res);
/* Exported by fib_trie.c */
void fib_trie_init(void);
struct fib_table *fib_trie_table(u32 id);
static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
u32 rtag;
#endif
*itag = FIB_RES_NH(*res).nh_tclassid<<16;
#ifdef CONFIG_IP_MULTIPLE_TABLES
rtag = res->tclassid;
if (*itag == 0)
*itag = (rtag<<16);
*itag |= (rtag>>16);
#endif
#endif
}
void free_fib_info(struct fib_info *fi);
static inline void fib_info_put(struct fib_info *fi)
{
if (atomic_dec_and_test(&fi->fib_clntref))
free_fib_info(fi);
}
#ifdef CONFIG_PROC_FS
int __net_init fib_proc_init(struct net *net);
void __net_exit fib_proc_exit(struct net *net);
#else
static inline int fib_proc_init(struct net *net)
{
return 0;
}
static inline void fib_proc_exit(struct net *net)
{
}
#endif
#endif /* _NET_FIB_H */