2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004 Topspin Communications. All rights reserved.
|
2005-08-11 14:03:10 +08:00
|
|
|
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
|
|
|
|
* Copyright (c) 2004 Voltaire, Inc. All rights reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the
|
|
|
|
* OpenIB.org BSD license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or
|
|
|
|
* without modification, are permitted provided that the following
|
|
|
|
* conditions are met:
|
|
|
|
*
|
|
|
|
* - Redistributions of source code must retain the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer.
|
|
|
|
*
|
|
|
|
* - Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials
|
|
|
|
* provided with the distribution.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
* SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "ipoib.h"
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
2006-04-11 00:43:58 +08:00
|
|
|
#include <linux/kernel.h>
|
2008-03-12 22:51:03 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/if_arp.h> /* For ARPHRD_xxx */
|
|
|
|
|
|
|
|
#include <linux/ip.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
|
2005-12-27 12:43:12 +08:00
|
|
|
#include <net/dst.h>
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
MODULE_AUTHOR("Roland Dreier");
|
|
|
|
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
|
|
|
|
MODULE_LICENSE("Dual BSD/GPL");
|
|
|
|
|
2006-04-11 00:43:58 +08:00
|
|
|
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
|
|
|
|
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
|
|
|
|
|
|
|
|
module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
|
|
|
|
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
|
|
|
|
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
|
|
|
|
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
|
|
|
|
|
2008-07-15 14:48:48 +08:00
|
|
|
static int lro;
|
|
|
|
module_param(lro, bool, 0444);
|
|
|
|
MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)");
|
|
|
|
|
|
|
|
static int lro_max_aggr = IPOIB_LRO_MAX_AGGR;
|
|
|
|
module_param(lro_max_aggr, int, 0644);
|
|
|
|
MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
|
|
|
|
"(default = 64)");
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
|
|
|
|
int ipoib_debug_level;
|
|
|
|
|
|
|
|
module_param_named(debug_level, ipoib_debug_level, int, 0644);
|
|
|
|
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
|
|
|
|
#endif
|
|
|
|
|
2005-11-08 02:33:11 +08:00
|
|
|
struct ipoib_path_iter {
|
|
|
|
struct net_device *dev;
|
|
|
|
struct ipoib_path path;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static const u8 ipv4_bcast_addr[] = {
|
|
|
|
0x00, 0xff, 0xff, 0xff,
|
|
|
|
0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
|
|
|
|
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
|
|
|
|
};
|
|
|
|
|
|
|
|
struct workqueue_struct *ipoib_workqueue;
|
|
|
|
|
2006-08-22 07:40:12 +08:00
|
|
|
struct ib_sa_client ipoib_sa_client;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void ipoib_add_one(struct ib_device *device);
|
|
|
|
static void ipoib_remove_one(struct ib_device *device);
|
|
|
|
|
|
|
|
static struct ib_client ipoib_client = {
|
|
|
|
.name = "ipoib",
|
|
|
|
.add = ipoib_add_one,
|
|
|
|
.remove = ipoib_remove_one
|
|
|
|
};
|
|
|
|
|
|
|
|
int ipoib_open(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
|
|
|
ipoib_dbg(priv, "bringing up interface\n");
|
|
|
|
|
|
|
|
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
|
|
|
|
|
|
|
|
if (ipoib_pkey_dev_delay_open(dev))
|
|
|
|
return 0;
|
|
|
|
|
2008-11-13 02:24:36 +08:00
|
|
|
napi_enable(&priv->napi);
|
|
|
|
|
[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-04 07:41:36 +08:00
|
|
|
if (ipoib_ib_dev_open(dev)) {
|
|
|
|
napi_disable(&priv->napi);
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-04 07:41:36 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-30 02:55:58 +08:00
|
|
|
if (ipoib_ib_dev_up(dev)) {
|
2007-05-19 23:51:54 +08:00
|
|
|
ipoib_ib_dev_stop(dev, 1);
|
[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-04 07:41:36 +08:00
|
|
|
napi_disable(&priv->napi);
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
2005-11-30 02:55:58 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
|
|
|
|
struct ipoib_dev_priv *cpriv;
|
|
|
|
|
|
|
|
/* Bring up any child interfaces too */
|
2006-01-14 06:51:39 +08:00
|
|
|
mutex_lock(&priv->vlan_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
list_for_each_entry(cpriv, &priv->child_intfs, list) {
|
|
|
|
int flags;
|
|
|
|
|
|
|
|
flags = cpriv->dev->flags;
|
|
|
|
if (flags & IFF_UP)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
dev_change_flags(cpriv->dev, flags | IFF_UP);
|
|
|
|
}
|
2006-01-14 06:51:39 +08:00
|
|
|
mutex_unlock(&priv->vlan_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
netif_start_queue(dev);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ipoib_stop(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
|
|
|
ipoib_dbg(priv, "stopping interface\n");
|
|
|
|
|
|
|
|
clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
|
[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-04 07:41:36 +08:00
|
|
|
napi_disable(&priv->napi);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
netif_stop_queue(dev);
|
|
|
|
|
2008-08-20 06:01:32 +08:00
|
|
|
ipoib_ib_dev_down(dev, 0);
|
|
|
|
ipoib_ib_dev_stop(dev, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
|
|
|
|
struct ipoib_dev_priv *cpriv;
|
|
|
|
|
|
|
|
/* Bring down any child interfaces too */
|
2006-01-14 06:51:39 +08:00
|
|
|
mutex_lock(&priv->vlan_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
list_for_each_entry(cpriv, &priv->child_intfs, list) {
|
|
|
|
int flags;
|
|
|
|
|
|
|
|
flags = cpriv->dev->flags;
|
|
|
|
if (!(flags & IFF_UP))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
dev_change_flags(cpriv->dev, flags & ~IFF_UP);
|
|
|
|
}
|
2006-01-14 06:51:39 +08:00
|
|
|
mutex_unlock(&priv->vlan_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
2007-02-06 04:12:23 +08:00
|
|
|
/* dev->mtu > 2K ==> connected mode */
|
2007-12-22 05:08:23 +08:00
|
|
|
if (ipoib_cm_admin_enabled(dev)) {
|
|
|
|
if (new_mtu > ipoib_cm_max_mtu(dev))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2007-02-06 04:12:23 +08:00
|
|
|
if (new_mtu > priv->mcast_mtu)
|
|
|
|
ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
|
|
|
|
priv->mcast_mtu);
|
2007-12-22 05:08:23 +08:00
|
|
|
|
2007-02-06 04:12:23 +08:00
|
|
|
dev->mtu = new_mtu;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-04-24 02:55:45 +08:00
|
|
|
if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
priv->admin_mtu = new_mtu;
|
|
|
|
|
|
|
|
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct rb_node *n = priv->path_tree.rb_node;
|
|
|
|
struct ipoib_path *path;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
path = rb_entry(n, struct ipoib_path, rb_node);
|
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
ret = memcmp(gid, path->pathrec.dgid.raw,
|
2005-04-17 06:20:36 +08:00
|
|
|
sizeof (union ib_gid));
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
n = n->rb_left;
|
|
|
|
else if (ret > 0)
|
|
|
|
n = n->rb_right;
|
|
|
|
else
|
|
|
|
return path;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __path_add(struct net_device *dev, struct ipoib_path *path)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct rb_node **n = &priv->path_tree.rb_node;
|
|
|
|
struct rb_node *pn = NULL;
|
|
|
|
struct ipoib_path *tpath;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (*n) {
|
|
|
|
pn = *n;
|
|
|
|
tpath = rb_entry(pn, struct ipoib_path, rb_node);
|
|
|
|
|
|
|
|
ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
|
|
|
|
sizeof (union ib_gid));
|
|
|
|
if (ret < 0)
|
|
|
|
n = &pn->rb_left;
|
|
|
|
else if (ret > 0)
|
|
|
|
n = &pn->rb_right;
|
|
|
|
else
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&path->rb_node, pn, n);
|
|
|
|
rb_insert_color(&path->rb_node, &priv->path_tree);
|
|
|
|
|
|
|
|
list_add_tail(&path->list, &priv->path_list);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void path_free(struct net_device *dev, struct ipoib_path *path)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_neigh *neigh, *tn;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
while ((skb = __skb_dequeue(&path->queue)))
|
|
|
|
dev_kfree_skb_irq(skb);
|
|
|
|
|
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
|
|
|
|
/*
|
|
|
|
* It's safe to call ipoib_put_ah() inside priv->lock
|
|
|
|
* here, because we know that path->ah will always
|
|
|
|
* hold one more reference, so ipoib_put_ah() will
|
|
|
|
* never do more than decrement the ref count.
|
|
|
|
*/
|
|
|
|
if (neigh->ah)
|
|
|
|
ipoib_put_ah(neigh->ah);
|
2006-04-05 00:59:40 +08:00
|
|
|
|
2006-11-16 20:16:47 +08:00
|
|
|
ipoib_neigh_free(dev, neigh);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
|
|
|
|
|
|
|
if (path->ah)
|
|
|
|
ipoib_put_ah(path->ah);
|
|
|
|
|
|
|
|
kfree(path);
|
|
|
|
}
|
|
|
|
|
2005-11-08 02:33:11 +08:00
|
|
|
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
|
|
|
|
|
|
|
|
struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_path_iter *iter;
|
|
|
|
|
|
|
|
iter = kmalloc(sizeof *iter, GFP_KERNEL);
|
|
|
|
if (!iter)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
iter->dev = dev;
|
|
|
|
memset(iter->path.pathrec.dgid.raw, 0, 16);
|
|
|
|
|
|
|
|
if (ipoib_path_iter_next(iter)) {
|
|
|
|
kfree(iter);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ipoib_path_iter_next(struct ipoib_path_iter *iter)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
|
|
|
|
struct rb_node *n;
|
|
|
|
struct ipoib_path *path;
|
|
|
|
int ret = 1;
|
|
|
|
|
|
|
|
spin_lock_irq(&priv->lock);
|
|
|
|
|
|
|
|
n = rb_first(&priv->path_tree);
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
path = rb_entry(n, struct ipoib_path, rb_node);
|
|
|
|
|
|
|
|
if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
|
|
|
|
sizeof (union ib_gid)) < 0) {
|
|
|
|
iter->path = *path;
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = rb_next(n);
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_irq(&priv->lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ipoib_path_iter_read(struct ipoib_path_iter *iter,
|
|
|
|
struct ipoib_path *path)
|
|
|
|
{
|
|
|
|
*path = iter->path;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
|
|
|
|
|
2008-07-15 14:48:49 +08:00
|
|
|
void ipoib_mark_paths_invalid(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_path *path, *tp;
|
|
|
|
|
|
|
|
spin_lock_irq(&priv->lock);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(path, tp, &priv->path_list, list) {
|
2008-10-30 03:52:50 +08:00
|
|
|
ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
|
2008-07-15 14:48:49 +08:00
|
|
|
be16_to_cpu(path->pathrec.dlid),
|
2008-10-29 13:37:22 +08:00
|
|
|
path->pathrec.dgid.raw);
|
2008-07-15 14:48:49 +08:00
|
|
|
path->valid = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_irq(&priv->lock);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
void ipoib_flush_paths(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_path *path, *tp;
|
|
|
|
LIST_HEAD(remove_list);
|
2008-10-01 01:36:21 +08:00
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
netif_tx_lock_bh(dev);
|
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-17 12:09:26 +08:00
|
|
|
list_splice_init(&priv->path_list, &remove_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
list_for_each_entry(path, &remove_list, list)
|
|
|
|
rb_erase(&path->rb_node, &priv->path_tree);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(path, tp, &remove_list, list) {
|
|
|
|
if (path->query)
|
|
|
|
ib_sa_cancel_query(path->query_id, path->query);
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
|
|
|
netif_tx_unlock_bh(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
wait_for_completion(&path->done);
|
|
|
|
path_free(dev, path);
|
2008-10-01 01:36:21 +08:00
|
|
|
netif_tx_lock_bh(dev);
|
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-10-01 01:36:21 +08:00
|
|
|
|
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
|
|
|
netif_tx_unlock_bh(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void path_rec_completion(int status,
|
|
|
|
struct ib_sa_path_rec *pathrec,
|
|
|
|
void *path_ptr)
|
|
|
|
{
|
|
|
|
struct ipoib_path *path = path_ptr;
|
|
|
|
struct net_device *dev = path->dev;
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_ah *ah = NULL;
|
2008-09-26 06:26:15 +08:00
|
|
|
struct ipoib_ah *old_ah = NULL;
|
2007-03-23 05:40:16 +08:00
|
|
|
struct ipoib_neigh *neigh, *tn;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sk_buff_head skqueue;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
unsigned long flags;
|
|
|
|
|
2007-02-27 04:57:08 +08:00
|
|
|
if (!status)
|
2008-10-30 03:52:50 +08:00
|
|
|
ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
|
2008-10-29 13:37:22 +08:00
|
|
|
be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
|
2005-04-17 06:20:36 +08:00
|
|
|
else
|
2008-10-30 03:52:50 +08:00
|
|
|
ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
|
2008-10-29 13:37:22 +08:00
|
|
|
status, path->pathrec.dgid.raw);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
skb_queue_head_init(&skqueue);
|
|
|
|
|
|
|
|
if (!status) {
|
2007-04-06 02:50:11 +08:00
|
|
|
struct ib_ah_attr av;
|
|
|
|
|
|
|
|
if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
|
|
|
|
ah = ipoib_create_ah(dev, priv->pd, &av);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
|
|
|
|
|
|
|
if (ah) {
|
|
|
|
path->pathrec = *pathrec;
|
|
|
|
|
2008-09-26 06:26:15 +08:00
|
|
|
old_ah = path->ah;
|
|
|
|
path->ah = ah;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
|
|
|
|
ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
|
|
|
|
|
|
|
|
while ((skb = __skb_dequeue(&path->queue)))
|
|
|
|
__skb_queue_tail(&skqueue, skb);
|
|
|
|
|
2007-03-23 05:40:16 +08:00
|
|
|
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
|
2008-07-15 14:48:49 +08:00
|
|
|
if (neigh->ah) {
|
|
|
|
WARN_ON(neigh->ah != old_ah);
|
|
|
|
/*
|
|
|
|
* Dropping the ah reference inside
|
|
|
|
* priv->lock is safe here, because we
|
|
|
|
* will hold one more reference from
|
|
|
|
* the original value of path->ah (ie
|
|
|
|
* old_ah).
|
|
|
|
*/
|
|
|
|
ipoib_put_ah(neigh->ah);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
kref_get(&path->ah->ref);
|
|
|
|
neigh->ah = path->ah;
|
2006-07-19 22:44:37 +08:00
|
|
|
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
|
|
|
|
sizeof(union ib_gid));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-06 04:12:23 +08:00
|
|
|
if (ipoib_cm_enabled(dev, neigh->neighbour)) {
|
|
|
|
if (!ipoib_cm_get(neigh))
|
|
|
|
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
|
|
|
|
path,
|
|
|
|
neigh));
|
|
|
|
if (!ipoib_cm_get(neigh)) {
|
|
|
|
list_del(&neigh->list);
|
|
|
|
if (neigh->ah)
|
|
|
|
ipoib_put_ah(neigh->ah);
|
|
|
|
ipoib_neigh_free(dev, neigh);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
while ((skb = __skb_dequeue(&neigh->queue)))
|
|
|
|
__skb_queue_tail(&skqueue, skb);
|
|
|
|
}
|
2008-07-15 14:48:49 +08:00
|
|
|
path->valid = 1;
|
2005-11-30 02:13:54 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-30 02:13:54 +08:00
|
|
|
path->query = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
complete(&path->done);
|
|
|
|
|
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
|
|
|
|
2008-07-15 14:48:49 +08:00
|
|
|
if (old_ah)
|
|
|
|
ipoib_put_ah(old_ah);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
while ((skb = __skb_dequeue(&skqueue))) {
|
|
|
|
skb->dev = dev;
|
|
|
|
if (dev_queue_xmit(skb))
|
|
|
|
ipoib_warn(priv, "dev_queue_xmit failed "
|
|
|
|
"to requeue packet\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_path *path;
|
|
|
|
|
2007-11-26 16:41:19 +08:00
|
|
|
if (!priv->broadcast)
|
|
|
|
return NULL;
|
|
|
|
|
2005-11-03 02:07:59 +08:00
|
|
|
path = kzalloc(sizeof *path, GFP_ATOMIC);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!path)
|
|
|
|
return NULL;
|
|
|
|
|
2005-11-03 02:07:59 +08:00
|
|
|
path->dev = dev;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
skb_queue_head_init(&path->queue);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&path->neigh_list);
|
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
|
2007-10-24 10:57:54 +08:00
|
|
|
path->pathrec.sgid = priv->local_gid;
|
|
|
|
path->pathrec.pkey = cpu_to_be16(priv->pkey);
|
2007-08-03 03:21:31 +08:00
|
|
|
path->pathrec.numb_path = 1;
|
|
|
|
path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return path;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int path_rec_start(struct net_device *dev,
|
|
|
|
struct ipoib_path *path)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
2008-10-30 03:52:50 +08:00
|
|
|
ipoib_dbg(priv, "Start path record lookup for %pI6\n",
|
2008-10-29 13:37:22 +08:00
|
|
|
path->pathrec.dgid.raw);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-29 13:20:34 +08:00
|
|
|
init_completion(&path->done);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
path->query_id =
|
2006-08-22 07:40:12 +08:00
|
|
|
ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
|
2005-04-17 06:20:36 +08:00
|
|
|
&path->pathrec,
|
|
|
|
IB_SA_PATH_REC_DGID |
|
|
|
|
IB_SA_PATH_REC_SGID |
|
|
|
|
IB_SA_PATH_REC_NUMB_PATH |
|
2007-08-03 03:21:31 +08:00
|
|
|
IB_SA_PATH_REC_TRAFFIC_CLASS |
|
2005-04-17 06:20:36 +08:00
|
|
|
IB_SA_PATH_REC_PKEY,
|
|
|
|
1000, GFP_ATOMIC,
|
|
|
|
path_rec_completion,
|
|
|
|
path, &path->query);
|
|
|
|
if (path->query_id < 0) {
|
2008-07-23 05:18:34 +08:00
|
|
|
ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
|
2005-04-17 06:20:36 +08:00
|
|
|
path->query = NULL;
|
2008-11-13 02:24:38 +08:00
|
|
|
complete(&path->done);
|
2005-04-17 06:20:36 +08:00
|
|
|
return path->query_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_path *path;
|
|
|
|
struct ipoib_neigh *neigh;
|
2008-10-01 01:36:21 +08:00
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-10 10:43:36 +08:00
|
|
|
neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!neigh) {
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
path = __path_find(dev, skb->dst->neighbour->ha + 4);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!path) {
|
2006-05-30 00:14:05 +08:00
|
|
|
path = path_rec_create(dev, skb->dst->neighbour->ha + 4);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!path)
|
2006-04-05 00:59:40 +08:00
|
|
|
goto err_path;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
__path_add(dev, path);
|
|
|
|
}
|
|
|
|
|
|
|
|
list_add_tail(&neigh->list, &path->neigh_list);
|
|
|
|
|
2006-01-18 01:22:05 +08:00
|
|
|
if (path->ah) {
|
2005-04-17 06:20:36 +08:00
|
|
|
kref_get(&path->ah->ref);
|
|
|
|
neigh->ah = path->ah;
|
2006-07-19 22:44:37 +08:00
|
|
|
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
|
|
|
|
sizeof(union ib_gid));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-06 04:12:23 +08:00
|
|
|
if (ipoib_cm_enabled(dev, neigh->neighbour)) {
|
|
|
|
if (!ipoib_cm_get(neigh))
|
|
|
|
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
|
|
|
|
if (!ipoib_cm_get(neigh)) {
|
|
|
|
list_del(&neigh->list);
|
|
|
|
if (neigh->ah)
|
|
|
|
ipoib_put_ah(neigh->ah);
|
|
|
|
ipoib_neigh_free(dev, neigh);
|
|
|
|
goto err_drop;
|
|
|
|
}
|
|
|
|
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
|
|
|
|
__skb_queue_tail(&neigh->queue, skb);
|
|
|
|
else {
|
|
|
|
ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
|
|
|
|
skb_queue_len(&neigh->queue));
|
|
|
|
goto err_drop;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
neigh->ah = NULL;
|
|
|
|
|
|
|
|
if (!path->query && path_rec_start(dev, path))
|
2006-04-05 00:59:40 +08:00
|
|
|
goto err_list;
|
2006-11-16 20:16:47 +08:00
|
|
|
|
|
|
|
__skb_queue_tail(&neigh->queue, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
|
2006-04-05 00:59:40 +08:00
|
|
|
err_list:
|
2005-04-17 06:20:36 +08:00
|
|
|
list_del(&neigh->list);
|
|
|
|
|
2006-04-05 00:59:40 +08:00
|
|
|
err_path:
|
2006-11-16 20:16:47 +08:00
|
|
|
ipoib_neigh_free(dev, neigh);
|
2007-02-06 04:12:23 +08:00
|
|
|
err_drop:
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-09-29 10:56:57 +08:00
|
|
|
static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
|
|
|
|
|
|
|
|
/* Look up path record for unicasts */
|
|
|
|
if (skb->dst->neighbour->ha[4] != 0xff) {
|
|
|
|
neigh_add_path(skb, dev);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add in the P_Key for multicasts */
|
|
|
|
skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
|
|
|
|
skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
|
2006-05-30 00:14:05 +08:00
|
|
|
ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
struct ipoib_pseudoheader *phdr)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_path *path;
|
2008-10-01 01:36:21 +08:00
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
path = __path_find(dev, phdr->hwaddr + 4);
|
2008-07-15 14:48:49 +08:00
|
|
|
if (!path || !path->valid) {
|
|
|
|
if (!path)
|
|
|
|
path = path_rec_create(dev, phdr->hwaddr + 4);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (path) {
|
|
|
|
/* put pseudoheader back on for next time */
|
|
|
|
skb_push(skb, sizeof *phdr);
|
|
|
|
__skb_queue_tail(&path->queue, skb);
|
|
|
|
|
2008-11-13 02:24:39 +08:00
|
|
|
if (!path->query && path_rec_start(dev, path)) {
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
path_free(dev, path);
|
|
|
|
return;
|
|
|
|
} else
|
|
|
|
__path_add(dev, path);
|
|
|
|
} else {
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
}
|
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-01-18 01:22:05 +08:00
|
|
|
if (path->ah) {
|
2005-04-17 06:20:36 +08:00
|
|
|
ipoib_dbg(priv, "Send unicast ARP to %04x\n",
|
|
|
|
be16_to_cpu(path->pathrec.dlid));
|
|
|
|
|
2006-11-16 16:59:12 +08:00
|
|
|
ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
|
2005-04-17 06:20:36 +08:00
|
|
|
} else if ((path->query || !path_rec_start(dev, path)) &&
|
|
|
|
skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
|
|
|
|
/* put pseudoheader back on for next time */
|
|
|
|
skb_push(skb, sizeof *phdr);
|
|
|
|
__skb_queue_tail(&path->queue, skb);
|
|
|
|
} else {
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
}
|
|
|
|
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
struct ipoib_neigh *neigh;
|
|
|
|
unsigned long flags;
|
|
|
|
|
2006-09-23 06:22:58 +08:00
|
|
|
if (likely(skb->dst && skb->dst->neighbour)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
|
2005-09-29 10:56:57 +08:00
|
|
|
ipoib_path_lookup(skb, dev);
|
2008-10-01 01:36:21 +08:00
|
|
|
return NETDEV_TX_OK;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
neigh = *to_ipoib_neigh(skb->dst->neighbour);
|
|
|
|
|
2008-01-17 23:03:45 +08:00
|
|
|
if (neigh->ah)
|
2007-10-10 10:43:37 +08:00
|
|
|
if (unlikely((memcmp(&neigh->dgid.raw,
|
2006-07-19 22:44:37 +08:00
|
|
|
skb->dst->neighbour->ha + 4,
|
2007-10-10 10:43:37 +08:00
|
|
|
sizeof(union ib_gid))) ||
|
|
|
|
(neigh->dev != dev))) {
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
2006-07-19 22:44:37 +08:00
|
|
|
/*
|
|
|
|
* It's safe to call ipoib_put_ah() inside
|
|
|
|
* priv->lock here, because we know that
|
|
|
|
* path->ah will always hold one more reference,
|
|
|
|
* so ipoib_put_ah() will never do more than
|
|
|
|
* decrement the ref count.
|
|
|
|
*/
|
|
|
|
ipoib_put_ah(neigh->ah);
|
|
|
|
list_del(&neigh->list);
|
2006-11-16 20:16:47 +08:00
|
|
|
ipoib_neigh_free(dev, neigh);
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2006-07-19 22:44:37 +08:00
|
|
|
ipoib_path_lookup(skb, dev);
|
2008-10-01 01:36:21 +08:00
|
|
|
return NETDEV_TX_OK;
|
2006-07-19 22:44:37 +08:00
|
|
|
}
|
|
|
|
|
2008-01-17 23:03:45 +08:00
|
|
|
if (ipoib_cm_get(neigh)) {
|
|
|
|
if (ipoib_cm_up(neigh)) {
|
|
|
|
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
|
2008-10-01 01:36:21 +08:00
|
|
|
return NETDEV_TX_OK;
|
2008-01-17 23:03:45 +08:00
|
|
|
}
|
|
|
|
} else if (neigh->ah) {
|
2006-11-16 16:59:12 +08:00
|
|
|
ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
|
2008-10-01 01:36:21 +08:00
|
|
|
return NETDEV_TX_OK;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
__skb_queue_tail(&neigh->queue, skb);
|
2008-10-01 01:36:21 +08:00
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
struct ipoib_pseudoheader *phdr =
|
|
|
|
(struct ipoib_pseudoheader *) skb->data;
|
|
|
|
skb_pull(skb, sizeof *phdr);
|
|
|
|
|
|
|
|
if (phdr->hwaddr[4] == 0xff) {
|
|
|
|
/* Add in the P_Key for multicast*/
|
|
|
|
phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
|
|
|
|
phdr->hwaddr[9] = priv->pkey & 0xff;
|
|
|
|
|
2006-05-30 00:14:05 +08:00
|
|
|
ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
2005-07-29 04:17:26 +08:00
|
|
|
/* unicast GID -- should be ARP or RARP reply */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-07-29 04:17:26 +08:00
|
|
|
if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
|
|
|
|
(be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
|
2008-10-30 03:52:50 +08:00
|
|
|
ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",
|
2005-04-17 06:20:36 +08:00
|
|
|
skb->dst ? "neigh" : "dst",
|
2005-08-14 12:05:57 +08:00
|
|
|
be16_to_cpup((__be16 *) skb->data),
|
2006-11-16 16:59:12 +08:00
|
|
|
IPOIB_QPN(phdr->hwaddr),
|
2008-10-29 13:37:22 +08:00
|
|
|
phdr->hwaddr + 4);
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2008-10-01 01:36:21 +08:00
|
|
|
return NETDEV_TX_OK;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
unicast_arp_send(skb, dev, phdr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ipoib_timeout(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
2005-10-19 03:20:06 +08:00
|
|
|
ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
|
|
|
|
jiffies_to_msecs(jiffies - dev->trans_start));
|
|
|
|
ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
|
|
|
|
netif_queue_stopped(dev),
|
|
|
|
priv->tx_head, priv->tx_tail);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* XXX reset QP, etc. */
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ipoib_hard_header(struct sk_buff *skb,
|
|
|
|
struct net_device *dev,
|
|
|
|
unsigned short type,
|
2007-10-09 16:40:57 +08:00
|
|
|
const void *daddr, const void *saddr, unsigned len)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipoib_header *header;
|
|
|
|
|
|
|
|
header = (struct ipoib_header *) skb_push(skb, sizeof *header);
|
|
|
|
|
|
|
|
header->proto = htons(type);
|
|
|
|
header->reserved = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we don't have a neighbour structure, stuff the
|
|
|
|
* destination address onto the front of the skb so we can
|
|
|
|
* figure out where to send the packet later.
|
|
|
|
*/
|
2006-03-30 01:36:46 +08:00
|
|
|
if ((!skb->dst || !skb->dst->neighbour) && daddr) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ipoib_pseudoheader *phdr =
|
|
|
|
(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
|
|
|
|
memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ipoib_set_mcast_list(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
2006-03-24 01:52:51 +08:00
|
|
|
if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
|
|
|
|
ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2005-08-25 05:41:51 +08:00
|
|
|
queue_work(ipoib_workqueue, &priv->restart_task);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-03-25 03:52:16 +08:00
|
|
|
static void ipoib_neigh_cleanup(struct neighbour *n)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipoib_neigh *neigh;
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(n->dev);
|
|
|
|
unsigned long flags;
|
|
|
|
struct ipoib_ah *ah = NULL;
|
|
|
|
|
2007-10-10 10:43:36 +08:00
|
|
|
neigh = *to_ipoib_neigh(n);
|
2008-01-29 18:57:56 +08:00
|
|
|
if (neigh)
|
2007-10-10 10:43:36 +08:00
|
|
|
priv = netdev_priv(neigh->dev);
|
2008-01-29 18:57:56 +08:00
|
|
|
else
|
2007-10-10 10:43:36 +08:00
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
ipoib_dbg(priv,
|
2008-10-30 03:52:50 +08:00
|
|
|
"neigh_cleanup for %06x %pI6\n",
|
2006-11-16 16:59:12 +08:00
|
|
|
IPOIB_QPN(n->ha),
|
2008-10-29 13:37:22 +08:00
|
|
|
n->ha + 4);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
spin_lock_irqsave(&priv->lock, flags);
|
|
|
|
|
2007-10-10 10:43:36 +08:00
|
|
|
if (neigh->ah)
|
|
|
|
ah = neigh->ah;
|
|
|
|
list_del(&neigh->list);
|
|
|
|
ipoib_neigh_free(n->dev, neigh);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
spin_unlock_irqrestore(&priv->lock, flags);
|
|
|
|
|
|
|
|
if (ah)
|
|
|
|
ipoib_put_ah(ah);
|
|
|
|
}
|
|
|
|
|
2007-10-10 10:43:36 +08:00
|
|
|
struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
|
|
|
|
struct net_device *dev)
|
2006-04-05 00:59:40 +08:00
|
|
|
{
|
|
|
|
struct ipoib_neigh *neigh;
|
|
|
|
|
|
|
|
neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
|
|
|
|
if (!neigh)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
neigh->neighbour = neighbour;
|
2007-10-10 10:43:36 +08:00
|
|
|
neigh->dev = dev;
|
2006-04-05 00:59:40 +08:00
|
|
|
*to_ipoib_neigh(neighbour) = neigh;
|
2006-12-13 06:48:18 +08:00
|
|
|
skb_queue_head_init(&neigh->queue);
|
2007-02-06 04:12:23 +08:00
|
|
|
ipoib_cm_set(neigh, NULL);
|
2006-04-05 00:59:40 +08:00
|
|
|
|
|
|
|
return neigh;
|
|
|
|
}
|
|
|
|
|
2006-11-16 20:16:47 +08:00
|
|
|
void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
|
2006-04-05 00:59:40 +08:00
|
|
|
{
|
2006-11-16 20:16:47 +08:00
|
|
|
struct sk_buff *skb;
|
2006-04-05 00:59:40 +08:00
|
|
|
*to_ipoib_neigh(neigh->neighbour) = NULL;
|
2006-11-16 20:16:47 +08:00
|
|
|
while ((skb = __skb_dequeue(&neigh->queue))) {
|
2007-09-29 06:33:51 +08:00
|
|
|
++dev->stats.tx_dropped;
|
2006-11-16 20:16:47 +08:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
}
|
2007-02-06 04:12:23 +08:00
|
|
|
if (ipoib_cm_get(neigh))
|
|
|
|
ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
|
2006-04-05 00:59:40 +08:00
|
|
|
kfree(neigh);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
|
|
|
|
{
|
2007-03-25 03:52:16 +08:00
|
|
|
parms->neigh_cleanup = ipoib_neigh_cleanup;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
|
|
|
/* Allocate RX/TX "rings" to hold queued skbs */
|
2006-04-11 00:43:58 +08:00
|
|
|
priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
|
2005-04-17 06:20:36 +08:00
|
|
|
GFP_KERNEL);
|
|
|
|
if (!priv->rx_ring) {
|
|
|
|
printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
|
2006-04-11 00:43:58 +08:00
|
|
|
ca->name, ipoib_recvq_size);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-03-12 22:51:03 +08:00
|
|
|
priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!priv->tx_ring) {
|
|
|
|
printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
|
2006-04-11 00:43:58 +08:00
|
|
|
ca->name, ipoib_sendq_size);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out_rx_ring_cleanup;
|
|
|
|
}
|
2008-03-12 22:51:03 +08:00
|
|
|
memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-08-16 20:36:16 +08:00
|
|
|
/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (ipoib_ib_dev_init(dev, ca, port))
|
|
|
|
goto out_tx_ring_cleanup;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_tx_ring_cleanup:
|
2008-03-12 22:51:03 +08:00
|
|
|
vfree(priv->tx_ring);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
out_rx_ring_cleanup:
|
|
|
|
kfree(priv->rx_ring);
|
|
|
|
|
|
|
|
out:
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ipoib_dev_cleanup(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
|
|
|
|
|
2005-11-08 02:33:11 +08:00
|
|
|
ipoib_delete_debug_files(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Delete any child interfaces first */
|
|
|
|
list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
|
|
|
|
unregister_netdev(cpriv->dev);
|
|
|
|
ipoib_dev_cleanup(cpriv->dev);
|
|
|
|
free_netdev(cpriv->dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
ipoib_ib_dev_cleanup(dev);
|
|
|
|
|
2005-08-14 11:50:27 +08:00
|
|
|
kfree(priv->rx_ring);
|
2008-03-12 22:51:03 +08:00
|
|
|
vfree(priv->tx_ring);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-14 11:50:27 +08:00
|
|
|
priv->rx_ring = NULL;
|
|
|
|
priv->tx_ring = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-10-09 16:40:57 +08:00
|
|
|
static const struct header_ops ipoib_header_ops = {
|
|
|
|
.create = ipoib_hard_header,
|
|
|
|
};
|
|
|
|
|
2008-07-15 14:48:48 +08:00
|
|
|
static int get_skb_hdr(struct sk_buff *skb, void **iphdr,
|
|
|
|
void **tcph, u64 *hdr_flags, void *priv)
|
|
|
|
{
|
|
|
|
unsigned int ip_len;
|
|
|
|
struct iphdr *iph;
|
|
|
|
|
|
|
|
if (unlikely(skb->protocol != htons(ETH_P_IP)))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the future we may add an else clause that verifies the
|
|
|
|
* checksum and allows devices which do not calculate checksum
|
|
|
|
* to use LRO.
|
|
|
|
*/
|
|
|
|
if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* Check for non-TCP packet */
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
iph = ip_hdr(skb);
|
|
|
|
if (iph->protocol != IPPROTO_TCP)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
ip_len = ip_hdrlen(skb);
|
|
|
|
skb_set_transport_header(skb, ip_len);
|
|
|
|
*tcph = tcp_hdr(skb);
|
|
|
|
|
|
|
|
/* check if IP header and TCP header are complete */
|
|
|
|
if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
*hdr_flags = LRO_IPV4 | LRO_TCP;
|
|
|
|
*iphdr = iph;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ipoib_lro_setup(struct ipoib_dev_priv *priv)
|
|
|
|
{
|
|
|
|
priv->lro.lro_mgr.max_aggr = lro_max_aggr;
|
|
|
|
priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS;
|
|
|
|
priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc;
|
|
|
|
priv->lro.lro_mgr.get_skb_header = get_skb_hdr;
|
|
|
|
priv->lro.lro_mgr.features = LRO_F_NAPI;
|
|
|
|
priv->lro.lro_mgr.dev = priv->dev;
|
|
|
|
priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void ipoib_setup(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
|
|
|
|
2007-10-24 10:57:54 +08:00
|
|
|
dev->open = ipoib_open;
|
|
|
|
dev->stop = ipoib_stop;
|
|
|
|
dev->change_mtu = ipoib_change_mtu;
|
|
|
|
dev->hard_start_xmit = ipoib_start_xmit;
|
|
|
|
dev->tx_timeout = ipoib_timeout;
|
|
|
|
dev->header_ops = &ipoib_header_ops;
|
|
|
|
dev->set_multicast_list = ipoib_set_mcast_list;
|
|
|
|
dev->neigh_setup = ipoib_neigh_setup_dev;
|
[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-04 07:41:36 +08:00
|
|
|
|
2008-04-17 12:09:32 +08:00
|
|
|
ipoib_set_ethtool_ops(dev);
|
|
|
|
|
[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-04 07:41:36 +08:00
|
|
|
netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-24 10:57:54 +08:00
|
|
|
dev->watchdog_timeo = HZ;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-24 10:57:54 +08:00
|
|
|
dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We add in INFINIBAND_ALEN to allow for the destination
|
|
|
|
* address "pseudoheader" for skbs without neighbour struct.
|
|
|
|
*/
|
2007-10-24 10:57:54 +08:00
|
|
|
dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
|
|
|
|
dev->addr_len = INFINIBAND_ALEN;
|
|
|
|
dev->type = ARPHRD_INFINIBAND;
|
|
|
|
dev->tx_queue_len = ipoib_sendq_size * 2;
|
2008-01-31 00:30:46 +08:00
|
|
|
dev->features = (NETIF_F_VLAN_CHALLENGED |
|
|
|
|
NETIF_F_HIGHDMA);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
|
|
|
|
|
|
|
|
netif_carrier_off(dev);
|
|
|
|
|
|
|
|
priv->dev = dev;
|
|
|
|
|
2008-07-15 14:48:48 +08:00
|
|
|
ipoib_lro_setup(priv);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock_init(&priv->lock);
|
|
|
|
|
2006-01-14 06:51:39 +08:00
|
|
|
mutex_init(&priv->vlan_mutex);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&priv->path_list);
|
|
|
|
INIT_LIST_HEAD(&priv->child_intfs);
|
|
|
|
INIT_LIST_HEAD(&priv->dead_ahs);
|
|
|
|
INIT_LIST_HEAD(&priv->multicast_list);
|
|
|
|
|
2007-05-19 23:51:54 +08:00
|
|
|
INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
|
2006-11-22 22:57:56 +08:00
|
|
|
INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
|
2008-09-17 02:57:45 +08:00
|
|
|
INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
|
2008-07-15 14:48:49 +08:00
|
|
|
INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
|
|
|
|
INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
|
|
|
|
INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
|
2006-11-22 22:57:56 +08:00
|
|
|
INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
|
|
|
|
INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
|
|
|
|
{
|
|
|
|
struct net_device *dev;
|
|
|
|
|
|
|
|
dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
|
|
|
|
ipoib_setup);
|
|
|
|
if (!dev)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return netdev_priv(dev);
|
|
|
|
}
|
|
|
|
|
2002-04-10 03:14:34 +08:00
|
|
|
static ssize_t show_pkey(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2002-04-10 03:14:34 +08:00
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return sprintf(buf, "0x%04x\n", priv->pkey);
|
|
|
|
}
|
2002-04-10 03:14:34 +08:00
|
|
|
static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
IPoIB: Allow setting policy to ignore multicast groups
The kernel IB stack allows (through the RDMA CM) userspace
applications to join and use multicast groups from the IPoIB MGID
range. This allows multicast traffic to be handled directly from
userspace QPs, without going through the kernel stack, which gives
better performance for some applications.
However, to fully interoperate with IP multicast, such userspace
applications need to participate in IGMP reports and queries, or else
routers may not forward the multicast traffic to the system where the
application is running. The simplest way to do this is to share the
kernel IGMP implementation by using the IP_ADD_MEMBERSHIP option to
join multicast groups that are being handled directly in userspace.
However, in such cases, the actual multicast traffic should not also
be handled by the IPoIB interface, because that would burn resources
handling multicast packets that will just be discarded in the kernel.
To handle this, this patch adds lookup on the database used for IB
multicast group reference counting when IPoIB is joining multicast
groups, and if a multicast group is already handled by user space,
then the IPoIB kernel driver ignores the group. This is controlled by
a per-interface policy flag. When the flag is set, IPoIB will not
join and attach its QP to a multicast group which already has an entry
in the database; when the flag is cleared, IPoIB will behave as before
this change.
For each IPoIB interface, the /sys/class/net/$intf/umcast attribute
controls the policy flag. The default value is off/0.
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2007-10-08 16:13:00 +08:00
|
|
|
static ssize_t show_umcast(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
|
|
|
|
|
|
|
|
return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t set_umcast(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
|
|
|
|
unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
|
|
|
|
|
|
|
|
if (umcast_val > 0) {
|
|
|
|
set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
|
|
|
|
ipoib_warn(priv, "ignoring multicast groups joined directly "
|
|
|
|
"by userspace\n");
|
|
|
|
} else
|
|
|
|
clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
|
|
|
|
|
|
|
|
int ipoib_add_umcast_attr(struct net_device *dev)
|
|
|
|
{
|
|
|
|
return device_create_file(&dev->dev, &dev_attr_umcast);
|
|
|
|
}
|
|
|
|
|
2002-04-10 03:14:34 +08:00
|
|
|
static ssize_t create_child(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
2005-04-17 06:20:36 +08:00
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
int pkey;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (sscanf(buf, "%i", &pkey) != 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (pkey < 0 || pkey > 0xffff)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2005-08-20 03:03:17 +08:00
|
|
|
/*
|
|
|
|
* Set the full membership bit, so that we join the right
|
|
|
|
* broadcast group, etc.
|
|
|
|
*/
|
|
|
|
pkey |= 0x8000;
|
|
|
|
|
2002-04-10 03:14:34 +08:00
|
|
|
ret = ipoib_vlan_add(to_net_dev(dev), pkey);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return ret ? ret : count;
|
|
|
|
}
|
2002-04-10 03:14:34 +08:00
|
|
|
static DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2002-04-10 03:14:34 +08:00
|
|
|
static ssize_t delete_child(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
2005-04-17 06:20:36 +08:00
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
int pkey;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (sscanf(buf, "%i", &pkey) != 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (pkey < 0 || pkey > 0xffff)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2002-04-10 03:14:34 +08:00
|
|
|
ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return ret ? ret : count;
|
|
|
|
|
|
|
|
}
|
2002-04-10 03:14:34 +08:00
|
|
|
static DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
int ipoib_add_pkey_attr(struct net_device *dev)
|
|
|
|
{
|
2002-04-10 03:14:34 +08:00
|
|
|
return device_create_file(&dev->dev, &dev_attr_pkey);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-23 06:49:49 +08:00
|
|
|
int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
|
|
|
|
{
|
|
|
|
struct ib_device_attr *device_attr;
|
|
|
|
int result = -ENOMEM;
|
|
|
|
|
|
|
|
device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
|
|
|
|
if (!device_attr) {
|
|
|
|
printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
|
|
|
|
hca->name, sizeof *device_attr);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
result = ib_query_device(hca, device_attr);
|
|
|
|
if (result) {
|
|
|
|
printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
|
|
|
|
hca->name, result);
|
|
|
|
kfree(device_attr);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
priv->hca_caps = device_attr->device_cap_flags;
|
|
|
|
|
|
|
|
kfree(device_attr);
|
|
|
|
|
|
|
|
if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
|
|
|
|
set_bit(IPOIB_FLAG_CSUM, &priv->flags);
|
|
|
|
priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lro)
|
|
|
|
priv->dev->features |= NETIF_F_LRO;
|
|
|
|
|
|
|
|
if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
|
|
|
|
priv->dev->features |= NETIF_F_TSO;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static struct net_device *ipoib_add_port(const char *format,
|
|
|
|
struct ib_device *hca, u8 port)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv;
|
2008-04-24 02:55:45 +08:00
|
|
|
struct ib_port_attr attr;
|
2005-04-17 06:20:36 +08:00
|
|
|
int result = -ENOMEM;
|
|
|
|
|
|
|
|
priv = ipoib_intf_alloc(format);
|
|
|
|
if (!priv)
|
|
|
|
goto alloc_mem_failed;
|
|
|
|
|
|
|
|
SET_NETDEV_DEV(priv->dev, hca->dma_device);
|
|
|
|
|
2008-04-24 02:55:45 +08:00
|
|
|
if (!ib_query_port(hca, port, &attr))
|
|
|
|
priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
|
|
|
|
else {
|
|
|
|
printk(KERN_WARNING "%s: ib_query_port %d failed\n",
|
|
|
|
hca->name, port);
|
|
|
|
goto device_init_failed;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* MTU will be reset when mcast join happens */
|
|
|
|
priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
|
|
|
|
priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
result = ib_query_pkey(hca, port, 0, &priv->pkey);
|
|
|
|
if (result) {
|
|
|
|
printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
|
|
|
|
hca->name, port, result);
|
2007-08-21 23:46:10 +08:00
|
|
|
goto device_init_failed;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-23 06:49:49 +08:00
|
|
|
if (ipoib_set_dev_features(priv, hca))
|
2008-04-17 12:01:10 +08:00
|
|
|
goto device_init_failed;
|
2008-07-15 14:48:48 +08:00
|
|
|
|
2005-08-20 03:03:17 +08:00
|
|
|
/*
|
|
|
|
* Set the full membership bit, so that we join the right
|
|
|
|
* broadcast group, etc.
|
|
|
|
*/
|
|
|
|
priv->pkey |= 0x8000;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
priv->dev->broadcast[8] = priv->pkey >> 8;
|
|
|
|
priv->dev->broadcast[9] = priv->pkey & 0xff;
|
|
|
|
|
|
|
|
result = ib_query_gid(hca, port, 0, &priv->local_gid);
|
|
|
|
if (result) {
|
|
|
|
printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
|
|
|
|
hca->name, port, result);
|
2007-08-21 23:46:10 +08:00
|
|
|
goto device_init_failed;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else
|
|
|
|
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
|
|
|
|
|
|
|
|
result = ipoib_dev_init(priv->dev, hca, port);
|
|
|
|
if (result < 0) {
|
|
|
|
printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
|
|
|
|
hca->name, port, result);
|
|
|
|
goto device_init_failed;
|
|
|
|
}
|
|
|
|
|
|
|
|
INIT_IB_EVENT_HANDLER(&priv->event_handler,
|
|
|
|
priv->ca, ipoib_event);
|
|
|
|
result = ib_register_event_handler(&priv->event_handler);
|
|
|
|
if (result < 0) {
|
|
|
|
printk(KERN_WARNING "%s: ib_register_event_handler failed for "
|
|
|
|
"port %d (ret = %d)\n",
|
|
|
|
hca->name, port, result);
|
|
|
|
goto event_failed;
|
|
|
|
}
|
|
|
|
|
|
|
|
result = register_netdev(priv->dev);
|
|
|
|
if (result) {
|
|
|
|
printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
|
|
|
|
hca->name, port, result);
|
|
|
|
goto register_failed;
|
|
|
|
}
|
|
|
|
|
2005-11-08 02:33:11 +08:00
|
|
|
ipoib_create_debug_files(priv->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-06 04:12:23 +08:00
|
|
|
if (ipoib_cm_add_mode_attr(priv->dev))
|
|
|
|
goto sysfs_failed;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (ipoib_add_pkey_attr(priv->dev))
|
|
|
|
goto sysfs_failed;
|
IPoIB: Allow setting policy to ignore multicast groups
The kernel IB stack allows (through the RDMA CM) userspace
applications to join and use multicast groups from the IPoIB MGID
range. This allows multicast traffic to be handled directly from
userspace QPs, without going through the kernel stack, which gives
better performance for some applications.
However, to fully interoperate with IP multicast, such userspace
applications need to participate in IGMP reports and queries, or else
routers may not forward the multicast traffic to the system where the
application is running. The simplest way to do this is to share the
kernel IGMP implementation by using the IP_ADD_MEMBERSHIP option to
join multicast groups that are being handled directly in userspace.
However, in such cases, the actual multicast traffic should not also
be handled by the IPoIB interface, because that would burn resources
handling multicast packets that will just be discarded in the kernel.
To handle this, this patch adds lookup on the database used for IB
multicast group reference counting when IPoIB is joining multicast
groups, and if a multicast group is already handled by user space,
then the IPoIB kernel driver ignores the group. This is controlled by
a per-interface policy flag. When the flag is set, IPoIB will not
join and attach its QP to a multicast group which already has an entry
in the database; when the flag is cleared, IPoIB will behave as before
this change.
For each IPoIB interface, the /sys/class/net/$intf/umcast attribute
controls the policy flag. The default value is off/0.
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2007-10-08 16:13:00 +08:00
|
|
|
if (ipoib_add_umcast_attr(priv->dev))
|
|
|
|
goto sysfs_failed;
|
2002-04-10 03:14:34 +08:00
|
|
|
if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto sysfs_failed;
|
2002-04-10 03:14:34 +08:00
|
|
|
if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto sysfs_failed;
|
|
|
|
|
|
|
|
return priv->dev;
|
|
|
|
|
|
|
|
sysfs_failed:
|
2005-11-08 02:33:11 +08:00
|
|
|
ipoib_delete_debug_files(priv->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
unregister_netdev(priv->dev);
|
|
|
|
|
|
|
|
register_failed:
|
|
|
|
ib_unregister_event_handler(&priv->event_handler);
|
2008-08-20 06:01:32 +08:00
|
|
|
flush_workqueue(ipoib_workqueue);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
event_failed:
|
|
|
|
ipoib_dev_cleanup(priv->dev);
|
|
|
|
|
|
|
|
device_init_failed:
|
|
|
|
free_netdev(priv->dev);
|
|
|
|
|
|
|
|
alloc_mem_failed:
|
|
|
|
return ERR_PTR(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ipoib_add_one(struct ib_device *device)
|
|
|
|
{
|
|
|
|
struct list_head *dev_list;
|
|
|
|
struct net_device *dev;
|
|
|
|
struct ipoib_dev_priv *priv;
|
|
|
|
int s, e, p;
|
|
|
|
|
2006-08-04 05:02:42 +08:00
|
|
|
if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
|
|
|
|
return;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
|
|
|
|
if (!dev_list)
|
|
|
|
return;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(dev_list);
|
|
|
|
|
2006-08-04 05:02:42 +08:00
|
|
|
if (device->node_type == RDMA_NODE_IB_SWITCH) {
|
2005-04-17 06:20:36 +08:00
|
|
|
s = 0;
|
|
|
|
e = 0;
|
|
|
|
} else {
|
|
|
|
s = 1;
|
|
|
|
e = device->phys_port_cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (p = s; p <= e; ++p) {
|
|
|
|
dev = ipoib_add_port("ib%d", device, p);
|
|
|
|
if (!IS_ERR(dev)) {
|
|
|
|
priv = netdev_priv(dev);
|
|
|
|
list_add_tail(&priv->list, dev_list);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ib_set_client_data(device, &ipoib_client, dev_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ipoib_remove_one(struct ib_device *device)
|
|
|
|
{
|
|
|
|
struct ipoib_dev_priv *priv, *tmp;
|
|
|
|
struct list_head *dev_list;
|
|
|
|
|
2006-08-04 05:02:42 +08:00
|
|
|
if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
|
|
|
|
return;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
dev_list = ib_get_client_data(device, &ipoib_client);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(priv, tmp, dev_list, list) {
|
|
|
|
ib_unregister_event_handler(&priv->event_handler);
|
2008-08-20 06:01:32 +08:00
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
|
|
|
|
rtnl_unlock();
|
|
|
|
|
|
|
|
flush_workqueue(ipoib_workqueue);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
unregister_netdev(priv->dev);
|
|
|
|
ipoib_dev_cleanup(priv->dev);
|
|
|
|
free_netdev(priv->dev);
|
|
|
|
}
|
2005-09-02 00:19:02 +08:00
|
|
|
|
|
|
|
kfree(dev_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __init ipoib_init_module(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2006-04-11 00:43:58 +08:00
|
|
|
ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
|
|
|
|
ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
|
|
|
|
ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
|
|
|
|
|
|
|
|
ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
|
|
|
|
ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
|
2008-04-30 04:46:53 +08:00
|
|
|
ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
|
|
|
|
IPOIB_MIN_QUEUE_SIZE));
|
2008-01-26 06:15:24 +08:00
|
|
|
#ifdef CONFIG_INFINIBAND_IPOIB_CM
|
|
|
|
ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
|
|
|
|
#endif
|
2006-04-11 00:43:58 +08:00
|
|
|
|
IPoIB: Copy small received SKBs in connected mode
The connected mode implementation in the IPoIB driver has a large
overhead in the way SKBs are handled in the receive flow. It usually
allocates an SKB with as big as was used in the currently received SKB
and moves unused fragments from the old SKB to the new one. This
involves a loop on all the remaining fragments and incurs overhead on
the CPU. This patch, for small SKBs, allocates an SKB just large
enough to contain the received data and copies to it the data from the
received SKB. The newly allocated SKB is passed to the stack and the
old SKB is reposted.
When running netperf, UDP small messages, without this pach I get:
UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
14.4.3.178 (14.4.3.178) port 0 AF_INET
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
114688 128 10.00 5142034 0 526.31
114688 10.00 1130489 115.71
With this patch I get both send and receive at ~315 mbps.
The reason that send performance actually slows down is as follows:
When using this patch, the overhead of the CPU for handling RX packets
is dramatically reduced. As a result, we do not experience RNR NAK
messages from the receiver which cause the connection to be closed and
reopened again; when the patch is not used, the receiver cannot handle
the packets fast enough so there is less time to post new buffers and
hence the mentioned RNR NACKs. So what happens is that the
application *thinks* it posted a certain number of packets for
transmission but these packets are flushed and do not really get
transmitted. Since the connection gets opened and closed many times,
each time netperf gets the CPU time that otherwise would have been
given to IPoIB to actually transmit the packets. This can be verified
when looking at the port counters -- the output of ifconfig and the
oputput of netperf (this is for the case without the patch):
tx packets
==========
port counter: 1,543,996
ifconfig: 1,581,426
netperf: 5,142,034
rx packets
==========
netperf 1,1304,089
Signed-off-by: Eli Cohen <eli@mellanox.co.il>
2008-07-15 14:48:44 +08:00
|
|
|
/*
|
|
|
|
* When copying small received packets, we only copy from the
|
|
|
|
* linear data part of the SKB, so we rely on this condition.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
ret = ipoib_register_debugfs();
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We create our own workqueue mainly because we want to be
|
|
|
|
* able to flush it when devices are being removed. We can't
|
|
|
|
* use schedule_work()/flush_scheduled_work() because both
|
|
|
|
* unregister_netdev() and linkwatch_event take the rtnl lock,
|
|
|
|
* so flush_scheduled_work() can deadlock during device
|
|
|
|
* removal.
|
|
|
|
*/
|
|
|
|
ipoib_workqueue = create_singlethread_workqueue("ipoib");
|
|
|
|
if (!ipoib_workqueue) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err_fs;
|
|
|
|
}
|
|
|
|
|
2006-08-22 07:40:12 +08:00
|
|
|
ib_sa_register_client(&ipoib_sa_client);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
ret = ib_register_client(&ipoib_client);
|
|
|
|
if (ret)
|
2006-08-22 07:40:12 +08:00
|
|
|
goto err_sa;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2006-08-22 07:40:12 +08:00
|
|
|
err_sa:
|
|
|
|
ib_sa_unregister_client(&ipoib_sa_client);
|
2005-04-17 06:20:36 +08:00
|
|
|
destroy_workqueue(ipoib_workqueue);
|
|
|
|
|
2005-04-17 06:26:07 +08:00
|
|
|
err_fs:
|
|
|
|
ipoib_unregister_debugfs();
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit ipoib_cleanup_module(void)
|
|
|
|
{
|
|
|
|
ib_unregister_client(&ipoib_client);
|
2006-08-22 07:40:12 +08:00
|
|
|
ib_sa_unregister_client(&ipoib_sa_client);
|
2005-04-17 06:26:07 +08:00
|
|
|
ipoib_unregister_debugfs();
|
2005-04-17 06:20:36 +08:00
|
|
|
destroy_workqueue(ipoib_workqueue);
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(ipoib_init_module);
|
|
|
|
module_exit(ipoib_cleanup_module);
|