Merge branch 'vsock'

Andy King says:

====================
In an effort to improve the out-of-the-box experience with Linux kernels for
VMware users, VMware is working on readying the VM Sockets (VSOCK, formerly
VMCI Sockets) (vsock) kernel module for inclusion in the Linux kernel. The
purpose of this post is to acquire feedback on the vsock kernel module.

Unlike previous submissions, where the new socket family was entirely reliant
on VMware's VMCI PCI device (and thus VMware's hypervisor), VM Sockets is now
completely[1] separated out into two parts, each in its own module:

o Core socket code, which is transport-neutral and invokes transport
  callbacks to communicate with the hypervisor.  This is vsock.ko.
o A VMCI transport, which communicates over VMCI with the VMware hypervisor.
  This is vmw_vsock_vmci_transport.ko, and it registers with the core module
  as a transport.

This should provide a path to introducing additional transports, for example
virtio, with the ultimate goal being to make this new socket family
hypervisor-neutral.

[1] If Gerd tries it and determines this to be false (still), I'll ship him
    a keg of beer.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2013-02-10 20:10:10 -05:00
commit 5b815b52f6
16 changed files with 6038 additions and 1 deletions

View File

@ -178,7 +178,8 @@ struct ucred {
#define AF_CAIF 37 /* CAIF sockets */
#define AF_ALG 38 /* Algorithm sockets */
#define AF_NFC 39 /* NFC sockets */
#define AF_MAX 40 /* For now.. */
#define AF_VSOCK 40 /* vSockets */
#define AF_MAX 41 /* For now.. */
/* Protocol families, same as address families. */
#define PF_UNSPEC AF_UNSPEC
@ -221,6 +222,7 @@ struct ucred {
#define PF_CAIF AF_CAIF
#define PF_ALG AF_ALG
#define PF_NFC AF_NFC
#define PF_VSOCK AF_VSOCK
#define PF_MAX AF_MAX
/* Maximum queue length specifiable by listen. */

View File

@ -0,0 +1,171 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _VM_SOCKETS_H_
#define _VM_SOCKETS_H_
#if !defined(__KERNEL__)
#include <sys/socket.h>
#endif
/* Option name for STREAM socket buffer size. Use as the option name in
* setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
* specifies the size of the buffer underlying a vSockets STREAM socket.
* Value is clamped to the MIN and MAX.
*/
#define SO_VM_SOCKETS_BUFFER_SIZE 0
/* Option name for STREAM socket minimum buffer size. Use as the option name
* in setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
* specifies the minimum size allowed for the buffer underlying a vSockets
* STREAM socket.
*/
#define SO_VM_SOCKETS_BUFFER_MIN_SIZE 1
/* Option name for STREAM socket maximum buffer size. Use as the option name
* in setsockopt(3) or getsockopt(3) to set or get an unsigned long long
* that specifies the maximum size allowed for the buffer underlying a
* vSockets STREAM socket.
*/
#define SO_VM_SOCKETS_BUFFER_MAX_SIZE 2
/* Option name for socket peer's host-specific VM ID. Use as the option name
* in getsockopt(3) to get a host-specific identifier for the peer endpoint's
* VM. The identifier is a signed integer.
* Only available for hypervisor endpoints.
*/
#define SO_VM_SOCKETS_PEER_HOST_VM_ID 3
/* Option name for socket's service label. Use as the option name in
* setsockopt(3) or getsockopt(3) to set or get the service label for a socket.
* The service label is a C-style NUL-terminated string. Only available for
* hypervisor endpoints.
*/
#define SO_VM_SOCKETS_SERVICE_LABEL 4
/* Option name for determining if a socket is trusted. Use as the option name
* in getsockopt(3) to determine if a socket is trusted. The value is a
* signed integer.
*/
#define SO_VM_SOCKETS_TRUSTED 5
/* Option name for STREAM socket connection timeout. Use as the option name
* in setsockopt(3) or getsockopt(3) to set or get the connection
* timeout for a STREAM socket.
*/
#define SO_VM_SOCKETS_CONNECT_TIMEOUT 6
/* Option name for using non-blocking send/receive. Use as the option name
* for setsockopt(3) or getsockopt(3) to set or get the non-blocking
* transmit/receive flag for a STREAM socket. This flag determines whether
* send() and recv() can be called in non-blocking contexts for the given
* socket. The value is a signed integer.
*
* This option is only relevant to kernel endpoints, where descheduling the
* thread of execution is not allowed, for example, while holding a spinlock.
* It is not to be confused with conventional non-blocking socket operations.
*
* Only available for hypervisor endpoints.
*/
#define SO_VM_SOCKETS_NONBLOCK_TXRX 7
/* The vSocket equivalent of INADDR_ANY. This works for the svm_cid field of
* sockaddr_vm and indicates the context ID of the current endpoint.
*/
#define VMADDR_CID_ANY -1U
/* Bind to any available port. Works for the svm_port field of
* sockaddr_vm.
*/
#define VMADDR_PORT_ANY -1U
/* Use this as the destination CID in an address when referring to the
* hypervisor. VMCI relies on it being 0, but this would be useful for other
* transports too.
*/
#define VMADDR_CID_HYPERVISOR 0
/* This CID is specific to VMCI and can be considered reserved (even VMCI
* doesn't use it anymore, it's a legacy value from an older release).
*/
#define VMADDR_CID_RESERVED 1
/* Use this as the destination CID in an address when referring to the host
* (any process other than the hypervisor). VMCI relies on it being 2, but
* this would be useful for other transports too.
*/
#define VMADDR_CID_HOST 2
/* Invalid vSockets version. */
#define VM_SOCKETS_INVALID_VERSION -1U
/* The epoch (first) component of the vSockets version. A single byte
* representing the epoch component of the vSockets version.
*/
#define VM_SOCKETS_VERSION_EPOCH(_v) (((_v) & 0xFF000000) >> 24)
/* The major (second) component of the vSockets version. A single byte
* representing the major component of the vSockets version. Typically
* changes for every major release of a product.
*/
#define VM_SOCKETS_VERSION_MAJOR(_v) (((_v) & 0x00FF0000) >> 16)
/* The minor (third) component of the vSockets version. Two bytes representing
* the minor component of the vSockets version.
*/
#define VM_SOCKETS_VERSION_MINOR(_v) (((_v) & 0x0000FFFF))
/* Address structure for vSockets. The address family should be set to
* whatever vmci_sock_get_af_value_fd() returns. The structure members should
* all align on their natural boundaries without resorting to compiler packing
* directives. The total size of this structure should be exactly the same as
* that of struct sockaddr.
*/
struct sockaddr_vm {
sa_family_t svm_family;
unsigned short svm_reserved1;
unsigned int svm_port;
unsigned int svm_cid;
unsigned char svm_zero[sizeof(struct sockaddr) -
sizeof(sa_family_t) -
sizeof(unsigned short) -
sizeof(unsigned int) - sizeof(unsigned int)];
};
#define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9)
#if defined(__KERNEL__)
int vm_sockets_get_local_cid(void);
#endif
#endif

View File

@ -217,6 +217,7 @@ source "net/dcb/Kconfig"
source "net/dns_resolver/Kconfig"
source "net/batman-adv/Kconfig"
source "net/openvswitch/Kconfig"
source "net/vmw_vsock/Kconfig"
config RPS
boolean

View File

@ -69,3 +69,4 @@ obj-$(CONFIG_CEPH_LIB) += ceph/
obj-$(CONFIG_BATMAN_ADV) += batman-adv/
obj-$(CONFIG_NFC) += nfc/
obj-$(CONFIG_OPENVSWITCH) += openvswitch/
obj-$(CONFIG_VSOCKETS) += vmw_vsock/

28
net/vmw_vsock/Kconfig Normal file
View File

@ -0,0 +1,28 @@
#
# Vsock protocol
#
config VSOCKETS
tristate "Virtual Socket protocol"
help
Virtual Socket Protocol is a socket protocol similar to TCP/IP
allowing comunication between Virtual Machines and hypervisor
or host.
You should also select one or more hypervisor-specific transports
below.
To compile this driver as a module, choose M here: the module
will be called vsock. If unsure, say N.
config VMWARE_VMCI_VSOCKETS
tristate "VMware VMCI transport for Virtual Sockets"
depends on VSOCKETS && VMWARE_VMCI
help
This module implements a VMCI transport for Virtual Sockets.
Enable this transport if your Virtual Machine runs on a VMware
hypervisor.
To compile this driver as a module, choose M here: the module
will be called vmw_vsock_vmci_transport. If unsure, say N.

7
net/vmw_vsock/Makefile Normal file
View File

@ -0,0 +1,7 @@
obj-$(CONFIG_VSOCKETS) += vsock.o
obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
vsock-y += af_vsock.o vsock_addr.o
vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
vmci_transport_notify_qstate.o

2015
net/vmw_vsock/af_vsock.c Normal file

File diff suppressed because it is too large Load Diff

175
net/vmw_vsock/af_vsock.h Normal file
View File

@ -0,0 +1,175 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef __AF_VSOCK_H__
#define __AF_VSOCK_H__
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/vm_sockets.h>
#include "vsock_addr.h"
#define LAST_RESERVED_PORT 1023
#define vsock_sk(__sk) ((struct vsock_sock *)__sk)
#define sk_vsock(__vsk) (&(__vsk)->sk)
struct vsock_sock {
/* sk must be the first member. */
struct sock sk;
struct sockaddr_vm local_addr;
struct sockaddr_vm remote_addr;
/* Links for the global tables of bound and connected sockets. */
struct list_head bound_table;
struct list_head connected_table;
/* Accessed without the socket lock held. This means it can never be
* modified outsided of socket create or destruct.
*/
bool trusted;
bool cached_peer_allow_dgram; /* Dgram communication allowed to
* cached peer?
*/
u32 cached_peer; /* Context ID of last dgram destination check. */
const struct cred *owner;
/* Rest are SOCK_STREAM only. */
long connect_timeout;
/* Listening socket that this came from. */
struct sock *listener;
/* Used for pending list and accept queue during connection handshake.
* The listening socket is the head for both lists. Sockets created
* for connection requests are placed in the pending list until they
* are connected, at which point they are put in the accept queue list
* so they can be accepted in accept(). If accept() cannot accept the
* connection, it is marked as rejected so the cleanup function knows
* to clean up the socket.
*/
struct list_head pending_links;
struct list_head accept_queue;
bool rejected;
struct delayed_work dwork;
u32 peer_shutdown;
bool sent_request;
bool ignore_connecting_rst;
/* Private to transport. */
void *trans;
};
s64 vsock_stream_has_data(struct vsock_sock *vsk);
s64 vsock_stream_has_space(struct vsock_sock *vsk);
void vsock_pending_work(struct work_struct *work);
struct sock *__vsock_create(struct net *net,
struct socket *sock,
struct sock *parent,
gfp_t priority, unsigned short type);
/**** TRANSPORT ****/
struct vsock_transport_recv_notify_data {
u64 data1; /* Transport-defined. */
u64 data2; /* Transport-defined. */
bool notify_on_block;
};
struct vsock_transport_send_notify_data {
u64 data1; /* Transport-defined. */
u64 data2; /* Transport-defined. */
};
struct vsock_transport {
/* Initialize/tear-down socket. */
int (*init)(struct vsock_sock *, struct vsock_sock *);
void (*destruct)(struct vsock_sock *);
void (*release)(struct vsock_sock *);
/* Connections. */
int (*connect)(struct vsock_sock *);
/* DGRAM. */
int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk,
struct msghdr *msg, size_t len, int flags);
int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
struct iovec *, size_t len);
bool (*dgram_allow)(u32 cid, u32 port);
/* STREAM. */
/* TODO: stream_bind() */
ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *,
size_t len, int flags);
ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *,
size_t len);
s64 (*stream_has_data)(struct vsock_sock *);
s64 (*stream_has_space)(struct vsock_sock *);
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
/* Notification. */
int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
int (*notify_recv_init)(struct vsock_sock *, size_t,
struct vsock_transport_recv_notify_data *);
int (*notify_recv_pre_block)(struct vsock_sock *, size_t,
struct vsock_transport_recv_notify_data *);
int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t,
struct vsock_transport_recv_notify_data *);
int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t,
ssize_t, bool, struct vsock_transport_recv_notify_data *);
int (*notify_send_init)(struct vsock_sock *,
struct vsock_transport_send_notify_data *);
int (*notify_send_pre_block)(struct vsock_sock *,
struct vsock_transport_send_notify_data *);
int (*notify_send_pre_enqueue)(struct vsock_sock *,
struct vsock_transport_send_notify_data *);
int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t,
struct vsock_transport_send_notify_data *);
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
/* Buffer sizes. */
void (*set_buffer_size)(struct vsock_sock *, u64);
void (*set_min_buffer_size)(struct vsock_sock *, u64);
void (*set_max_buffer_size)(struct vsock_sock *, u64);
u64 (*get_buffer_size)(struct vsock_sock *);
u64 (*get_min_buffer_size)(struct vsock_sock *);
u64 (*get_max_buffer_size)(struct vsock_sock *);
/* Addressing. */
u32 (*get_local_cid)(void);
};
/**** CORE ****/
int vsock_core_init(const struct vsock_transport *t);
void vsock_core_exit(void);
/**** UTILS ****/
void vsock_release_pending(struct sock *pending);
void vsock_add_pending(struct sock *listener, struct sock *pending);
void vsock_remove_pending(struct sock *listener, struct sock *pending);
void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
void vsock_insert_connected(struct vsock_sock *vsk);
void vsock_remove_bound(struct vsock_sock *vsk);
void vsock_remove_connected(struct vsock_sock *vsk);
struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
struct sockaddr_vm *dst);
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
#endif /* __AF_VSOCK_H__ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,139 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _VMCI_TRANSPORT_H_
#define _VMCI_TRANSPORT_H_
#include <linux/vmw_vmci_defs.h>
#include <linux/vmw_vmci_api.h>
#include "vsock_addr.h"
#include "af_vsock.h"
/* If the packet format changes in a release then this should change too. */
#define VMCI_TRANSPORT_PACKET_VERSION 1
/* The resource ID on which control packets are sent. */
#define VMCI_TRANSPORT_PACKET_RID 1
#define VSOCK_PROTO_INVALID 0
#define VSOCK_PROTO_PKT_ON_NOTIFY (1 << 0)
#define VSOCK_PROTO_ALL_SUPPORTED (VSOCK_PROTO_PKT_ON_NOTIFY)
#define vmci_trans(_vsk) ((struct vmci_transport *)((_vsk)->trans))
enum vmci_transport_packet_type {
VMCI_TRANSPORT_PACKET_TYPE_INVALID = 0,
VMCI_TRANSPORT_PACKET_TYPE_REQUEST,
VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE,
VMCI_TRANSPORT_PACKET_TYPE_OFFER,
VMCI_TRANSPORT_PACKET_TYPE_ATTACH,
VMCI_TRANSPORT_PACKET_TYPE_WROTE,
VMCI_TRANSPORT_PACKET_TYPE_READ,
VMCI_TRANSPORT_PACKET_TYPE_RST,
VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN,
VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE,
VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ,
VMCI_TRANSPORT_PACKET_TYPE_REQUEST2,
VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2,
VMCI_TRANSPORT_PACKET_TYPE_MAX
};
struct vmci_transport_waiting_info {
u64 generation;
u64 offset;
};
/* Control packet type for STREAM sockets. DGRAMs have no control packets nor
* special packet header for data packets, they are just raw VMCI DGRAM
* messages. For STREAMs, control packets are sent over the control channel
* while data is written and read directly from queue pairs with no packet
* format.
*/
struct vmci_transport_packet {
struct vmci_datagram dg;
u8 version;
u8 type;
u16 proto;
u32 src_port;
u32 dst_port;
u32 _reserved2;
union {
u64 size;
u64 mode;
struct vmci_handle handle;
struct vmci_transport_waiting_info wait;
} u;
};
struct vmci_transport_notify_pkt {
u64 write_notify_window;
u64 write_notify_min_window;
bool peer_waiting_read;
bool peer_waiting_write;
bool peer_waiting_write_detected;
bool sent_waiting_read;
bool sent_waiting_write;
struct vmci_transport_waiting_info peer_waiting_read_info;
struct vmci_transport_waiting_info peer_waiting_write_info;
u64 produce_q_generation;
u64 consume_q_generation;
};
struct vmci_transport_notify_pkt_q_state {
u64 write_notify_window;
u64 write_notify_min_window;
bool peer_waiting_write;
bool peer_waiting_write_detected;
};
union vmci_transport_notify {
struct vmci_transport_notify_pkt pkt;
struct vmci_transport_notify_pkt_q_state pkt_q_state;
};
/* Our transport-specific data. */
struct vmci_transport {
/* For DGRAMs. */
struct vmci_handle dg_handle;
/* For STREAMs. */
struct vmci_handle qp_handle;
struct vmci_qp *qpair;
u64 produce_size;
u64 consume_size;
u64 queue_pair_size;
u64 queue_pair_min_size;
u64 queue_pair_max_size;
u32 attach_sub_id;
u32 detach_sub_id;
union vmci_transport_notify notify;
struct vmci_transport_notify_ops *notify_ops;
};
int vmci_transport_register(void);
void vmci_transport_unregister(void);
int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst,
struct sockaddr_vm *src);
int vmci_transport_send_read_bh(struct sockaddr_vm *dst,
struct sockaddr_vm *src);
int vmci_transport_send_wrote(struct sock *sk);
int vmci_transport_send_read(struct sock *sk);
int vmci_transport_send_waiting_write(struct sock *sk,
struct vmci_transport_waiting_info *wait);
int vmci_transport_send_waiting_read(struct sock *sk,
struct vmci_transport_waiting_info *wait);
#endif

View File

@ -0,0 +1,680 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/stddef.h>
#include <net/sock.h>
#include "vmci_transport_notify.h"
#define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
bool retval;
u64 notify_limit;
if (!PKT_FIELD(vsk, peer_waiting_write))
return false;
#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
/* When the sender blocks, we take that as a sign that the sender is
* faster than the receiver. To reduce the transmit rate of the sender,
* we delay the sending of the read notification by decreasing the
* write_notify_window. The notification is delayed until the number of
* bytes used in the queue drops below the write_notify_window.
*/
if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
PKT_FIELD(vsk, peer_waiting_write_detected) = true;
if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
PKT_FIELD(vsk, write_notify_window) =
PKT_FIELD(vsk, write_notify_min_window);
} else {
PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
if (PKT_FIELD(vsk, write_notify_window) <
PKT_FIELD(vsk, write_notify_min_window))
PKT_FIELD(vsk, write_notify_window) =
PKT_FIELD(vsk, write_notify_min_window);
}
}
notify_limit = vmci_trans(vsk)->consume_size -
PKT_FIELD(vsk, write_notify_window);
#else
notify_limit = 0;
#endif
/* For now we ignore the wait information and just see if the free
* space exceeds the notify limit. Note that improving this function
* to be more intelligent will not require a protocol change and will
* retain compatibility between endpoints with mixed versions of this
* function.
*
* The notify_limit is used to delay notifications in the case where
* flow control is enabled. Below the test is expressed in terms of
* free space in the queue: if free_space > ConsumeSize -
* write_notify_window then notify An alternate way of expressing this
* is to rewrite the expression to use the data ready in the receive
* queue: if write_notify_window > bufferReady then notify as
* free_space == ConsumeSize - bufferReady.
*/
retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
notify_limit;
#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
if (retval) {
/*
* Once we notify the peer, we reset the detected flag so the
* next wait will again cause a decrease in the window size.
*/
PKT_FIELD(vsk, peer_waiting_write_detected) = false;
}
#endif
return retval;
#else
return true;
#endif
}
static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
if (!PKT_FIELD(vsk, peer_waiting_read))
return false;
/* For now we ignore the wait information and just see if there is any
* data for our peer to read. Note that improving this function to be
* more intelligent will not require a protocol change and will retain
* compatibility between endpoints with mixed versions of this
* function.
*/
return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
#else
return true;
#endif
}
static void
vmci_transport_handle_waiting_read(struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst,
struct sockaddr_vm *src)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
struct vsock_sock *vsk;
vsk = vsock_sk(sk);
PKT_FIELD(vsk, peer_waiting_read) = true;
memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
if (vmci_transport_notify_waiting_read(vsk)) {
bool sent;
if (bottom_half)
sent = vmci_transport_send_wrote_bh(dst, src) > 0;
else
sent = vmci_transport_send_wrote(sk) > 0;
if (sent)
PKT_FIELD(vsk, peer_waiting_read) = false;
}
#endif
}
static void
vmci_transport_handle_waiting_write(struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst,
struct sockaddr_vm *src)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
struct vsock_sock *vsk;
vsk = vsock_sk(sk);
PKT_FIELD(vsk, peer_waiting_write) = true;
memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
if (vmci_transport_notify_waiting_write(vsk)) {
bool sent;
if (bottom_half)
sent = vmci_transport_send_read_bh(dst, src) > 0;
else
sent = vmci_transport_send_read(sk) > 0;
if (sent)
PKT_FIELD(vsk, peer_waiting_write) = false;
}
#endif
}
static void
vmci_transport_handle_read(struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst, struct sockaddr_vm *src)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
struct vsock_sock *vsk;
vsk = vsock_sk(sk);
PKT_FIELD(vsk, sent_waiting_write) = false;
#endif
sk->sk_write_space(sk);
}
static bool send_waiting_read(struct sock *sk, u64 room_needed)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
struct vsock_sock *vsk;
struct vmci_transport_waiting_info waiting_info;
u64 tail;
u64 head;
u64 room_left;
bool ret;
vsk = vsock_sk(sk);
if (PKT_FIELD(vsk, sent_waiting_read))
return true;
if (PKT_FIELD(vsk, write_notify_window) <
vmci_trans(vsk)->consume_size)
PKT_FIELD(vsk, write_notify_window) =
min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
vmci_trans(vsk)->consume_size);
vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
room_left = vmci_trans(vsk)->consume_size - head;
if (room_needed >= room_left) {
waiting_info.offset = room_needed - room_left;
waiting_info.generation =
PKT_FIELD(vsk, consume_q_generation) + 1;
} else {
waiting_info.offset = head + room_needed;
waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
}
ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
if (ret)
PKT_FIELD(vsk, sent_waiting_read) = true;
return ret;
#else
return true;
#endif
}
static bool send_waiting_write(struct sock *sk, u64 room_needed)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
struct vsock_sock *vsk;
struct vmci_transport_waiting_info waiting_info;
u64 tail;
u64 head;
u64 room_left;
bool ret;
vsk = vsock_sk(sk);
if (PKT_FIELD(vsk, sent_waiting_write))
return true;
vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
room_left = vmci_trans(vsk)->produce_size - tail;
if (room_needed + 1 >= room_left) {
/* Wraps around to current generation. */
waiting_info.offset = room_needed + 1 - room_left;
waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
} else {
waiting_info.offset = tail + room_needed + 1;
waiting_info.generation =
PKT_FIELD(vsk, produce_q_generation) - 1;
}
ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
if (ret)
PKT_FIELD(vsk, sent_waiting_write) = true;
return ret;
#else
return true;
#endif
}
static int vmci_transport_send_read_notification(struct sock *sk)
{
struct vsock_sock *vsk;
bool sent_read;
unsigned int retries;
int err;
vsk = vsock_sk(sk);
sent_read = false;
retries = 0;
err = 0;
if (vmci_transport_notify_waiting_write(vsk)) {
/* Notify the peer that we have read, retrying the send on
* failure up to our maximum value. XXX For now we just log
* the failure, but later we should schedule a work item to
* handle the resend until it succeeds. That would require
* keeping track of work items in the vsk and cleaning them up
* upon socket close.
*/
while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
!sent_read &&
retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
err = vmci_transport_send_read(sk);
if (err >= 0)
sent_read = true;
retries++;
}
if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
pr_err("%p unable to send read notify to peer\n", sk);
else
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
PKT_FIELD(vsk, peer_waiting_write) = false;
#endif
}
return err;
}
static void
vmci_transport_handle_wrote(struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst, struct sockaddr_vm *src)
{
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, sent_waiting_read) = false;
#endif
sk->sk_data_ready(sk, 0);
}
static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
PKT_FIELD(vsk, peer_waiting_read) = false;
PKT_FIELD(vsk, peer_waiting_write) = false;
PKT_FIELD(vsk, peer_waiting_write_detected) = false;
PKT_FIELD(vsk, sent_waiting_read) = false;
PKT_FIELD(vsk, sent_waiting_write) = false;
PKT_FIELD(vsk, produce_q_generation) = 0;
PKT_FIELD(vsk, consume_q_generation) = 0;
memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
}
static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
{
}
static int
vmci_transport_notify_pkt_poll_in(struct sock *sk,
size_t target, bool *data_ready_now)
{
struct vsock_sock *vsk = vsock_sk(sk);
if (vsock_stream_has_data(vsk)) {
*data_ready_now = true;
} else {
/* We can't read right now because there is nothing in the
* queue. Ask for notifications when there is something to
* read.
*/
if (sk->sk_state == SS_CONNECTED) {
if (!send_waiting_read(sk, 1))
return -1;
}
*data_ready_now = false;
}
return 0;
}
static int
vmci_transport_notify_pkt_poll_out(struct sock *sk,
size_t target, bool *space_avail_now)
{
s64 produce_q_free_space;
struct vsock_sock *vsk = vsock_sk(sk);
produce_q_free_space = vsock_stream_has_space(vsk);
if (produce_q_free_space > 0) {
*space_avail_now = true;
return 0;
} else if (produce_q_free_space == 0) {
/* This is a connected socket but we can't currently send data.
* Notify the peer that we are waiting if the queue is full. We
* only send a waiting write if the queue is full because
* otherwise we end up in an infinite WAITING_WRITE, READ,
* WAITING_WRITE, READ, etc. loop. Treat failing to send the
* notification as a socket error, passing that back through
* the mask.
*/
if (!send_waiting_write(sk, 1))
return -1;
*space_avail_now = false;
}
return 0;
}
static int
vmci_transport_notify_pkt_recv_init(
struct sock *sk,
size_t target,
struct vmci_transport_recv_notify_data *data)
{
struct vsock_sock *vsk = vsock_sk(sk);
#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
data->consume_head = 0;
data->produce_tail = 0;
#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
data->notify_on_block = false;
if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
PKT_FIELD(vsk, write_notify_min_window) = target + 1;
if (PKT_FIELD(vsk, write_notify_window) <
PKT_FIELD(vsk, write_notify_min_window)) {
/* If the current window is smaller than the new
* minimal window size, we need to reevaluate whether
* we need to notify the sender. If the number of ready
* bytes are smaller than the new window, we need to
* send a notification to the sender before we block.
*/
PKT_FIELD(vsk, write_notify_window) =
PKT_FIELD(vsk, write_notify_min_window);
data->notify_on_block = true;
}
}
#endif
#endif
return 0;
}
static int
vmci_transport_notify_pkt_recv_pre_block(
struct sock *sk,
size_t target,
struct vmci_transport_recv_notify_data *data)
{
int err = 0;
/* Notify our peer that we are waiting for data to read. */
if (!send_waiting_read(sk, target)) {
err = -EHOSTUNREACH;
return err;
}
#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
if (data->notify_on_block) {
err = vmci_transport_send_read_notification(sk);
if (err < 0)
return err;
data->notify_on_block = false;
}
#endif
return err;
}
static int
vmci_transport_notify_pkt_recv_pre_dequeue(
struct sock *sk,
size_t target,
struct vmci_transport_recv_notify_data *data)
{
struct vsock_sock *vsk = vsock_sk(sk);
/* Now consume up to len bytes from the queue. Note that since we have
* the socket locked we should copy at least ready bytes.
*/
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
&data->produce_tail,
&data->consume_head);
#endif
return 0;
}
static int
vmci_transport_notify_pkt_recv_post_dequeue(
struct sock *sk,
size_t target,
ssize_t copied,
bool data_read,
struct vmci_transport_recv_notify_data *data)
{
struct vsock_sock *vsk;
int err;
vsk = vsock_sk(sk);
err = 0;
if (data_read) {
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
/* Detect a wrap-around to maintain queue generation. Note
* that this is safe since we hold the socket lock across the
* two queue pair operations.
*/
if (copied >=
vmci_trans(vsk)->consume_size - data->consume_head)
PKT_FIELD(vsk, consume_q_generation)++;
#endif
err = vmci_transport_send_read_notification(sk);
if (err < 0)
return err;
}
return err;
}
static int
vmci_transport_notify_pkt_send_init(
struct sock *sk,
struct vmci_transport_send_notify_data *data)
{
#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
data->consume_head = 0;
data->produce_tail = 0;
#endif
return 0;
}
static int
vmci_transport_notify_pkt_send_pre_block(
struct sock *sk,
struct vmci_transport_send_notify_data *data)
{
/* Notify our peer that we are waiting for room to write. */
if (!send_waiting_write(sk, 1))
return -EHOSTUNREACH;
return 0;
}
static int
vmci_transport_notify_pkt_send_pre_enqueue(
struct sock *sk,
struct vmci_transport_send_notify_data *data)
{
struct vsock_sock *vsk = vsock_sk(sk);
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
&data->produce_tail,
&data->consume_head);
#endif
return 0;
}
static int
vmci_transport_notify_pkt_send_post_enqueue(
struct sock *sk,
ssize_t written,
struct vmci_transport_send_notify_data *data)
{
int err = 0;
struct vsock_sock *vsk;
bool sent_wrote = false;
int retries = 0;
vsk = vsock_sk(sk);
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
/* Detect a wrap-around to maintain queue generation. Note that this
* is safe since we hold the socket lock across the two queue pair
* operations.
*/
if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
PKT_FIELD(vsk, produce_q_generation)++;
#endif
if (vmci_transport_notify_waiting_read(vsk)) {
/* Notify the peer that we have written, retrying the send on
* failure up to our maximum value. See the XXX comment for the
* corresponding piece of code in StreamRecvmsg() for potential
* improvements.
*/
while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
!sent_wrote &&
retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
err = vmci_transport_send_wrote(sk);
if (err >= 0)
sent_wrote = true;
retries++;
}
if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
pr_err("%p unable to send wrote notify to peer\n", sk);
return err;
} else {
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
PKT_FIELD(vsk, peer_waiting_read) = false;
#endif
}
}
return err;
}
static void
vmci_transport_notify_pkt_handle_pkt(
struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst,
struct sockaddr_vm *src, bool *pkt_processed)
{
bool processed = false;
switch (pkt->type) {
case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
processed = true;
break;
case VMCI_TRANSPORT_PACKET_TYPE_READ:
vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
processed = true;
break;
case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
dst, src);
processed = true;
break;
case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
dst, src);
processed = true;
break;
}
if (pkt_processed)
*pkt_processed = processed;
}
static void vmci_transport_notify_pkt_process_request(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
if (vmci_trans(vsk)->consume_size <
PKT_FIELD(vsk, write_notify_min_window))
PKT_FIELD(vsk, write_notify_min_window) =
vmci_trans(vsk)->consume_size;
}
static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
if (vmci_trans(vsk)->consume_size <
PKT_FIELD(vsk, write_notify_min_window))
PKT_FIELD(vsk, write_notify_min_window) =
vmci_trans(vsk)->consume_size;
}
/* Socket control packet based operations. */
struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
vmci_transport_notify_pkt_socket_init,
vmci_transport_notify_pkt_socket_destruct,
vmci_transport_notify_pkt_poll_in,
vmci_transport_notify_pkt_poll_out,
vmci_transport_notify_pkt_handle_pkt,
vmci_transport_notify_pkt_recv_init,
vmci_transport_notify_pkt_recv_pre_block,
vmci_transport_notify_pkt_recv_pre_dequeue,
vmci_transport_notify_pkt_recv_post_dequeue,
vmci_transport_notify_pkt_send_init,
vmci_transport_notify_pkt_send_pre_block,
vmci_transport_notify_pkt_send_pre_enqueue,
vmci_transport_notify_pkt_send_post_enqueue,
vmci_transport_notify_pkt_process_request,
vmci_transport_notify_pkt_process_negotiate,
};

View File

@ -0,0 +1,83 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef __VMCI_TRANSPORT_NOTIFY_H__
#define __VMCI_TRANSPORT_NOTIFY_H__
#include <linux/types.h>
#include <linux/vmw_vmci_defs.h>
#include <linux/vmw_vmci_api.h>
#include <linux/vm_sockets.h>
#include "vmci_transport.h"
/* Comment this out to compare with old protocol. */
#define VSOCK_OPTIMIZATION_WAITING_NOTIFY 1
#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
/* Comment this out to remove flow control for "new" protocol */
#define VSOCK_OPTIMIZATION_FLOW_CONTROL 1
#endif
#define VMCI_TRANSPORT_MAX_DGRAM_RESENDS 10
struct vmci_transport_recv_notify_data {
u64 consume_head;
u64 produce_tail;
bool notify_on_block;
};
struct vmci_transport_send_notify_data {
u64 consume_head;
u64 produce_tail;
};
/* Socket notification callbacks. */
struct vmci_transport_notify_ops {
void (*socket_init) (struct sock *sk);
void (*socket_destruct) (struct vsock_sock *vsk);
int (*poll_in) (struct sock *sk, size_t target,
bool *data_ready_now);
int (*poll_out) (struct sock *sk, size_t target,
bool *space_avail_now);
void (*handle_notify_pkt) (struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half, struct sockaddr_vm *dst,
struct sockaddr_vm *src,
bool *pkt_processed);
int (*recv_init) (struct sock *sk, size_t target,
struct vmci_transport_recv_notify_data *data);
int (*recv_pre_block) (struct sock *sk, size_t target,
struct vmci_transport_recv_notify_data *data);
int (*recv_pre_dequeue) (struct sock *sk, size_t target,
struct vmci_transport_recv_notify_data *data);
int (*recv_post_dequeue) (struct sock *sk, size_t target,
ssize_t copied, bool data_read,
struct vmci_transport_recv_notify_data *data);
int (*send_init) (struct sock *sk,
struct vmci_transport_send_notify_data *data);
int (*send_pre_block) (struct sock *sk,
struct vmci_transport_send_notify_data *data);
int (*send_pre_enqueue) (struct sock *sk,
struct vmci_transport_send_notify_data *data);
int (*send_post_enqueue) (struct sock *sk, ssize_t written,
struct vmci_transport_send_notify_data *data);
void (*process_request) (struct sock *sk);
void (*process_negotiate) (struct sock *sk);
};
extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops;
extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops;
#endif /* __VMCI_TRANSPORT_NOTIFY_H__ */

View File

@ -0,0 +1,438 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/stddef.h>
#include <net/sock.h>
#include "vmci_transport_notify.h"
#define PKT_FIELD(vsk, field_name) \
(vmci_trans(vsk)->notify.pkt_q_state.field_name)
static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
{
bool retval;
u64 notify_limit;
if (!PKT_FIELD(vsk, peer_waiting_write))
return false;
/* When the sender blocks, we take that as a sign that the sender is
* faster than the receiver. To reduce the transmit rate of the sender,
* we delay the sending of the read notification by decreasing the
* write_notify_window. The notification is delayed until the number of
* bytes used in the queue drops below the write_notify_window.
*/
if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
PKT_FIELD(vsk, peer_waiting_write_detected) = true;
if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
PKT_FIELD(vsk, write_notify_window) =
PKT_FIELD(vsk, write_notify_min_window);
} else {
PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
if (PKT_FIELD(vsk, write_notify_window) <
PKT_FIELD(vsk, write_notify_min_window))
PKT_FIELD(vsk, write_notify_window) =
PKT_FIELD(vsk, write_notify_min_window);
}
}
notify_limit = vmci_trans(vsk)->consume_size -
PKT_FIELD(vsk, write_notify_window);
/* The notify_limit is used to delay notifications in the case where
* flow control is enabled. Below the test is expressed in terms of
* free space in the queue: if free_space > ConsumeSize -
* write_notify_window then notify An alternate way of expressing this
* is to rewrite the expression to use the data ready in the receive
* queue: if write_notify_window > bufferReady then notify as
* free_space == ConsumeSize - bufferReady.
*/
retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
notify_limit;
if (retval) {
/* Once we notify the peer, we reset the detected flag so the
* next wait will again cause a decrease in the window size.
*/
PKT_FIELD(vsk, peer_waiting_write_detected) = false;
}
return retval;
}
static void
vmci_transport_handle_read(struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst, struct sockaddr_vm *src)
{
sk->sk_write_space(sk);
}
static void
vmci_transport_handle_wrote(struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst, struct sockaddr_vm *src)
{
sk->sk_data_ready(sk, 0);
}
static void vsock_block_update_write_window(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size)
PKT_FIELD(vsk, write_notify_window) =
min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
vmci_trans(vsk)->consume_size);
}
static int vmci_transport_send_read_notification(struct sock *sk)
{
struct vsock_sock *vsk;
bool sent_read;
unsigned int retries;
int err;
vsk = vsock_sk(sk);
sent_read = false;
retries = 0;
err = 0;
if (vmci_transport_notify_waiting_write(vsk)) {
/* Notify the peer that we have read, retrying the send on
* failure up to our maximum value. XXX For now we just log
* the failure, but later we should schedule a work item to
* handle the resend until it succeeds. That would require
* keeping track of work items in the vsk and cleaning them up
* upon socket close.
*/
while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
!sent_read &&
retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
err = vmci_transport_send_read(sk);
if (err >= 0)
sent_read = true;
retries++;
}
if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read)
pr_err("%p unable to send read notification to peer\n",
sk);
else
PKT_FIELD(vsk, peer_waiting_write) = false;
}
return err;
}
static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
PKT_FIELD(vsk, peer_waiting_write) = false;
PKT_FIELD(vsk, peer_waiting_write_detected) = false;
}
static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
{
PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
PKT_FIELD(vsk, peer_waiting_write) = false;
PKT_FIELD(vsk, peer_waiting_write_detected) = false;
}
static int
vmci_transport_notify_pkt_poll_in(struct sock *sk,
size_t target, bool *data_ready_now)
{
struct vsock_sock *vsk = vsock_sk(sk);
if (vsock_stream_has_data(vsk)) {
*data_ready_now = true;
} else {
/* We can't read right now because there is nothing in the
* queue. Ask for notifications when there is something to
* read.
*/
if (sk->sk_state == SS_CONNECTED)
vsock_block_update_write_window(sk);
*data_ready_now = false;
}
return 0;
}
static int
vmci_transport_notify_pkt_poll_out(struct sock *sk,
size_t target, bool *space_avail_now)
{
s64 produce_q_free_space;
struct vsock_sock *vsk = vsock_sk(sk);
produce_q_free_space = vsock_stream_has_space(vsk);
if (produce_q_free_space > 0) {
*space_avail_now = true;
return 0;
} else if (produce_q_free_space == 0) {
/* This is a connected socket but we can't currently send data.
* Nothing else to do.
*/
*space_avail_now = false;
}
return 0;
}
static int
vmci_transport_notify_pkt_recv_init(
struct sock *sk,
size_t target,
struct vmci_transport_recv_notify_data *data)
{
struct vsock_sock *vsk = vsock_sk(sk);
data->consume_head = 0;
data->produce_tail = 0;
data->notify_on_block = false;
if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
PKT_FIELD(vsk, write_notify_min_window) = target + 1;
if (PKT_FIELD(vsk, write_notify_window) <
PKT_FIELD(vsk, write_notify_min_window)) {
/* If the current window is smaller than the new
* minimal window size, we need to reevaluate whether
* we need to notify the sender. If the number of ready
* bytes are smaller than the new window, we need to
* send a notification to the sender before we block.
*/
PKT_FIELD(vsk, write_notify_window) =
PKT_FIELD(vsk, write_notify_min_window);
data->notify_on_block = true;
}
}
return 0;
}
static int
vmci_transport_notify_pkt_recv_pre_block(
struct sock *sk,
size_t target,
struct vmci_transport_recv_notify_data *data)
{
int err = 0;
vsock_block_update_write_window(sk);
if (data->notify_on_block) {
err = vmci_transport_send_read_notification(sk);
if (err < 0)
return err;
data->notify_on_block = false;
}
return err;
}
static int
vmci_transport_notify_pkt_recv_post_dequeue(
struct sock *sk,
size_t target,
ssize_t copied,
bool data_read,
struct vmci_transport_recv_notify_data *data)
{
struct vsock_sock *vsk;
int err;
bool was_full = false;
u64 free_space;
vsk = vsock_sk(sk);
err = 0;
if (data_read) {
smp_mb();
free_space =
vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair);
was_full = free_space == copied;
if (was_full)
PKT_FIELD(vsk, peer_waiting_write) = true;
err = vmci_transport_send_read_notification(sk);
if (err < 0)
return err;
/* See the comment in
* vmci_transport_notify_pkt_send_post_enqueue().
*/
sk->sk_data_ready(sk, 0);
}
return err;
}
static int
vmci_transport_notify_pkt_send_init(
struct sock *sk,
struct vmci_transport_send_notify_data *data)
{
data->consume_head = 0;
data->produce_tail = 0;
return 0;
}
static int
vmci_transport_notify_pkt_send_post_enqueue(
struct sock *sk,
ssize_t written,
struct vmci_transport_send_notify_data *data)
{
int err = 0;
struct vsock_sock *vsk;
bool sent_wrote = false;
bool was_empty;
int retries = 0;
vsk = vsock_sk(sk);
smp_mb();
was_empty =
vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written;
if (was_empty) {
while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
!sent_wrote &&
retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
err = vmci_transport_send_wrote(sk);
if (err >= 0)
sent_wrote = true;
retries++;
}
}
if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) {
pr_err("%p unable to send wrote notification to peer\n",
sk);
return err;
}
return err;
}
static void
vmci_transport_notify_pkt_handle_pkt(
struct sock *sk,
struct vmci_transport_packet *pkt,
bool bottom_half,
struct sockaddr_vm *dst,
struct sockaddr_vm *src, bool *pkt_processed)
{
bool processed = false;
switch (pkt->type) {
case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
processed = true;
break;
case VMCI_TRANSPORT_PACKET_TYPE_READ:
vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
processed = true;
break;
}
if (pkt_processed)
*pkt_processed = processed;
}
static void vmci_transport_notify_pkt_process_request(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
if (vmci_trans(vsk)->consume_size <
PKT_FIELD(vsk, write_notify_min_window))
PKT_FIELD(vsk, write_notify_min_window) =
vmci_trans(vsk)->consume_size;
}
static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
if (vmci_trans(vsk)->consume_size <
PKT_FIELD(vsk, write_notify_min_window))
PKT_FIELD(vsk, write_notify_min_window) =
vmci_trans(vsk)->consume_size;
}
static int
vmci_transport_notify_pkt_recv_pre_dequeue(
struct sock *sk,
size_t target,
struct vmci_transport_recv_notify_data *data)
{
return 0; /* NOP for QState. */
}
static int
vmci_transport_notify_pkt_send_pre_block(
struct sock *sk,
struct vmci_transport_send_notify_data *data)
{
return 0; /* NOP for QState. */
}
static int
vmci_transport_notify_pkt_send_pre_enqueue(
struct sock *sk,
struct vmci_transport_send_notify_data *data)
{
return 0; /* NOP for QState. */
}
/* Socket always on control packet based operations. */
struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
vmci_transport_notify_pkt_socket_init,
vmci_transport_notify_pkt_socket_destruct,
vmci_transport_notify_pkt_poll_in,
vmci_transport_notify_pkt_poll_out,
vmci_transport_notify_pkt_handle_pkt,
vmci_transport_notify_pkt_recv_init,
vmci_transport_notify_pkt_recv_pre_block,
vmci_transport_notify_pkt_recv_pre_dequeue,
vmci_transport_notify_pkt_recv_post_dequeue,
vmci_transport_notify_pkt_send_init,
vmci_transport_notify_pkt_send_pre_block,
vmci_transport_notify_pkt_send_pre_enqueue,
vmci_transport_notify_pkt_send_post_enqueue,
vmci_transport_notify_pkt_process_request,
vmci_transport_notify_pkt_process_negotiate,
};

View File

@ -0,0 +1,86 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2007-2012 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/stddef.h>
#include <net/sock.h>
#include "vsock_addr.h"
void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port)
{
memset(addr, 0, sizeof(*addr));
addr->svm_family = AF_VSOCK;
addr->svm_cid = cid;
addr->svm_port = port;
}
EXPORT_SYMBOL_GPL(vsock_addr_init);
int vsock_addr_validate(const struct sockaddr_vm *addr)
{
if (!addr)
return -EFAULT;
if (addr->svm_family != AF_VSOCK)
return -EAFNOSUPPORT;
if (addr->svm_zero[0] != 0)
return -EINVAL;
return 0;
}
EXPORT_SYMBOL_GPL(vsock_addr_validate);
bool vsock_addr_bound(const struct sockaddr_vm *addr)
{
return addr->svm_port != VMADDR_PORT_ANY;
}
EXPORT_SYMBOL_GPL(vsock_addr_bound);
void vsock_addr_unbind(struct sockaddr_vm *addr)
{
vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
}
EXPORT_SYMBOL_GPL(vsock_addr_unbind);
bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
const struct sockaddr_vm *other)
{
return addr->svm_cid == other->svm_cid &&
addr->svm_port == other->svm_port;
}
EXPORT_SYMBOL_GPL(vsock_addr_equals_addr);
bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
const struct sockaddr_vm *other)
{
return (addr->svm_cid == VMADDR_CID_ANY ||
other->svm_cid == VMADDR_CID_ANY ||
addr->svm_cid == other->svm_cid) &&
addr->svm_port == other->svm_port;
}
EXPORT_SYMBOL_GPL(vsock_addr_equals_addr_any);
int vsock_addr_cast(const struct sockaddr *addr,
size_t len, struct sockaddr_vm **out_addr)
{
if (len < sizeof(**out_addr))
return -EFAULT;
*out_addr = (struct sockaddr_vm *)addr;
return vsock_addr_validate(*out_addr);
}
EXPORT_SYMBOL_GPL(vsock_addr_cast);

View File

@ -0,0 +1,32 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _VSOCK_ADDR_H_
#define _VSOCK_ADDR_H_
#include <linux/vm_sockets.h>
void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port);
int vsock_addr_validate(const struct sockaddr_vm *addr);
bool vsock_addr_bound(const struct sockaddr_vm *addr);
void vsock_addr_unbind(struct sockaddr_vm *addr);
bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
const struct sockaddr_vm *other);
bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
const struct sockaddr_vm *other);
int vsock_addr_cast(const struct sockaddr *addr, size_t len,
struct sockaddr_vm **out_addr);
#endif

View File

@ -0,0 +1,22 @@
/*
* VMware vSockets Driver
*
* Copyright (C) 2011-2012 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef _VSOCK_VERSION_H_
#define _VSOCK_VERSION_H_
#define VSOCK_DRIVER_VERSION_PARTS { 1, 0, 0, 0 }
#define VSOCK_DRIVER_VERSION_STRING "1.0.0.0-k"
#endif /* _VSOCK_VERSION_H_ */