Merge branch 'blktrace' of git://brick.kernel.dk/data/git/linux-2.6-block

* 'blktrace' of git://brick.kernel.dk/data/git/linux-2.6-block:
  [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23
  [PATCH] relay: consolidate sendfile() and read() code
  [PATCH] relay: add sendfile() support
  [PATCH] relay: migrate from relayfs to a generic relay API
This commit is contained in:
Linus Torvalds 2006-03-23 16:24:24 -08:00
commit cec6062037
29 changed files with 2221 additions and 1293 deletions

View File

@ -11,4 +11,16 @@ config LBD
your machine, or if you want to have a raid or loopback device
bigger than 2TB. Otherwise say N.
config BLK_DEV_IO_TRACE
bool "Support for tracing block io actions"
select RELAY
select DEBUG_FS
help
Say Y here, if you want to be able to trace the block layer actions
on a given queue. Tracing allows you to see any traffic happening
on a block device queue. For more information (and the user space
support tools needed), fetch the blktrace app from:
git://brick.kernel.dk/data/git/blktrace.git
source block/Kconfig.iosched

View File

@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o

538
block/blktrace.c Normal file
View File

@ -0,0 +1,538 @@
/*
* Copyright (C) 2006 Jens Axboe <axboe@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <asm/uaccess.h>
static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
static unsigned int blktrace_seq __read_mostly = 1;
/*
* Send out a notify for this process, if we haven't done so since a trace
* started
*/
static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
{
struct blk_io_trace *t;
t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->device = bt->dev;
t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
t->pid = tsk->pid;
t->cpu = smp_processor_id();
t->pdu_len = sizeof(tsk->comm);
memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
tsk->btrace_seq = blktrace_seq;
}
}
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
pid_t pid)
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
return 1;
if (sector < bt->start_lba || sector > bt->end_lba)
return 1;
if (bt->pid && pid != bt->pid)
return 1;
return 0;
}
/*
* Data direction bit lookup
*/
static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
/*
* Bio action bits of interest
*/
static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
/*
* More could be added as needed, taking care to increment the decrementer
* to get correct indexing
*/
#define trace_barrier_bit(rw) \
(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
#define trace_sync_bit(rw) \
(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
/*
* The worker for the various blk_add_trace*() types. Fills out a
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int rw, u32 what, int error, int pdu_len, void *pdu_data)
{
struct task_struct *tsk = current;
struct blk_io_trace *t;
unsigned long flags;
unsigned long *sequence;
pid_t pid;
int cpu;
if (unlikely(bt->trace_state != Blktrace_running))
return;
what |= ddir_act[rw & WRITE];
what |= bio_act[trace_barrier_bit(rw)];
what |= bio_act[trace_sync_bit(rw)];
pid = tsk->pid;
if (unlikely(act_log_check(bt, what, sector, pid)))
return;
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes. Once reserved, it's
* enough to get preemption disabled to prevent read of this data
* before we are through filling it. get_cpu()/put_cpu() does this
* for us
*/
local_irq_save(flags);
if (unlikely(tsk->btrace_seq != blktrace_seq))
trace_note_tsk(bt, tsk);
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
if (t) {
cpu = smp_processor_id();
sequence = per_cpu_ptr(bt->sequence, cpu);
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
t->sector = sector;
t->bytes = bytes;
t->action = what;
t->pid = pid;
t->device = bt->dev;
t->cpu = cpu;
t->error = error;
t->pdu_len = pdu_len;
if (pdu_len)
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__blk_add_trace);
static struct dentry *blk_tree_root;
static struct mutex blk_tree_mutex;
static unsigned int root_users;
static inline void blk_remove_root(void)
{
if (blk_tree_root) {
debugfs_remove(blk_tree_root);
blk_tree_root = NULL;
}
}
static void blk_remove_tree(struct dentry *dir)
{
mutex_lock(&blk_tree_mutex);
debugfs_remove(dir);
if (--root_users == 0)
blk_remove_root();
mutex_unlock(&blk_tree_mutex);
}
static struct dentry *blk_create_tree(const char *blk_name)
{
struct dentry *dir = NULL;
mutex_lock(&blk_tree_mutex);
if (!blk_tree_root) {
blk_tree_root = debugfs_create_dir("block", NULL);
if (!blk_tree_root)
goto err;
}
dir = debugfs_create_dir(blk_name, blk_tree_root);
if (dir)
root_users++;
else
blk_remove_root();
err:
mutex_unlock(&blk_tree_mutex);
return dir;
}
static void blk_trace_cleanup(struct blk_trace *bt)
{
relay_close(bt->rchan);
debugfs_remove(bt->dropped_file);
blk_remove_tree(bt->dir);
free_percpu(bt->sequence);
kfree(bt);
}
static int blk_trace_remove(request_queue_t *q)
{
struct blk_trace *bt;
bt = xchg(&q->blk_trace, NULL);
if (!bt)
return -EINVAL;
if (bt->trace_state == Blktrace_setup ||
bt->trace_state == Blktrace_stopped)
blk_trace_cleanup(bt);
return 0;
}
static int blk_dropped_open(struct inode *inode, struct file *filp)
{
filp->private_data = inode->u.generic_ip;
return 0;
}
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct blk_trace *bt = filp->private_data;
char buf[16];
snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}
static struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
.open = blk_dropped_open,
.read = blk_dropped_read,
};
/*
* Keep track of how many times we encountered a full subbuffer, to aid
* the user space app in telling how many lost events there were.
*/
static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
void *prev_subbuf, size_t prev_padding)
{
struct blk_trace *bt;
if (!relay_buf_full(buf))
return 1;
bt = buf->chan->private_data;
atomic_inc(&bt->dropped);
return 0;
}
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
debugfs_remove(dentry);
return 0;
}
static struct dentry *blk_create_buf_file_callback(const char *filename,
struct dentry *parent,
int mode,
struct rchan_buf *buf,
int *is_global)
{
return debugfs_create_file(filename, mode, parent, buf,
&relay_file_operations);
}
static struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
};
/*
* Setup everything required to start tracing
*/
static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
char __user *arg)
{
struct blk_user_trace_setup buts;
struct blk_trace *old_bt, *bt = NULL;
struct dentry *dir = NULL;
char b[BDEVNAME_SIZE];
int ret, i;
if (copy_from_user(&buts, arg, sizeof(buts)))
return -EFAULT;
if (!buts.buf_size || !buts.buf_nr)
return -EINVAL;
strcpy(buts.name, bdevname(bdev, b));
/*
* some device names have larger paths - convert the slashes
* to underscores for this to work as expected
*/
for (i = 0; i < strlen(buts.name); i++)
if (buts.name[i] == '/')
buts.name[i] = '_';
if (copy_to_user(arg, &buts, sizeof(buts)))
return -EFAULT;
ret = -ENOMEM;
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
goto err;
bt->sequence = alloc_percpu(unsigned long);
if (!bt->sequence)
goto err;
ret = -ENOENT;
dir = blk_create_tree(buts.name);
if (!dir)
goto err;
bt->dir = dir;
bt->dev = bdev->bd_dev;
atomic_set(&bt->dropped, 0);
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
if (!bt->dropped_file)
goto err;
bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
if (!bt->rchan)
goto err;
bt->rchan->private_data = bt;
bt->act_mask = buts.act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
bt->start_lba = buts.start_lba;
bt->end_lba = buts.end_lba;
if (!bt->end_lba)
bt->end_lba = -1ULL;
bt->pid = buts.pid;
bt->trace_state = Blktrace_setup;
ret = -EBUSY;
old_bt = xchg(&q->blk_trace, bt);
if (old_bt) {
(void) xchg(&q->blk_trace, old_bt);
goto err;
}
return 0;
err:
if (dir)
blk_remove_tree(dir);
if (bt) {
if (bt->dropped_file)
debugfs_remove(bt->dropped_file);
if (bt->sequence)
free_percpu(bt->sequence);
if (bt->rchan)
relay_close(bt->rchan);
kfree(bt);
}
return ret;
}
static int blk_trace_startstop(request_queue_t *q, int start)
{
struct blk_trace *bt;
int ret;
if ((bt = q->blk_trace) == NULL)
return -EINVAL;
/*
* For starting a trace, we can transition from a setup or stopped
* trace. For stopping a trace, the state must be running
*/
ret = -EINVAL;
if (start) {
if (bt->trace_state == Blktrace_setup ||
bt->trace_state == Blktrace_stopped) {
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
ret = 0;
}
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
relay_flush(bt->rchan);
ret = 0;
}
}
return ret;
}
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
*
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
request_queue_t *q;
int ret, start = 0;
q = bdev_get_queue(bdev);
if (!q)
return -ENXIO;
mutex_lock(&bdev->bd_mutex);
switch (cmd) {
case BLKTRACESETUP:
ret = blk_trace_setup(q, bdev, arg);
break;
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
mutex_unlock(&bdev->bd_mutex);
return ret;
}
/**
* blk_trace_shutdown: - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
void blk_trace_shutdown(request_queue_t *q)
{
blk_trace_startstop(q, 0);
blk_trace_remove(q);
}
/*
* Average offset over two calls to sched_clock() with a gettimeofday()
* in the middle
*/
static void blk_check_time(unsigned long long *t)
{
unsigned long long a, b;
struct timeval tv;
a = sched_clock();
do_gettimeofday(&tv);
b = sched_clock();
*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
*t -= (a + b) / 2;
}
static void blk_trace_check_cpu_time(void *data)
{
unsigned long long *t;
int cpu = get_cpu();
t = &per_cpu(blk_trace_cpu_offset, cpu);
/*
* Just call it twice, hopefully the second call will be cache hot
* and a little more precise
*/
blk_check_time(t);
blk_check_time(t);
put_cpu();
}
/*
* Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
* timings
*/
static void blk_trace_calibrate_offsets(void)
{
unsigned long flags;
smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
local_irq_save(flags);
blk_trace_check_cpu_time(NULL);
local_irq_restore(flags);
}
static void blk_trace_set_ht_offsets(void)
{
#if defined(CONFIG_SCHED_SMT)
int cpu, i;
/*
* now make sure HT siblings have the same time offset
*/
preempt_disable();
for_each_online_cpu(cpu) {
unsigned long long *cpu_off, *sibling_off;
for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
if (i == cpu)
continue;
cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
sibling_off = &per_cpu(blk_trace_cpu_offset, i);
*sibling_off = *cpu_off;
}
}
preempt_enable();
#endif
}
static __init int blk_trace_init(void)
{
mutex_init(&blk_tree_mutex);
blk_trace_calibrate_offsets();
blk_trace_set_ht_offsets();
return 0;
}
module_init(blk_trace_init);

View File

@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/delay.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
struct list_head *pos;
unsigned ordseq;
blk_add_trace_rq(q, rq, BLK_TA_INSERT);
rq->q = q;
switch (where) {
@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
* not be passed by new incoming requests
*/
rq->flags |= REQ_STARTED;
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
}
if (!q->boundary_rq || q->boundary_rq == rq) {

View File

@ -5,6 +5,7 @@
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/smp_lock.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
return put_ulong(arg, bdev->bd_inode->i_size >> 9);
case BLKGETSIZE64:
return put_u64(arg, bdev->bd_inode->i_size);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
}
return -ENOIOCTLCMD;
}

View File

@ -28,6 +28,7 @@
#include <linux/writeback.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/blktrace_api.h>
/*
* for max sense size
@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
return;
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
}
}
EXPORT_SYMBOL(blk_plug_device);
@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
/*
* devices don't necessarily have an ->unplug_fn defined
*/
if (q->unplug_fn)
if (q->unplug_fn) {
blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
q->unplug_fn(q);
}
}
static void blk_unplug_work(void *data)
{
request_queue_t *q = data;
blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
q->unplug_fn(q);
}
@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
{
request_queue_t *q = (request_queue_t *)data;
blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
kblockd_schedule_work(&q->unplug_work);
}
@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
if (q->queue_tags)
__blk_queue_free_tags(q);
if (q->blk_trace)
blk_trace_shutdown(q);
kmem_cache_free(requestq_cachep, q);
}
@ -2129,6 +2145,8 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
rq_init(q, rq);
rq->rl = rl;
blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
out:
return rq;
}
@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
if (!rq) {
struct io_context *ioc;
blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
io_schedule();
@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
*/
void blk_requeue_request(request_queue_t *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
if (blk_rq_tagged(rq))
blk_queue_end_tag(q, rq);
@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
if (!q->back_merge_fn(q, req, bio))
break;
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
req->biotail->bi_next = bio;
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
if (!q->front_merge_fn(q, req, bio))
break;
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
bio->bi_next = req->bio;
req->bio = bio;
@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
request_queue_t *q;
sector_t maxsector;
int ret, nr_sectors = bio_sectors(bio);
dev_t old_dev;
might_sleep();
/* Test device or partition size, when known. */
@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
* NOTE: we don't repeat the blk_size check for each new device.
* Stacking drivers are expected to know what they are doing.
*/
maxsector = -1;
old_dev = 0;
do {
char b[BDEVNAME_SIZE];
@ -3034,6 +3063,15 @@ void generic_make_request(struct bio *bio)
*/
blk_partition_remap(bio);
if (maxsector != -1)
blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
maxsector);
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
maxsector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
ret = q->make_request_fn(q, bio);
} while (ret);
}
@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
int total_bytes, bio_nbytes, error, next_idx = 0;
struct bio *bio;
blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
/*
* extend uptodate bool to allow < 0 value to be direct io error
*/

View File

@ -38,6 +38,7 @@
#include <linux/hdreg.h>
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
cmd->rq->completion_data = cmd;
cmd->rq->errors = status;
blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
blk_complete_request(cmd->rq);
}

View File

@ -17,6 +17,7 @@
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/blktrace_api.h>
static const char *_name = DM_NAME;
@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);
blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io->md, io);
}
@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
struct target_io *tio)
{
int r;
sector_t sector;
/*
* Sanity checks.
@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
* this io.
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
if (r > 0)
if (r > 0) {
/* the bio has been remapped so dispatch it */
blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector,
clone->bi_sector);
generic_make_request(clone);
}
else if (r < 0) {
/* error the io and bail out */

View File

@ -859,18 +859,6 @@ config RAMFS
To compile this as a module, choose M here: the module will be called
ramfs.
config RELAYFS_FS
tristate "Relayfs file system support"
---help---
Relayfs is a high-speed data relay filesystem designed to provide
an efficient mechanism for tools and facilities to relay large
amounts of data from kernel space to user space.
To compile this code as a module, choose M here: the module will be
called relayfs.
If unsure, say N.
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
depends on EXPERIMENTAL

View File

@ -91,7 +91,6 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_RELAYFS_FS) += relayfs/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_JFS_FS) += jfs/
obj-$(CONFIG_XFS_FS) += xfs/

View File

@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/blktrace_api.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#define BIO_POOL_SIZE 256
@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
if (!bp)
return bp;
blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
bi->bi_sector + first_sectors);
BUG_ON(bi->bi_vcnt != 1);
BUG_ON(bi->bi_idx != 0);
atomic_set(&bp->cnt, 3);

View File

@ -72,6 +72,7 @@
#include <linux/i2c-dev.h>
#include <linux/wireless.h>
#include <linux/atalk.h>
#include <linux/blktrace_api.h>
#include <net/sock.h> /* siocdevprivate_ioctl */
#include <net/bluetooth/bluetooth.h>

View File

@ -1,4 +0,0 @@
obj-$(CONFIG_RELAYFS_FS) += relayfs.o
relayfs-y := relay.o inode.o buffers.o

View File

@ -1,190 +0,0 @@
/*
* RelayFS buffer management code.
*
* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
*
* This file is released under the GPL.
*/
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/relayfs_fs.h>
#include "relay.h"
#include "buffers.h"
/*
* close() vm_op implementation for relayfs file mapping.
*/
static void relay_file_mmap_close(struct vm_area_struct *vma)
{
struct rchan_buf *buf = vma->vm_private_data;
buf->chan->cb->buf_unmapped(buf, vma->vm_file);
}
/*
* nopage() vm_op implementation for relayfs file mapping.
*/
static struct page *relay_buf_nopage(struct vm_area_struct *vma,
unsigned long address,
int *type)
{
struct page *page;
struct rchan_buf *buf = vma->vm_private_data;
unsigned long offset = address - vma->vm_start;
if (address > vma->vm_end)
return NOPAGE_SIGBUS; /* Disallow mremap */
if (!buf)
return NOPAGE_OOM;
page = vmalloc_to_page(buf->start + offset);
if (!page)
return NOPAGE_OOM;
get_page(page);
if (type)
*type = VM_FAULT_MINOR;
return page;
}
/*
* vm_ops for relay file mappings.
*/
static struct vm_operations_struct relay_file_mmap_ops = {
.nopage = relay_buf_nopage,
.close = relay_file_mmap_close,
};
/**
* relay_mmap_buf: - mmap channel buffer to process address space
* @buf: relay channel buffer
* @vma: vm_area_struct describing memory to be mapped
*
* Returns 0 if ok, negative on error
*
* Caller should already have grabbed mmap_sem.
*/
int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
unsigned long length = vma->vm_end - vma->vm_start;
struct file *filp = vma->vm_file;
if (!buf)
return -EBADF;
if (length != (unsigned long)buf->chan->alloc_size)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
vma->vm_private_data = buf;
buf->chan->cb->buf_mapped(buf, filp);
return 0;
}
/**
* relay_alloc_buf - allocate a channel buffer
* @buf: the buffer struct
* @size: total size of the buffer
*
* Returns a pointer to the resulting buffer, NULL if unsuccessful
*/
static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
{
void *mem;
unsigned int i, j, n_pages;
size = PAGE_ALIGN(size);
n_pages = size >> PAGE_SHIFT;
buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
if (!buf->page_array)
return NULL;
for (i = 0; i < n_pages; i++) {
buf->page_array[i] = alloc_page(GFP_KERNEL);
if (unlikely(!buf->page_array[i]))
goto depopulate;
}
mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
if (!mem)
goto depopulate;
memset(mem, 0, size);
buf->page_count = n_pages;
return mem;
depopulate:
for (j = 0; j < i; j++)
__free_page(buf->page_array[j]);
kfree(buf->page_array);
return NULL;
}
/**
* relay_create_buf - allocate and initialize a channel buffer
* @alloc_size: size of the buffer to allocate
* @n_subbufs: number of sub-buffers in the channel
*
* Returns channel buffer if successful, NULL otherwise
*/
struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
if (!buf->padding)
goto free_buf;
buf->start = relay_alloc_buf(buf, chan->alloc_size);
if (!buf->start)
goto free_buf;
buf->chan = chan;
kref_get(&buf->chan->kref);
return buf;
free_buf:
kfree(buf->padding);
kfree(buf);
return NULL;
}
/**
* relay_destroy_buf - destroy an rchan_buf struct and associated buffer
* @buf: the buffer struct
*/
void relay_destroy_buf(struct rchan_buf *buf)
{
struct rchan *chan = buf->chan;
unsigned int i;
if (likely(buf->start)) {
vunmap(buf->start);
for (i = 0; i < buf->page_count; i++)
__free_page(buf->page_array[i]);
kfree(buf->page_array);
}
kfree(buf->padding);
kfree(buf);
kref_put(&chan->kref, relay_destroy_channel);
}
/**
* relay_remove_buf - remove a channel buffer
*
* Removes the file from the relayfs fileystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
* kref_put().
*/
void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
buf->chan->cb->remove_buf_file(buf->dentry);
relay_destroy_buf(buf);
}

View File

@ -1,12 +0,0 @@
#ifndef _BUFFERS_H
#define _BUFFERS_H
/* This inspired by rtai/shmem */
#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
extern struct rchan_buf *relay_create_buf(struct rchan *chan);
extern void relay_destroy_buf(struct rchan_buf *buf);
extern void relay_remove_buf(struct kref *kref);
#endif/* _BUFFERS_H */

View File

@ -1,581 +0,0 @@
/*
* VFS-related code for RelayFS, a high-speed data relay filesystem.
*
* Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
* Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
*
* Based on ramfs, Copyright (C) 2002 - Linus Torvalds
*
* This file is released under the GPL.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/relayfs_fs.h>
#include "relay.h"
#include "buffers.h"
#define RELAYFS_MAGIC 0xF0B4A981
static struct vfsmount * relayfs_mount;
static int relayfs_mount_count;
static struct backing_dev_info relayfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
};
static struct inode *relayfs_get_inode(struct super_block *sb,
int mode,
struct file_operations *fops,
void *data)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
inode->i_mode = mode;
inode->i_uid = 0;
inode->i_gid = 0;
inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_fop = fops;
if (data)
inode->u.generic_ip = data;
break;
case S_IFDIR:
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inode->i_nlink++;
break;
default:
break;
}
return inode;
}
/**
* relayfs_create_entry - create a relayfs directory or file
* @name: the name of the file to create
* @parent: parent directory
* @mode: mode
* @fops: file operations to use for the file
* @data: user-associated data for this file
*
* Returns the new dentry, NULL on failure
*
* Creates a file or directory with the specifed permissions.
*/
static struct dentry *relayfs_create_entry(const char *name,
struct dentry *parent,
int mode,
struct file_operations *fops,
void *data)
{
struct dentry *d;
struct inode *inode;
int error = 0;
BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
if (error) {
printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
return NULL;
}
if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
parent = relayfs_mount->mnt_sb->s_root;
if (!parent) {
simple_release_fs(&relayfs_mount, &relayfs_mount_count);
return NULL;
}
parent = dget(parent);
mutex_lock(&parent->d_inode->i_mutex);
d = lookup_one_len(name, parent, strlen(name));
if (IS_ERR(d)) {
d = NULL;
goto release_mount;
}
if (d->d_inode) {
d = NULL;
goto release_mount;
}
inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
if (!inode) {
d = NULL;
goto release_mount;
}
d_instantiate(d, inode);
dget(d); /* Extra count - pin the dentry in core */
if (S_ISDIR(mode))
parent->d_inode->i_nlink++;
goto exit;
release_mount:
simple_release_fs(&relayfs_mount, &relayfs_mount_count);
exit:
mutex_unlock(&parent->d_inode->i_mutex);
dput(parent);
return d;
}
/**
* relayfs_create_file - create a file in the relay filesystem
* @name: the name of the file to create
* @parent: parent directory
* @mode: mode, if not specied the default perms are used
* @fops: file operations to use for the file
* @data: user-associated data for this file
*
* Returns file dentry if successful, NULL otherwise.
*
* The file will be created user r on behalf of current user.
*/
struct dentry *relayfs_create_file(const char *name,
struct dentry *parent,
int mode,
struct file_operations *fops,
void *data)
{
BUG_ON(!fops);
if (!mode)
mode = S_IRUSR;
mode = (mode & S_IALLUGO) | S_IFREG;
return relayfs_create_entry(name, parent, mode, fops, data);
}
/**
* relayfs_create_dir - create a directory in the relay filesystem
* @name: the name of the directory to create
* @parent: parent directory, NULL if parent should be fs root
*
* Returns directory dentry if successful, NULL otherwise.
*
* The directory will be created world rwx on behalf of current user.
*/
struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
{
int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
return relayfs_create_entry(name, parent, mode, NULL, NULL);
}
/**
* relayfs_remove - remove a file or directory in the relay filesystem
* @dentry: file or directory dentry
*
* Returns 0 if successful, negative otherwise.
*/
int relayfs_remove(struct dentry *dentry)
{
struct dentry *parent;
int error = 0;
if (!dentry)
return -EINVAL;
parent = dentry->d_parent;
if (!parent)
return -EINVAL;
parent = dget(parent);
mutex_lock(&parent->d_inode->i_mutex);
if (dentry->d_inode) {
if (S_ISDIR(dentry->d_inode->i_mode))
error = simple_rmdir(parent->d_inode, dentry);
else
error = simple_unlink(parent->d_inode, dentry);
if (!error)
d_delete(dentry);
}
if (!error)
dput(dentry);
mutex_unlock(&parent->d_inode->i_mutex);
dput(parent);
if (!error)
simple_release_fs(&relayfs_mount, &relayfs_mount_count);
return error;
}
/**
* relayfs_remove_file - remove a file from relay filesystem
* @dentry: directory dentry
*
* Returns 0 if successful, negative otherwise.
*/
int relayfs_remove_file(struct dentry *dentry)
{
return relayfs_remove(dentry);
}
/**
* relayfs_remove_dir - remove a directory in the relay filesystem
* @dentry: directory dentry
*
* Returns 0 if successful, negative otherwise.
*/
int relayfs_remove_dir(struct dentry *dentry)
{
return relayfs_remove(dentry);
}
/**
* relay_file_open - open file op for relay files
* @inode: the inode
* @filp: the file
*
* Increments the channel buffer refcount.
*/
static int relay_file_open(struct inode *inode, struct file *filp)
{
struct rchan_buf *buf = inode->u.generic_ip;
kref_get(&buf->kref);
filp->private_data = buf;
return 0;
}
/**
* relay_file_mmap - mmap file op for relay files
* @filp: the file
* @vma: the vma describing what to map
*
* Calls upon relay_mmap_buf to map the file into user space.
*/
static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct rchan_buf *buf = filp->private_data;
return relay_mmap_buf(buf, vma);
}
/**
* relay_file_poll - poll file op for relay files
* @filp: the file
* @wait: poll table
*
* Poll implemention.
*/
static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
{
unsigned int mask = 0;
struct rchan_buf *buf = filp->private_data;
if (buf->finalized)
return POLLERR;
if (filp->f_mode & FMODE_READ) {
poll_wait(filp, &buf->read_wait, wait);
if (!relay_buf_empty(buf))
mask |= POLLIN | POLLRDNORM;
}
return mask;
}
/**
* relay_file_release - release file op for relay files
* @inode: the inode
* @filp: the file
*
* Decrements the channel refcount, as the filesystem is
* no longer using it.
*/
static int relay_file_release(struct inode *inode, struct file *filp)
{
struct rchan_buf *buf = filp->private_data;
kref_put(&buf->kref, relay_remove_buf);
return 0;
}
/**
* relay_file_read_consume - update the consumed count for the buffer
*/
static void relay_file_read_consume(struct rchan_buf *buf,
size_t read_pos,
size_t bytes_consumed)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t read_subbuf;
if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
}
buf->bytes_consumed += bytes_consumed;
read_subbuf = read_pos / buf->chan->subbuf_size;
if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
(buf->offset == subbuf_size))
return;
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
}
}
/**
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
{
size_t bytes_produced, bytes_consumed, write_offset;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced % n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
if (consumed > produced) {
if ((produced > n_subbufs) &&
(produced + n_subbufs - consumed <= n_subbufs))
produced += n_subbufs;
} else if (consumed == produced) {
if (buf->offset > subbuf_size) {
produced += n_subbufs;
if (buf->subbufs_produced == buf->subbufs_consumed)
consumed += n_subbufs;
}
}
if (buf->offset > subbuf_size)
bytes_produced = (produced - 1) * subbuf_size + write_offset;
else
bytes_produced = produced * subbuf_size + write_offset;
bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
if (bytes_produced == bytes_consumed)
return 0;
relay_file_read_consume(buf, read_pos, 0);
return 1;
}
/**
* relay_file_read_subbuf_avail - return bytes available in sub-buffer
*/
static size_t relay_file_read_subbuf_avail(size_t read_pos,
struct rchan_buf *buf)
{
size_t padding, avail = 0;
size_t read_subbuf, read_offset, write_subbuf, write_offset;
size_t subbuf_size = buf->chan->subbuf_size;
write_subbuf = (buf->data - buf->start) / subbuf_size;
write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
read_subbuf = read_pos / subbuf_size;
read_offset = read_pos % subbuf_size;
padding = buf->padding[read_subbuf];
if (read_subbuf == write_subbuf) {
if (read_offset + padding < write_offset)
avail = write_offset - (read_offset + padding);
} else
avail = (subbuf_size - padding) - read_offset;
return avail;
}
/**
* relay_file_read_start_pos - find the first available byte to read
*
* If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
static size_t relay_file_read_start_pos(size_t read_pos,
struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
padding_end = (read_subbuf + 1) * subbuf_size;
if (read_pos >= padding_start && read_pos < padding_end) {
read_subbuf = (read_subbuf + 1) % n_subbufs;
read_pos = read_subbuf * subbuf_size;
}
return read_pos;
}
/**
* relay_file_read_end_pos - return the new read position
*/
static size_t relay_file_read_end_pos(struct rchan_buf *buf,
size_t read_pos,
size_t count)
{
size_t read_subbuf, padding, end_pos;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
if (read_pos % subbuf_size + count + padding == subbuf_size)
end_pos = (read_subbuf + 1) * subbuf_size;
else
end_pos = read_pos + count;
if (end_pos >= subbuf_size * n_subbufs)
end_pos = 0;
return end_pos;
}
/**
* relay_file_read - read file op for relay files
* @filp: the file
* @buffer: the userspace buffer
* @count: number of bytes to read
* @ppos: position to read from
*
* Reads count bytes or the number of bytes available in the
* current sub-buffer being read, whichever is smaller.
*/
static ssize_t relay_file_read(struct file *filp,
char __user *buffer,
size_t count,
loff_t *ppos)
{
struct rchan_buf *buf = filp->private_data;
struct inode *inode = filp->f_dentry->d_inode;
size_t read_start, avail;
ssize_t ret = 0;
void *from;
mutex_lock(&inode->i_mutex);
if(!relay_file_read_avail(buf, *ppos))
goto out;
read_start = relay_file_read_start_pos(*ppos, buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
goto out;
from = buf->start + read_start;
ret = count = min(count, avail);
if (copy_to_user(buffer, from, count)) {
ret = -EFAULT;
goto out;
}
relay_file_read_consume(buf, read_start, count);
*ppos = relay_file_read_end_pos(buf, read_start, count);
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
struct file_operations relay_file_operations = {
.open = relay_file_open,
.poll = relay_file_poll,
.mmap = relay_file_mmap,
.read = relay_file_read,
.llseek = no_llseek,
.release = relay_file_release,
};
static struct super_operations relayfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
};
static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
{
struct inode *inode;
struct dentry *root;
int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = RELAYFS_MAGIC;
sb->s_op = &relayfs_ops;
inode = relayfs_get_inode(sb, mode, NULL, NULL);
if (!inode)
return -ENOMEM;
root = d_alloc_root(inode);
if (!root) {
iput(inode);
return -ENOMEM;
}
sb->s_root = root;
return 0;
}
static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *data)
{
return get_sb_single(fs_type, flags, data, relayfs_fill_super);
}
static struct file_system_type relayfs_fs_type = {
.owner = THIS_MODULE,
.name = "relayfs",
.get_sb = relayfs_get_sb,
.kill_sb = kill_litter_super,
};
static int __init init_relayfs_fs(void)
{
return register_filesystem(&relayfs_fs_type);
}
static void __exit exit_relayfs_fs(void)
{
unregister_filesystem(&relayfs_fs_type);
}
module_init(init_relayfs_fs)
module_exit(exit_relayfs_fs)
EXPORT_SYMBOL_GPL(relay_file_operations);
EXPORT_SYMBOL_GPL(relayfs_create_dir);
EXPORT_SYMBOL_GPL(relayfs_remove_dir);
EXPORT_SYMBOL_GPL(relayfs_create_file);
EXPORT_SYMBOL_GPL(relayfs_remove_file);
MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
MODULE_DESCRIPTION("Relay Filesystem");
MODULE_LICENSE("GPL");

View File

@ -1,482 +0,0 @@
/*
* Public API and common code for RelayFS.
*
* See Documentation/filesystems/relayfs.txt for an overview of relayfs.
*
* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
*
* This file is released under the GPL.
*/
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/relayfs_fs.h>
#include "relay.h"
#include "buffers.h"
/**
* relay_buf_empty - boolean, is the channel buffer empty?
* @buf: channel buffer
*
* Returns 1 if the buffer is empty, 0 otherwise.
*/
int relay_buf_empty(struct rchan_buf *buf)
{
return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
}
/**
* relay_buf_full - boolean, is the channel buffer full?
* @buf: channel buffer
*
* Returns 1 if the buffer is full, 0 otherwise.
*/
int relay_buf_full(struct rchan_buf *buf)
{
size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
return (ready >= buf->chan->n_subbufs) ? 1 : 0;
}
/*
* High-level relayfs kernel API and associated functions.
*/
/*
* rchan_callback implementations defining default channel behavior. Used
* in place of corresponding NULL values in client callback struct.
*/
/*
* subbuf_start() default callback. Does nothing.
*/
static int subbuf_start_default_callback (struct rchan_buf *buf,
void *subbuf,
void *prev_subbuf,
size_t prev_padding)
{
if (relay_buf_full(buf))
return 0;
return 1;
}
/*
* buf_mapped() default callback. Does nothing.
*/
static void buf_mapped_default_callback(struct rchan_buf *buf,
struct file *filp)
{
}
/*
* buf_unmapped() default callback. Does nothing.
*/
static void buf_unmapped_default_callback(struct rchan_buf *buf,
struct file *filp)
{
}
/*
* create_buf_file_create() default callback. Creates file to represent buf.
*/
static struct dentry *create_buf_file_default_callback(const char *filename,
struct dentry *parent,
int mode,
struct rchan_buf *buf,
int *is_global)
{
return relayfs_create_file(filename, parent, mode,
&relay_file_operations, buf);
}
/*
* remove_buf_file() default callback. Removes file representing relay buffer.
*/
static int remove_buf_file_default_callback(struct dentry *dentry)
{
return relayfs_remove(dentry);
}
/* relay channel default callbacks */
static struct rchan_callbacks default_channel_callbacks = {
.subbuf_start = subbuf_start_default_callback,
.buf_mapped = buf_mapped_default_callback,
.buf_unmapped = buf_unmapped_default_callback,
.create_buf_file = create_buf_file_default_callback,
.remove_buf_file = remove_buf_file_default_callback,
};
/**
* wakeup_readers - wake up readers waiting on a channel
* @private: the channel buffer
*
* This is the work function used to defer reader waking. The
* reason waking is deferred is that calling directly from write
* causes problems if you're writing from say the scheduler.
*/
static void wakeup_readers(void *private)
{
struct rchan_buf *buf = private;
wake_up_interruptible(&buf->read_wait);
}
/**
* __relay_reset - reset a channel buffer
* @buf: the channel buffer
* @init: 1 if this is a first-time initialization
*
* See relay_reset for description of effect.
*/
static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
{
size_t i;
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
INIT_WORK(&buf->wake_readers, NULL, NULL);
} else {
cancel_delayed_work(&buf->wake_readers);
flush_scheduled_work();
}
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
buf->bytes_consumed = 0;
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
}
/**
* relay_reset - reset the channel
* @chan: the channel
*
* This has the effect of erasing all data from all channel buffers
* and restarting the channel in its initial state. The buffers
* are not freed, so any mappings are still in effect.
*
* NOTE: Care should be taken that the channel isn't actually
* being used by anything when this call is made.
*/
void relay_reset(struct rchan *chan)
{
unsigned int i;
struct rchan_buf *prev = NULL;
if (!chan)
return;
for (i = 0; i < NR_CPUS; i++) {
if (!chan->buf[i] || chan->buf[i] == prev)
break;
__relay_reset(chan->buf[i], 0);
prev = chan->buf[i];
}
}
/**
* relay_open_buf - create a new channel buffer in relayfs
*
* Internal - used by relay_open().
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan,
const char *filename,
struct dentry *parent,
int *is_global)
{
struct rchan_buf *buf;
struct dentry *dentry;
if (*is_global)
return chan->buf[0];
buf = relay_create_buf(chan);
if (!buf)
return NULL;
/* Create file in fs */
dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
buf, is_global);
if (!dentry) {
relay_destroy_buf(buf);
return NULL;
}
buf->dentry = dentry;
__relay_reset(buf, 1);
return buf;
}
/**
* relay_close_buf - close a channel buffer
* @buf: channel buffer
*
* Marks the buffer finalized and restores the default callbacks.
* The channel buffer and channel buffer data structure are then freed
* automatically when the last reference is given up.
*/
static inline void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
buf->chan->cb = &default_channel_callbacks;
cancel_delayed_work(&buf->wake_readers);
flush_scheduled_work();
kref_put(&buf->kref, relay_remove_buf);
}
static inline void setup_callbacks(struct rchan *chan,
struct rchan_callbacks *cb)
{
if (!cb) {
chan->cb = &default_channel_callbacks;
return;
}
if (!cb->subbuf_start)
cb->subbuf_start = subbuf_start_default_callback;
if (!cb->buf_mapped)
cb->buf_mapped = buf_mapped_default_callback;
if (!cb->buf_unmapped)
cb->buf_unmapped = buf_unmapped_default_callback;
if (!cb->create_buf_file)
cb->create_buf_file = create_buf_file_default_callback;
if (!cb->remove_buf_file)
cb->remove_buf_file = remove_buf_file_default_callback;
chan->cb = cb;
}
/**
* relay_open - create a new relayfs channel
* @base_filename: base name of files to create
* @parent: dentry of parent directory, NULL for root directory
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
*
* Returns channel pointer if successful, NULL otherwise.
*
* Creates a channel buffer for each cpu using the sizes and
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be S_IRUSR.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
struct rchan_callbacks *cb)
{
unsigned int i;
struct rchan *chan;
char *tmpname;
int is_global = 0;
if (!base_filename)
return NULL;
if (!(subbuf_size && n_subbufs))
return NULL;
chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
if (!chan)
return NULL;
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
setup_callbacks(chan, cb);
kref_init(&chan->kref);
tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!tmpname)
goto free_chan;
for_each_online_cpu(i) {
sprintf(tmpname, "%s%d", base_filename, i);
chan->buf[i] = relay_open_buf(chan, tmpname, parent,
&is_global);
chan->buf[i]->cpu = i;
if (!chan->buf[i])
goto free_bufs;
}
kfree(tmpname);
return chan;
free_bufs:
for (i = 0; i < NR_CPUS; i++) {
if (!chan->buf[i])
break;
relay_close_buf(chan->buf[i]);
if (is_global)
break;
}
kfree(tmpname);
free_chan:
kref_put(&chan->kref, relay_destroy_channel);
return NULL;
}
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
* @length: size of current event
*
* Returns either the length passed in or 0 if full.
* Performs sub-buffer-switch tasks such as invoking callbacks,
* updating padding counts, waking up readers, etc.
*/
size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
{
void *old, *new;
size_t old_subbuf, new_subbuf;
if (unlikely(length > buf->chan->subbuf_size))
goto toobig;
if (buf->offset != buf->chan->subbuf_size + 1) {
buf->prev_padding = buf->chan->subbuf_size - buf->offset;
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
buf->padding[old_subbuf] = buf->prev_padding;
buf->subbufs_produced++;
if (waitqueue_active(&buf->read_wait)) {
PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
schedule_delayed_work(&buf->wake_readers, 1);
}
}
old = buf->data;
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
buf->data = new;
buf->padding[new_subbuf] = 0;
if (unlikely(length + buf->offset > buf->chan->subbuf_size))
goto toobig;
return length;
toobig:
buf->chan->last_toobig = length;
return 0;
}
/**
* relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
* @chan: the channel
* @cpu: the cpu associated with the channel buffer to update
* @subbufs_consumed: number of sub-buffers to add to current buf's count
*
* Adds to the channel buffer's consumed sub-buffer count.
* subbufs_consumed should be the number of sub-buffers newly consumed,
* not the total consumed.
*
* NOTE: kernel clients don't need to call this function if the channel
* mode is 'overwrite'.
*/
void relay_subbufs_consumed(struct rchan *chan,
unsigned int cpu,
size_t subbufs_consumed)
{
struct rchan_buf *buf;
if (!chan)
return;
if (cpu >= NR_CPUS || !chan->buf[cpu])
return;
buf = chan->buf[cpu];
buf->subbufs_consumed += subbufs_consumed;
if (buf->subbufs_consumed > buf->subbufs_produced)
buf->subbufs_consumed = buf->subbufs_produced;
}
/**
* relay_destroy_channel - free the channel struct
*
* Should only be called from kref_put().
*/
void relay_destroy_channel(struct kref *kref)
{
struct rchan *chan = container_of(kref, struct rchan, kref);
kfree(chan);
}
/**
* relay_close - close the channel
* @chan: the channel
*
* Closes all channel buffers and frees the channel.
*/
void relay_close(struct rchan *chan)
{
unsigned int i;
struct rchan_buf *prev = NULL;
if (!chan)
return;
for (i = 0; i < NR_CPUS; i++) {
if (!chan->buf[i] || chan->buf[i] == prev)
break;
relay_close_buf(chan->buf[i]);
prev = chan->buf[i];
}
if (chan->last_toobig)
printk(KERN_WARNING "relayfs: one or more items not logged "
"[item size (%Zd) > sub-buffer size (%Zd)]\n",
chan->last_toobig, chan->subbuf_size);
kref_put(&chan->kref, relay_destroy_channel);
}
/**
* relay_flush - close the channel
* @chan: the channel
*
* Flushes all channel buffers i.e. forces buffer switch.
*/
void relay_flush(struct rchan *chan)
{
unsigned int i;
struct rchan_buf *prev = NULL;
if (!chan)
return;
for (i = 0; i < NR_CPUS; i++) {
if (!chan->buf[i] || chan->buf[i] == prev)
break;
relay_switch_subbuf(chan->buf[i], 0);
prev = chan->buf[i];
}
}
EXPORT_SYMBOL_GPL(relay_open);
EXPORT_SYMBOL_GPL(relay_close);
EXPORT_SYMBOL_GPL(relay_flush);
EXPORT_SYMBOL_GPL(relay_reset);
EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
EXPORT_SYMBOL_GPL(relay_switch_subbuf);
EXPORT_SYMBOL_GPL(relay_buf_full);

View File

@ -1,8 +0,0 @@
#ifndef _RELAY_H
#define _RELAY_H
extern int relayfs_remove(struct dentry *dentry);
extern int relay_buf_empty(struct rchan_buf *buf);
extern void relay_destroy_channel(struct kref *kref);
#endif /* _RELAY_H */

View File

@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
struct elevator_queue;
typedef struct elevator_queue elevator_t;
struct request_pm_state;
struct blk_trace;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
@ -416,6 +417,8 @@ struct request_queue
unsigned int sg_reserved_size;
int node;
struct blk_trace *blk_trace;
/*
* reserved for flush operations
*/

View File

@ -0,0 +1,277 @@
#ifndef BLKTRACE_H
#define BLKTRACE_H
#include <linux/config.h>
#include <linux/blkdev.h>
#include <linux/relay.h>
/*
* Trace categories
*/
enum blktrace_cat {
BLK_TC_READ = 1 << 0, /* reads */
BLK_TC_WRITE = 1 << 1, /* writes */
BLK_TC_BARRIER = 1 << 2, /* barrier */
BLK_TC_SYNC = 1 << 3, /* barrier */
BLK_TC_QUEUE = 1 << 4, /* queueing/merging */
BLK_TC_REQUEUE = 1 << 5, /* requeueing */
BLK_TC_ISSUE = 1 << 6, /* issue */
BLK_TC_COMPLETE = 1 << 7, /* completions */
BLK_TC_FS = 1 << 8, /* fs requests */
BLK_TC_PC = 1 << 9, /* pc requests */
BLK_TC_NOTIFY = 1 << 10, /* special message */
BLK_TC_END = 1 << 15, /* only 16-bits, reminder */
};
#define BLK_TC_SHIFT (16)
#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT)
/*
* Basic trace actions
*/
enum blktrace_act {
__BLK_TA_QUEUE = 1, /* queued */
__BLK_TA_BACKMERGE, /* back merged to existing rq */
__BLK_TA_FRONTMERGE, /* front merge to existing rq */
__BLK_TA_GETRQ, /* allocated new request */
__BLK_TA_SLEEPRQ, /* sleeping on rq allocation */
__BLK_TA_REQUEUE, /* request requeued */
__BLK_TA_ISSUE, /* sent to driver */
__BLK_TA_COMPLETE, /* completed by driver */
__BLK_TA_PLUG, /* queue was plugged */
__BLK_TA_UNPLUG_IO, /* queue was unplugged by io */
__BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */
__BLK_TA_INSERT, /* insert request */
__BLK_TA_SPLIT, /* bio was split */
__BLK_TA_BOUNCE, /* bio was bounced */
__BLK_TA_REMAP, /* bio was remapped */
};
/*
* Trace actions in full. Additionally, read or write is masked
*/
#define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
#define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
#define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
#define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_SPLIT (__BLK_TA_SPLIT)
#define BLK_TA_BOUNCE (__BLK_TA_BOUNCE)
#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_IO_TRACE_MAGIC 0x65617400
#define BLK_IO_TRACE_VERSION 0x07
/*
* The trace itself
*/
struct blk_io_trace {
u32 magic; /* MAGIC << 8 | version */
u32 sequence; /* event number */
u64 time; /* in microseconds */
u64 sector; /* disk offset */
u32 bytes; /* transfer length */
u32 action; /* what happened */
u32 pid; /* who did it */
u32 device; /* device number */
u32 cpu; /* on what cpu did it happen */
u16 error; /* completion error */
u16 pdu_len; /* length of data after this trace */
};
/*
* The remap event
*/
struct blk_io_trace_remap {
u32 device;
u32 __pad;
u64 sector;
};
enum {
Blktrace_setup = 1,
Blktrace_running,
Blktrace_stopped,
};
struct blk_trace {
int trace_state;
struct rchan *rchan;
unsigned long *sequence;
u16 act_mask;
u64 start_lba;
u64 end_lba;
u32 pid;
u32 dev;
struct dentry *dir;
struct dentry *dropped_file;
atomic_t dropped;
};
/*
* User setup structure passed with BLKTRACESTART
*/
struct blk_user_trace_setup {
char name[BDEVNAME_SIZE]; /* output */
u16 act_mask; /* input */
u32 buf_size; /* input */
u32 buf_nr; /* input */
u64 start_lba;
u64 end_lba;
u32 pid;
};
#if defined(CONFIG_BLK_DEV_IO_TRACE)
extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
extern void blk_trace_shutdown(request_queue_t *);
extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
/**
* blk_add_trace_rq - Add a trace for a request oriented action
* @q: queue the io is for
* @rq: the source request
* @what: the action
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
u32 what)
{
struct blk_trace *bt = q->blk_trace;
int rw = rq->flags & 0x07;
if (likely(!bt))
return;
if (blk_pc_request(rq)) {
what |= BLK_TC_ACT(BLK_TC_PC);
__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
}
}
/**
* blk_add_trace_bio - Add a trace for a bio oriented action
* @q: queue the io is for
* @bio: the source bio
* @what: the action
*
* Description:
* Records an action against a bio. Will log the bio offset + size.
*
**/
static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what)
{
struct blk_trace *bt = q->blk_trace;
if (likely(!bt))
return;
__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
}
/**
* blk_add_trace_generic - Add a trace for a generic action
* @q: queue the io is for
* @bio: the source bio
* @rw: the data direction
* @what: the action
*
* Description:
* Records a simple trace
*
**/
static inline void blk_add_trace_generic(struct request_queue *q,
struct bio *bio, int rw, u32 what)
{
struct blk_trace *bt = q->blk_trace;
if (likely(!bt))
return;
if (bio)
blk_add_trace_bio(q, bio, what);
else
__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
}
/**
* blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
* @q: queue the io is for
* @what: the action
* @bio: the source bio
* @pdu: the integer payload
*
* Description:
* Adds a trace with some integer payload. This might be an unplug
* option given as the action, with the depth at unplug time given
* as the payload
*
**/
static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
struct bio *bio, unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
u64 rpdu = cpu_to_be64(pdu);
if (likely(!bt))
return;
if (bio)
__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
else
__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
/**
* blk_add_trace_remap - Add a trace for a remap operation
* @q: queue the io is for
* @bio: the source bio
* @dev: target device
* @from: source sector
* @to: target sector
*
* Description:
* Device mapper or raid target sometimes need to split a bio because
* it spans a stripe (or similar). Add a trace for that action.
*
**/
static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from, sector_t to)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
if (likely(!bt))
return;
r.device = cpu_to_be32(dev);
r.sector = cpu_to_be64(to);
__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
}
#else /* !CONFIG_BLK_DEV_IO_TRACE */
#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY)
#define blk_trace_shutdown(q) do { } while (0)
#define blk_add_trace_rq(q, rq, what) do { } while (0)
#define blk_add_trace_bio(q, rq, what) do { } while (0)
#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0)
#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#endif

View File

@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
COMPATIBLE_IOCTL(BLKFLSBUF)
COMPATIBLE_IOCTL(BLKSECTSET)
COMPATIBLE_IOCTL(BLKSSZGET)
COMPATIBLE_IOCTL(BLKTRACESTART)
COMPATIBLE_IOCTL(BLKTRACESTOP)
COMPATIBLE_IOCTL(BLKTRACESETUP)
COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
ULONG_IOCTL(BLKRASET)
ULONG_IOCTL(BLKFRASET)
/* RAID */

View File

@ -197,6 +197,10 @@ extern int dir_notify_enable;
#define BLKBSZGET _IOR(0x12,112,size_t)
#define BLKBSZSET _IOW(0x12,113,size_t)
#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
#define BLKTRACESTART _IO(0x12,116)
#define BLKTRACESTOP _IO(0x12,117)
#define BLKTRACETEARDOWN _IO(0x12,118)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */

281
include/linux/relay.h Normal file
View File

@ -0,0 +1,281 @@
/*
* linux/include/linux/relay.h
*
* Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
*
* CONFIG_RELAY definitions and declarations
*/
#ifndef _LINUX_RELAY_H
#define _LINUX_RELAY_H
#include <linux/config.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/kref.h>
/* Needs a _much_ better name... */
#define FIX_SIZE(x) ((((x) - 1) & PAGE_MASK) + PAGE_SIZE)
/*
* Tracks changes to rchan/rchan_buf structs
*/
#define RELAYFS_CHANNEL_VERSION 6
/*
* Per-cpu relay channel buffer
*/
struct rchan_buf
{
void *start; /* start of channel buffer */
void *data; /* start of current sub-buffer */
size_t offset; /* current offset into sub-buffer */
size_t subbufs_produced; /* count of sub-buffers produced */
size_t subbufs_consumed; /* count of sub-buffers consumed */
struct rchan *chan; /* associated channel */
wait_queue_head_t read_wait; /* reader wait queue */
struct work_struct wake_readers; /* reader wake-up work struct */
struct dentry *dentry; /* channel file dentry */
struct kref kref; /* channel buffer refcount */
struct page **page_array; /* array of current buffer pages */
unsigned int page_count; /* number of current buffer pages */
unsigned int finalized; /* buffer has been finalized */
size_t *padding; /* padding counts per sub-buffer */
size_t prev_padding; /* temporary variable */
size_t bytes_consumed; /* bytes consumed in cur read subbuf */
unsigned int cpu; /* this buf's cpu */
} ____cacheline_aligned;
/*
* Relay channel data structure
*/
struct rchan
{
u32 version; /* the version of this struct */
size_t subbuf_size; /* sub-buffer size */
size_t n_subbufs; /* number of sub-buffers per buffer */
size_t alloc_size; /* total buffer size allocated */
struct rchan_callbacks *cb; /* client callbacks */
struct kref kref; /* channel refcount */
void *private_data; /* for user-defined data */
size_t last_toobig; /* tried to log event > subbuf size */
struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
};
/*
* Relay channel client callbacks
*/
struct rchan_callbacks
{
/*
* subbuf_start - called on buffer-switch to a new sub-buffer
* @buf: the channel buffer containing the new sub-buffer
* @subbuf: the start of the new sub-buffer
* @prev_subbuf: the start of the previous sub-buffer
* @prev_padding: unused space at the end of previous sub-buffer
*
* The client should return 1 to continue logging, 0 to stop
* logging.
*
* NOTE: subbuf_start will also be invoked when the buffer is
* created, so that the first sub-buffer can be initialized
* if necessary. In this case, prev_subbuf will be NULL.
*
* NOTE: the client can reserve bytes at the beginning of the new
* sub-buffer by calling subbuf_start_reserve() in this callback.
*/
int (*subbuf_start) (struct rchan_buf *buf,
void *subbuf,
void *prev_subbuf,
size_t prev_padding);
/*
* buf_mapped - relay buffer mmap notification
* @buf: the channel buffer
* @filp: relay file pointer
*
* Called when a relay file is successfully mmapped
*/
void (*buf_mapped)(struct rchan_buf *buf,
struct file *filp);
/*
* buf_unmapped - relay buffer unmap notification
* @buf: the channel buffer
* @filp: relay file pointer
*
* Called when a relay file is successfully unmapped
*/
void (*buf_unmapped)(struct rchan_buf *buf,
struct file *filp);
/*
* create_buf_file - create file to represent a relay channel buffer
* @filename: the name of the file to create
* @parent: the parent of the file to create
* @mode: the mode of the file to create
* @buf: the channel buffer
* @is_global: outparam - set non-zero if the buffer should be global
*
* Called during relay_open(), once for each per-cpu buffer,
* to allow the client to create a file to be used to
* represent the corresponding channel buffer. If the file is
* created outside of relay, the parent must also exist in
* that filesystem.
*
* The callback should return the dentry of the file created
* to represent the relay buffer.
*
* Setting the is_global outparam to a non-zero value will
* cause relay_open() to create a single global buffer rather
* than the default set of per-cpu buffers.
*
* See Documentation/filesystems/relayfs.txt for more info.
*/
struct dentry *(*create_buf_file)(const char *filename,
struct dentry *parent,
int mode,
struct rchan_buf *buf,
int *is_global);
/*
* remove_buf_file - remove file representing a relay channel buffer
* @dentry: the dentry of the file to remove
*
* Called during relay_close(), once for each per-cpu buffer,
* to allow the client to remove a file used to represent a
* channel buffer.
*
* The callback should return 0 if successful, negative if not.
*/
int (*remove_buf_file)(struct dentry *dentry);
};
/*
* CONFIG_RELAY kernel API, kernel/relay.c
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
struct rchan_callbacks *cb);
extern void relay_close(struct rchan *chan);
extern void relay_flush(struct rchan *chan);
extern void relay_subbufs_consumed(struct rchan *chan,
unsigned int cpu,
size_t consumed);
extern void relay_reset(struct rchan *chan);
extern int relay_buf_full(struct rchan_buf *buf);
extern size_t relay_switch_subbuf(struct rchan_buf *buf,
size_t length);
/**
* relay_write - write data into the channel
* @chan: relay channel
* @data: data to be written
* @length: number of bytes to write
*
* Writes data into the current cpu's channel buffer.
*
* Protects the buffer by disabling interrupts. Use this
* if you might be logging from interrupt context. Try
* __relay_write() if you know you won't be logging from
* interrupt context.
*/
static inline void relay_write(struct rchan *chan,
const void *data,
size_t length)
{
unsigned long flags;
struct rchan_buf *buf;
local_irq_save(flags);
buf = chan->buf[smp_processor_id()];
if (unlikely(buf->offset + length > chan->subbuf_size))
length = relay_switch_subbuf(buf, length);
memcpy(buf->data + buf->offset, data, length);
buf->offset += length;
local_irq_restore(flags);
}
/**
* __relay_write - write data into the channel
* @chan: relay channel
* @data: data to be written
* @length: number of bytes to write
*
* Writes data into the current cpu's channel buffer.
*
* Protects the buffer by disabling preemption. Use
* relay_write() if you might be logging from interrupt
* context.
*/
static inline void __relay_write(struct rchan *chan,
const void *data,
size_t length)
{
struct rchan_buf *buf;
buf = chan->buf[get_cpu()];
if (unlikely(buf->offset + length > buf->chan->subbuf_size))
length = relay_switch_subbuf(buf, length);
memcpy(buf->data + buf->offset, data, length);
buf->offset += length;
put_cpu();
}
/**
* relay_reserve - reserve slot in channel buffer
* @chan: relay channel
* @length: number of bytes to reserve
*
* Returns pointer to reserved slot, NULL if full.
*
* Reserves a slot in the current cpu's channel buffer.
* Does not protect the buffer at all - caller must provide
* appropriate synchronization.
*/
static inline void *relay_reserve(struct rchan *chan, size_t length)
{
void *reserved;
struct rchan_buf *buf = chan->buf[smp_processor_id()];
if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
length = relay_switch_subbuf(buf, length);
if (!length)
return NULL;
}
reserved = buf->data + buf->offset;
buf->offset += length;
return reserved;
}
/**
* subbuf_start_reserve - reserve bytes at the start of a sub-buffer
* @buf: relay channel buffer
* @length: number of bytes to reserve
*
* Helper function used to reserve bytes at the beginning of
* a sub-buffer in the subbuf_start() callback.
*/
static inline void subbuf_start_reserve(struct rchan_buf *buf,
size_t length)
{
BUG_ON(length >= buf->chan->subbuf_size - 1);
buf->offset = length;
}
/*
* exported relay file operations, kernel/relay.c
*/
extern struct file_operations relay_file_operations;
#endif /* _LINUX_RELAY_H */

View File

@ -706,6 +706,7 @@ struct task_struct {
prio_array_t *array;
unsigned short ioprio;
unsigned int btrace_seq;
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;

View File

@ -214,6 +214,17 @@ config CPUSETS
Say N if unsure.
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
This option enables support for relay interface support in
certain file systems (such as debugfs).
It is designed to provide an efficient mechanism for tools and
facilities to relay large amounts of data from kernel space to
user space.
If unsure, say N.
source "usr/Kconfig"
config UID16

View File

@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

View File

@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
atomic_set(&tsk->fs_excl, 0);
tsk->btrace_seq = 0;
return tsk;
}

1012
kernel/relay.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/blktrace_api.h>
#include <asm/tlbflush.h>
static mempool_t *page_pool, *isa_page_pool;
@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
pool = isa_page_pool;
}
blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
/*
* slow path
*/