Merge branch 'bcache-for-upstream' of http://evilpiepirate.org/git/linux-bcache into for-3.10/drivers

Kent writes:

Hey Jens, this is everything I've got ready for 3.10 - there's _still_
one more bug I'm trying to track down.

Andrew - I've got patches that rip out the pkey() and pbtree() macros,
but they're somewhat tied up with some other nontrivial refactorings so
I think I'm going to wait a bit on those.
This commit is contained in:
Jens Axboe 2013-05-01 09:23:05 +02:00
commit f50efd2fdb
6 changed files with 219 additions and 118 deletions

View File

@ -243,31 +243,37 @@ static void invalidate_buckets_lru(struct cache *ca)
ca->heap.used = 0;
for_each_bucket(b, ca) {
/*
* If we fill up the unused list, if we then return before
* adding anything to the free_inc list we'll skip writing
* prios/gens and just go back to allocating from the unused
* list:
*/
if (fifo_full(&ca->unused))
return;
if (!can_invalidate_bucket(ca, b))
continue;
if (!GC_SECTORS_USED(b)) {
if (!bch_bucket_add_unused(ca, b))
return;
} else {
if (!heap_full(&ca->heap))
heap_add(&ca->heap, b, bucket_max_cmp);
else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
ca->heap.data[0] = b;
heap_sift(&ca->heap, 0, bucket_max_cmp);
}
if (!GC_SECTORS_USED(b) &&
bch_bucket_add_unused(ca, b))
continue;
if (!heap_full(&ca->heap))
heap_add(&ca->heap, b, bucket_max_cmp);
else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
ca->heap.data[0] = b;
heap_sift(&ca->heap, 0, bucket_max_cmp);
}
}
if (ca->heap.used * 2 < ca->heap.size)
bch_queue_gc(ca->set);
for (i = ca->heap.used / 2 - 1; i >= 0; --i)
heap_sift(&ca->heap, i, bucket_min_cmp);
while (!fifo_full(&ca->free_inc)) {
if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
/* We don't want to be calling invalidate_buckets()
/*
* We don't want to be calling invalidate_buckets()
* multiple times when it can't do anything
*/
ca->invalidate_needs_gc = 1;
@ -343,15 +349,22 @@ static void invalidate_buckets(struct cache *ca)
invalidate_buckets_random(ca);
break;
}
pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
fifo_used(&ca->free), ca->free.size,
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->unused), ca->unused.size);
}
#define allocator_wait(ca, cond) \
do { \
DEFINE_WAIT(__wait); \
\
while (!(cond)) { \
while (1) { \
prepare_to_wait(&ca->set->alloc_wait, \
&__wait, TASK_INTERRUPTIBLE); \
if (cond) \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
@ -360,7 +373,6 @@ do { \
} \
\
schedule(); \
__set_current_state(TASK_RUNNING); \
mutex_lock(&(ca)->set->bucket_lock); \
} \
\
@ -374,6 +386,11 @@ void bch_allocator_thread(struct closure *cl)
mutex_lock(&ca->set->bucket_lock);
while (1) {
/*
* First, we pull buckets off of the unused and free_inc lists,
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
while (1) {
long bucket;
@ -398,17 +415,26 @@ void bch_allocator_thread(struct closure *cl)
}
}
allocator_wait(ca, ca->set->gc_mark_valid);
/*
* We've run out of free buckets, we need to find some buckets
* we can invalidate. First, invalidate them in memory and add
* them to the free_inc list:
*/
allocator_wait(ca, ca->set->gc_mark_valid &&
(ca->need_save_prio > 64 ||
!ca->invalidate_needs_gc));
invalidate_buckets(ca);
allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
!CACHE_SYNC(&ca->set->sb));
/*
* Now, we write their new gens to disk so we can start writing
* new stuff to them:
*/
allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
if (CACHE_SYNC(&ca->set->sb) &&
(!fifo_empty(&ca->free_inc) ||
ca->need_save_prio > 64)) {
ca->need_save_prio > 64))
bch_prio_write(ca);
}
}
}
@ -475,7 +501,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
for (i = 0; i < KEY_PTRS(k); i++) {
struct bucket *b = PTR_BUCKET(c, k, i);
SET_GC_MARK(b, 0);
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
SET_GC_SECTORS_USED(b, 0);
bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
}

View File

@ -223,11 +223,17 @@ struct bkey {
#define BKEY_PADDED(key) \
union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
/* Version 1: Backing device
/* Version 0: Cache device
* Version 1: Backing device
* Version 2: Seed pointer into btree node checksum
* Version 3: New UUID format
* Version 3: Cache device with new UUID format
* Version 4: Backing device with data offset
*/
#define BCACHE_SB_VERSION 3
#define BCACHE_SB_VERSION_CDEV 0
#define BCACHE_SB_VERSION_BDEV 1
#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
#define BCACHE_SB_MAX_VERSION 4
#define SB_SECTOR 8
#define SB_SIZE 4096
@ -236,13 +242,12 @@ struct bkey {
/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
#define MAX_CACHES_PER_SET 8
#define BDEV_DATA_START 16 /* sectors */
#define BDEV_DATA_START_DEFAULT 16 /* sectors */
struct cache_sb {
uint64_t csum;
uint64_t offset; /* sector where this sb was written */
uint64_t version;
#define CACHE_BACKING_DEV 1
uint8_t magic[16];
@ -257,12 +262,28 @@ struct cache_sb {
uint64_t seq;
uint64_t pad[8];
uint64_t nbuckets; /* device size */
uint16_t block_size; /* sectors */
uint16_t bucket_size; /* sectors */
union {
struct {
/* Cache devices */
uint64_t nbuckets; /* device size */
uint16_t nr_in_set;
uint16_t nr_this_dev;
uint16_t block_size; /* sectors */
uint16_t bucket_size; /* sectors */
uint16_t nr_in_set;
uint16_t nr_this_dev;
};
struct {
/* Backing devices */
uint64_t data_offset;
/*
* block_size from the cache device section is still used by
* backing devices, so don't add anything here until we fix
* things to not need it for backing devices anymore
*/
};
};
uint32_t last_mount; /* time_t */
@ -861,6 +882,12 @@ static inline bool key_merging_disabled(struct cache_set *c)
#endif
}
static inline bool SB_IS_BDEV(const struct cache_sb *sb)
{
return sb->version == BCACHE_SB_VERSION_BDEV
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
}
struct bbio {
unsigned submit_time_us;
union {

View File

@ -984,7 +984,7 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
if (b->prio_blocked &&
!atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
closure_wake_up(&b->c->bucket_wait);
wake_up(&b->c->alloc_wait);
b->prio_blocked = 0;
@ -1548,7 +1548,6 @@ static void bch_btree_gc(struct closure *cl)
trace_bcache_gc_end(c->sb.set_uuid);
wake_up(&c->alloc_wait);
closure_wake_up(&c->bucket_wait);
continue_at(cl, bch_moving_gc, bch_gc_wq);
}

View File

@ -38,6 +38,15 @@ static void bch_generic_make_request_hack(struct bio *bio)
bio = clone;
}
/*
* Hack, since drivers that clone bios clone up to bi_max_vecs, but our
* bios might have had more than that (before we split them per device
* limitations).
*
* To be taken out once immutable bvec stuff is in.
*/
bio->bi_max_vecs = bio->bi_vcnt;
generic_make_request(bio);
}
@ -149,34 +158,32 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
{
unsigned ret = bio_sectors(bio);
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
queue_max_segments(q));
struct bio_vec *bv, *end = bio_iovec(bio) +
min_t(int, bio_segments(bio), queue_max_segments(q));
struct bvec_merge_data bvm = {
.bi_bdev = bio->bi_bdev,
.bi_sector = bio->bi_sector,
.bi_size = 0,
.bi_rw = bio->bi_rw,
};
min_t(int, bio_segments(bio), max_segments);
if (bio->bi_rw & REQ_DISCARD)
return min(ret, q->limits.max_discard_sectors);
if (bio_segments(bio) > queue_max_segments(q) ||
if (bio_segments(bio) > max_segments ||
q->merge_bvec_fn) {
ret = 0;
for (bv = bio_iovec(bio); bv < end; bv++) {
struct bvec_merge_data bvm = {
.bi_bdev = bio->bi_bdev,
.bi_sector = bio->bi_sector,
.bi_size = ret << 9,
.bi_rw = bio->bi_rw,
};
if (q->merge_bvec_fn &&
q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
break;
ret += bv->bv_len >> 9;
bvm.bi_size += bv->bv_len;
ret += bv->bv_len >> 9;
}
if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9)
return (BIO_MAX_PAGES * PAGE_SIZE) >> 9;
}
ret = min(ret, queue_max_sectors(q));

View File

@ -1220,7 +1220,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
part_stat_unlock();
bio->bi_bdev = dc->bdev;
bio->bi_sector += BDEV_DATA_START;
bio->bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) {
s = search_alloc(bio, d);

View File

@ -110,15 +110,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->flags = le64_to_cpu(s->flags);
sb->seq = le64_to_cpu(s->seq);
sb->nbuckets = le64_to_cpu(s->nbuckets);
sb->block_size = le16_to_cpu(s->block_size);
sb->bucket_size = le16_to_cpu(s->bucket_size);
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
sb->last_mount = le32_to_cpu(s->last_mount);
sb->first_bucket = le16_to_cpu(s->first_bucket);
sb->keys = le16_to_cpu(s->keys);
@ -147,53 +139,81 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
if (bch_is_zero(sb->uuid, 16))
goto err;
err = "Unsupported superblock version";
if (sb->version > BCACHE_SB_VERSION)
sb->block_size = le16_to_cpu(s->block_size);
err = "Superblock block size smaller than device block size";
if (sb->block_size << 9 < bdev_logical_block_size(bdev))
goto err;
err = "Bad block/bucket size";
if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS ||
!is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS)
goto err;
switch (sb->version) {
case BCACHE_SB_VERSION_BDEV:
sb->data_offset = BDEV_DATA_START_DEFAULT;
break;
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
sb->data_offset = le64_to_cpu(s->data_offset);
err = "Too many buckets";
if (sb->nbuckets > LONG_MAX)
goto err;
err = "Not enough buckets";
if (sb->nbuckets < 1 << 7)
goto err;
err = "Invalid superblock: device too small";
if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
goto err;
if (sb->version == CACHE_BACKING_DEV)
goto out;
err = "Bad UUID";
if (bch_is_zero(sb->set_uuid, 16))
goto err;
err = "Bad cache device number in set";
if (!sb->nr_in_set ||
sb->nr_in_set <= sb->nr_this_dev ||
sb->nr_in_set > MAX_CACHES_PER_SET)
goto err;
err = "Journal buckets not sequential";
for (i = 0; i < sb->keys; i++)
if (sb->d[i] != sb->first_bucket + i)
err = "Bad data offset";
if (sb->data_offset < BDEV_DATA_START_DEFAULT)
goto err;
err = "Too many journal buckets";
if (sb->first_bucket + sb->keys > sb->nbuckets)
goto err;
break;
case BCACHE_SB_VERSION_CDEV:
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
sb->nbuckets = le64_to_cpu(s->nbuckets);
sb->block_size = le16_to_cpu(s->block_size);
sb->bucket_size = le16_to_cpu(s->bucket_size);
err = "Invalid superblock: first bucket comes before end of super";
if (sb->first_bucket * sb->bucket_size < 16)
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
err = "Too many buckets";
if (sb->nbuckets > LONG_MAX)
goto err;
err = "Not enough buckets";
if (sb->nbuckets < 1 << 7)
goto err;
err = "Bad block/bucket size";
if (!is_power_of_2(sb->block_size) ||
sb->block_size > PAGE_SECTORS ||
!is_power_of_2(sb->bucket_size) ||
sb->bucket_size < PAGE_SECTORS)
goto err;
err = "Invalid superblock: device too small";
if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
goto err;
err = "Bad UUID";
if (bch_is_zero(sb->set_uuid, 16))
goto err;
err = "Bad cache device number in set";
if (!sb->nr_in_set ||
sb->nr_in_set <= sb->nr_this_dev ||
sb->nr_in_set > MAX_CACHES_PER_SET)
goto err;
err = "Journal buckets not sequential";
for (i = 0; i < sb->keys; i++)
if (sb->d[i] != sb->first_bucket + i)
goto err;
err = "Too many journal buckets";
if (sb->first_bucket + sb->keys > sb->nbuckets)
goto err;
err = "Invalid superblock: first bucket comes before end of super";
if (sb->first_bucket * sb->bucket_size < 16)
goto err;
break;
default:
err = "Unsupported superblock version";
goto err;
out:
}
sb->last_mount = get_seconds();
err = NULL;
@ -286,7 +306,7 @@ void bcache_write_super(struct cache_set *c)
for_each_cache(ca, c, i) {
struct bio *bio = &ca->sb_bio;
ca->sb.version = BCACHE_SB_VERSION;
ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
ca->sb.seq = c->sb.seq;
ca->sb.last_mount = c->sb.last_mount;
@ -641,6 +661,35 @@ void bcache_device_stop(struct bcache_device *d)
closure_queue(&d->cl);
}
static void bcache_device_unlink(struct bcache_device *d)
{
unsigned i;
struct cache *ca;
sysfs_remove_link(&d->c->kobj, d->name);
sysfs_remove_link(&d->kobj, "cache");
for_each_cache(ca, d->c, i)
bd_unlink_disk_holder(ca->bdev, d->disk);
}
static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
const char *name)
{
unsigned i;
struct cache *ca;
for_each_cache(ca, d->c, i)
bd_link_disk_holder(ca->bdev, d->disk);
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
sysfs_create_link(&c->kobj, &d->kobj, d->name),
"Couldn't create device <-> cache set symlinks");
}
static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@ -656,6 +705,8 @@ static void bcache_device_detach(struct bcache_device *d)
atomic_set(&d->detaching, 0);
}
bcache_device_unlink(d);
d->c->devices[d->id] = NULL;
closure_put(&d->c->caching);
d->c = NULL;
@ -673,17 +724,6 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
closure_get(&c->caching);
}
static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
const char *name)
{
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
sysfs_create_link(&c->kobj, &d->kobj, d->name),
"Couldn't create device <-> cache set symlinks");
}
static void bcache_device_free(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@ -784,6 +824,7 @@ void bch_cached_dev_run(struct cached_dev *dc)
}
add_disk(d->disk);
bd_link_disk_holder(dc->bdev, dc->disk.disk);
#if 0
char *env[] = { "SYMLINK=label" , NULL };
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
@ -803,9 +844,6 @@ static void cached_dev_detach_finish(struct work_struct *w)
BUG_ON(!atomic_read(&dc->disk.detaching));
BUG_ON(atomic_read(&dc->count));
sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name);
sysfs_remove_link(&dc->disk.kobj, "cache");
mutex_lock(&bch_register_lock);
memset(&dc->sb.set_uuid, 0, 16);
@ -920,7 +958,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
bcache_device_attach(&dc->disk, c, u - c->uuids);
bcache_device_link(&dc->disk, c, "bdev");
list_move(&dc->list, &c->cached_devs);
calc_cached_dev_sectors(c);
@ -938,6 +975,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
pr_info("Caching %s as %s on set %pU",
bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
@ -961,6 +999,7 @@ static void cached_dev_free(struct closure *cl)
mutex_lock(&bch_register_lock);
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
list_del(&dc->list);
@ -1049,7 +1088,11 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
g = dc->disk.disk;
set_capacity(g, dc->bdev->bd_part->nr_sects - 16);
set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
g->queue->backing_dev_info.ra_pages =
max(g->queue->backing_dev_info.ra_pages,
bdev->bd_queue->backing_dev_info.ra_pages);
bch_cached_dev_request_init(dc);
@ -1099,8 +1142,7 @@ static void flash_dev_flush(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
sysfs_remove_link(&d->c->kobj, d->name);
sysfs_remove_link(&d->kobj, "cache");
bcache_device_unlink(d);
kobject_del(&d->kobj);
continue_at(cl, flash_dev_free, system_wq);
}
@ -1802,7 +1844,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (err)
goto err_close;
if (sb->version == CACHE_BACKING_DEV) {
if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
err = register_bdev(sb, sb_page, bdev, dc);