for-5.1-part1-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAlx9czQACgkQxWXV+ddt
 WDvC9w/8CxJf1/eZBqb+b+aA38kgZhoaNixMud/IW/IFmIlicX0PoDxk6dh1ZTA+
 3uej/7fyfwjNCVvtrPVVxdT8zhZgyJouHrbhG1PlDWtmTEV2VqV5pBG1xQtCwmZy
 oinQI5oYYM5Le5EXxRGH8TQs6Z3tFuLx2kcrVWBLFKoZ2kZBZxe6KykGF9izve4a
 sVjtOL1CEL1e00vrNLzUmch8qss9Cu0i3qd3k8UANp3SgKIaOkJt4S/HeEcLfy5J
 kf6hVKlgPDuakVtAJKyhbLVQsfHVNkfiyvplta9lDot/iJchJITTRkadP6LblVeo
 knl8V+VO9kzQUvGauxtu66Q3DJ/7mqbzHUwPISetdKCV9ZXkuPFHnu0AEP577mrx
 e1JAPA/a8lF3up5QhqIb0uzH3sczOd8nNN/b1Xnxl7Kogyl8SUjhmX3FFy88borj
 /8Ptv/fFMQZs9IJ0QWlkh5TKRXAtSNAzVy2FpkvLaO0k0gJKQjyJuTKV5ezv/PGU
 +4m5kDtfpyz//KAOZxq4lERj4EMIEDhHhNbA8Qqmdeoj7oaKZ+gW57enOXohCTbi
 gVE6xDr2u4oQ85j3JuQo5W5mZA4uza35Gh4t43n5akdrrkLFVLW584hDxShGx9uS
 B0maToGbzOdGJTZXZ2SLHZ5Da14Lzb/TooCufF8GMAISb99vbjw=
 =D9zc
 -----END PGP SIGNATURE-----

Merge tag 'for-5.1-part1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "This contains usual mix of new features, core changes and fixes; full
  list below. I'm planning second pull request, with a few more fixes
  that arrived recently but too close to merge window, will send it next
  week.

  New features:

   - support zstd compression levels

   - new ioctl to unregister a device from the module (ie. reverse of
     device scan)

   - scrub prints a message to log when it's about to start or finish

  Core changes:

   - qgroups can now skip part of a tree that does not get updated
     during relocation, because this does not affect the quota
     accounting, estimated speedup in run time is about 20%

   - the compression workspace management had to be enhanced due to zstd
     requirements

   - various enospc fixes, when there's high fragmentation the
     over-reservation can cause ENOSPC that might not happen after a
     flush, in such cases try to wait if the situation improves

  Fixes:

   - various ioctls could overwrite previous return value if
     copy_to_user fails, fix this so the original error is reported

   - more reclaim vs GFP_KERNEL fixes

   - other cleanups and refactoring

   - fix a (valid) lockdep warning in a test when device replace is
     destroying worker threads

   - make qgroup async transaction commit more aggressive, this avoids
     some 'quota limit reached' errors if there are not enough data to
     trigger transaction in order to flush

   - fix deadlock between snapshot deletion and quotas when backref
     walking is called from context that already holds the same locks

   - fsync fixes:
      - fix fsync after succession of renames of different files
      - fix fsync after succession of renames and unlink/rmdir"

* tag 'for-5.1-part1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (92 commits)
  btrfs: Remove unnecessary casts in btrfs_read_root_item
  Btrfs: remove assertion when searching for a key in a node/leaf
  Btrfs: add missing error handling after doing leaf/node binary search
  btrfs: drop the lock on error in btrfs_dev_replace_cancel
  btrfs: ensure that a DUP or RAID1 block group has exactly two stripes
  btrfs: init csum_list before possible free
  Btrfs: remove no longer needed range length checks for deduplication
  Btrfs: fix fsync after succession of renames and unlink/rmdir
  Btrfs: fix fsync after succession of renames of different files
  btrfs: honor path->skip_locking in backref code
  btrfs: qgroup: Make qgroup async transaction commit more aggressive
  btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
  btrfs: scrub: remove unused nocow worker pointer
  btrfs: scrub: add assertions for worker pointers
  btrfs: scrub: convert scrub_workers_refcnt to refcount_t
  btrfs: scrub: add scrub_lock lockdep check in scrub_workers_get
  btrfs: scrub: fix circular locking dependency warning
  btrfs: fix comment its device list mutex not volume lock
  btrfs: extent_io: Kill the forward declaration of flush_write_bio
  btrfs: Fix grossly misleading argument names in extent io search
  ...
This commit is contained in:
Linus Torvalds 2019-03-07 09:07:30 -08:00
commit b1e243957e
38 changed files with 1997 additions and 964 deletions

View File

@ -9,6 +9,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
}
if (acl) {
unsigned int nofs_flag;
size = posix_acl_xattr_size(acl->a_count);
/*
* We're holding a transaction handle, so use a NOFS memory
* allocation context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!value) {
ret = -ENOMEM;
goto out;

View File

@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
}
if (flags & WQ_HIGHPRI)
ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
ret->current_active, "btrfs",
name);
ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
ret->current_active, name);
else
ret->normal_wq = alloc_workqueue("%s-%s", flags,
ret->current_active, "btrfs",
name);
ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
ret->current_active, name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;

View File

@ -712,7 +712,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* read tree blocks and add keys where required.
*/
static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct preftrees *preftrees)
struct preftrees *preftrees, bool lock)
{
struct prelim_ref *ref;
struct extent_buffer *eb;
@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
btrfs_tree_read_lock(eb);
if (lock)
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
else
btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
btrfs_tree_read_unlock(eb);
if (lock)
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
cond_resched();
@ -1227,7 +1229,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = add_missing_keys(fs_info, &preftrees);
ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
@ -1288,11 +1290,15 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
ret = -EIO;
goto out;
}
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
if (!path->skip_locking) {
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_read(eb);
}
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
btrfs_tree_read_unlock_blocking(eb);
if (!path->skip_locking)
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
if (!path->skip_locking)
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}

View File

@ -730,6 +730,28 @@ struct heuristic_ws {
struct list_head list;
};
static struct workspace_manager heuristic_wsm;
static void heuristic_init_workspace_manager(void)
{
btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
}
static void heuristic_cleanup_workspace_manager(void)
{
btrfs_cleanup_workspace_manager(&heuristic_wsm);
}
static struct list_head *heuristic_get_workspace(unsigned int level)
{
return btrfs_get_workspace(&heuristic_wsm, level);
}
static void heuristic_put_workspace(struct list_head *ws)
{
btrfs_put_workspace(&heuristic_wsm, ws);
}
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@ -742,7 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
static struct list_head *alloc_heuristic_ws(void)
static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
@ -769,65 +791,59 @@ static struct list_head *alloc_heuristic_ws(void)
return ERR_PTR(-ENOMEM);
}
struct workspaces_list {
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
int free_ws;
/* Total number of allocated workspaces */
atomic_t total_ws;
/* Waiters for a free workspace */
wait_queue_head_t ws_wait;
const struct btrfs_compress_op btrfs_heuristic_compress = {
.init_workspace_manager = heuristic_init_workspace_manager,
.cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
.get_workspace = heuristic_get_workspace,
.put_workspace = heuristic_put_workspace,
.alloc_workspace = alloc_heuristic_ws,
.free_workspace = free_heuristic_ws,
};
static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
static struct workspaces_list btrfs_heuristic_ws;
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
/* The heuristic is represented as compression type 0 */
&btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
void __init btrfs_init_compress(void)
void btrfs_init_workspace_manager(struct workspace_manager *wsm,
const struct btrfs_compress_op *ops)
{
struct list_head *workspace;
int i;
INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
spin_lock_init(&btrfs_heuristic_ws.ws_lock);
atomic_set(&btrfs_heuristic_ws.total_ws, 0);
init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
wsm->ops = ops;
workspace = alloc_heuristic_ws();
INIT_LIST_HEAD(&wsm->idle_ws);
spin_lock_init(&wsm->ws_lock);
atomic_set(&wsm->total_ws, 0);
init_waitqueue_head(&wsm->ws_wait);
/*
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
workspace = wsm->ops->alloc_workspace(0);
if (IS_ERR(workspace)) {
pr_warn(
"BTRFS: cannot preallocate heuristic workspace, will try later\n");
"BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
atomic_set(&btrfs_heuristic_ws.total_ws, 1);
btrfs_heuristic_ws.free_ws = 1;
list_add(workspace, &btrfs_heuristic_ws.idle_ws);
atomic_set(&wsm->total_ws, 1);
wsm->free_ws = 1;
list_add(workspace, &wsm->idle_ws);
}
}
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
spin_lock_init(&btrfs_comp_ws[i].ws_lock);
atomic_set(&btrfs_comp_ws[i].total_ws, 0);
init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
{
struct list_head *ws;
/*
* Preallocate one workspace for each compression type so
* we can guarantee forward progress in the worst case
*/
workspace = btrfs_compress_op[i]->alloc_workspace();
if (IS_ERR(workspace)) {
pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
atomic_set(&btrfs_comp_ws[i].total_ws, 1);
btrfs_comp_ws[i].free_ws = 1;
list_add(workspace, &btrfs_comp_ws[i].idle_ws);
}
while (!list_empty(&wsman->idle_ws)) {
ws = wsman->idle_ws.next;
list_del(ws);
wsman->ops->free_workspace(ws);
atomic_dec(&wsman->total_ws);
}
}
@ -837,11 +853,11 @@ void __init btrfs_init_compress(void)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
static struct list_head *__find_workspace(int type, bool heuristic)
struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
unsigned int level)
{
struct list_head *workspace;
int cpus = num_online_cpus();
int idx = type - 1;
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
@ -849,19 +865,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
wait_queue_head_t *ws_wait;
int *free_ws;
if (heuristic) {
idle_ws = &btrfs_heuristic_ws.idle_ws;
ws_lock = &btrfs_heuristic_ws.ws_lock;
total_ws = &btrfs_heuristic_ws.total_ws;
ws_wait = &btrfs_heuristic_ws.ws_wait;
free_ws = &btrfs_heuristic_ws.free_ws;
} else {
idle_ws = &btrfs_comp_ws[idx].idle_ws;
ws_lock = &btrfs_comp_ws[idx].ws_lock;
total_ws = &btrfs_comp_ws[idx].total_ws;
ws_wait = &btrfs_comp_ws[idx].ws_wait;
free_ws = &btrfs_comp_ws[idx].free_ws;
}
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
ws_wait = &wsm->ws_wait;
free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
@ -892,10 +900,7 @@ static struct list_head *__find_workspace(int type, bool heuristic)
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
if (heuristic)
workspace = alloc_heuristic_ws();
else
workspace = btrfs_compress_op[idx]->alloc_workspace();
workspace = wsm->ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@ -926,85 +931,47 @@ static struct list_head *__find_workspace(int type, bool heuristic)
return workspace;
}
static struct list_head *find_workspace(int type)
static struct list_head *get_workspace(int type, int level)
{
return __find_workspace(type, false);
return btrfs_compress_op[type]->get_workspace(level);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
static void __free_workspace(int type, struct list_head *workspace,
bool heuristic)
void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
{
int idx = type - 1;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
if (heuristic) {
idle_ws = &btrfs_heuristic_ws.idle_ws;
ws_lock = &btrfs_heuristic_ws.ws_lock;
total_ws = &btrfs_heuristic_ws.total_ws;
ws_wait = &btrfs_heuristic_ws.ws_wait;
free_ws = &btrfs_heuristic_ws.free_ws;
} else {
idle_ws = &btrfs_comp_ws[idx].idle_ws;
ws_lock = &btrfs_comp_ws[idx].ws_lock;
total_ws = &btrfs_comp_ws[idx].total_ws;
ws_wait = &btrfs_comp_ws[idx].ws_wait;
free_ws = &btrfs_comp_ws[idx].free_ws;
}
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
ws_wait = &wsm->ws_wait;
free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
list_add(workspace, idle_ws);
list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
if (heuristic)
free_heuristic_ws(workspace);
else
btrfs_compress_op[idx]->free_workspace(workspace);
wsm->ops->free_workspace(ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
static void free_workspace(int type, struct list_head *ws)
static void put_workspace(int type, struct list_head *ws)
{
return __free_workspace(type, ws, false);
}
/*
* cleanup function for module exit
*/
static void free_workspaces(void)
{
struct list_head *workspace;
int i;
while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
workspace = btrfs_heuristic_ws.idle_ws.next;
list_del(workspace);
free_heuristic_ws(workspace);
atomic_dec(&btrfs_heuristic_ws.total_ws);
}
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
atomic_dec(&btrfs_comp_ws[i].total_ws);
}
}
return btrfs_compress_op[type]->put_workspace(ws);
}
/*
@ -1036,18 +1003,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_in,
unsigned long *total_out)
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
int type = type_level & 0xF;
workspace = find_workspace(type);
btrfs_compress_op[type - 1]->set_level(workspace, type_level);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
workspace = get_workspace(type, level);
ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
start, pages,
out_pages,
total_in, total_out);
free_workspace(type, workspace);
put_workspace(type, workspace);
return ret;
}
@ -1071,9 +1037,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int ret;
int type = cb->compress_type;
workspace = find_workspace(type);
ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
free_workspace(type, workspace);
workspace = get_workspace(type, 0);
ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
put_workspace(type, workspace);
return ret;
}
@ -1089,19 +1055,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
struct list_head *workspace;
int ret;
workspace = find_workspace(type);
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
workspace = get_workspace(type, 0);
ret = btrfs_compress_op[type]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
put_workspace(type, workspace);
free_workspace(type, workspace);
return ret;
}
void __init btrfs_init_compress(void)
{
int i;
for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
btrfs_compress_op[i]->init_workspace_manager();
}
void __cold btrfs_exit_compress(void)
{
free_workspaces();
int i;
for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
btrfs_compress_op[i]->cleanup_workspace_manager();
}
/*
@ -1512,7 +1488,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
struct list_head *ws_list = __find_workspace(0, true);
struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@ -1581,18 +1557,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
}
out:
__free_workspace(0, ws_list, true);
put_workspace(0, ws_list);
return ret;
}
unsigned int btrfs_compress_str2level(const char *str)
/*
* Convert the compression suffix (eg. after "zlib" starting with ":") to
* level, unrecognized string will set the default level
*/
unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
if (strncmp(str, "zlib", 4) != 0)
unsigned int level = 0;
int ret;
if (!type)
return 0;
/* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
return str[5] - '0';
if (str[0] == ':') {
ret = kstrtouint(str + 1, 10, &level);
if (ret)
level = 0;
}
return BTRFS_ZLIB_DEFAULT_LEVEL;
level = btrfs_compress_op[type]->set_level(level);
return level;
}

View File

@ -64,6 +64,16 @@ struct compressed_bio {
u32 sums;
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
{
return (type_level & 0xF);
}
static inline unsigned int btrfs_compress_level(unsigned int type_level)
{
return ((type_level & 0xF0) >> 4);
}
void __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
unsigned btrfs_compress_str2level(const char *str);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@ -97,8 +107,35 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_TYPES = 3,
};
struct workspace_manager {
const struct btrfs_compress_op *ops;
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
int free_ws;
/* Total number of allocated workspaces */
atomic_t total_ws;
/* Waiters for a free workspace */
wait_queue_head_t ws_wait;
};
void btrfs_init_workspace_manager(struct workspace_manager *wsm,
const struct btrfs_compress_op *ops);
struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
unsigned int level);
void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
struct btrfs_compress_op {
struct list_head *(*alloc_workspace)(void);
void (*init_workspace_manager)(void);
void (*cleanup_workspace_manager)(void);
struct list_head *(*get_workspace)(unsigned int level);
void (*put_workspace)(struct list_head *ws);
struct list_head *(*alloc_workspace)(unsigned int level);
void (*free_workspace)(struct list_head *workspace);
@ -119,9 +156,18 @@ struct btrfs_compress_op {
unsigned long start_byte,
size_t srclen, size_t destlen);
void (*set_level)(struct list_head *ws, unsigned int type);
/*
* This bounds the level set by the user to be within range of a
* particular compression type. It returns the level that will be used
* if the level is out of bounds or the default if 0 is passed in.
*/
unsigned int (*set_level)(unsigned int level);
};
/* The heuristic workspaces are managed via the 0th workspace manager */
#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;

View File

@ -13,6 +13,7 @@
#include "print-tree.h"
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (!p->nodes[i] || !p->locks[i])
continue;
btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
if (p->locks[i] == BTRFS_READ_LOCK)
/*
* If we currently have a spinning reader or writer lock this
* will bump the count of blocking holders and drop the
* spinlock.
*/
if (p->locks[i] == BTRFS_READ_LOCK) {
btrfs_set_lock_blocking_read(p->nodes[i]);
p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
else if (p->locks[i] == BTRFS_WRITE_LOCK)
} else if (p->locks[i] == BTRFS_WRITE_LOCK) {
btrfs_set_lock_blocking_write(p->nodes[i]);
p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
}
}
}
@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
return eb;
btrfs_set_path_blocking(path);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock_blocking(eb_root);
free_extent_buffer(eb_root);
@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
btrfs_set_lock_blocking(parent);
btrfs_set_lock_blocking(buf);
btrfs_set_lock_blocking_write(parent);
btrfs_set_lock_blocking_write(buf);
/*
* Before CoWing this block for later modification, check if it's
* the subtree root and do the delayed subtree trace if needed.
*
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
btrfs_set_lock_blocking(parent);
btrfs_set_lock_blocking_write(parent);
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = last_block;
btrfs_tree_lock(cur);
btrfs_set_lock_blocking(cur);
btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
if (ret) {
btrfs_tree_unlock(child);
@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left);
if (wret) {
@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right);
if (wret) {
@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@ -2529,26 +2544,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
return ret;
}
static void key_search_validate(struct extent_buffer *b,
const struct btrfs_key *key,
int level)
{
#ifdef CONFIG_BTRFS_ASSERT
struct btrfs_disk_key disk_key;
btrfs_cpu_key_to_disk(&disk_key, key);
if (level == 0)
ASSERT(!memcmp_extent_buffer(b, &disk_key,
offsetof(struct btrfs_leaf, items[0].key),
sizeof(disk_key)));
else
ASSERT(!memcmp_extent_buffer(b, &disk_key,
offsetof(struct btrfs_node, ptrs[0].key),
sizeof(disk_key)));
#endif
}
static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
return *prev_cmp;
}
key_search_validate(b, key, level);
*slot = 0;
return 0;
@ -3005,6 +2999,8 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
*/
prev_cmp = -1;
ret = key_search(b, key, level, &prev_cmp, &slot);
if (ret < 0)
goto done;
if (level != 0) {
int dec = 0;
@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
@ -5156,6 +5152,10 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
sret = btrfs_bin_search(cur, min_key, level, &slot);
if (sret < 0) {
ret = sret;
goto out;
}
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {

View File

@ -934,7 +934,8 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
struct mutex cleaner_delayed_iput_mutex;
atomic_t nr_delayed_iputs;
wait_queue_head_t delayed_iputs_wait;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@ -1074,10 +1075,13 @@ struct btrfs_fs_info {
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
int scrub_workers_refcnt;
/*
* The worker pointers are NULL iff the refcount is 0, ie. scrub is not
* running.
*/
refcount_t scrub_workers_refcnt;
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_nocow_workers;
struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@ -1199,6 +1203,24 @@ enum {
BTRFS_ROOT_MULTI_LOG_TASKS,
BTRFS_ROOT_DIRTY,
BTRFS_ROOT_DELETING,
/*
* Reloc tree is orphan, only kept here for qgroup delayed subtree scan
*
* Set for the subvolume tree owning the reloc tree.
*/
BTRFS_ROOT_DEAD_RELOC_TREE,
};
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
*/
struct btrfs_qgroup_swapped_blocks {
spinlock_t lock;
/* RM_EMPTY_ROOT() of above blocks[] */
bool swapped;
struct rb_root blocks[BTRFS_MAX_LEVEL];
};
/*
@ -1311,6 +1333,14 @@ struct btrfs_root {
struct list_head ordered_root;
u64 nr_ordered_extents;
/*
* Not empty if this subvolume root has gone through tree block swap
* (relocation)
*
* Will be used by reloc_control::dirty_subvol_roots.
*/
struct list_head reloc_dirty_list;
/*
* Number of currently running SEND ioctls to prevent
* manipulation with the read-only status via SUBVOL_SETFLAGS
@ -1328,6 +1358,9 @@ struct btrfs_root {
/* Number of active swapfiles */
atomic_t nr_swapfiles;
/* Record pairs of swapped blocks for qgroup */
struct btrfs_qgroup_swapped_blocks swapped_blocks;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@ -2775,7 +2808,8 @@ enum btrfs_flush_state {
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
ALLOC_CHUNK = 7,
COMMIT_TRANS = 8,
ALLOC_CHUNK_FORCE = 8,
COMMIT_TRANS = 9,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@ -3181,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start,
u64 len, int create);
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes);
@ -3254,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
@ -3261,7 +3295,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@ -3490,21 +3524,18 @@ do { \
rcu_read_unlock(); \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
__cold
static inline void assfail(const char *expr, const char *file, int line)
{
pr_err("assertion failed: %s, file: %s, line: %d\n",
expr, file, line);
BUG();
if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
pr_err("assertion failed: %s, file: %s, line: %d\n",
expr, file, line);
BUG();
}
}
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#else
#define ASSERT(expr) ((void)0)
#endif
/*
* Use that for functions that are conditionally exported for sanity tests but

View File

@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
head_ref->qgroup_reserved = 0;
head_ref->qgroup_ref_root = 0;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
if (qrecord) {
if (ref_root && reserved) {
head_ref->qgroup_ref_root = ref_root;
head_ref->qgroup_reserved = reserved;
qrecord->data_rsv = reserved;
qrecord->data_rsv_refroot = ref_root;
}
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
WARN_ON(qrecord && head_ref->qgroup_ref_root
&& head_ref->qgroup_reserved
&& existing->qgroup_ref_root
&& existing->qgroup_reserved);
update_existing_head_ref(trans, existing, head_ref,
old_ref_mod);
/*
@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
record = kmalloc(sizeof(*record), GFP_NOFS);
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
record = kmalloc(sizeof(*record), GFP_NOFS);
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep,

View File

@ -102,17 +102,6 @@ struct btrfs_delayed_ref_head {
*/
int ref_mod;
/*
* For qgroup reserved space freeing.
*
* ref_root and reserved will be recorded after
* BTRFS_ADD_DELAYED_EXTENT is called.
* And will be used to free reserved qgroup space at
* run_delayed_refs() time.
*/
u64 qgroup_ref_root;
u64 qgroup_reserved;
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree

View File

@ -111,11 +111,11 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
NULL, NULL);
dev_replace->tgtdev = btrfs_find_device(fs_info,
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
src_devid, NULL, NULL, true);
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
NULL, NULL);
NULL, NULL, true);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
break;
default:
up_write(&dev_replace->rwsem);
result = -EINVAL;
}

View File

@ -17,6 +17,7 @@
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (need_lock) {
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@ -1120,7 +1121,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
-buf->len,
fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@ -1175,6 +1176,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
@ -1218,6 +1220,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@ -1258,10 +1261,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
unsigned int nofs_flag;
int ret = 0;
uuid_le uuid = NULL_UUID_LE;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
@ -1707,9 +1717,7 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
@ -2101,7 +2109,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
fs_info->scrub_workers_refcnt = 0;
refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@ -2666,7 +2674,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@ -2688,6 +2695,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@ -2765,6 +2773,7 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
init_waitqueue_head(&fs_info->delayed_iputs_wait);
INIT_LIST_HEAD(&fs_info->pinned_chunks);
@ -4238,16 +4247,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
spin_lock(&delayed_refs->lock);
if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
}
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
@ -4263,12 +4265,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);

View File

@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
}
}
/* Also free its reserved qgroup space */
btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
head->qgroup_reserved);
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@ -3013,8 +3010,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
if (run_all) {
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans);
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first_cached(&delayed_refs->href_root);
@ -4280,10 +4276,14 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
* more space is released.
* more space is released. We don't need to
* explicitly run the delayed iputs here because
* the commit_transaction would have woken up
* the cleaner.
*/
mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
ret = btrfs_wait_on_delayed_iputs(fs_info);
if (ret)
return ret;
goto again;
} else {
btrfs_end_transaction(trans);
@ -4396,21 +4396,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
if (force == CHUNK_ALLOC_FORCE)
return 1;
/*
* We need to take into account the global rsv because for all intents
* and purposes it's used space. Don't worry about locking the
* global_rsv, it doesn't change except when the transaction commits.
*/
if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
bytes_used += calc_global_rsv_need_space(global_rsv);
/*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 max_reclaim;
u64 async_pages;
u64 items;
long time_left;
unsigned long nr_pages;
@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_SHIFT;
btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
/*
* We need to wait for the async pages to actually start before
* we do anything.
* Triggers inode writeback for up to nr_pages. This will invoke
* ->writepages callback and trigger delalloc filling
* (btrfs_run_delalloc_range()).
*/
max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
if (!max_reclaim)
btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
/*
* We need to wait for the compressed pages to start before
* we continue.
*/
async_pages = atomic_read(&fs_info->async_delalloc_pages);
if (!async_pages)
goto skip_async;
if (max_reclaim <= nr_pages)
max_reclaim = 0;
/*
* Calculate how many compressed pages we want to be written
* before we continue. I.e if there are more async pages than we
* require wait_event will wait until nr_pages are written.
*/
if (async_pages <= nr_pages)
async_pages = 0;
else
max_reclaim -= nr_pages;
async_pages -= nr_pages;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
(int)max_reclaim);
(int)async_pages);
skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
@ -4808,6 +4810,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
}
struct reserve_ticket {
u64 orig_bytes;
u64 bytes;
int error;
struct list_head list;
@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (!bytes_needed)
return 0;
/* See if there is enough pinned space to make this reservation */
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
bytes_needed,
BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
/*
* See if there is enough pinned space to make this reservation, or if
* we have block groups that are going to be freed, allowing us to
* possibly do a chunk allocation the next loop through.
*/
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
__percpu_counter_compare(&space_info->total_bytes_pinned,
bytes_needed,
BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
goto commit;
/*
@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
* this reservation.
*/
if (space_info != delayed_rsv->space_info)
return -ENOSPC;
goto enospc;
spin_lock(&delayed_rsv->lock);
reclaim_bytes += delayed_rsv->reserved;
@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
bytes_needed,
BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
return -ENOSPC;
}
BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
goto enospc;
commit:
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return -ENOSPC;
return btrfs_commit_transaction(trans);
enospc:
btrfs_end_transaction(trans);
return -ENOSPC;
}
/*
@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
}
ret = do_chunk_alloc(trans,
btrfs_metadata_alloc_profile(fs_info),
CHUNK_ALLOC_NO_FORCE);
(state == ALLOC_CHUNK) ?
CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* bunch of pinned space, so make sure we run the iputs before
* we do our pinned bytes check below.
*/
mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_wait_on_delayed_iputs(fs_info);
ret = may_commit_transaction(fs_info, space_info);
break;
@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
static void wake_all_tickets(struct list_head *head)
static bool wake_all_tickets(struct list_head *head)
{
struct reserve_ticket *ticket;
@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
list_del_init(&ticket->list);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
if (ticket->bytes != ticket->orig_bytes)
return true;
}
return false;
}
/*
@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
/*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
* freed up a bunch of space and so have a lot of pinned space
* to reclaim. We would rather use that than possibly create a
* underutilized metadata chunk. So if this is our first run
* through the flushing state machine skip ALLOC_CHUNK_FORCE and
* commit the transaction. If nothing has changed the next go
* around then we can force a chunk allocation.
*/
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
flush_state++;
if (flush_state > COMMIT_TRANS) {
commit_cycles++;
if (commit_cycles > 2) {
wake_all_tickets(&space_info->tickets);
space_info->flush = 0;
if (wake_all_tickets(&space_info->tickets)) {
flush_state = FLUSH_DELAYED_ITEMS_NR;
commit_cycles--;
} else {
space_info->flush = 0;
}
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
INIT_WORK(work, btrfs_async_reclaim_metadata_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
ALLOC_CHUNK,
};
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
u64 to_reclaim;
int flush_state = FLUSH_DELAYED_ITEMS_NR;
int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
}
spin_unlock(&space_info->lock);
flush_state = 0;
do {
flush_space(fs_info, space_info, to_reclaim, flush_state);
flush_space(fs_info, space_info, to_reclaim,
priority_flush_states[flush_state]);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
return;
}
spin_unlock(&space_info->lock);
/*
* Priority flushers can't wait on delalloc without
* deadlocking.
*/
if (flush_state == FLUSH_DELALLOC ||
flush_state == FLUSH_DELALLOC_WAIT)
flush_state = ALLOC_CHUNK;
} while (flush_state < COMMIT_TRANS);
} while (flush_state < ARRAY_SIZE(priority_flush_states));
}
static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket, u64 orig_bytes)
struct reserve_ticket *ticket)
{
DEFINE_WAIT(wait);
u64 reclaim_bytes = 0;
int ret = 0;
spin_lock(&space_info->lock);
@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
ret = ticket->error;
if (!list_empty(&ticket->list))
list_del_init(&ticket->list);
if (ticket->bytes && ticket->bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket->bytes;
update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, num_bytes, 0);
}
if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
reclaim_bytes = ticket->orig_bytes - ticket->bytes;
spin_unlock(&space_info->lock);
if (reclaim_bytes)
space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
return ret;
}
@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket ticket;
u64 used;
u64 reclaim_bytes = 0;
int ret = 0;
ASSERT(orig_bytes);
@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
ticket.orig_bytes = orig_bytes;
ticket.bytes = orig_bytes;
ticket.error = 0;
init_waitqueue_head(&ticket.wait);
@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
if (flush == BTRFS_RESERVE_FLUSH_ALL)
return wait_reserve_ticket(fs_info, space_info, &ticket,
orig_bytes);
return wait_reserve_ticket(fs_info, space_info, &ticket);
ret = 0;
priority_reclaim_metadata_space(fs_info, space_info, &ticket);
spin_lock(&space_info->lock);
if (ticket.bytes) {
if (ticket.bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket.bytes;
update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags,
num_bytes, 0);
}
if (ticket.bytes < orig_bytes)
reclaim_bytes = orig_bytes - ticket.bytes;
list_del_init(&ticket.list);
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
if (reclaim_bytes)
space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
ASSERT(list_empty(&ticket.list));
return ret;
}
@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
u64 *metadata_bytes, u64 *qgroup_bytes)
{
*metadata_bytes = 0;
*qgroup_bytes = 0;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved < block_rsv->size)
*metadata_bytes = block_rsv->size - block_rsv->reserved;
if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
*qgroup_bytes = block_rsv->qgroup_rsv_size -
block_rsv->qgroup_rsv_reserved;
spin_unlock(&block_rsv->lock);
}
/**
* btrfs_inode_rsv_refill - refill the inode block rsv.
* @inode - the inode we are refilling.
@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 num_bytes = 0;
u64 qgroup_num_bytes = 0;
u64 num_bytes, last = 0;
u64 qgroup_num_bytes;
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved < block_rsv->size)
num_bytes = block_rsv->size - block_rsv->reserved;
if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
qgroup_num_bytes = block_rsv->qgroup_rsv_size -
block_rsv->qgroup_rsv_reserved;
spin_unlock(&block_rsv->lock);
calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
if (num_bytes == 0)
return 0;
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
if (ret)
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
do {
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
true);
if (ret)
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
last = num_bytes;
/*
* If we are fragmented we can end up with a lot of
* outstanding extents which will make our size be much
* larger than our reserved amount.
*
* If the reservation happens here, it might be very
* big though not needed in the end, if the delalloc
* flushing happens.
*
* If this is the case try and do the reserve again.
*/
if (flush == BTRFS_RESERVE_FLUSH_ALL)
calc_refill_bytes(block_rsv, &num_bytes,
&qgroup_num_bytes);
if (num_bytes == 0)
return 0;
}
} while (ret && last != num_bytes);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&block_rsv->lock);
} else
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
}
return ret;
}
@ -8066,6 +8125,15 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
return ret;
}
#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
do { \
struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
spin_lock(&__rsv->lock); \
btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
__rsv->size, __rsv->reserved); \
spin_unlock(&__rsv->lock); \
} while (0)
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
info->bytes_readonly);
spin_unlock(&info->lock);
DUMP_BLOCK_RSV(fs_info, global_block_rsv);
DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
if (!dump_block_groups)
return;
@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clean_tree_block(fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@ -8917,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
reada = 1;
}
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@ -8977,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
btrfs_set_lock_blocking_write(next);
}
level--;
@ -9089,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
btrfs_set_lock_blocking(eb);
btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@ -9131,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
btrfs_set_lock_blocking(eb);
btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
clean_tree_block(fs_info, eb);
@ -9298,7 +9372,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
btrfs_set_lock_blocking(path->nodes[level]);
btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
memset(&wc->update_progress, 0,
@ -9328,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
btrfs_set_lock_blocking(path->nodes[level]);
btrfs_set_lock_blocking_write(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@ -9595,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
@ -9621,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
sinfo_used = btrfs_space_info_used(sinfo, true);
if (btrfs_space_info_used(sinfo, true) + num_bytes +
min_allocable_bytes <= sinfo->total_bytes) {
if (sinfo_used + num_bytes + min_allocable_bytes <=
sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@ -9632,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro",
cache->key.objectid);
btrfs_info(cache->fs_info,
"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
sinfo_used, num_bytes, min_allocable_bytes);
dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
}
@ -10781,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->dirty_list)) {
WARN_ON(1);
}
if (!list_empty(&block_group->io_list)) {
WARN_ON(1);
}
WARN_ON(!list_empty(&block_group->dirty_list));
WARN_ON(!list_empty(&block_group->io_list));
spin_unlock(&trans->transaction->dirty_bgs_lock);
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);

View File

@ -147,7 +147,38 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
static void flush_write_bio(struct extent_page_data *epd);
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
if (tree->ops)
ret = tree->ops->submit_bio_hook(tree->private_data, bio,
mirror_num, bio_flags, start);
else
btrfsic_submit_bio(bio);
return blk_status_to_errno(ret);
}
static void flush_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
int ret;
ret = submit_one_bio(epd->bio, 0, 0);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
}
int __init extent_io_init(void)
{
@ -281,8 +312,8 @@ static struct rb_node *tree_insert(struct rb_root *root,
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
struct rb_node **prev_ret,
struct rb_node **next_ret,
struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
@ -311,23 +342,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
if (parent_ret)
*parent_ret = prev;
if (prev_ret) {
if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
*prev_ret = prev;
*next_ret = prev;
prev = orig_prev;
}
if (next_ret) {
if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
while (prev && offset < prev_entry->start) {
prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
*next_ret = prev;
*prev_ret = prev;
}
return NULL;
}
@ -338,12 +369,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
struct rb_node *prev = NULL;
struct rb_node *next= NULL;
struct rb_node *ret;
ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret)
return prev;
return next;
return ret;
}
@ -585,7 +616,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (delete)
bits |= ~EXTENT_CTLBITS;
bits |= EXTENT_FIRST_DELALLOC;
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
@ -850,7 +880,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@ -2692,28 +2721,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
if (tree->ops)
ret = tree->ops->submit_bio_hook(tree->private_data, bio,
mirror_num, bio_flags, start);
else
btrfsic_submit_bio(bio);
return blk_status_to_errno(ret);
}
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
@ -4007,17 +4014,6 @@ static int extent_write_cache_pages(struct address_space *mapping,
return ret;
}
static void flush_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
int ret;
ret = submit_one_bio(epd->bio, 0, 0);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
}
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
@ -4259,8 +4255,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
len, 0);
em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
if (IS_ERR_OR_NULL(em))
return em;

View File

@ -18,17 +18,16 @@
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
#define EXTENT_CLEAR_META_RESV (1U << 11)
#define EXTENT_FIRST_DELALLOC (1U << 12)
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_CLEAR_DATA_RESV (1U << 17)
#define EXTENT_DELALLOC_NEW (1U << 18)
#define EXTENT_NEED_WAIT (1U << 12)
#define EXTENT_DAMAGED (1U << 13)
#define EXTENT_NORESERVE (1U << 14)
#define EXTENT_QGROUP_RESERVED (1U << 15)
#define EXTENT_CLEAR_DATA_RESV (1U << 16)
#define EXTENT_DELALLOC_NEW (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
/*
* flags for bio submission. The high bits indicate the compression

View File

@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (!list_empty(&prev->list) || !list_empty(&next->list))
return 0;
ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
prev->block_start != EXTENT_MAP_DELALLOC);
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
prev->block_start == EXTENT_MAP_INLINE) ||
(next->block_start == EXTENT_MAP_DELALLOC &&
prev->block_start == EXTENT_MAP_DELALLOC) ||
(next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
next->block_start == extent_map_block_end(prev)))) {
return 1;

View File

@ -9,6 +9,7 @@
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
/* used only during fiemap calls */
#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */

View File

@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
&cached_state);
while (start < inode->i_size) {
em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
start, len, 0);
em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;

View File

@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
u64 isize = i_size_read(inode);
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
actual_end = min_t(u64, isize, end + 1);
actual_end = min_t(u64, i_size_read(inode), end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
static noinline void submit_compressed_extents(struct inode *inode,
struct async_cow *async_cow)
static noinline void submit_compressed_extents(struct async_cow *async_cow)
{
struct inode *inode = async_cow->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
/*
* ->inode could be NULL if async_cow_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
* always adjust ->async_delalloc_pages as its paired with the init
* happening in cow_file_range_async
*/
if (async_cow->inode)
submit_compressed_extents(async_cow->inode, async_cow);
submit_compressed_extents(async_cow);
}
static noinline void async_cow_free(struct btrfs_work *work)
@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
async_cow->inode = igrab(inode);
/*
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
ihold(inode);
async_cow->inode = inode;
async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
unsigned int write_flags = wbc_to_write_flags(wbc);
@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
/**
* btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
* @fs_info - the fs_info for this fs
* @return - EINTR if we were killed, 0 if nothing's pending
*
* This will wait on any delayed iputs that are currently running with KILLABLE
* set. Once they are all done running we will return, unless we are killed in
* which case we return EINTR. This helps in user operations like fallocate etc
* that might get blocked on the iputs.
*/
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
int ret = wait_event_killable(fs_info->delayed_iputs_wait,
atomic_read(&fs_info->nr_delayed_iputs) == 0);
if (ret)
return -EINTR;
return 0;
}
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
int failures = 0;
for (;;) {
struct btrfs_trans_handle *trans;
int ret;
ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
ret = btrfs_block_rsv_refill(root, rsv,
rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
return ERR_PTR(-ENOSPC);
}
/*
* Evict can generate a large amount of delayed refs without
* having a way to add space back since we exhaust our temporary
* block rsv. We aren't allowed to do FLUSH_ALL in this case
* because we could deadlock with so many things in the flushing
* code, so we have to try and hold some extra space to
* compensate for our delayed ref generation. If we can't get
* that space then we need see if we can steal our minimum from
* the global reserve. We will be ratelimited by the amount of
* space we have for the delayed refs rsv, so we'll end up
* committing and trying again.
*/
trans = btrfs_join_transaction(root);
if (IS_ERR(trans) || !ret)
if (IS_ERR(trans) || !ret) {
if (!IS_ERR(trans)) {
trans->block_rsv = &fs_info->trans_block_rsv;
trans->bytes_reserved = delayed_refs_extra;
btrfs_block_rsv_migrate(rsv, trans->block_rsv,
delayed_refs_extra, 1);
}
return trans;
}
/*
* Try to steal from the global reserve if there is space for
@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
u32 found_type;
u8 extent_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (ret < 0) {
err = ret;
goto out;
}
if (ret != 0) {
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
/* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
found_type = found_key.type;
if (found_key.objectid != objectid ||
found_type != BTRFS_EXTENT_DATA_KEY) {
found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto next;
}
found_type = btrfs_file_extent_type(leaf, item);
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
@ -6840,9 +6888,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (ret < 0) {
err = ret;
goto out;
}
if (ret > 0)
} else if (ret > 0) {
goto not_found;
}
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@ -6853,19 +6901,22 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto not_found;
if (start > found_key.offset)
goto next;
/* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
goto not_found_em;
em->block_start = EXTENT_MAP_HOLE;
goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item,
new_inline, em);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
@ -6916,7 +6967,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
em->start = start;
em->orig_start = start;
em->len = len;
not_found_em:
em->block_start = EXTENT_MAP_HOLE;
insert:
btrfs_release_path(path);
@ -6946,19 +6996,17 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
struct page *page,
size_t pg_offset, u64 start, u64 len,
int create)
u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
u64 range_start = start;
u64 delalloc_start = start;
u64 end;
u64 found;
u64 found_end;
u64 delalloc_len;
u64 delalloc_end;
int err = 0;
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return em;
/*
@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
found = count_range_bits(&inode->io_tree, &range_start,
delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
found_end = range_start + found;
if (found_end < range_start)
found_end = (u64)-1;
delalloc_end = delalloc_start + delalloc_len;
if (delalloc_end < delalloc_start)
delalloc_end = (u64)-1;
/*
* we didn't find anything useful, return
* the original results from get_extent()
* We didn't find anything useful, return the original results from
* get_extent()
*/
if (range_start > end || found_end <= start) {
if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
/* adjust the range_start to make sure it doesn't
* go backwards from the start they passed in
/*
* Adjust the delalloc_start to make sure it doesn't go backwards from
* the start they passed in
*/
range_start = max(start, range_start);
found = found_end - range_start;
delalloc_start = max(start, delalloc_start);
delalloc_len = delalloc_end - delalloc_start;
if (found > 0) {
u64 hole_start = start;
u64 hole_len = len;
if (delalloc_len > 0) {
u64 hole_start;
u64 hole_len;
const u64 hole_end = extent_map_end(hole_em);
em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
/*
* when btrfs_get_extent can't find anything it
* returns one huge hole
*
* make sure what it found really fits our range, and
* adjust to make sure it is based on the start from
* the caller
*/
if (hole_em) {
u64 calc_end = extent_map_end(hole_em);
if (calc_end <= start || (hole_em->start > end)) {
free_extent_map(hole_em);
hole_em = NULL;
} else {
hole_start = max(hole_em->start, start);
hole_len = calc_end - hole_start;
}
}
em->bdev = NULL;
if (hole_em && range_start > hole_start) {
/* our hole starts before our delalloc, so we
* have to return just the parts of the hole
* that go until the delalloc starts
ASSERT(hole_em);
/*
* When btrfs_get_extent can't find anything it returns one
* huge hole
*
* Make sure what it found really fits our range, and adjust to
* make sure it is based on the start from the caller
*/
if (hole_end <= start || hole_em->start > end) {
free_extent_map(hole_em);
hole_em = NULL;
} else {
hole_start = max(hole_em->start, start);
hole_len = hole_end - hole_start;
}
if (hole_em && delalloc_start > hole_start) {
/*
* Our hole starts before our delalloc, so we have to
* return just the parts of the hole that go until the
* delalloc starts
*/
em->len = min(hole_len,
range_start - hole_start);
em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
* don't adjust block start at all,
* it is fixed at EXTENT_MAP_HOLE
* Don't adjust block start at all, it is fixed at
* EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
em->start = range_start;
em->len = found;
em->orig_start = range_start;
/*
* Hole is out of passed range or it starts after
* delalloc range
*/
em->start = delalloc_start;
em->len = delalloc_len;
em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
em->block_len = found;
em->block_len = delalloc_len;
}
} else {
return hole_em;
@ -9910,7 +9962,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);

View File

@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
device = btrfs_find_device(fs_info, devid, NULL, NULL);
device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
s_uuid = di_args->uuid;
rcu_read_lock();
dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
NULL, true);
if (!dev) {
ret = -ENODEV;
@ -3241,32 +3242,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
u64 len = olen;
if (loff + len == src->i_size)
len = ALIGN(src->i_size, bs) - loff;
/*
* For same inode case we don't want our length pushed out past i_size
* as comparing that data range makes no sense.
*
* This effectively means we require aligned extents for the single
* inode case, whereas the other cases allow an unaligned length so long
* as it ends at i_size.
*/
if (dst == src && len != olen)
return -EINVAL;
/*
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
@ -3278,21 +3264,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
u64 i, tail_len, chunk_count;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
return -EINVAL;
if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
return -ETXTBSY;
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
if (chunk_count == 0)
num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
@ -3908,14 +3883,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* be either compressed or non-compressed.
*/
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
return -EINVAL;
if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
return -ETXTBSY;
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
* eof block into the middle of a file, which would result in corruption
@ -3991,6 +3958,13 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
else
btrfs_double_inode_lock(inode_in, inode_out);
/* don't make the dst file partly checksummed */
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
ret = -EINVAL;
goto out_unlock;
}
/*
* Now that the inodes are locked, we need to start writeback ourselves
* and can not rely on the writeback from the VFS's generic helper
@ -4381,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
if (copy_to_user(arg, sa, sizeof(*sa)))
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
@ -4414,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
if (copy_to_user(arg, sa, sizeof(*sa)))
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@ -4438,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
ret = btrfs_get_dev_stats(fs_info, sa);
if (copy_to_user(arg, sa, sizeof(*sa)))
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@ -4484,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
break;
}
if (copy_to_user(arg, p, sizeof(*p)))
if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
out:
kfree(p);
@ -4790,7 +4764,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
if (arg) {
if ((ret == 0 || ret == -ECANCELED) && arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}

View File

@ -14,43 +14,58 @@
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
/*
* if we currently have a spinning reader or writer lock
* (indicated by the rw flag) this will bump the count
* of blocking holders and drop the spinlock.
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
/*
* no lock is required. The lock owner may change if
* we have a read lock, but it won't change to or away
* from us. If we have the write lock, we are the owner
* and it'll never change.
* No lock is required. The lock owner may change if we have a read
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
btrfs_assert_tree_locked(eb);
atomic_inc(&eb->blocking_writers);
write_unlock(&eb->lock);
}
} else if (rw == BTRFS_READ_LOCK) {
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
atomic_dec(&eb->spinning_readers);
read_unlock(&eb->lock);
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
atomic_dec(&eb->spinning_readers);
read_unlock(&eb->lock);
}
void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
{
/*
* No lock is required. The lock owner may change if we have a read
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
btrfs_assert_tree_locked(eb);
atomic_inc(&eb->blocking_writers);
write_unlock(&eb->lock);
}
}
/*
* if we currently have a blocking lock, take the spinlock
* and drop our blocking count
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
{
/*
* No lock is required. The lock owner may change if we have a read
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_readers))
cond_wake_up_nomb(&eb->read_lock_wq);
}
void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
{
/*
* no lock is required. The lock owner may change if
@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_writers))
cond_wake_up_nomb(&eb->write_lock_wq);
} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_readers))
cond_wake_up_nomb(&eb->read_lock_wq);
}
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_writers))
cond_wake_up_nomb(&eb->write_lock_wq);
}
/*
@ -232,16 +237,9 @@ void btrfs_tree_lock(struct extent_buffer *eb)
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
write_lock(&eb->lock);
if (atomic_read(&eb->blocking_readers)) {
if (atomic_read(&eb->blocking_readers) ||
atomic_read(&eb->blocking_writers)) {
write_unlock(&eb->lock);
wait_event(eb->read_lock_wq,
atomic_read(&eb->blocking_readers) == 0);
goto again;
}
if (atomic_read(&eb->blocking_writers)) {
write_unlock(&eb->lock);
wait_event(eb->write_lock_wq,
atomic_read(&eb->blocking_writers) == 0);
goto again;
}
WARN_ON(atomic_read(&eb->spinning_writers));

View File

@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
{
btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
}
static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
{
btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
}
#endif

View File

@ -61,6 +61,28 @@ struct workspace {
struct list_head list;
};
static struct workspace_manager wsm;
static void lzo_init_workspace_manager(void)
{
btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
}
static void lzo_cleanup_workspace_manager(void)
{
btrfs_cleanup_workspace_manager(&wsm);
}
static struct list_head *lzo_get_workspace(unsigned int level)
{
return btrfs_get_workspace(&wsm, level);
}
static void lzo_put_workspace(struct list_head *ws)
{
btrfs_put_workspace(&wsm, ws);
}
static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
static struct list_head *lzo_alloc_workspace(void)
static struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@ -485,11 +507,16 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
return ret;
}
static void lzo_set_level(struct list_head *ws, unsigned int type)
static unsigned int lzo_set_level(unsigned int level)
{
return 0;
}
const struct btrfs_compress_op btrfs_lzo_compress = {
.init_workspace_manager = lzo_init_workspace_manager,
.cleanup_workspace_manager = lzo_cleanup_workspace_manager,
.get_workspace = lzo_get_workspace,
.put_workspace = lzo_put_workspace,
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,

View File

@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
if (bytenr < entry->bytenr)
if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
else if (bytenr > entry->bytenr)
} else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
else
} else {
if (record->data_rsv && !entry->data_rsv) {
entry->data_rsv = record->data_rsv;
entry->data_rsv_refroot =
record->data_rsv_refroot;
}
return 1;
}
}
rb_link_node(&record->node, parent_node, p);
@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
record = kmalloc(sizeof(*record), gfp_flag);
record = kzalloc(sizeof(*record), gfp_flag);
if (!record)
return -ENOMEM;
@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
}
@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
need_cleanup = true;
}
@ -2017,86 +2023,30 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
return ret;
}
/*
* Inform qgroup to trace subtree swap used in balance.
*
* Unlike btrfs_qgroup_trace_subtree(), this function will only trace
* new tree blocks whose generation is equal to (or larger than) @last_snapshot.
*
* Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
* @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
* and then go down @src_eb (pointed by @src_parent and @src_slot) to find
* the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
* and skip all tree blocks whose generation is smaller than last_snapshot.
*
* This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
* which could be the cause of very slow balance if the file tree is large.
*
* @src_parent, @src_slot: pointer to src (file tree) eb.
* @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
*/
int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *bg_cache,
struct extent_buffer *src_parent, int src_slot,
struct extent_buffer *dst_parent, int dst_slot,
u64 last_snapshot)
static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
struct extent_buffer *src_eb,
struct extent_buffer *dst_eb,
u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *dst_path = NULL;
struct btrfs_key first_key;
struct extent_buffer *src_eb = NULL;
struct extent_buffer *dst_eb = NULL;
bool trace_leaf = false;
u64 child_gen;
u64 child_bytenr;
int level;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
/* Check parameter order */
if (btrfs_node_ptr_generation(src_parent, src_slot) >
btrfs_node_ptr_generation(dst_parent, dst_slot)) {
/* Wrong parameter order */
if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
btrfs_node_ptr_generation(src_parent, src_slot),
btrfs_node_ptr_generation(dst_parent, dst_slot));
btrfs_header_generation(src_eb),
btrfs_header_generation(dst_eb));
return -EUCLEAN;
}
/*
* Only trace leaf if we're relocating data block groups, this could
* reduce tons of data extents tracing for meta/sys bg relocation.
*/
if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
trace_leaf = true;
/* Read out real @src_eb, pointed by @src_parent and @src_slot */
child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
btrfs_header_level(src_parent) - 1, &first_key);
if (IS_ERR(src_eb)) {
ret = PTR_ERR(src_eb);
goto out;
}
/* Read out real @dst_eb, pointed by @src_parent and @src_slot */
child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
btrfs_header_level(dst_parent) - 1, &first_key);
if (IS_ERR(dst_eb)) {
ret = PTR_ERR(dst_eb);
goto out;
}
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
ret = -EINVAL;
ret = -EIO;
goto out;
}
@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = -ENOMEM;
goto out;
}
/* For dst_path */
extent_buffer_get(dst_eb);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
/* Do the generation-aware breadth-first search */
/* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
level, last_snapshot, trace_leaf);
if (ret < 0)
@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = 0;
out:
free_extent_buffer(src_eb);
free_extent_buffer(dst_eb);
btrfs_free_path(dst_path);
if (ret < 0)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@ -2207,7 +2154,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
goto cleanup;
}
/* Free the reserved data space */
btrfs_qgroup_free_refroot(fs_info,
record->data_rsv_refroot,
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
/*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
@ -2842,16 +2794,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
/*
* Two limits to commit transaction in advance.
*
* For RATIO, it will be 1/RATIO of the remaining limit
* (excluding data and prealloc meta) as threshold.
* For RATIO, it will be 1/RATIO of the remaining limit as threshold.
* For SIZE, it will be in byte unit as threshold.
*/
#define QGROUP_PERTRANS_RATIO 32
#define QGROUP_PERTRANS_SIZE SZ_32M
#define QGROUP_FREE_RATIO 32
#define QGROUP_FREE_SIZE SZ_32M
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup *qg, u64 num_bytes)
{
u64 limit;
u64 free;
u64 threshold;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
*/
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
limit = qg->max_excl;
else
limit = qg->max_rfer;
threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
QGROUP_PERTRANS_RATIO;
threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
QGROUP_FREE_SIZE);
} else {
free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
QGROUP_FREE_SIZE);
}
/*
* Use transaction_kthread to commit transaction, so we no
* longer need to bother nested transaction nor lock context.
*/
if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
if (free < threshold)
btrfs_commit_transaction_locksafe(fs_info);
}
@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks)
{
int i;
spin_lock_init(&swapped_blocks->lock);
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
swapped_blocks->blocks[i] = RB_ROOT;
swapped_blocks->swapped = false;
}
/*
* Delete all swapped blocks record of @root.
* Every record here means we skipped a full subtree scan for qgroup.
*
* Gets called when committing one transaction.
*/
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
{
struct btrfs_qgroup_swapped_blocks *swapped_blocks;
int i;
swapped_blocks = &root->swapped_blocks;
spin_lock(&swapped_blocks->lock);
if (!swapped_blocks->swapped)
goto out;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
struct rb_root *cur_root = &swapped_blocks->blocks[i];
struct btrfs_qgroup_swapped_block *entry;
struct btrfs_qgroup_swapped_block *next;
rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
node)
kfree(entry);
swapped_blocks->blocks[i] = RB_ROOT;
}
swapped_blocks->swapped = false;
out:
spin_unlock(&swapped_blocks->lock);
}
/*
* Add subtree roots record into @subvol_root.
*
* @subvol_root: tree root of the subvolume tree get swapped
* @bg: block group under balance
* @subvol_parent/slot: pointer to the subtree root in subvolume tree
* @reloc_parent/slot: pointer to the subtree root in reloc tree
* BOTH POINTERS ARE BEFORE TREE SWAP
* @last_snapshot: last snapshot generation of the subvolume tree
*/
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
struct btrfs_block_group_cache *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot)
{
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct rb_node **cur;
struct rb_node *parent = NULL;
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
__func__,
btrfs_node_ptr_generation(subvol_parent, subvol_slot),
btrfs_node_ptr_generation(reloc_parent, reloc_slot));
return -EUCLEAN;
}
block = kmalloc(sizeof(*block), GFP_NOFS);
if (!block) {
ret = -ENOMEM;
goto out;
}
/*
* @reloc_parent/slot is still before swap, while @block is going to
* record the bytenr after swap, so we do the swap here.
*/
block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
reloc_slot);
block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
subvol_slot);
block->last_snapshot = last_snapshot;
block->level = level;
if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
block->trace_leaf = true;
else
block->trace_leaf = false;
btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
cur = &blocks->blocks[level].rb_node;
while (*cur) {
struct btrfs_qgroup_swapped_block *entry;
parent = *cur;
entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
node);
if (entry->subvol_bytenr < block->subvol_bytenr) {
cur = &(*cur)->rb_left;
} else if (entry->subvol_bytenr > block->subvol_bytenr) {
cur = &(*cur)->rb_right;
} else {
if (entry->subvol_generation !=
block->subvol_generation ||
entry->reloc_bytenr != block->reloc_bytenr ||
entry->reloc_generation !=
block->reloc_generation) {
/*
* Duplicated but mismatch entry found.
* Shouldn't happen.
*
* Marking qgroup inconsistent should be enough
* for end users.
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
ret = -EEXIST;
}
kfree(block);
goto out_unlock;
}
}
rb_link_node(&block->node, parent, cur);
rb_insert_color(&block->node, &blocks->blocks[level]);
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
/*
* Check if the tree block is a subtree root, and if so do the needed
* delayed subtree trace for qgroup.
*
* This is called during btrfs_cow_block().
*/
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *subvol_eb)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
struct rb_node *node;
bool found = false;
bool swapped = false;
int level = btrfs_header_level(subvol_eb);
int ret = 0;
int i;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
if (!blocks->swapped) {
spin_unlock(&blocks->lock);
return 0;
}
node = blocks->blocks[level].rb_node;
while (node) {
block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
if (block->subvol_bytenr < subvol_eb->start) {
node = node->rb_left;
} else if (block->subvol_bytenr > subvol_eb->start) {
node = node->rb_right;
} else {
found = true;
break;
}
}
if (!found) {
spin_unlock(&blocks->lock);
goto out;
}
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]);
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
swapped = true;
break;
}
}
blocks->swapped = swapped;
spin_unlock(&blocks->lock);
/* Read out reloc subtree root */
reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
block->reloc_generation, block->level,
&block->first_key);
if (IS_ERR(reloc_eb)) {
ret = PTR_ERR(reloc_eb);
reloc_eb = NULL;
goto free_out;
}
if (!extent_buffer_uptodate(reloc_eb)) {
ret = -EIO;
goto free_out;
}
ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
block->last_snapshot, block->trace_leaf);
free_out:
kfree(block);
free_extent_buffer(reloc_eb);
out:
if (ret < 0) {
btrfs_err_rl(fs_info,
"failed to account subtree at bytenr %llu: %d",
subvol_eb->start, ret);
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
return ret;
}

View File

@ -6,6 +6,8 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@ -37,6 +39,66 @@
* Normally at qgroup rescan and transaction commit time.
*/
/*
* Special performance optimization for balance.
*
* For balance, we need to swap subtree of subvolume and reloc trees.
* In theory, we need to trace all subtree blocks of both subvolume and reloc
* trees, since their owner has changed during such swap.
*
* However since balance has ensured that both subtrees are containing the
* same contents and have the same tree structures, such swap won't cause
* qgroup number change.
*
* But there is a race window between subtree swap and transaction commit,
* during that window, if we increase/decrease tree level or merge/split tree
* blocks, we still need to trace the original subtrees.
*
* So for balance, we use a delayed subtree tracing, whose workflow is:
*
* 1) Record the subtree root block get swapped.
*
* During subtree swap:
* O = Old tree blocks
* N = New tree blocks
* reloc tree subvolume tree X
* Root Root
* / \ / \
* NA OB OA OB
* / | | \ / | | \
* NC ND OE OF OC OD OE OF
*
* In this case, NA and OA are going to be swapped, record (NA, OA) into
* subvolume tree X.
*
* 2) After subtree swap.
* reloc tree subvolume tree X
* Root Root
* / \ / \
* OA OB NA OB
* / | | \ / | | \
* OC OD OE OF NC ND OE OF
*
* 3a) COW happens for OB
* If we are going to COW tree block OB, we check OB's bytenr against
* tree X's swapped_blocks structure.
* If it doesn't fit any, nothing will happen.
*
* 3b) COW happens for NA
* Check NA's bytenr against tree X's swapped_blocks, and get a hit.
* Then we do subtree scan on both subtrees OA and NA.
* Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
*
* Then no matter what we do to subvolume tree X, qgroup numbers will
* still be correct.
* Then NA's record gets removed from X's swapped_blocks.
*
* 4) Transaction commit
* Any record in X's swapped_blocks gets removed, since there is no
* modification to the swapped subtrees, no need to trigger heavy qgroup
* subtree rescan for them.
*/
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
/*
* For qgroup reserved data space freeing.
*
* @data_rsv_refroot and @data_rsv will be recorded after
* BTRFS_ADD_DELAYED_EXTENT is called.
* And will be used to free reserved qgroup space at
* transaction commit time.
*/
u32 data_rsv; /* reserved data space needs to be freed */
u64 data_rsv_refroot; /* which root the reserved data belongs to */
struct ulist *old_roots;
};
struct btrfs_qgroup_swapped_block {
struct rb_node node;
int level;
bool trace_leaf;
/* bytenr/generation of the tree block in subvolume tree after swap */
u64 subvol_bytenr;
u64 subvol_generation;
/* bytenr/generation of the tree block in reloc tree after swap */
u64 reloc_bytenr;
u64 reloc_generation;
u64 last_snapshot;
struct btrfs_key first_key;
};
/*
* Qgroup reservation types:
*
@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *bg_cache,
struct extent_buffer *src_parent, int src_slot,
struct extent_buffer *dst_parent, int dst_slot,
u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return;
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
BTRFS_QGROUP_RSV_DATA);
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks);
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
struct btrfs_block_group_cache *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot);
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
#endif

View File

@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
return -EIO;
}
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;

View File

@ -162,6 +162,8 @@ struct reloc_control {
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
/* list of subvolume trees that get relocated */
struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
if (!root->reloc_root)
if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
!root->reloc_root)
goto out;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
/* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
root->reloc_root = NULL;
set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
__del_reloc_root(reloc_root);
}
@ -1773,7 +1777,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
btrfs_set_lock_blocking(eb);
btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@ -1786,7 +1790,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
BUG_ON(ret);
}
btrfs_set_lock_blocking(eb);
btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@ -1802,6 +1806,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
BUG_ON(level < lowest_level);
ret = btrfs_bin_search(parent, &key, level, &slot);
if (ret < 0)
break;
if (ret && slot > 0)
slot--;
@ -1852,7 +1858,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
slot, &eb);
BUG_ON(ret);
}
btrfs_set_lock_blocking(eb);
btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@ -1885,15 +1891,18 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
* and tree block numbers, if current trans doesn't free
* data reloc tree inode.
*
* We don't scan the subtree right now, but only record
* the swapped tree blocks.
* The real subtree rescan is delayed until we have new
* CoW on the subtree root node before transaction commit.
*/
ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
parent, slot, path->nodes[level],
path->slots[level], last_snapshot);
ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
rc->block_group, parent, slot,
path->nodes[level], path->slots[level],
last_snapshot);
if (ret < 0)
break;
/*
* swap blocks in fs tree and reloc tree.
*/
@ -2120,6 +2129,58 @@ static int find_next_key(struct btrfs_path *path, int level,
return 1;
}
/*
* Insert current subvolume into reloc_control::dirty_subvol_roots
*/
static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
ASSERT(reloc_root);
reloc_root_item = &reloc_root->root_item;
memset(&reloc_root_item->drop_progress, 0,
sizeof(reloc_root_item->drop_progress));
reloc_root_item->drop_level = 0;
btrfs_set_root_refs(reloc_root_item, 0);
btrfs_update_reloc_root(trans, root);
if (list_empty(&root->reloc_dirty_list)) {
btrfs_grab_fs_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
}
static int clean_dirty_subvols(struct reloc_control *rc)
{
struct btrfs_root *root;
struct btrfs_root *next;
int ret = 0;
list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
reloc_dirty_list) {
struct btrfs_root *reloc_root = root->reloc_root;
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
if (reloc_root) {
int ret2;
ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
if (ret2 < 0 && !ret)
ret = ret2;
}
btrfs_put_fs_root(root);
}
return ret;
}
/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
struct btrfs_trans_handle *trans = NULL;
@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
if (err == 0) {
memset(&root_item->drop_progress, 0,
sizeof(root_item->drop_progress));
root_item->drop_level = 0;
btrfs_set_root_refs(root_item, 0);
btrfs_update_reloc_root(trans, root);
}
if (err == 0)
insert_dirty_subvol(trans, rc, root);
if (trans)
btrfs_end_transaction_throttle(trans);
@ -2410,14 +2465,6 @@ void merge_reloc_roots(struct reloc_control *rc)
} else {
list_del_init(&reloc_root->root_list);
}
ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
if (ret < 0) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
}
}
if (found) {
@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key,
upper->level, &slot);
if (ret < 0) {
err = ret;
goto next;
}
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
&slot);
if (ret < 0) {
err = ret;
goto next;
}
BUG_ON(ret);
}
@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
btrfs_tree_lock(eb);
btrfs_set_lock_blocking(eb);
btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
@ -4079,6 +4134,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
goto out_free;
}
btrfs_commit_transaction(trans);
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(&rc->processed_blocks, NULL);
@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_free;
}
err = btrfs_commit_transaction(trans);
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
out_free:
kfree(rc);
out:

View File

@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
uuid_le uuid;
int len;
u32 len;
int need_reset = 0;
len = btrfs_item_size_nr(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
min_t(int, len, (int)sizeof(*item)));
min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)

View File

@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
INIT_LIST_HEAD(&sctx->csum_list);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
if (fs_info->scrub_workers_refcnt == 0) {
lockdep_assert_held(&fs_info->scrub_lock);
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
ASSERT(fs_info->scrub_workers == NULL);
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
ASSERT(fs_info->scrub_wr_completion_workers == NULL);
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
ASSERT(fs_info->scrub_parity_workers == NULL);
fs_info->scrub_parity_workers =
btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
} else {
refcount_inc(&fs_info->scrub_workers_refcnt);
}
++fs_info->scrub_workers_refcnt;
return 0;
fail_scrub_parity_workers:
@ -3770,16 +3778,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
return -ENOMEM;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (--fs_info->scrub_workers_refcnt == 0) {
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
struct btrfs_workqueue *scrub_workers = NULL;
struct btrfs_workqueue *scrub_wr_comp = NULL;
struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return PTR_ERR(sctx);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
* kick off writing super in log tree sync.
@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
if (!is_dev_replace)
btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
ret ? "not finished" : "finished", devid, ret);
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
scrub_workers_put(fs_info);
if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
scrub_workers = fs_info->scrub_workers;
scrub_wr_comp = fs_info->scrub_wr_completion_workers;
scrub_parity = fs_info->scrub_parity_workers;
fs_info->scrub_workers = NULL;
fs_info->scrub_wr_completion_workers = NULL;
fs_info->scrub_parity_workers = NULL;
}
mutex_unlock(&fs_info->scrub_lock);
btrfs_destroy_workqueue(scrub_workers);
btrfs_destroy_workqueue(scrub_wr_comp);
btrfs_destroy_workqueue(scrub_parity);
scrub_put_ctx(sctx);
return ret;
@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)

View File

@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
btrfs_compress_str2level(args[0].from);
btrfs_compress_str2level(
BTRFS_COMPRESS_ZLIB,
args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
} else if (strcmp(args[0].from, "zstd") == 0) {
} else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
info->compress_level =
btrfs_compress_str2level(
BTRFS_COMPRESS_ZSTD,
args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
ret = btrfs_forget_devices(vol->name);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,

View File

@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans);
btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
btrfs_set_lock_blocking(old);
btrfs_set_lock_blocking_write(old);
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
@ -1943,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->delayed_refs.flushing = 1;
smp_wmb();
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans);
btrfs_create_pending_block_groups(trans);
ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {

View File

@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
btrfs_set_lock_blocking(root_node);
btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */

View File

@ -27,6 +27,7 @@
#define LOG_INODE_ALL 0
#define LOG_INODE_EXISTS 1
#define LOG_OTHER_INODE 2
#define LOG_OTHER_INODE_ALL 3
/*
* directory trouble cases
@ -1330,6 +1331,67 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
return ret;
}
static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct inode *dir, struct inode *inode, const char *name,
int namelen, u64 ref_index)
{
struct btrfs_dir_item *dir_item;
struct btrfs_key key;
struct btrfs_path *path;
struct inode *other_inode = NULL;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
dir_item = btrfs_lookup_dir_item(NULL, root, path,
btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
if (!dir_item) {
btrfs_release_path(path);
goto add_link;
} else if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
goto out;
}
/*
* Our inode's dentry collides with the dentry of another inode which is
* in the log but not yet processed since it has a higher inode number.
* So delete that other dentry.
*/
btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
btrfs_release_path(path);
other_inode = read_one_inode(root, key.objectid);
if (!other_inode) {
ret = -ENOENT;
goto out;
}
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
name, namelen);
if (ret)
goto out;
/*
* If we dropped the link count to 0, bump it so that later the iput()
* on the inode will not free it. We will fixup the link count later.
*/
if (other_inode->i_nlink == 0)
inc_nlink(other_inode);
ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
add_link:
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
name, namelen, 0, ref_index);
out:
iput(other_inode);
btrfs_free_path(path);
return ret;
}
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
ret = btrfs_add_link(trans, BTRFS_I(dir),
BTRFS_I(inode),
name, namelen, 0, ref_index);
ret = add_link(trans, root, dir, inode, name, namelen,
ref_index);
if (ret)
goto out;
@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
&start_slot);
if (ret < 0)
break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
u64 *other_ino)
u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
ret = 1;
*other_ino = di_key.objectid;
if (di_key.objectid != key->objectid) {
ret = 1;
*other_ino = di_key.objectid;
*other_parent = parent;
} else {
ret = 0;
}
} else {
ret = -EAGAIN;
}
@ -4801,6 +4869,144 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
return ret;
}
struct btrfs_ino_list {
u64 ino;
u64 parent;
struct list_head list;
};
static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx,
u64 ino, u64 parent)
{
struct btrfs_ino_list *ino_elem;
LIST_HEAD(inode_list);
int ret = 0;
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
list_add_tail(&ino_elem->list, &inode_list);
while (!list_empty(&inode_list)) {
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct inode *inode;
ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
list);
ino = ino_elem->ino;
parent = ino_elem->parent;
list_del(&ino_elem->list);
kfree(ino_elem);
if (ret)
continue;
btrfs_release_path(path);
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
* directory.
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
if (ret == -ENOENT) {
key.objectid = parent;
inode = btrfs_iget(fs_info->sb, &key, root,
NULL);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
0, LLONG_MAX, ctx);
iput(inode);
}
}
continue;
}
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* are safe against concurrent renames of the other inode as
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
if (ret) {
iput(inode);
continue;
}
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
iput(inode);
continue;
}
while (true) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u64 other_ino = 0;
u64 other_parent = 0;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
break;
} else if (ret > 0) {
ret = 0;
break;
}
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino ||
(key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)) {
ret = 0;
break;
}
ret = btrfs_check_ref_name_override(leaf, slot, &key,
BTRFS_I(inode), &other_ino,
&other_parent);
if (ret < 0)
break;
if (ret > 0) {
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem) {
ret = -ENOMEM;
break;
}
ino_elem->ino = other_ino;
ino_elem->parent = other_parent;
list_add_tail(&ino_elem->list, &inode_list);
ret = 0;
}
path->slots[0]++;
}
iput(inode);
}
return ret;
}
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
bool recursive_logging = false;
path = btrfs_alloc_path();
if (!path)
@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return ret;
}
if (inode_only == LOG_OTHER_INODE) {
inode_only = LOG_INODE_EXISTS;
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
recursive_logging = true;
if (inode_only == LOG_OTHER_INODE)
inode_only = LOG_INODE_EXISTS;
else
inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
@ -4981,20 +5192,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid) {
inode->generation == trans->transid &&
!recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
&other_ino);
&other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
struct btrfs_key inode_key;
struct inode *other_inode;
if (ins_nr > 0) {
ins_nr++;
} else {
@ -5010,43 +5220,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
goto out_unlock;
}
ins_nr = 0;
btrfs_release_path(path);
inode_key.objectid = other_ino;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
other_inode = btrfs_iget(fs_info->sb,
&inode_key, root,
NULL);
/*
* If the other inode that had a conflicting dir
* entry was deleted in the current transaction,
* we don't need to do more work nor fallback to
* a transaction commit.
*/
if (other_inode == ERR_PTR(-ENOENT)) {
goto next_key;
} else if (IS_ERR(other_inode)) {
err = PTR_ERR(other_inode);
goto out_unlock;
}
/*
* We are safe logging the other inode without
* acquiring its i_mutex as long as we log with
* the LOG_INODE_EXISTS mode. We're safe against
* concurrent renames of the other inode as well
* because during a rename we pin the log and
* update the log with the new name before we
* unpin it.
*/
err = btrfs_log_inode(trans, root,
BTRFS_I(other_inode),
LOG_OTHER_INODE, 0, LLONG_MAX,
ctx);
iput(other_inode);
err = log_conflicting_inodes(trans, root, path,
ctx, other_ino, other_parent);
if (err)
goto out_unlock;
else
goto next_key;
btrfs_release_path(path);
goto next_key;
}
}

View File

@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void)
return dev;
}
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*/
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, const u8 *uuid)
{
struct btrfs_device *dev;
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
if (dev->devid == devid &&
(!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
return dev;
}
}
return NULL;
}
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
int found;
rcu_read_lock();
found = strcmp(rcu_str_deref(device->name), path);
rcu_read_unlock();
return found == 0;
}
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
* Return: 0 for success or if @path is NULL.
* -EBUSY if @path is a mounted device.
* -ENOENT if @path does not match any device in the list.
*/
static void btrfs_free_stale_devices(const char *path,
static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
int ret = 0;
if (path)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
mutex_unlock(&fs_devices->device_list_mutex);
continue;
}
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
int not_found = 0;
if (skip_device && skip_device == device)
continue;
if (path && !device->name)
continue;
rcu_read_lock();
if (path)
not_found = strcmp(rcu_str_deref(device->name),
path);
rcu_read_unlock();
if (not_found)
if (path && !device_path_matched(path, device))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
if (path && ret != 0)
ret = -EBUSY;
break;
}
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
ret = 0;
if (fs_devices->num_devices == 0)
break;
}
mutex_unlock(&fs_devices->device_list_mutex);
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
return ret;
}
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = find_device(fs_devices, devid,
disk_super->dev_item.uuid);
device = btrfs_find_device(fs_devices, devid,
disk_super->dev_item.uuid, NULL, false);
/*
* If this disk has been pulled into an fs devices created by
@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
/* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
return 0;
}
int btrfs_forget_devices(const char *path)
{
int ret;
mutex_lock(&uuid_mutex);
ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
mutex_unlock(&uuid_mutex);
return ret;
}
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path(
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info, devid, dev_uuid,
disk_super->metadata_uuid);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->metadata_uuid, true);
else
device = btrfs_find_device(fs_info, devid,
dev_uuid, disk_super->fsid);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->fsid, true);
brelse(bh);
if (!device)
@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
return device;
}
static struct btrfs_device *btrfs_find_device_missing_or_by_path(
struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_device *device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
devices = &fs_info->fs_devices->devices;
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
device = tmp;
break;
}
}
if (!device)
return ERR_PTR(-ENOENT);
} else {
device = btrfs_find_device_by_path(fs_info, device_path);
}
return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
struct btrfs_device *device;
if (devid) {
device = btrfs_find_device(fs_info, devid, NULL, NULL);
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
NULL, true);
if (!device)
return ERR_PTR(-ENOENT);
} else {
if (!devpath || !devpath[0])
return ERR_PTR(-EINVAL);
device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
return device;
}
return device;
if (!device_path || !device_path[0])
return ERR_PTR(-EINVAL);
if (strcmp(device_path, "missing") == 0) {
/* Find first missing device */
list_for_each_entry(device, &fs_info->fs_devices->devices,
dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) && !device->bdev)
return device;
}
return ERR_PTR(-ENOENT);
}
return btrfs_find_device_by_path(fs_info, device_path);
}
/*
@ -2563,7 +2556,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
fs_uuid, true);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
return BLK_STS_OK;
}
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid)
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*
* If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *uuid, u8 *fsid,
bool seed)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
cur_devices = fs_info->fs_devices;
while (cur_devices) {
while (fs_devices) {
if (!fsid ||
!memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
device = find_device(cur_devices, devid, uuid);
if (device)
return device;
!memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
list_for_each_entry(device, &fs_devices->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
BTRFS_UUID_SIZE) == 0))
return device;
}
}
cur_devices = cur_devices->seed;
if (seed)
fs_devices = fs_devices->seed;
else
return NULL;
}
return NULL;
}
@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
}
if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
(type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
(type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
(type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
(type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != 1)) {
btrfs_err(fs_info,
@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info, devid,
uuid, NULL);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
devid, uuid, NULL, true);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
return PTR_ERR(fs_devices);
}
device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
fs_uuid, true);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
true);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
dev = find_device(fs_info->fs_devices->seed, devid, NULL);
dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);

View File

@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid);
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *uuid, u8 *fsid, bool seed);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,

View File

@ -27,6 +27,33 @@ struct workspace {
int level;
};
static struct workspace_manager wsm;
static void zlib_init_workspace_manager(void)
{
btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
}
static void zlib_cleanup_workspace_manager(void)
{
btrfs_cleanup_workspace_manager(&wsm);
}
static struct list_head *zlib_get_workspace(unsigned int level)
{
struct list_head *ws = btrfs_get_workspace(&wsm, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
return ws;
}
static void zlib_put_workspace(struct list_head *ws)
{
btrfs_put_workspace(&wsm, ws);
}
static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
static struct list_head *zlib_alloc_workspace(void)
static struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@ -390,18 +418,19 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
return ret;
}
static void zlib_set_level(struct list_head *ws, unsigned int type)
static unsigned int zlib_set_level(unsigned int level)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
unsigned level = (type & 0xF0) >> 4;
if (!level)
return BTRFS_ZLIB_DEFAULT_LEVEL;
if (level > 9)
level = 9;
workspace->level = level > 0 ? level : 3;
return min_t(unsigned int, level, 9);
}
const struct btrfs_compress_op btrfs_zlib_compress = {
.init_workspace_manager = zlib_init_workspace_manager,
.cleanup_workspace_manager = zlib_cleanup_workspace_manager,
.get_workspace = zlib_get_workspace,
.put_workspace = zlib_put_workspace,
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,

View File

@ -6,25 +6,31 @@
*/
#include <linux/bio.h>
#include <linux/bitmap.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/pagemap.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/zstd.h>
#include "compression.h"
#include "ctree.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
size_t src_len)
{
ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
src_len, 0);
ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@ -36,11 +42,290 @@ struct workspace {
void *mem;
size_t size;
char *buf;
unsigned int level;
unsigned int req_level;
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
ZSTD_inBuffer in_buf;
ZSTD_outBuffer out_buf;
};
/*
* Zstd Workspace Management
*
* Zstd workspaces have different memory requirements depending on the level.
* The zstd workspaces are managed by having individual lists for each level
* and a global lru. Forward progress is maintained by protecting a max level
* workspace.
*
* Getting a workspace is done by using the bitmap to identify the levels that
* have available workspaces and scans up. This lets us recycle higher level
* workspaces because of the monotonic memory guarantee. A workspace's
* last_used is only updated if it is being used by the corresponding memory
* level. Putting a workspace involves adding it back to the appropriate places
* and adding it back to the lru if necessary.
*
* A timer is used to reclaim workspaces if they have not been used for
* ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
* The upper bound is provided by the workqueue limit which is 2 (percpu limit).
*/
struct zstd_workspace_manager {
const struct btrfs_compress_op *ops;
spinlock_t lock;
struct list_head lru_list;
struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
unsigned long active_map;
wait_queue_head_t wait;
struct timer_list timer;
};
static struct zstd_workspace_manager wsm;
static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
static inline struct workspace *list_to_workspace(struct list_head *list)
{
return container_of(list, struct workspace, list);
}
/*
* zstd_reclaim_timer_fn - reclaim timer
* @t: timer
*
* This scans the lru_list and attempts to reclaim any workspace that hasn't
* been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
spin_unlock(&wsm.lock);
return;
}
list_for_each_prev_safe(pos, next, &wsm.lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
unsigned int level;
if (time_after(victim->last_used, reclaim_threshold))
break;
/* workspace is in use */
if (victim->req_level)
continue;
level = victim->level;
list_del(&victim->lru_list);
list_del(&victim->list);
wsm.ops->free_workspace(&victim->list);
if (list_empty(&wsm.idle_ws[level - 1]))
clear_bit(level - 1, &wsm.active_map);
}
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
spin_unlock(&wsm.lock);
}
/*
* zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
*
* It is possible based on the level configurations that a higher level
* workspace uses less memory than a lower level workspace. In order to reuse
* workspaces, this must be made a monotonic relationship. This precomputes
* the required memory for each level and enforces the monotonicity between
* level and memory required.
*/
static void zstd_calc_ws_mem_sizes(void)
{
size_t max_size = 0;
unsigned int level;
for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
ZSTD_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
max_t(size_t,
ZSTD_CStreamWorkspaceBound(params.cParams),
ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
zstd_ws_mem_sizes[level - 1] = max_size;
}
}
static void zstd_init_workspace_manager(void)
{
struct list_head *ws;
int i;
zstd_calc_ws_mem_sizes();
wsm.ops = &btrfs_zstd_compress;
spin_lock_init(&wsm.lock);
init_waitqueue_head(&wsm.wait);
timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
INIT_LIST_HEAD(&wsm.lru_list);
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&wsm.idle_ws[i]);
ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
pr_warn(
"BTRFS: cannot preallocate zstd compression workspace\n");
} else {
set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
}
}
static void zstd_cleanup_workspace_manager(void)
{
struct workspace *workspace;
int i;
del_timer(&wsm.timer);
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
while (!list_empty(&wsm.idle_ws[i])) {
workspace = container_of(wsm.idle_ws[i].next,
struct workspace, list);
list_del(&workspace->list);
list_del(&workspace->lru_list);
wsm.ops->free_workspace(&workspace->list);
}
}
}
/*
* zstd_find_workspace - find workspace
* @level: compression level
*
* This iterates over the set bits in the active_map beginning at the requested
* compression level. This lets us utilize already allocated workspaces before
* allocating a new one. If the workspace is of a larger size, it is used, but
* the place in the lru_list and last_used times are not updated. This is to
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
static struct list_head *zstd_find_workspace(unsigned int level)
{
struct list_head *ws;
struct workspace *workspace;
int i = level - 1;
spin_lock(&wsm.lock);
for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
if (!list_empty(&wsm.idle_ws[i])) {
ws = wsm.idle_ws[i].next;
workspace = list_to_workspace(ws);
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
if (level == workspace->level)
list_del(&workspace->lru_list);
if (list_empty(&wsm.idle_ws[i]))
clear_bit(i, &wsm.active_map);
spin_unlock(&wsm.lock);
return ws;
}
}
spin_unlock(&wsm.lock);
return NULL;
}
/*
* zstd_get_workspace - zstd's get_workspace
* @level: compression level
*
* If @level is 0, then any compression level can be used. Therefore, we begin
* scanning from 1. We first scan through possible workspaces and then after
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
static struct list_head *zstd_get_workspace(unsigned int level)
{
struct list_head *ws;
unsigned int nofs_flag;
/* level == 0 means we can use any workspace */
if (!level)
level = 1;
again:
ws = zstd_find_workspace(level);
if (ws)
return ws;
nofs_flag = memalloc_nofs_save();
ws = wsm.ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ws)) {
DEFINE_WAIT(wait);
prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
schedule();
finish_wait(&wsm.wait, &wait);
goto again;
}
return ws;
}
/*
* zstd_put_workspace - zstd put_workspace
* @ws: list_head for the workspace
*
* When putting back a workspace, we only need to update the LRU if we are of
* the requested compression level. Here is where we continue to protect the
* max level workspace or update last_used accordingly. If the reclaim timer
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
static void zstd_put_workspace(struct list_head *ws)
{
struct workspace *workspace = list_to_workspace(ws);
spin_lock(&wsm.lock);
/* A node is only taken off the lru if we are the corresponding level */
if (workspace->req_level == workspace->level) {
/* Hide a max level workspace from reclaim */
if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
} else {
workspace->last_used = jiffies;
list_add(&workspace->lru_list, &wsm.lru_list);
if (!timer_pending(&wsm.timer))
mod_timer(&wsm.timer,
jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
}
}
set_bit(workspace->level - 1, &wsm.active_map);
list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
workspace->req_level = 0;
spin_unlock(&wsm.lock);
if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
cond_wake_up(&wsm.wait);
}
static void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
static struct list_head *zstd_alloc_workspace(void)
static struct list_head *zstd_alloc_workspace(unsigned int level)
{
ZSTD_parameters params =
zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
workspace->size = max_t(size_t,
ZSTD_CStreamWorkspaceBound(params.cParams),
ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
workspace->size = zstd_ws_mem_sizes[level - 1];
workspace->level = level;
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
INIT_LIST_HEAD(&workspace->lru_list);
return &workspace->list;
fail:
@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
ZSTD_parameters params = zstd_get_btrfs_parameters(len);
ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
*out_pages = 0;
*total_out = 0;
@ -419,11 +705,19 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
return ret;
}
static void zstd_set_level(struct list_head *ws, unsigned int type)
static unsigned int zstd_set_level(unsigned int level)
{
if (!level)
return ZSTD_BTRFS_DEFAULT_LEVEL;
return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
}
const struct btrfs_compress_op btrfs_zstd_compress = {
.init_workspace_manager = zstd_init_workspace_manager,
.cleanup_workspace_manager = zstd_cleanup_workspace_manager,
.get_workspace = zstd_get_workspace,
.put_workspace = zstd_put_workspace,
.alloc_workspace = zstd_alloc_workspace,
.free_workspace = zstd_free_workspace,
.compress_pages = zstd_compress_pages,

View File

@ -1051,6 +1051,7 @@ TRACE_EVENT(btrfs_trigger_flush,
{ FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \
{ FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \
{ ALLOC_CHUNK, "ALLOC_CHUNK"}, \
{ ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE"}, \
{ COMMIT_TRANS, "COMMIT_TRANS"})
TRACE_EVENT(btrfs_flush_space,
@ -1512,35 +1513,6 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
TP_ARGS(inode, start, len, reserved, op)
);
DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
TP_PROTO(const struct btrfs_fs_info *fs_info,
u64 ref_root, u64 reserved),
TP_ARGS(fs_info, ref_root, reserved),
TP_STRUCT__entry_btrfs(
__field( u64, ref_root )
__field( u64, reserved )
),
TP_fast_assign_btrfs(fs_info,
__entry->ref_root = ref_root;
__entry->reserved = reserved;
),
TP_printk_btrfs("root=%llu reserved=%llu op=free",
__entry->ref_root, __entry->reserved)
);
DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
TP_PROTO(const struct btrfs_fs_info *fs_info,
u64 ref_root, u64 reserved),
TP_ARGS(fs_info, ref_root, reserved)
);
DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
TP_PROTO(const struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup_extent_record *rec),

View File

@ -837,6 +837,8 @@ enum btrfs_err_code {
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
struct btrfs_ioctl_vol_args)
/* trans start and trans end are dangerous, and only for
* use by applications that know how to avoid the
* resulting deadlocks