2018-06-06 10:42:14 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
|
2013-04-24 16:58:02 +08:00
|
|
|
* Copyright (c) 2013 Red Hat, Inc.
|
2005-11-02 11:58:39 +08:00
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_DA_BTREE_H__
|
|
|
|
#define __XFS_DA_BTREE_H__
|
|
|
|
|
|
|
|
struct xfs_inode;
|
|
|
|
struct xfs_trans;
|
|
|
|
struct zone;
|
|
|
|
|
2014-06-06 13:01:58 +08:00
|
|
|
/*
|
|
|
|
* Directory/attribute geometry information. There will be one of these for each
|
|
|
|
* data fork type, and it will be passed around via the xfs_da_args. Global
|
|
|
|
* structures will be attached to the xfs_mount.
|
|
|
|
*/
|
|
|
|
struct xfs_da_geometry {
|
2019-11-09 06:52:07 +08:00
|
|
|
unsigned int blksize; /* da block size in bytes */
|
|
|
|
unsigned int fsbcount; /* da block size in filesystem blocks */
|
2014-06-06 13:01:58 +08:00
|
|
|
uint8_t fsblog; /* log2 of _filesystem_ block size */
|
|
|
|
uint8_t blklog; /* log2 of da block size */
|
2019-11-09 06:57:49 +08:00
|
|
|
unsigned int node_hdr_size; /* danode header size in bytes */
|
2019-11-09 06:52:07 +08:00
|
|
|
unsigned int node_ents; /* # of entries in a danode */
|
|
|
|
unsigned int magicpct; /* 37% of block size in bytes */
|
2014-06-06 13:01:58 +08:00
|
|
|
xfs_dablk_t datablk; /* blockno of dir data v2 */
|
2019-11-09 06:57:51 +08:00
|
|
|
unsigned int leaf_hdr_size; /* dir2 leaf header size */
|
2019-11-09 06:57:51 +08:00
|
|
|
unsigned int leaf_max_ents; /* # of entries in dir2 leaf */
|
2014-06-06 13:01:58 +08:00
|
|
|
xfs_dablk_t leafblk; /* blockno of leaf data v2 */
|
2019-11-09 07:01:29 +08:00
|
|
|
unsigned int free_hdr_size; /* dir2 free header size */
|
2019-11-09 07:01:30 +08:00
|
|
|
unsigned int free_max_bests; /* # of bests entries in dir2 free */
|
2014-06-06 13:01:58 +08:00
|
|
|
xfs_dablk_t freeblk; /* blockno of free data v2 */
|
2019-11-09 07:05:38 +08:00
|
|
|
|
|
|
|
xfs_dir2_data_aoff_t data_first_offset;
|
|
|
|
size_t data_entry_offset;
|
2014-06-06 13:01:58 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*========================================================================
|
|
|
|
* Btree searching and modification structure definitions.
|
|
|
|
*========================================================================*/
|
|
|
|
|
2008-05-21 14:41:01 +08:00
|
|
|
/*
|
|
|
|
* Search comparison results
|
|
|
|
*/
|
|
|
|
enum xfs_dacmp {
|
|
|
|
XFS_CMP_DIFFERENT, /* names are completely different */
|
|
|
|
XFS_CMP_EXACT, /* names are exactly the same */
|
|
|
|
XFS_CMP_CASE /* names are same but differ in case */
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Structure to ease passing around component names.
|
|
|
|
*/
|
|
|
|
typedef struct xfs_da_args {
|
2014-06-06 13:01:58 +08:00
|
|
|
struct xfs_da_geometry *geo; /* da block geometry */
|
2017-06-17 02:00:05 +08:00
|
|
|
const uint8_t *name; /* string (maybe not NULL terminated) */
|
2005-04-17 06:20:36 +08:00
|
|
|
int namelen; /* length of string (maybe no NULL) */
|
2017-06-17 02:00:05 +08:00
|
|
|
uint8_t filetype; /* filetype of inode for directories */
|
|
|
|
uint8_t *value; /* set of bytes (maybe contain NULLs) */
|
2005-04-17 06:20:36 +08:00
|
|
|
int valuelen; /* length of value */
|
|
|
|
int flags; /* argument flags (eg: ATTR_NOCREATE) */
|
|
|
|
xfs_dahash_t hashval; /* hash value of name */
|
|
|
|
xfs_ino_t inumber; /* input/output inode number */
|
|
|
|
struct xfs_inode *dp; /* directory inode to manipulate */
|
|
|
|
struct xfs_trans *trans; /* current trans (changes over time) */
|
|
|
|
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
|
|
|
|
int whichfork; /* data or attribute fork */
|
|
|
|
xfs_dablk_t blkno; /* blkno of attr leaf of interest */
|
|
|
|
int index; /* index of attr of interest in blk */
|
|
|
|
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
|
|
|
|
int rmtblkcnt; /* remote attr value block count */
|
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 05:37:31 +08:00
|
|
|
int rmtvaluelen; /* remote attr value length in bytes */
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
|
|
|
|
int index2; /* index of 2nd attr in blk */
|
|
|
|
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
|
|
|
|
int rmtblkcnt2; /* remote attr value block count */
|
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 05:37:31 +08:00
|
|
|
int rmtvaluelen2; /* remote attr value length in bytes */
|
2008-05-21 14:42:05 +08:00
|
|
|
int op_flags; /* operation flags */
|
2008-05-21 14:41:01 +08:00
|
|
|
enum xfs_dacmp cmpresult; /* name compare result for lookups */
|
2005-04-17 06:20:36 +08:00
|
|
|
} xfs_da_args_t;
|
|
|
|
|
2008-05-21 14:42:05 +08:00
|
|
|
/*
|
|
|
|
* Operation flags:
|
|
|
|
*/
|
|
|
|
#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
|
|
|
|
#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
|
|
|
|
#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
|
|
|
|
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
|
2008-05-21 14:58:22 +08:00
|
|
|
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
|
xfs: allocate xattr buffer on demand
When doing file lookups and checking for permissions, we end up in
xfs_get_acl() to see if there are any ACLs on the inode. This
requires and xattr lookup, and to do that we have to supply a buffer
large enough to hold an maximum sized xattr.
On workloads were we are accessing a wide range of cache cold files
under memory pressure (e.g. NFS fileservers) we end up spending a
lot of time allocating the buffer. The buffer is 64k in length, so
is a contiguous multi-page allocation, and if that then fails we
fall back to vmalloc(). Hence the allocation here is /expensive/
when we are looking up hundreds of thousands of files a second.
Initial numbers from a bpf trace show average time in xfs_get_acl()
is ~32us, with ~19us of that in the memory allocation. Note these
are average times, so there are going to be affected by the worst
case allocations more than the common fast case...
To avoid this, we could just do a "null" lookup to see if the ACL
xattr exists and then only do the allocation if it exists. This,
however, optimises the path for the "no ACL present" case at the
expense of the "acl present" case. i.e. we can halve the time in
xfs_get_acl() for the no acl case (i.e down to ~10-15us), but that
then increases the ACL case by 30% (i.e. up to 40-45us).
To solve this and speed up both cases, drive the xattr buffer
allocation into the attribute code once we know what the actual
xattr length is. For the no-xattr case, we avoid the allocation
completely, speeding up that case. For the common ACL case, we'll
end up with a fast heap allocation (because it'll be smaller than a
page), and only for the rarer "we have a remote xattr" will we have
a multi-page allocation occur. Hence the common ACL case will be
much faster, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-08-30 00:04:10 +08:00
|
|
|
#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
|
2008-05-21 14:42:05 +08:00
|
|
|
|
2009-12-15 07:14:59 +08:00
|
|
|
#define XFS_DA_OP_FLAGS \
|
|
|
|
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
|
|
|
|
{ XFS_DA_OP_RENAME, "RENAME" }, \
|
|
|
|
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
|
|
|
|
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
|
xfs: allocate xattr buffer on demand
When doing file lookups and checking for permissions, we end up in
xfs_get_acl() to see if there are any ACLs on the inode. This
requires and xattr lookup, and to do that we have to supply a buffer
large enough to hold an maximum sized xattr.
On workloads were we are accessing a wide range of cache cold files
under memory pressure (e.g. NFS fileservers) we end up spending a
lot of time allocating the buffer. The buffer is 64k in length, so
is a contiguous multi-page allocation, and if that then fails we
fall back to vmalloc(). Hence the allocation here is /expensive/
when we are looking up hundreds of thousands of files a second.
Initial numbers from a bpf trace show average time in xfs_get_acl()
is ~32us, with ~19us of that in the memory allocation. Note these
are average times, so there are going to be affected by the worst
case allocations more than the common fast case...
To avoid this, we could just do a "null" lookup to see if the ACL
xattr exists and then only do the allocation if it exists. This,
however, optimises the path for the "no ACL present" case at the
expense of the "acl present" case. i.e. we can halve the time in
xfs_get_acl() for the no acl case (i.e down to ~10-15us), but that
then increases the ACL case by 30% (i.e. up to 40-45us).
To solve this and speed up both cases, drive the xattr buffer
allocation into the attribute code once we know what the actual
xattr length is. For the no-xattr case, we avoid the allocation
completely, speeding up that case. For the common ACL case, we'll
end up with a fast heap allocation (because it'll be smaller than a
page), and only for the rarer "we have a remote xattr" will we have
a multi-page allocation occur. Hence the common ACL case will be
much faster, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-08-30 00:04:10 +08:00
|
|
|
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
|
|
|
|
{ XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
|
2009-12-15 07:14:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Storage for holding state during Btree searches and split/join ops.
|
|
|
|
*
|
|
|
|
* Only need space for 5 intermediate nodes. With a minimum of 62-way
|
|
|
|
* fanout to the Btree, we can support over 900 million directory blocks,
|
|
|
|
* which is slightly more than enough.
|
|
|
|
*/
|
|
|
|
typedef struct xfs_da_state_blk {
|
2012-06-22 16:50:14 +08:00
|
|
|
struct xfs_buf *bp; /* buffer containing block */
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_dablk_t blkno; /* filesystem blkno of buffer */
|
|
|
|
xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
|
|
|
|
int index; /* relevant index into block */
|
|
|
|
xfs_dahash_t hashval; /* last hash value in block */
|
|
|
|
int magic; /* blk's magic number, ie: blk type */
|
|
|
|
} xfs_da_state_blk_t;
|
|
|
|
|
|
|
|
typedef struct xfs_da_state_path {
|
|
|
|
int active; /* number of active levels */
|
|
|
|
xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
|
|
|
|
} xfs_da_state_path_t;
|
|
|
|
|
|
|
|
typedef struct xfs_da_state {
|
|
|
|
xfs_da_args_t *args; /* filename arguments */
|
|
|
|
struct xfs_mount *mp; /* filesystem mount point */
|
|
|
|
xfs_da_state_path_t path; /* search/split paths */
|
|
|
|
xfs_da_state_path_t altpath; /* alternate path for join */
|
|
|
|
unsigned char inleaf; /* insert into 1->lf, 0->splf */
|
|
|
|
unsigned char extravalid; /* T/F: extrablk is in use */
|
|
|
|
unsigned char extraafter; /* T/F: extrablk is after new */
|
2009-03-29 15:55:42 +08:00
|
|
|
xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* for dirv2 extrablk is data */
|
|
|
|
} xfs_da_state_t;
|
|
|
|
|
2019-11-09 06:52:06 +08:00
|
|
|
/*
|
|
|
|
* In-core version of the node header to abstract the differences in the v2 and
|
|
|
|
* v3 disk format of the headers. Callers need to convert to/from disk format as
|
|
|
|
* appropriate.
|
|
|
|
*/
|
|
|
|
struct xfs_da3_icnode_hdr {
|
|
|
|
uint32_t forw;
|
|
|
|
uint32_t back;
|
|
|
|
uint16_t magic;
|
|
|
|
uint16_t count;
|
|
|
|
uint16_t level;
|
2019-11-09 06:57:48 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Pointer to the on-disk format entries, which are behind the
|
|
|
|
* variable size (v4 vs v5) header in the on-disk block.
|
|
|
|
*/
|
|
|
|
struct xfs_da_node_entry *btree;
|
2019-11-09 06:52:06 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Utility macros to aid in logging changed structure fields.
|
|
|
|
*/
|
|
|
|
#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
|
|
|
|
#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
|
|
|
|
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
|
|
|
|
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
|
|
|
|
|
|
|
|
/*========================================================================
|
2008-10-30 14:05:38 +08:00
|
|
|
* Function prototypes.
|
2005-04-17 06:20:36 +08:00
|
|
|
*========================================================================*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines used for growing the Btree.
|
|
|
|
*/
|
2013-04-24 16:58:02 +08:00
|
|
|
int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
|
|
|
|
int level, struct xfs_buf **bpp, int whichfork);
|
|
|
|
int xfs_da3_split(xfs_da_state_t *state);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines used for shrinking the Btree.
|
|
|
|
*/
|
2013-04-24 16:58:02 +08:00
|
|
|
int xfs_da3_join(xfs_da_state_t *state);
|
|
|
|
void xfs_da3_fixhashpath(struct xfs_da_state *state,
|
|
|
|
struct xfs_da_state_path *path_to_to_fix);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines used for finding things in the Btree.
|
|
|
|
*/
|
2013-04-24 16:58:02 +08:00
|
|
|
int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
|
|
|
|
int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
|
2005-04-17 06:20:36 +08:00
|
|
|
int forward, int release, int *result);
|
|
|
|
/*
|
|
|
|
* Utility routines.
|
|
|
|
*/
|
2013-04-24 16:58:02 +08:00
|
|
|
int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_da_state_blk_t *new_blk);
|
2013-04-24 16:58:02 +08:00
|
|
|
int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
|
2019-11-21 01:46:04 +08:00
|
|
|
xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
|
|
|
|
int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
|
|
|
|
xfs_daddr_t mappedbno, struct xfs_buf **bpp,
|
|
|
|
int whichfork);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Utility routines.
|
|
|
|
*/
|
2019-11-21 02:18:50 +08:00
|
|
|
|
|
|
|
#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
|
2011-07-13 19:43:49 +08:00
|
|
|
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
|
|
|
|
int count);
|
2005-04-17 06:20:36 +08:00
|
|
|
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
|
2019-11-21 01:46:05 +08:00
|
|
|
xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
|
2005-04-17 06:20:36 +08:00
|
|
|
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
|
2019-11-21 01:46:04 +08:00
|
|
|
xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
|
|
|
|
int whichfork, const struct xfs_buf_ops *ops);
|
2017-02-03 07:13:58 +08:00
|
|
|
int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
|
2019-11-21 01:46:02 +08:00
|
|
|
unsigned int flags, int whichfork,
|
|
|
|
const struct xfs_buf_ops *ops);
|
2005-04-17 06:20:36 +08:00
|
|
|
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
|
2012-06-22 16:50:14 +08:00
|
|
|
struct xfs_buf *dead_buf);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-06-17 02:00:05 +08:00
|
|
|
uint xfs_da_hashname(const uint8_t *name_string, int name_length);
|
2008-05-21 14:41:01 +08:00
|
|
|
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
|
2010-01-20 07:47:17 +08:00
|
|
|
const unsigned char *name, int len);
|
2008-05-21 14:41:01 +08:00
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_da_state_t *xfs_da_state_alloc(void);
|
|
|
|
void xfs_da_state_free(xfs_da_state_t *state);
|
|
|
|
|
2019-11-09 06:53:00 +08:00
|
|
|
void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
|
|
|
|
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
|
2019-11-09 06:57:48 +08:00
|
|
|
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
|
|
|
|
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
|
2019-11-09 06:53:00 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern struct kmem_zone *xfs_da_state_zone;
|
|
|
|
|
|
|
|
#endif /* __XFS_DA_BTREE_H__ */
|