linux/fs/xfs/xfs_trans_dquot.c

875 lines
21 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2002 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
/*
* Add the locked dquot to the transaction.
* The dquot must be locked, and it cannot be associated with any
* transaction.
*/
void
xfs_trans_dqjoin(
struct xfs_trans *tp,
struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(dqp->q_logitem.qli_dquot == dqp);
/*
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
}
/*
* This is called to mark the dquot as needing
* to be logged when the transaction is committed. The dquot must
* already be associated with the given transaction.
* Note that it marks the entire transaction as dirty. In the ordinary
* case, this gets called via xfs_trans_commit, after the transaction
* is already dirty. However, there's nothing stop this from getting
* called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
* flag.
*/
void
xfs_trans_log_dquot(
struct xfs_trans *tp,
struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
/*
* Carry forward whatever is left of the quota blk reservation to
* the spanky new transaction
*/
void
xfs_trans_dup_dqinfo(
struct xfs_trans *otp,
struct xfs_trans *ntp)
{
struct xfs_dqtrx *oq, *nq;
int i, j;
struct xfs_dqtrx *oqa, *nqa;
uint64_t blk_res_used;
if (!otp->t_dqinfo)
return;
xfs_trans_alloc_dqinfo(ntp);
/*
* Because the quota blk reservation is carried forward,
* it is also necessary to carry forward the DQ_DIRTY flag.
*/
if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
oqa = otp->t_dqinfo->dqs[j];
nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
xfs: fix quota block reservation leak when tp allocates and frees blocks Al Viro reports that generic/231 fails frequently on XFS and bisected the problem to the following commit: 5d11fb4b xfs: rework zero range to prevent invalid i_size updates ... which is just the first commit that happens to cause fsx to reproduce the problem. fsx reproduces via zero range calls. The aforementioned commit overhauls zero range to use hole punch and fallocate. As it turns out, the problem is reproducible on demand using basic hole punch as follows: $ mkfs.xfs -f -m crc=1,finobt=1 <dev> $ mount <dev> /mnt -o uquota $ xfs_io -f -c "falloc 0 50m" /mnt/file $ for i in $(seq 1 20); do xfs_io -c "fpunch ${i}m 32k" /mnt/file; done $ rm -f /mnt/file $ repquota -us /mnt ... User used soft hard grace used soft hard grace ---------------------------------------------------------------------- root -- 32K 0K 0K 3 0 0 A file is allocated with a single 50m extent. The extent count increases via hole punches until the bmap converts to btree format. The file is removed but quota reports 32k of space usage for the user. This reservation is effectively leaked for the lifetime of the mount. The reason this occurs is because the quota block reservation tracking is confused when a transaction happens to free and allocate blocks at the same time. Consider the following sequence of events: - tp is allocated from xfs_free_file_space() and reserves several blocks for btree management. Blocks are reserved against the dquot and marked as such in the transaction (qtrx->qt_blk_res). - 8 blocks are accounted free when the 32k range is punched out. xfs_trans_mod_dquot() is called with XFS_TRANS_DQ_BCOUNT and sets ->qt_bcount_delta to -8. - Subsequently, a block is allocated against the same transaction by xfs_bmap_extents_to_btree() for btree conversion. A call to xfs_trans_mod_dquot() increases qt_blk_res_used to 1 and qt_bcount_delta to -7. - The transaction is dup'd and committed by xfs_bmap_finish(). xfs_trans_dup_dqinfo() sets the first transaction up such that it has a matching qt_blk_res and qt_blk_res_used of 1. The remaining unused reservation is transferred to the duplicate tp. When the transactions are committed, the dquots are fixed up in xfs_trans_apply_dquot_deltas() according to one of two methods: 1.) If the transaction holds a block reservation (->qt_blk_res != 0), _only_ the unused portion reservation is unaccounted from the dquot. Note that the tp duplication behavior of xfs_bmap_finish() makes it such that qt_blk_res is typically 0 for tp's with unused reservation. 2.) Otherwise, the dquot is fixed up based on the block delta (->qt_bcount_delta) created by the transaction. Therefore, if a transaction has a negative qt_bcount_delta and positive qt_blk_res_used, the former set of blocks that have been removed from the file are never factored out of the in-core dquot reservation. Instead, *_apply_dquot_deltas() sees 1 block used out of a 1 block reservation and believes there is nothing to fix up. The on-disk d_bcount is updated independently from qt_bcount_delta, and thus is correct (and allows the quota usage to correct on remount). To deal with this situation, we effectively want the "used reservation" part of the transaction to be consistent with any freed blocks with respect to quota tracking. For example, if 8 blocks are freed, the subsequent single block allocation does not need to consume the initial reservation made by the tp. Instead, it simply borrows one from the previously freed. One possible implementation of such borrowing is to avoid the blks_res_used increment when bcount_delta is negative. This alone is flawed logic in that it only handles the case where blocks are freed before allocated, however. Rather than add more complexity to manage synchronization between bcount_delta and blks_res_used, kill the latter entirely. blk_res_used is only updated in one place and always in sync with delta_bcount. Therefore, the net block reservation consumption of the transaction is always available from bcount_delta. Calculate the reservation consumption on the fly where necessary based on whether the tp has a reservation and results in a positive net block delta on the inode. Reported-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-06-01 05:15:37 +08:00
blk_res_used = 0;
if (oqa[i].qt_dquot == NULL)
break;
oq = &oqa[i];
nq = &nqa[i];
xfs: fix quota block reservation leak when tp allocates and frees blocks Al Viro reports that generic/231 fails frequently on XFS and bisected the problem to the following commit: 5d11fb4b xfs: rework zero range to prevent invalid i_size updates ... which is just the first commit that happens to cause fsx to reproduce the problem. fsx reproduces via zero range calls. The aforementioned commit overhauls zero range to use hole punch and fallocate. As it turns out, the problem is reproducible on demand using basic hole punch as follows: $ mkfs.xfs -f -m crc=1,finobt=1 <dev> $ mount <dev> /mnt -o uquota $ xfs_io -f -c "falloc 0 50m" /mnt/file $ for i in $(seq 1 20); do xfs_io -c "fpunch ${i}m 32k" /mnt/file; done $ rm -f /mnt/file $ repquota -us /mnt ... User used soft hard grace used soft hard grace ---------------------------------------------------------------------- root -- 32K 0K 0K 3 0 0 A file is allocated with a single 50m extent. The extent count increases via hole punches until the bmap converts to btree format. The file is removed but quota reports 32k of space usage for the user. This reservation is effectively leaked for the lifetime of the mount. The reason this occurs is because the quota block reservation tracking is confused when a transaction happens to free and allocate blocks at the same time. Consider the following sequence of events: - tp is allocated from xfs_free_file_space() and reserves several blocks for btree management. Blocks are reserved against the dquot and marked as such in the transaction (qtrx->qt_blk_res). - 8 blocks are accounted free when the 32k range is punched out. xfs_trans_mod_dquot() is called with XFS_TRANS_DQ_BCOUNT and sets ->qt_bcount_delta to -8. - Subsequently, a block is allocated against the same transaction by xfs_bmap_extents_to_btree() for btree conversion. A call to xfs_trans_mod_dquot() increases qt_blk_res_used to 1 and qt_bcount_delta to -7. - The transaction is dup'd and committed by xfs_bmap_finish(). xfs_trans_dup_dqinfo() sets the first transaction up such that it has a matching qt_blk_res and qt_blk_res_used of 1. The remaining unused reservation is transferred to the duplicate tp. When the transactions are committed, the dquots are fixed up in xfs_trans_apply_dquot_deltas() according to one of two methods: 1.) If the transaction holds a block reservation (->qt_blk_res != 0), _only_ the unused portion reservation is unaccounted from the dquot. Note that the tp duplication behavior of xfs_bmap_finish() makes it such that qt_blk_res is typically 0 for tp's with unused reservation. 2.) Otherwise, the dquot is fixed up based on the block delta (->qt_bcount_delta) created by the transaction. Therefore, if a transaction has a negative qt_bcount_delta and positive qt_blk_res_used, the former set of blocks that have been removed from the file are never factored out of the in-core dquot reservation. Instead, *_apply_dquot_deltas() sees 1 block used out of a 1 block reservation and believes there is nothing to fix up. The on-disk d_bcount is updated independently from qt_bcount_delta, and thus is correct (and allows the quota usage to correct on remount). To deal with this situation, we effectively want the "used reservation" part of the transaction to be consistent with any freed blocks with respect to quota tracking. For example, if 8 blocks are freed, the subsequent single block allocation does not need to consume the initial reservation made by the tp. Instead, it simply borrows one from the previously freed. One possible implementation of such borrowing is to avoid the blks_res_used increment when bcount_delta is negative. This alone is flawed logic in that it only handles the case where blocks are freed before allocated, however. Rather than add more complexity to manage synchronization between bcount_delta and blks_res_used, kill the latter entirely. blk_res_used is only updated in one place and always in sync with delta_bcount. Therefore, the net block reservation consumption of the transaction is always available from bcount_delta. Calculate the reservation consumption on the fly where necessary based on whether the tp has a reservation and results in a positive net block delta on the inode. Reported-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-06-01 05:15:37 +08:00
if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
blk_res_used = oq->qt_bcount_delta;
nq->qt_dquot = oq->qt_dquot;
nq->qt_bcount_delta = nq->qt_icount_delta = 0;
nq->qt_rtbcount_delta = 0;
/*
* Transfer whatever is left of the reservations.
*/
xfs: fix quota block reservation leak when tp allocates and frees blocks Al Viro reports that generic/231 fails frequently on XFS and bisected the problem to the following commit: 5d11fb4b xfs: rework zero range to prevent invalid i_size updates ... which is just the first commit that happens to cause fsx to reproduce the problem. fsx reproduces via zero range calls. The aforementioned commit overhauls zero range to use hole punch and fallocate. As it turns out, the problem is reproducible on demand using basic hole punch as follows: $ mkfs.xfs -f -m crc=1,finobt=1 <dev> $ mount <dev> /mnt -o uquota $ xfs_io -f -c "falloc 0 50m" /mnt/file $ for i in $(seq 1 20); do xfs_io -c "fpunch ${i}m 32k" /mnt/file; done $ rm -f /mnt/file $ repquota -us /mnt ... User used soft hard grace used soft hard grace ---------------------------------------------------------------------- root -- 32K 0K 0K 3 0 0 A file is allocated with a single 50m extent. The extent count increases via hole punches until the bmap converts to btree format. The file is removed but quota reports 32k of space usage for the user. This reservation is effectively leaked for the lifetime of the mount. The reason this occurs is because the quota block reservation tracking is confused when a transaction happens to free and allocate blocks at the same time. Consider the following sequence of events: - tp is allocated from xfs_free_file_space() and reserves several blocks for btree management. Blocks are reserved against the dquot and marked as such in the transaction (qtrx->qt_blk_res). - 8 blocks are accounted free when the 32k range is punched out. xfs_trans_mod_dquot() is called with XFS_TRANS_DQ_BCOUNT and sets ->qt_bcount_delta to -8. - Subsequently, a block is allocated against the same transaction by xfs_bmap_extents_to_btree() for btree conversion. A call to xfs_trans_mod_dquot() increases qt_blk_res_used to 1 and qt_bcount_delta to -7. - The transaction is dup'd and committed by xfs_bmap_finish(). xfs_trans_dup_dqinfo() sets the first transaction up such that it has a matching qt_blk_res and qt_blk_res_used of 1. The remaining unused reservation is transferred to the duplicate tp. When the transactions are committed, the dquots are fixed up in xfs_trans_apply_dquot_deltas() according to one of two methods: 1.) If the transaction holds a block reservation (->qt_blk_res != 0), _only_ the unused portion reservation is unaccounted from the dquot. Note that the tp duplication behavior of xfs_bmap_finish() makes it such that qt_blk_res is typically 0 for tp's with unused reservation. 2.) Otherwise, the dquot is fixed up based on the block delta (->qt_bcount_delta) created by the transaction. Therefore, if a transaction has a negative qt_bcount_delta and positive qt_blk_res_used, the former set of blocks that have been removed from the file are never factored out of the in-core dquot reservation. Instead, *_apply_dquot_deltas() sees 1 block used out of a 1 block reservation and believes there is nothing to fix up. The on-disk d_bcount is updated independently from qt_bcount_delta, and thus is correct (and allows the quota usage to correct on remount). To deal with this situation, we effectively want the "used reservation" part of the transaction to be consistent with any freed blocks with respect to quota tracking. For example, if 8 blocks are freed, the subsequent single block allocation does not need to consume the initial reservation made by the tp. Instead, it simply borrows one from the previously freed. One possible implementation of such borrowing is to avoid the blks_res_used increment when bcount_delta is negative. This alone is flawed logic in that it only handles the case where blocks are freed before allocated, however. Rather than add more complexity to manage synchronization between bcount_delta and blks_res_used, kill the latter entirely. blk_res_used is only updated in one place and always in sync with delta_bcount. Therefore, the net block reservation consumption of the transaction is always available from bcount_delta. Calculate the reservation consumption on the fly where necessary based on whether the tp has a reservation and results in a positive net block delta on the inode. Reported-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-06-01 05:15:37 +08:00
nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
oq->qt_blk_res = blk_res_used;
nq->qt_rtblk_res = oq->qt_rtblk_res -
oq->qt_rtblk_res_used;
oq->qt_rtblk_res = oq->qt_rtblk_res_used;
nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
oq->qt_ino_res = oq->qt_ino_res_used;
}
}
}
/*
* Wrap around mod_dquot to account for both user and group quotas.
*/
void
xfs_trans_mod_dquot_byino(
xfs_trans_t *tp,
xfs_inode_t *ip,
uint field,
int64_t delta)
{
xfs_mount_t *mp = tp->t_mountp;
if (!XFS_IS_QUOTA_RUNNING(mp) ||
!XFS_IS_QUOTA_ON(mp) ||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
if (tp->t_dqinfo == NULL)
xfs_trans_alloc_dqinfo(tp);
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
(void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
}
STATIC struct xfs_dqtrx *
xfs_trans_get_dqtrx(
struct xfs_trans *tp,
struct xfs_dquot *dqp)
{
int i;
struct xfs_dqtrx *qa;
if (XFS_QM_ISUDQ(dqp))
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
else if (XFS_QM_ISGDQ(dqp))
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
else if (XFS_QM_ISPDQ(dqp))
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
else
return NULL;
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
qa[i].qt_dquot == dqp)
return &qa[i];
}
return NULL;
}
/*
* Make the changes in the transaction structure.
* The moral equivalent to xfs_trans_mod_sb().
* We don't touch any fields in the dquot, so we don't care
* if it's locked or not (most of the time it won't be).
*/
void
xfs_trans_mod_dquot(
struct xfs_trans *tp,
struct xfs_dquot *dqp,
uint field,
int64_t delta)
{
struct xfs_dqtrx *qtrx;
ASSERT(tp);
ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
qtrx = NULL;
if (tp->t_dqinfo == NULL)
xfs_trans_alloc_dqinfo(tp);
/*
* Find either the first free slot or the slot that belongs
* to this dquot.
*/
qtrx = xfs_trans_get_dqtrx(tp, dqp);
ASSERT(qtrx);
if (qtrx->qt_dquot == NULL)
qtrx->qt_dquot = dqp;
switch (field) {
/*
* regular disk blk reservation
*/
case XFS_TRANS_DQ_RES_BLKS:
qtrx->qt_blk_res += delta;
break;
/*
* inode reservation
*/
case XFS_TRANS_DQ_RES_INOS:
qtrx->qt_ino_res += delta;
break;
/*
* disk blocks used.
*/
case XFS_TRANS_DQ_BCOUNT:
qtrx->qt_bcount_delta += delta;
break;
case XFS_TRANS_DQ_DELBCOUNT:
qtrx->qt_delbcnt_delta += delta;
break;
/*
* Inode Count
*/
case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
}
qtrx->qt_icount_delta += delta;
break;
/*
* rtblk reservation
*/
case XFS_TRANS_DQ_RES_RTBLKS:
qtrx->qt_rtblk_res += delta;
break;
/*
* rtblk count
*/
case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
}
qtrx->qt_rtbcount_delta += delta;
break;
case XFS_TRANS_DQ_DELRTBCOUNT:
qtrx->qt_delrtb_delta += delta;
break;
default:
ASSERT(0);
}
tp->t_flags |= XFS_TRANS_DQ_DIRTY;
}
/*
xfs: dquot log reservations are too small During review of the separate project quota inode patches, it became obvious that the dquot log reservation calculation underestimated the number dquots that can be modified in a transaction. This has it's roots way back in the Irix quota implementation. That is, when quotas were first implemented in XFS, it only supported user and project quotas as Irix did not have group quotas. Hence the worst case operation involving dquot modification was calculated to involve 2 user dquots and 1 project dquot or 1 user dequot and 2 project dquots. i.e. 3 dquots. This was determined back in 1996, and has remained unchanged ever since. However, back in 2001, the Linux XFS port dropped all support for project quota and implmented group quotas over the top. This was effectively done with a search-and-replace of project with group, and as such the log reservation was not changed. However, with the advent of group quotas, chmod and rename now could modify more than 3 dquots in a single transaction - both could modify 4 dquots. Hence this log reservation has been wrong for a long time. In 2005, project quota support was reintroduced into Linux, but it was implemented to be mutually exclusive to group quotas and so this didn't add any new changes to the dquot log reservation. Hence when project quotas were in use (rather than group quotas) the log reservation was again valid, just like in the Irix days. Now, with the addition of the separate project quota inode, group and project quotas are no longer mutually exclusive, and hence operations can now modify three dquots per inode where previously it was only two. The worst case here is the rename transaction, which can allocate/free space on two different directory inodes, and if they have different uid/gid/prid configurations and are world writeable, then rename can actually modify 6 different dquots now. Further, the dquot log reservation doesn't take into account the space used by the dquot log format structure that precedes the dquot that is logged, and hence further underestimates the worst case log space required by dquots during a transaction. This has been missing since the first commit in 1996. Hence the worst case log reservation needs to be increased from 3 to 6, and it needs to take into account a log format header for each of those dquots. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-07-10 05:04:01 +08:00
* Given an array of dqtrx structures, lock all the dquots associated and join
* them to the transaction, provided they have been modified. We know that the
* highest number of dquots of one type - usr, grp and prj - involved in a
* transaction is 3 so we don't need to make this very generic.
*/
STATIC void
xfs_trans_dqlockedjoin(
struct xfs_trans *tp,
struct xfs_dqtrx *q)
{
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
xfs_dqlock(q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
} else {
ASSERT(XFS_QM_TRANS_MAXDQS == 2);
xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[1].qt_dquot);
}
}
/*
* Called by xfs_trans_commit() and similar in spirit to
* xfs_trans_apply_sb_deltas().
* Go thru all the dquots belonging to this transaction and modify the
* INCORE dquot to reflect the actual usages.
* Unreserve just the reservations done by this transaction.
* dquot is still left locked at exit.
*/
void
xfs_trans_apply_dquot_deltas(
struct xfs_trans *tp)
{
int i, j;
struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
struct xfs_disk_dquot *d;
int64_t totalbdelta;
int64_t totalrtbdelta;
if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
ASSERT(tp->t_dqinfo);
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
qa = tp->t_dqinfo->dqs[j];
if (qa[0].qt_dquot == NULL)
continue;
/*
* Lock all of the dquots and join them to the transaction.
*/
xfs_trans_dqlockedjoin(tp, qa);
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
qtrx = &qa[i];
/*
* The array of dquots is filled
* sequentially, not sparsely.
*/
if ((dqp = qtrx->qt_dquot) == NULL)
break;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
/*
* adjust the actual number of blocks used
*/
d = &dqp->q_core;
/*
* The issue here is - sometimes we don't make a blkquota
* reservation intentionally to be fair to users
* (when the amount is small). On the other hand,
* delayed allocs do make reservations, but that's
* outside of a transaction, so we have no
* idea how much was really reserved.
* So, here we've accumulated delayed allocation blks and
* non-delay blks. The assumption is that the
* delayed ones are always reserved (outside of a
* transaction), and the others may or may not have
* quota reservations.
*/
totalbdelta = qtrx->qt_bcount_delta +
qtrx->qt_delbcnt_delta;
totalrtbdelta = qtrx->qt_rtbcount_delta +
qtrx->qt_delrtb_delta;
#ifdef DEBUG
if (totalbdelta < 0)
ASSERT(be64_to_cpu(d->d_bcount) >=
-totalbdelta);
if (totalrtbdelta < 0)
ASSERT(be64_to_cpu(d->d_rtbcount) >=
-totalrtbdelta);
if (qtrx->qt_icount_delta < 0)
ASSERT(be64_to_cpu(d->d_icount) >=
-qtrx->qt_icount_delta);
#endif
if (totalbdelta)
be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
if (qtrx->qt_icount_delta)
be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
if (totalrtbdelta)
be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
/*
* Get any default limits in use.
* Start/reset the timer(s) if needed.
*/
if (d->d_id) {
xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
xfs_qm_adjust_dqtimers(tp->t_mountp, dqp);
}
dqp->dq_flags |= XFS_DQ_DIRTY;
/*
* add this to the list of items to get logged
*/
xfs_trans_log_dquot(tp, dqp);
/*
* Take off what's left of the original reservation.
* In case of delayed allocations, there's no
* reservation that a transaction structure knows of.
*/
if (qtrx->qt_blk_res != 0) {
uint64_t blk_res_used = 0;
xfs: fix quota block reservation leak when tp allocates and frees blocks Al Viro reports that generic/231 fails frequently on XFS and bisected the problem to the following commit: 5d11fb4b xfs: rework zero range to prevent invalid i_size updates ... which is just the first commit that happens to cause fsx to reproduce the problem. fsx reproduces via zero range calls. The aforementioned commit overhauls zero range to use hole punch and fallocate. As it turns out, the problem is reproducible on demand using basic hole punch as follows: $ mkfs.xfs -f -m crc=1,finobt=1 <dev> $ mount <dev> /mnt -o uquota $ xfs_io -f -c "falloc 0 50m" /mnt/file $ for i in $(seq 1 20); do xfs_io -c "fpunch ${i}m 32k" /mnt/file; done $ rm -f /mnt/file $ repquota -us /mnt ... User used soft hard grace used soft hard grace ---------------------------------------------------------------------- root -- 32K 0K 0K 3 0 0 A file is allocated with a single 50m extent. The extent count increases via hole punches until the bmap converts to btree format. The file is removed but quota reports 32k of space usage for the user. This reservation is effectively leaked for the lifetime of the mount. The reason this occurs is because the quota block reservation tracking is confused when a transaction happens to free and allocate blocks at the same time. Consider the following sequence of events: - tp is allocated from xfs_free_file_space() and reserves several blocks for btree management. Blocks are reserved against the dquot and marked as such in the transaction (qtrx->qt_blk_res). - 8 blocks are accounted free when the 32k range is punched out. xfs_trans_mod_dquot() is called with XFS_TRANS_DQ_BCOUNT and sets ->qt_bcount_delta to -8. - Subsequently, a block is allocated against the same transaction by xfs_bmap_extents_to_btree() for btree conversion. A call to xfs_trans_mod_dquot() increases qt_blk_res_used to 1 and qt_bcount_delta to -7. - The transaction is dup'd and committed by xfs_bmap_finish(). xfs_trans_dup_dqinfo() sets the first transaction up such that it has a matching qt_blk_res and qt_blk_res_used of 1. The remaining unused reservation is transferred to the duplicate tp. When the transactions are committed, the dquots are fixed up in xfs_trans_apply_dquot_deltas() according to one of two methods: 1.) If the transaction holds a block reservation (->qt_blk_res != 0), _only_ the unused portion reservation is unaccounted from the dquot. Note that the tp duplication behavior of xfs_bmap_finish() makes it such that qt_blk_res is typically 0 for tp's with unused reservation. 2.) Otherwise, the dquot is fixed up based on the block delta (->qt_bcount_delta) created by the transaction. Therefore, if a transaction has a negative qt_bcount_delta and positive qt_blk_res_used, the former set of blocks that have been removed from the file are never factored out of the in-core dquot reservation. Instead, *_apply_dquot_deltas() sees 1 block used out of a 1 block reservation and believes there is nothing to fix up. The on-disk d_bcount is updated independently from qt_bcount_delta, and thus is correct (and allows the quota usage to correct on remount). To deal with this situation, we effectively want the "used reservation" part of the transaction to be consistent with any freed blocks with respect to quota tracking. For example, if 8 blocks are freed, the subsequent single block allocation does not need to consume the initial reservation made by the tp. Instead, it simply borrows one from the previously freed. One possible implementation of such borrowing is to avoid the blks_res_used increment when bcount_delta is negative. This alone is flawed logic in that it only handles the case where blocks are freed before allocated, however. Rather than add more complexity to manage synchronization between bcount_delta and blks_res_used, kill the latter entirely. blk_res_used is only updated in one place and always in sync with delta_bcount. Therefore, the net block reservation consumption of the transaction is always available from bcount_delta. Calculate the reservation consumption on the fly where necessary based on whether the tp has a reservation and results in a positive net block delta on the inode. Reported-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-06-01 05:15:37 +08:00
if (qtrx->qt_bcount_delta > 0)
blk_res_used = qtrx->qt_bcount_delta;
if (qtrx->qt_blk_res != blk_res_used) {
if (qtrx->qt_blk_res > blk_res_used)
dqp->q_res_bcount -= (xfs_qcnt_t)
(qtrx->qt_blk_res -
xfs: fix quota block reservation leak when tp allocates and frees blocks Al Viro reports that generic/231 fails frequently on XFS and bisected the problem to the following commit: 5d11fb4b xfs: rework zero range to prevent invalid i_size updates ... which is just the first commit that happens to cause fsx to reproduce the problem. fsx reproduces via zero range calls. The aforementioned commit overhauls zero range to use hole punch and fallocate. As it turns out, the problem is reproducible on demand using basic hole punch as follows: $ mkfs.xfs -f -m crc=1,finobt=1 <dev> $ mount <dev> /mnt -o uquota $ xfs_io -f -c "falloc 0 50m" /mnt/file $ for i in $(seq 1 20); do xfs_io -c "fpunch ${i}m 32k" /mnt/file; done $ rm -f /mnt/file $ repquota -us /mnt ... User used soft hard grace used soft hard grace ---------------------------------------------------------------------- root -- 32K 0K 0K 3 0 0 A file is allocated with a single 50m extent. The extent count increases via hole punches until the bmap converts to btree format. The file is removed but quota reports 32k of space usage for the user. This reservation is effectively leaked for the lifetime of the mount. The reason this occurs is because the quota block reservation tracking is confused when a transaction happens to free and allocate blocks at the same time. Consider the following sequence of events: - tp is allocated from xfs_free_file_space() and reserves several blocks for btree management. Blocks are reserved against the dquot and marked as such in the transaction (qtrx->qt_blk_res). - 8 blocks are accounted free when the 32k range is punched out. xfs_trans_mod_dquot() is called with XFS_TRANS_DQ_BCOUNT and sets ->qt_bcount_delta to -8. - Subsequently, a block is allocated against the same transaction by xfs_bmap_extents_to_btree() for btree conversion. A call to xfs_trans_mod_dquot() increases qt_blk_res_used to 1 and qt_bcount_delta to -7. - The transaction is dup'd and committed by xfs_bmap_finish(). xfs_trans_dup_dqinfo() sets the first transaction up such that it has a matching qt_blk_res and qt_blk_res_used of 1. The remaining unused reservation is transferred to the duplicate tp. When the transactions are committed, the dquots are fixed up in xfs_trans_apply_dquot_deltas() according to one of two methods: 1.) If the transaction holds a block reservation (->qt_blk_res != 0), _only_ the unused portion reservation is unaccounted from the dquot. Note that the tp duplication behavior of xfs_bmap_finish() makes it such that qt_blk_res is typically 0 for tp's with unused reservation. 2.) Otherwise, the dquot is fixed up based on the block delta (->qt_bcount_delta) created by the transaction. Therefore, if a transaction has a negative qt_bcount_delta and positive qt_blk_res_used, the former set of blocks that have been removed from the file are never factored out of the in-core dquot reservation. Instead, *_apply_dquot_deltas() sees 1 block used out of a 1 block reservation and believes there is nothing to fix up. The on-disk d_bcount is updated independently from qt_bcount_delta, and thus is correct (and allows the quota usage to correct on remount). To deal with this situation, we effectively want the "used reservation" part of the transaction to be consistent with any freed blocks with respect to quota tracking. For example, if 8 blocks are freed, the subsequent single block allocation does not need to consume the initial reservation made by the tp. Instead, it simply borrows one from the previously freed. One possible implementation of such borrowing is to avoid the blks_res_used increment when bcount_delta is negative. This alone is flawed logic in that it only handles the case where blocks are freed before allocated, however. Rather than add more complexity to manage synchronization between bcount_delta and blks_res_used, kill the latter entirely. blk_res_used is only updated in one place and always in sync with delta_bcount. Therefore, the net block reservation consumption of the transaction is always available from bcount_delta. Calculate the reservation consumption on the fly where necessary based on whether the tp has a reservation and results in a positive net block delta on the inode. Reported-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-06-01 05:15:37 +08:00
blk_res_used);
else
dqp->q_res_bcount -= (xfs_qcnt_t)
xfs: fix quota block reservation leak when tp allocates and frees blocks Al Viro reports that generic/231 fails frequently on XFS and bisected the problem to the following commit: 5d11fb4b xfs: rework zero range to prevent invalid i_size updates ... which is just the first commit that happens to cause fsx to reproduce the problem. fsx reproduces via zero range calls. The aforementioned commit overhauls zero range to use hole punch and fallocate. As it turns out, the problem is reproducible on demand using basic hole punch as follows: $ mkfs.xfs -f -m crc=1,finobt=1 <dev> $ mount <dev> /mnt -o uquota $ xfs_io -f -c "falloc 0 50m" /mnt/file $ for i in $(seq 1 20); do xfs_io -c "fpunch ${i}m 32k" /mnt/file; done $ rm -f /mnt/file $ repquota -us /mnt ... User used soft hard grace used soft hard grace ---------------------------------------------------------------------- root -- 32K 0K 0K 3 0 0 A file is allocated with a single 50m extent. The extent count increases via hole punches until the bmap converts to btree format. The file is removed but quota reports 32k of space usage for the user. This reservation is effectively leaked for the lifetime of the mount. The reason this occurs is because the quota block reservation tracking is confused when a transaction happens to free and allocate blocks at the same time. Consider the following sequence of events: - tp is allocated from xfs_free_file_space() and reserves several blocks for btree management. Blocks are reserved against the dquot and marked as such in the transaction (qtrx->qt_blk_res). - 8 blocks are accounted free when the 32k range is punched out. xfs_trans_mod_dquot() is called with XFS_TRANS_DQ_BCOUNT and sets ->qt_bcount_delta to -8. - Subsequently, a block is allocated against the same transaction by xfs_bmap_extents_to_btree() for btree conversion. A call to xfs_trans_mod_dquot() increases qt_blk_res_used to 1 and qt_bcount_delta to -7. - The transaction is dup'd and committed by xfs_bmap_finish(). xfs_trans_dup_dqinfo() sets the first transaction up such that it has a matching qt_blk_res and qt_blk_res_used of 1. The remaining unused reservation is transferred to the duplicate tp. When the transactions are committed, the dquots are fixed up in xfs_trans_apply_dquot_deltas() according to one of two methods: 1.) If the transaction holds a block reservation (->qt_blk_res != 0), _only_ the unused portion reservation is unaccounted from the dquot. Note that the tp duplication behavior of xfs_bmap_finish() makes it such that qt_blk_res is typically 0 for tp's with unused reservation. 2.) Otherwise, the dquot is fixed up based on the block delta (->qt_bcount_delta) created by the transaction. Therefore, if a transaction has a negative qt_bcount_delta and positive qt_blk_res_used, the former set of blocks that have been removed from the file are never factored out of the in-core dquot reservation. Instead, *_apply_dquot_deltas() sees 1 block used out of a 1 block reservation and believes there is nothing to fix up. The on-disk d_bcount is updated independently from qt_bcount_delta, and thus is correct (and allows the quota usage to correct on remount). To deal with this situation, we effectively want the "used reservation" part of the transaction to be consistent with any freed blocks with respect to quota tracking. For example, if 8 blocks are freed, the subsequent single block allocation does not need to consume the initial reservation made by the tp. Instead, it simply borrows one from the previously freed. One possible implementation of such borrowing is to avoid the blks_res_used increment when bcount_delta is negative. This alone is flawed logic in that it only handles the case where blocks are freed before allocated, however. Rather than add more complexity to manage synchronization between bcount_delta and blks_res_used, kill the latter entirely. blk_res_used is only updated in one place and always in sync with delta_bcount. Therefore, the net block reservation consumption of the transaction is always available from bcount_delta. Calculate the reservation consumption on the fly where necessary based on whether the tp has a reservation and results in a positive net block delta on the inode. Reported-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-06-01 05:15:37 +08:00
(blk_res_used -
qtrx->qt_blk_res);
}
} else {
/*
* These blks were never reserved, either inside
* a transaction or outside one (in a delayed
* allocation). Also, this isn't always a
* negative number since we sometimes
* deliberately skip quota reservations.
*/
if (qtrx->qt_bcount_delta) {
dqp->q_res_bcount +=
(xfs_qcnt_t)qtrx->qt_bcount_delta;
}
}
/*
* Adjust the RT reservation.
*/
if (qtrx->qt_rtblk_res != 0) {
if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
if (qtrx->qt_rtblk_res >
qtrx->qt_rtblk_res_used)
dqp->q_res_rtbcount -= (xfs_qcnt_t)
(qtrx->qt_rtblk_res -
qtrx->qt_rtblk_res_used);
else
dqp->q_res_rtbcount -= (xfs_qcnt_t)
(qtrx->qt_rtblk_res_used -
qtrx->qt_rtblk_res);
}
} else {
if (qtrx->qt_rtbcount_delta)
dqp->q_res_rtbcount +=
(xfs_qcnt_t)qtrx->qt_rtbcount_delta;
}
/*
* Adjust the inode reservation.
*/
if (qtrx->qt_ino_res != 0) {
ASSERT(qtrx->qt_ino_res >=
qtrx->qt_ino_res_used);
if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
dqp->q_res_icount -= (xfs_qcnt_t)
(qtrx->qt_ino_res -
qtrx->qt_ino_res_used);
} else {
if (qtrx->qt_icount_delta)
dqp->q_res_icount +=
(xfs_qcnt_t)qtrx->qt_icount_delta;
}
ASSERT(dqp->q_res_bcount >=
be64_to_cpu(dqp->q_core.d_bcount));
ASSERT(dqp->q_res_icount >=
be64_to_cpu(dqp->q_core.d_icount));
ASSERT(dqp->q_res_rtbcount >=
be64_to_cpu(dqp->q_core.d_rtbcount));
}
}
}
/*
* Release the reservations, and adjust the dquots accordingly.
* This is called only when the transaction is being aborted. If by
* any chance we have done dquot modifications incore (ie. deltas) already,
* we simply throw those away, since that's the expected behavior
* when a transaction is curtailed without a commit.
*/
void
xfs_trans_unreserve_and_mod_dquots(
struct xfs_trans *tp)
{
int i, j;
struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
qa = tp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
qtrx = &qa[i];
/*
* We assume that the array of dquots is filled
* sequentially, not sparsely.
*/
if ((dqp = qtrx->qt_dquot) == NULL)
break;
/*
* Unreserve the original reservation. We don't care
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
locked = false;
if (qtrx->qt_blk_res) {
xfs_dqlock(dqp);
locked = true;
dqp->q_res_bcount -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
if (qtrx->qt_ino_res) {
if (!locked) {
xfs_dqlock(dqp);
locked = true;
}
dqp->q_res_icount -=
(xfs_qcnt_t)qtrx->qt_ino_res;
}
if (qtrx->qt_rtblk_res) {
if (!locked) {
xfs_dqlock(dqp);
locked = true;
}
dqp->q_res_rtbcount -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
if (locked)
xfs_dqunlock(dqp);
}
}
}
STATIC void
xfs_quota_warn(
struct xfs_mount *mp,
struct xfs_dquot *dqp,
int type)
{
enum quota_type qtype;
if (dqp->dq_flags & XFS_DQ_PROJ)
qtype = PRJQUOTA;
else if (dqp->dq_flags & XFS_DQ_USER)
qtype = USRQUOTA;
else
qtype = GRPQUOTA;
quota_send_warning(make_kqid(&init_user_ns, qtype,
be32_to_cpu(dqp->q_core.d_id)),
mp->m_super->s_dev, type);
}
/*
* This reserves disk blocks and inodes against a dquot.
* Flags indicate if the dquot is to be locked here and also
* if the blk reservation is for RT or regular blocks.
* Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
*/
STATIC int
xfs_trans_dqresv(
struct xfs_trans *tp,
struct xfs_mount *mp,
struct xfs_dquot *dqp,
int64_t nblks,
long ninos,
uint flags)
{
xfs_qcnt_t hardlimit;
xfs_qcnt_t softlimit;
time64_t timer;
xfs_qwarncnt_t warns;
xfs_qwarncnt_t warnlimit;
xfs_qcnt_t total_count;
xfs_qcnt_t *resbcountp;
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_def_quota *defq;
xfs_dqlock(dqp);
defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
if (flags & XFS_TRANS_DQ_RES_BLKS) {
hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
if (!hardlimit)
hardlimit = defq->bhardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
if (!softlimit)
softlimit = defq->bsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_btimer);
warns = be16_to_cpu(dqp->q_core.d_bwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
resbcountp = &dqp->q_res_bcount;
} else {
ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
if (!hardlimit)
hardlimit = defq->rtbhardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
if (!softlimit)
softlimit = defq->rtbsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
resbcountp = &dqp->q_res_rtbcount;
}
if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
dqp->q_core.d_id &&
((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
(XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
(XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
if (nblks > 0) {
/*
* dquot is locked already. See if we'd go over the
* hardlimit or exceed the timelimit if we allocate
* nblks.
*/
total_count = *resbcountp + nblks;
if (hardlimit && total_count > hardlimit) {
xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
goto error_return;
}
if (softlimit && total_count > softlimit) {
if ((timer != 0 &&
ktime_get_real_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
QUOTA_NL_BSOFTLONGWARN);
goto error_return;
}
xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
}
}
if (ninos > 0) {
total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
timer = be32_to_cpu(dqp->q_core.d_itimer);
warns = be16_to_cpu(dqp->q_core.d_iwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
if (!hardlimit)
hardlimit = defq->ihardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
if (!softlimit)
softlimit = defq->isoftlimit;
if (hardlimit && total_count > hardlimit) {
xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
goto error_return;
}
if (softlimit && total_count > softlimit) {
if ((timer != 0 &&
ktime_get_real_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
QUOTA_NL_ISOFTLONGWARN);
goto error_return;
}
xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
}
}
}
/*
* Change the reservation, but not the actual usage.
* Note that q_res_bcount = q_core.d_bcount + resv
*/
(*resbcountp) += (xfs_qcnt_t)nblks;
if (ninos != 0)
dqp->q_res_icount += (xfs_qcnt_t)ninos;
/*
* note the reservation amt in the trans struct too,
* so that the transaction knows how much was reserved by
* it against this particular dquot.
* We don't do this when we are reserving for a delayed allocation,
* because we don't have the luxury of a transaction envelope then.
*/
if (tp) {
ASSERT(tp->t_dqinfo);
ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
if (nblks != 0)
xfs_trans_mod_dquot(tp, dqp,
flags & XFS_QMOPT_RESBLK_MASK,
nblks);
if (ninos != 0)
xfs_trans_mod_dquot(tp, dqp,
XFS_TRANS_DQ_RES_INOS,
ninos);
}
ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
xfs_dqunlock(dqp);
return 0;
error_return:
xfs_dqunlock(dqp);
xfs: always return -ENOSPC on project quota reservation failure XFS project quota treats project hierarchies as "mini filesysems" and so rather than -EDQUOT, the intent is to return -ENOSPC when a quota reservation fails, but this behavior is not consistent. The only place we make a decision between -EDQUOT and -ENOSPC returns based on quota type is in xfs_trans_dqresv(). This behavior is currently controlled by whether or not the XFS_QMOPT_ENOSPC flag gets passed into the quota reservation. However, its use is not consistent; paths such as xfs_create() and xfs_symlink() don't set the flag, so a reservation failure will return -EDQUOT for project quota reservation failures rather than -ENOSPC for these sorts of operations, even for project quota: # mkdir mnt/project # xfs_quota -x -c "project -s -p mnt/project 42" mnt # xfs_quota -x -c 'limit -p isoft=2 ihard=3 42' mnt # touch mnt/project/file{1,2,3} touch: cannot touch ‘mnt/project/file3’: Disk quota exceeded We can make this consistent by not requiring the flag to be set at the top of the callchain; instead we can simply test whether we are reserving a project quota with XFS_QM_ISPDQ in xfs_trans_dqresv and if so, return -ENOSPC for that failure. This removes the need for the XFS_QMOPT_ENOSPC altogether and simplifies the code a fair bit. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-05-22 04:06:59 +08:00
if (XFS_QM_ISPDQ(dqp))
return -ENOSPC;
return -EDQUOT;
}
/*
* Given dquot(s), make disk block and/or inode reservations against them.
* The fact that this does the reservation against user, group and
* project quotas is important, because this follows a all-or-nothing
* approach.
*
* flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
* XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota.
* XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
* XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
* dquots are unlocked on return, if they were not locked by caller.
*/
int
xfs_trans_reserve_quota_bydquots(
struct xfs_trans *tp,
struct xfs_mount *mp,
struct xfs_dquot *udqp,
struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp,
int64_t nblks,
long ninos,
uint flags)
{
int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
if (tp && tp->t_dqinfo == NULL)
xfs_trans_alloc_dqinfo(tp);
ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
if (udqp) {
xfs: always return -ENOSPC on project quota reservation failure XFS project quota treats project hierarchies as "mini filesysems" and so rather than -EDQUOT, the intent is to return -ENOSPC when a quota reservation fails, but this behavior is not consistent. The only place we make a decision between -EDQUOT and -ENOSPC returns based on quota type is in xfs_trans_dqresv(). This behavior is currently controlled by whether or not the XFS_QMOPT_ENOSPC flag gets passed into the quota reservation. However, its use is not consistent; paths such as xfs_create() and xfs_symlink() don't set the flag, so a reservation failure will return -EDQUOT for project quota reservation failures rather than -ENOSPC for these sorts of operations, even for project quota: # mkdir mnt/project # xfs_quota -x -c "project -s -p mnt/project 42" mnt # xfs_quota -x -c 'limit -p isoft=2 ihard=3 42' mnt # touch mnt/project/file{1,2,3} touch: cannot touch ‘mnt/project/file3’: Disk quota exceeded We can make this consistent by not requiring the flag to be set at the top of the callchain; instead we can simply test whether we are reserving a project quota with XFS_QM_ISPDQ in xfs_trans_dqresv and if so, return -ENOSPC for that failure. This removes the need for the XFS_QMOPT_ENOSPC altogether and simplifies the code a fair bit. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-05-22 04:06:59 +08:00
error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
if (error)
return error;
}
if (gdqp) {
xfs: always return -ENOSPC on project quota reservation failure XFS project quota treats project hierarchies as "mini filesysems" and so rather than -EDQUOT, the intent is to return -ENOSPC when a quota reservation fails, but this behavior is not consistent. The only place we make a decision between -EDQUOT and -ENOSPC returns based on quota type is in xfs_trans_dqresv(). This behavior is currently controlled by whether or not the XFS_QMOPT_ENOSPC flag gets passed into the quota reservation. However, its use is not consistent; paths such as xfs_create() and xfs_symlink() don't set the flag, so a reservation failure will return -EDQUOT for project quota reservation failures rather than -ENOSPC for these sorts of operations, even for project quota: # mkdir mnt/project # xfs_quota -x -c "project -s -p mnt/project 42" mnt # xfs_quota -x -c 'limit -p isoft=2 ihard=3 42' mnt # touch mnt/project/file{1,2,3} touch: cannot touch ‘mnt/project/file3’: Disk quota exceeded We can make this consistent by not requiring the flag to be set at the top of the callchain; instead we can simply test whether we are reserving a project quota with XFS_QM_ISPDQ in xfs_trans_dqresv and if so, return -ENOSPC for that failure. This removes the need for the XFS_QMOPT_ENOSPC altogether and simplifies the code a fair bit. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-05-22 04:06:59 +08:00
error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
if (error)
goto unwind_usr;
}
if (pdqp) {
error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
if (error)
goto unwind_grp;
}
/*
* Didn't change anything critical, so, no need to log
*/
return 0;
unwind_grp:
flags |= XFS_QMOPT_FORCE_RES;
if (gdqp)
xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
unwind_usr:
flags |= XFS_QMOPT_FORCE_RES;
if (udqp)
xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
return error;
}
/*
* Lock the dquot and change the reservation if we can.
* This doesn't change the actual usage, just the reservation.
* The inode sent in is locked.
*/
int
xfs_trans_reserve_quota_nblks(
struct xfs_trans *tp,
struct xfs_inode *ip,
int64_t nblks,
long ninos,
uint flags)
{
struct xfs_mount *mp = ip->i_mount;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
xfs: always return -ENOSPC on project quota reservation failure XFS project quota treats project hierarchies as "mini filesysems" and so rather than -EDQUOT, the intent is to return -ENOSPC when a quota reservation fails, but this behavior is not consistent. The only place we make a decision between -EDQUOT and -ENOSPC returns based on quota type is in xfs_trans_dqresv(). This behavior is currently controlled by whether or not the XFS_QMOPT_ENOSPC flag gets passed into the quota reservation. However, its use is not consistent; paths such as xfs_create() and xfs_symlink() don't set the flag, so a reservation failure will return -EDQUOT for project quota reservation failures rather than -ENOSPC for these sorts of operations, even for project quota: # mkdir mnt/project # xfs_quota -x -c "project -s -p mnt/project 42" mnt # xfs_quota -x -c 'limit -p isoft=2 ihard=3 42' mnt # touch mnt/project/file{1,2,3} touch: cannot touch ‘mnt/project/file3’: Disk quota exceeded We can make this consistent by not requiring the flag to be set at the top of the callchain; instead we can simply test whether we are reserving a project quota with XFS_QM_ISPDQ in xfs_trans_dqresv and if so, return -ENOSPC for that failure. This removes the need for the XFS_QMOPT_ENOSPC altogether and simplifies the code a fair bit. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-05-22 04:06:59 +08:00
ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS ||
(flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS);
/*
* Reserve nblks against these dquots, with trans as the mediator.
*/
return xfs_trans_reserve_quota_bydquots(tp, mp,
ip->i_udquot, ip->i_gdquot,
ip->i_pdquot,
nblks, ninos, flags);
}
/*
* This routine is called to allocate a quotaoff log item.
*/
struct xfs_qoff_logitem *
xfs_trans_get_qoff_item(
struct xfs_trans *tp,
struct xfs_qoff_logitem *startqoff,
uint flags)
{
struct xfs_qoff_logitem *q;
ASSERT(tp != NULL);
q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
ASSERT(q != NULL);
/*
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &q->qql_item);
return q;
}
/*
* This is called to mark the quotaoff logitem as needing
* to be logged when the transaction is committed. The logitem must
* already be associated with the given transaction.
*/
void
xfs_trans_log_quotaoff_item(
struct xfs_trans *tp,
struct xfs_qoff_logitem *qlp)
{
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
}
STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
}
void
xfs_trans_free_dqinfo(
xfs_trans_t *tp)
{
if (!tp->t_dqinfo)
return;
kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}