2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-11-02 11:58:39 +08:00
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License as
|
2005-04-17 06:20:36 +08:00
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
2005-11-02 11:58:39 +08:00
|
|
|
* This program is distributed in the hope that it would be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-11-02 11:58:39 +08:00
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write the Free Software Foundation,
|
|
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_fs.h"
|
2013-10-23 07:36:05 +08:00
|
|
|
#include "xfs_shared.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_format.h"
|
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_bit.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_sb.h"
|
|
|
|
#include "xfs_mount.h"
|
2013-10-15 06:17:51 +08:00
|
|
|
#include "xfs_da_format.h"
|
2014-12-04 06:43:17 +08:00
|
|
|
#include "xfs_da_btree.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_inode.h"
|
2013-10-23 07:51:50 +08:00
|
|
|
#include "xfs_dir2.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_ialloc.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_alloc.h"
|
|
|
|
#include "xfs_rtalloc.h"
|
|
|
|
#include "xfs_bmap.h"
|
2013-10-23 07:51:50 +08:00
|
|
|
#include "xfs_trans.h"
|
|
|
|
#include "xfs_trans_priv.h"
|
|
|
|
#include "xfs_log.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_error.h"
|
|
|
|
#include "xfs_quota.h"
|
|
|
|
#include "xfs_fsops.h"
|
2009-12-15 07:14:59 +08:00
|
|
|
#include "xfs_trace.h"
|
2012-10-08 18:56:09 +08:00
|
|
|
#include "xfs_icache.h"
|
2014-07-15 06:07:01 +08:00
|
|
|
#include "xfs_sysfs.h"
|
2009-12-15 07:14:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-03-30 16:21:31 +08:00
|
|
|
static DEFINE_MUTEX(xfs_uuid_table_mutex);
|
|
|
|
static int xfs_uuid_table_size;
|
|
|
|
static uuid_t *xfs_uuid_table;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See if the UUID is unique among mounted XFS filesystems.
|
|
|
|
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_uuid_mount(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
uuid_t *uuid = &mp->m_sb.sb_uuid;
|
|
|
|
int hole, i;
|
|
|
|
|
|
|
|
if (mp->m_flags & XFS_MOUNT_NOUUID)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (uuid_is_nil(uuid)) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "Filesystem has nil UUID - can't mount");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2009-03-30 16:21:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&xfs_uuid_table_mutex);
|
|
|
|
for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
|
|
|
|
if (uuid_is_nil(&xfs_uuid_table[i])) {
|
|
|
|
hole = i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (uuid_equal(uuid, &xfs_uuid_table[i]))
|
|
|
|
goto out_duplicate;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hole < 0) {
|
|
|
|
xfs_uuid_table = kmem_realloc(xfs_uuid_table,
|
|
|
|
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
|
|
|
|
xfs_uuid_table_size * sizeof(*xfs_uuid_table),
|
|
|
|
KM_SLEEP);
|
|
|
|
hole = xfs_uuid_table_size++;
|
|
|
|
}
|
|
|
|
xfs_uuid_table[hole] = *uuid;
|
|
|
|
mutex_unlock(&xfs_uuid_table_mutex);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_duplicate:
|
|
|
|
mutex_unlock(&xfs_uuid_table_mutex);
|
2012-01-13 13:58:39 +08:00
|
|
|
xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2009-03-30 16:21:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
STATIC void
|
|
|
|
xfs_uuid_unmount(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
uuid_t *uuid = &mp->m_sb.sb_uuid;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (mp->m_flags & XFS_MOUNT_NOUUID)
|
|
|
|
return;
|
|
|
|
|
|
|
|
mutex_lock(&xfs_uuid_table_mutex);
|
|
|
|
for (i = 0; i < xfs_uuid_table_size; i++) {
|
|
|
|
if (uuid_is_nil(&xfs_uuid_table[i]))
|
|
|
|
continue;
|
|
|
|
if (!uuid_equal(uuid, &xfs_uuid_table[i]))
|
|
|
|
continue;
|
|
|
|
memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ASSERT(i < xfs_uuid_table_size);
|
|
|
|
mutex_unlock(&xfs_uuid_table_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-09-22 08:47:20 +08:00
|
|
|
STATIC void
|
|
|
|
__xfs_free_perag(
|
|
|
|
struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
|
|
|
|
|
|
|
|
ASSERT(atomic_read(&pag->pag_ref) == 0);
|
|
|
|
kmem_free(pag);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2010-09-22 08:47:20 +08:00
|
|
|
* Free up the per-ag resources associated with the mount structure.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2008-05-20 13:10:52 +08:00
|
|
|
STATIC void
|
2008-08-13 14:50:47 +08:00
|
|
|
xfs_free_perag(
|
2007-08-30 15:20:39 +08:00
|
|
|
xfs_mount_t *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-01-11 19:47:44 +08:00
|
|
|
xfs_agnumber_t agno;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
|
|
|
|
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
|
|
|
|
spin_lock(&mp->m_perag_lock);
|
|
|
|
pag = radix_tree_delete(&mp->m_perag_tree, agno);
|
|
|
|
spin_unlock(&mp->m_perag_lock);
|
2010-09-22 08:47:20 +08:00
|
|
|
ASSERT(pag);
|
2010-11-08 16:55:04 +08:00
|
|
|
ASSERT(atomic_read(&pag->pag_ref) == 0);
|
2010-09-22 08:47:20 +08:00
|
|
|
call_rcu(&pag->rcu_head, __xfs_free_perag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-05-14 16:24:02 +08:00
|
|
|
/*
|
|
|
|
* Check size of device based on the (data/realtime) block count.
|
|
|
|
* Note: this check is used by the growfs code as well as mount.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_sb_validate_fsb_count(
|
|
|
|
xfs_sb_t *sbp,
|
|
|
|
__uint64_t nblocks)
|
|
|
|
{
|
|
|
|
ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
|
|
|
|
ASSERT(sbp->sb_blocklog >= BBSHIFT);
|
|
|
|
|
2014-07-30 07:12:05 +08:00
|
|
|
/* Limited by ULONG_MAX of page cache index */
|
2007-05-14 16:24:02 +08:00
|
|
|
if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EFBIG;
|
2007-05-14 16:24:02 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
int
|
2005-11-02 12:11:45 +08:00
|
|
|
xfs_initialize_perag(
|
|
|
|
xfs_mount_t *mp,
|
2010-01-11 19:47:44 +08:00
|
|
|
xfs_agnumber_t agcount,
|
|
|
|
xfs_agnumber_t *maxagi)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2012-09-20 21:32:38 +08:00
|
|
|
xfs_agnumber_t index;
|
2010-01-11 19:47:48 +08:00
|
|
|
xfs_agnumber_t first_initialised = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_perag_t *pag;
|
|
|
|
xfs_agino_t agino;
|
|
|
|
xfs_ino_t ino;
|
|
|
|
xfs_sb_t *sbp = &mp->m_sb;
|
2010-01-11 19:47:48 +08:00
|
|
|
int error = -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
/*
|
|
|
|
* Walk the current per-ag tree so we don't try to initialise AGs
|
|
|
|
* that already exist (growfs case). Allocate and insert all the
|
|
|
|
* AGs we don't find ready for initialisation.
|
|
|
|
*/
|
|
|
|
for (index = 0; index < agcount; index++) {
|
|
|
|
pag = xfs_perag_get(mp, index);
|
|
|
|
if (pag) {
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
continue;
|
|
|
|
}
|
2010-01-11 19:47:48 +08:00
|
|
|
if (!first_initialised)
|
|
|
|
first_initialised = index;
|
2010-05-29 03:03:10 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
|
|
|
|
if (!pag)
|
2010-01-11 19:47:48 +08:00
|
|
|
goto out_unwind;
|
2010-05-29 03:03:10 +08:00
|
|
|
pag->pag_agno = index;
|
|
|
|
pag->pag_mount = mp;
|
2010-12-16 14:08:41 +08:00
|
|
|
spin_lock_init(&pag->pag_ici_lock);
|
2010-09-27 09:09:51 +08:00
|
|
|
mutex_init(&pag->pag_ici_reclaim_lock);
|
2010-05-29 03:03:10 +08:00
|
|
|
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
|
2010-09-24 17:59:04 +08:00
|
|
|
spin_lock_init(&pag->pag_buf_lock);
|
|
|
|
pag->pag_buf_tree = RB_ROOT;
|
2010-05-29 03:03:10 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
if (radix_tree_preload(GFP_NOFS))
|
2010-01-11 19:47:48 +08:00
|
|
|
goto out_unwind;
|
2010-05-29 03:03:10 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
spin_lock(&mp->m_perag_lock);
|
|
|
|
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
|
|
|
|
BUG();
|
|
|
|
spin_unlock(&mp->m_perag_lock);
|
2010-01-11 19:47:48 +08:00
|
|
|
radix_tree_preload_end();
|
|
|
|
error = -EEXIST;
|
|
|
|
goto out_unwind;
|
2010-01-11 19:47:44 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&mp->m_perag_lock);
|
|
|
|
radix_tree_preload_end();
|
|
|
|
}
|
|
|
|
|
2010-05-29 03:03:10 +08:00
|
|
|
/*
|
|
|
|
* If we mount with the inode64 option, or no inode overflows
|
|
|
|
* the legacy 32-bit address space clear the inode32 option.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2010-05-29 03:03:10 +08:00
|
|
|
agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
|
|
|
|
ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
|
|
|
|
|
|
|
|
if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_flags |= XFS_MOUNT_32BITINODES;
|
2010-05-29 03:03:10 +08:00
|
|
|
else
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_flags &= ~XFS_MOUNT_32BITINODES;
|
|
|
|
|
2012-09-20 21:32:38 +08:00
|
|
|
if (mp->m_flags & XFS_MOUNT_32BITINODES)
|
xfs: allow inode allocations in post-growfs disk space
Today, if we perform an xfs_growfs which adds allocation groups,
mp->m_maxagi is not properly updated when the growfs is complete.
Therefore inodes will continue to be allocated only in the
AGs which existed prior to the growfs, and the new space
won't be utilized.
This is because of this path in xfs_growfs_data_private():
xfs_growfs_data_private
xfs_initialize_perag(mp, nagcount, &nagimax);
if (mp->m_flags & XFS_MOUNT_32BITINODES)
index = xfs_set_inode32(mp);
else
index = xfs_set_inode64(mp);
if (maxagi)
*maxagi = index;
where xfs_set_inode* iterates over the (old) agcount in
mp->m_sb.sb_agblocks, which has not yet been updated
in the growfs path. So "index" will be returned based on
the old agcount, not the new one, and new AGs are not available
for inode allocation.
Fix this by explicitly passing the proper AG count (which
xfs_initialize_perag() already has) down another level,
so that xfs_set_inode* can make the proper decision about
acceptable AGs for inode allocation in the potentially
newly-added AGs.
This has been broken since 3.7, when these two
xfs_set_inode* functions were added in commit 2d2194f.
Prior to that, we looped over "agcount" not sb_agblocks
in these calculations.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-24 18:51:54 +08:00
|
|
|
index = xfs_set_inode32(mp, agcount);
|
2012-09-20 21:32:38 +08:00
|
|
|
else
|
xfs: allow inode allocations in post-growfs disk space
Today, if we perform an xfs_growfs which adds allocation groups,
mp->m_maxagi is not properly updated when the growfs is complete.
Therefore inodes will continue to be allocated only in the
AGs which existed prior to the growfs, and the new space
won't be utilized.
This is because of this path in xfs_growfs_data_private():
xfs_growfs_data_private
xfs_initialize_perag(mp, nagcount, &nagimax);
if (mp->m_flags & XFS_MOUNT_32BITINODES)
index = xfs_set_inode32(mp);
else
index = xfs_set_inode64(mp);
if (maxagi)
*maxagi = index;
where xfs_set_inode* iterates over the (old) agcount in
mp->m_sb.sb_agblocks, which has not yet been updated
in the growfs path. So "index" will be returned based on
the old agcount, not the new one, and new AGs are not available
for inode allocation.
Fix this by explicitly passing the proper AG count (which
xfs_initialize_perag() already has) down another level,
so that xfs_set_inode* can make the proper decision about
acceptable AGs for inode allocation in the potentially
newly-added AGs.
This has been broken since 3.7, when these two
xfs_set_inode* functions were added in commit 2d2194f.
Prior to that, we looped over "agcount" not sb_agblocks
in these calculations.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-24 18:51:54 +08:00
|
|
|
index = xfs_set_inode64(mp, agcount);
|
2010-05-29 03:03:10 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
if (maxagi)
|
|
|
|
*maxagi = index;
|
|
|
|
return 0;
|
2010-01-11 19:47:48 +08:00
|
|
|
|
|
|
|
out_unwind:
|
|
|
|
kmem_free(pag);
|
|
|
|
for (; index > first_initialised; index--) {
|
|
|
|
pag = radix_tree_delete(&mp->m_perag_tree, index);
|
|
|
|
kmem_free(pag);
|
|
|
|
}
|
|
|
|
return error;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* xfs_readsb
|
|
|
|
*
|
|
|
|
* Does the initial read of the superblock.
|
|
|
|
*/
|
|
|
|
int
|
2013-08-12 18:49:41 +08:00
|
|
|
xfs_readsb(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned int sector_size;
|
2013-04-03 13:11:31 +08:00
|
|
|
struct xfs_buf *bp;
|
|
|
|
struct xfs_sb *sbp = &mp->m_sb;
|
2005-04-17 06:20:36 +08:00
|
|
|
int error;
|
2011-03-07 07:04:35 +08:00
|
|
|
int loud = !(flags & XFS_MFSI_QUIET);
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
const struct xfs_buf_ops *buf_ops;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
ASSERT(mp->m_sb_bp == NULL);
|
|
|
|
ASSERT(mp->m_ddev_targp != NULL);
|
|
|
|
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
/*
|
|
|
|
* For the initial read, we must guess at the sector
|
|
|
|
* size based on the block device. It's enough to
|
|
|
|
* get the sb_sectsize out of the superblock and
|
|
|
|
* then reread with the proper length.
|
|
|
|
* We don't verify it yet, because it may not be complete.
|
|
|
|
*/
|
|
|
|
sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
|
|
|
|
buf_ops = NULL;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Allocate a (locked) buffer to hold the superblock.
|
|
|
|
* This will be kept around at all times to optimize
|
|
|
|
* access to the superblock.
|
|
|
|
*/
|
2010-09-22 08:47:20 +08:00
|
|
|
reread:
|
2014-10-02 07:05:32 +08:00
|
|
|
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
|
|
|
|
BTOBB(sector_size), 0, &bp, buf_ops);
|
|
|
|
if (error) {
|
2012-11-12 19:54:02 +08:00
|
|
|
if (loud)
|
2013-04-03 13:11:32 +08:00
|
|
|
xfs_warn(mp, "SB validate failed with error %d.", error);
|
2014-03-07 13:19:14 +08:00
|
|
|
/* bad CRC means corrupted metadata */
|
2014-06-25 12:58:08 +08:00
|
|
|
if (error == -EFSBADCRC)
|
|
|
|
error = -EFSCORRUPTED;
|
2014-10-02 07:05:32 +08:00
|
|
|
return error;
|
2012-11-12 19:54:02 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the mount structure from the superblock.
|
|
|
|
*/
|
2014-06-06 14:00:43 +08:00
|
|
|
xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we haven't validated the superblock, do so now before we try
|
|
|
|
* to check the sector size and reread the superblock appropriately.
|
|
|
|
*/
|
|
|
|
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
|
|
|
|
if (loud)
|
|
|
|
xfs_warn(mp, "Invalid superblock magic number");
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -EINVAL;
|
2014-06-06 14:00:43 +08:00
|
|
|
goto release_buf;
|
|
|
|
}
|
2013-08-12 18:49:41 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We must be able to do sector-sized and sector-aligned IO.
|
|
|
|
*/
|
2013-04-03 13:11:31 +08:00
|
|
|
if (sector_size > sbp->sb_sectsize) {
|
2011-03-07 07:04:35 +08:00
|
|
|
if (loud)
|
|
|
|
xfs_warn(mp, "device supports %u byte sectors (not %u)",
|
2013-04-03 13:11:31 +08:00
|
|
|
sector_size, sbp->sb_sectsize);
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -ENOSYS;
|
2010-09-22 08:47:20 +08:00
|
|
|
goto release_buf;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
if (buf_ops == NULL) {
|
2014-06-06 14:00:43 +08:00
|
|
|
/*
|
|
|
|
* Re-read the superblock so the buffer is correctly sized,
|
|
|
|
* and properly verified.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_buf_relse(bp);
|
2013-04-03 13:11:31 +08:00
|
|
|
sector_size = sbp->sb_sectsize;
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
|
2010-09-22 08:47:20 +08:00
|
|
|
goto reread;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2015-02-23 18:22:31 +08:00
|
|
|
xfs_reinit_percpu_counters(mp);
|
2006-03-14 10:13:09 +08:00
|
|
|
|
2013-04-03 13:11:31 +08:00
|
|
|
/* no need to be quiet anymore, so reset the buf ops */
|
|
|
|
bp->b_ops = &xfs_sb_buf_ops;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_sb_bp = bp;
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_unlock(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
|
2010-09-22 08:47:20 +08:00
|
|
|
release_buf:
|
|
|
|
xfs_buf_relse(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-10-12 09:03:40 +08:00
|
|
|
* Update alignment values based on mount options and sb values
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-10-12 09:03:40 +08:00
|
|
|
STATIC int
|
2009-01-19 09:04:07 +08:00
|
|
|
xfs_update_alignment(xfs_mount_t *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
xfs_sb_t *sbp = &(mp->m_sb);
|
|
|
|
|
2008-08-13 14:49:32 +08:00
|
|
|
if (mp->m_dalign) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* If stripe unit and stripe width are not multiples
|
|
|
|
* of the fs blocksize turn off alignment.
|
|
|
|
*/
|
|
|
|
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
|
|
|
|
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
|
2013-05-02 19:27:47 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"alignment check failed: sunit/swidth vs. blocksize(%d)",
|
|
|
|
sbp->sb_blocksize);
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Convert the stripe unit and width to FSBs.
|
|
|
|
*/
|
|
|
|
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
|
|
|
|
if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
|
2011-03-07 07:05:35 +08:00
|
|
|
xfs_warn(mp,
|
2013-05-02 19:27:47 +08:00
|
|
|
"alignment check failed: sunit/swidth vs. agsize(%d)",
|
|
|
|
sbp->sb_agblocks);
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else if (mp->m_dalign) {
|
|
|
|
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
|
|
|
|
} else {
|
2013-05-02 19:27:47 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"alignment check failed: sunit(%d) less than bsize(%d)",
|
|
|
|
mp->m_dalign, sbp->sb_blocksize);
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update superblock with new values
|
|
|
|
* and log changes
|
|
|
|
*/
|
2008-03-06 10:44:28 +08:00
|
|
|
if (xfs_sb_version_hasdalign(sbp)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (sbp->sb_unit != mp->m_dalign) {
|
|
|
|
sbp->sb_unit = mp->m_dalign;
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
if (sbp->sb_width != mp->m_swidth) {
|
|
|
|
sbp->sb_width = mp->m_swidth;
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
xfs: Don't keep silent if sunit/swidth can not be changed via mount
As per the mount man page, sunit and swidth can be changed via
mount options. For XFS, on the face of it, those options seems
works if the specified alignments is properly, e.g.
# mount -o sunit=4096,swidth=8192 /dev/sdb1 /mnt
# mount | grep sdb1
/dev/sdb1 on /mnt type xfs (rw,sunit=4096,swidth=8192)
However, neither sunit nor swidth is shown from the xfs_info output.
# xfs_info /mnt
meta-data=/dev/sdb1 isize=256 agcount=4, agsize=262144 blks
= sectsz=512 attr=2
data = bsize=4096 blocks=1048576, imaxpct=25
= sunit=0 swidth=0 blks
^^^^^^^^^^^^^^^^^^^^^^^^^^
naming =version 2 bsize=4096 ascii-ci=0
log =internal bsize=4096 blocks=2560, version=2
= sectsz=512 sunit=0 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
The reason is that the alignment can only be changed if the relevant
super block is already configured with alignments, otherwise, the
given value is silently ignored.
With this fix, the attempt to mount a storage without strip alignment
setup on a super block will get an error with a warning in syslog to
indicate the true cause, e.g.
# mount -o sunit=4096,swidth=8192 /dev/sdb1 /mnt
mount: wrong fs type, bad option, bad superblock on /dev/sdb1,
missing codepage or helper program, or other error
In some cases useful info is found in syslog - try
dmesg | tail or so
.......
XFS (sdb1): cannot change alignment: superblock does not support data
alignment
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Tinguely <tinguely@sgi.com>
Cc: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-05-02 19:27:53 +08:00
|
|
|
} else {
|
|
|
|
xfs_warn(mp,
|
|
|
|
"cannot change alignment: superblock does not support data alignment");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
|
2008-03-06 10:44:28 +08:00
|
|
|
xfs_sb_version_hasdalign(&mp->m_sb)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_dalign = sbp->sb_unit;
|
|
|
|
mp->m_swidth = sbp->sb_width;
|
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* Set the maximum inode count for this filesystem
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_set_maxicount(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
xfs_sb_t *sbp = &(mp->m_sb);
|
|
|
|
__uint64_t icount;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
if (sbp->sb_imax_pct) {
|
|
|
|
/*
|
|
|
|
* Make sure the maximum inode count is a multiple
|
|
|
|
* of the units we allocate inodes in.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
icount = sbp->sb_dblocks * sbp->sb_imax_pct;
|
|
|
|
do_div(icount, 100);
|
|
|
|
do_div(icount, mp->m_ialloc_blks);
|
|
|
|
mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
|
|
|
|
sbp->sb_inopblog;
|
2007-10-12 09:03:40 +08:00
|
|
|
} else {
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_maxicount = 0;
|
|
|
|
}
|
2007-10-12 09:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the default minimum read and write sizes unless
|
|
|
|
* already specified in a mount option.
|
|
|
|
* We use smaller I/O sizes when the file system
|
|
|
|
* is being used for NFS service (wsync mount option).
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_set_rw_sizes(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
xfs_sb_t *sbp = &(mp->m_sb);
|
|
|
|
int readio_log, writeio_log;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
|
|
|
|
if (mp->m_flags & XFS_MOUNT_WSYNC) {
|
|
|
|
readio_log = XFS_WSYNC_READIO_LOG;
|
|
|
|
writeio_log = XFS_WSYNC_WRITEIO_LOG;
|
|
|
|
} else {
|
|
|
|
readio_log = XFS_READIO_LOG_LARGE;
|
|
|
|
writeio_log = XFS_WRITEIO_LOG_LARGE;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
readio_log = mp->m_readio_log;
|
|
|
|
writeio_log = mp->m_writeio_log;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sbp->sb_blocklog > readio_log) {
|
|
|
|
mp->m_readio_log = sbp->sb_blocklog;
|
|
|
|
} else {
|
|
|
|
mp->m_readio_log = readio_log;
|
|
|
|
}
|
|
|
|
mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
|
|
|
|
if (sbp->sb_blocklog > writeio_log) {
|
|
|
|
mp->m_writeio_log = sbp->sb_blocklog;
|
|
|
|
} else {
|
|
|
|
mp->m_writeio_log = writeio_log;
|
|
|
|
}
|
|
|
|
mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
|
2007-10-12 09:03:40 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-04 08:35:03 +08:00
|
|
|
/*
|
|
|
|
* precalculate the low space thresholds for dynamic speculative preallocation.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_set_low_space_thresholds(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < XFS_LOWSP_MAX; i++) {
|
|
|
|
__uint64_t space = mp->m_sb.sb_dblocks;
|
|
|
|
|
|
|
|
do_div(space, 100);
|
|
|
|
mp->m_low_space[i] = space * (i + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* Set whether we're using inode alignment.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_set_inoalignment(xfs_mount_t *mp)
|
|
|
|
{
|
2008-03-06 10:44:28 +08:00
|
|
|
if (xfs_sb_version_hasalign(&mp->m_sb) &&
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_sb.sb_inoalignmt >=
|
|
|
|
XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
|
|
|
|
mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
|
|
|
|
else
|
|
|
|
mp->m_inoalign_mask = 0;
|
|
|
|
/*
|
|
|
|
* If we are using stripe alignment, check whether
|
|
|
|
* the stripe unit is a multiple of the inode alignment
|
|
|
|
*/
|
|
|
|
if (mp->m_dalign && mp->m_inoalign_mask &&
|
|
|
|
!(mp->m_dalign & mp->m_inoalign_mask))
|
|
|
|
mp->m_sinoalign = mp->m_dalign;
|
|
|
|
else
|
|
|
|
mp->m_sinoalign = 0;
|
2007-10-12 09:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-08-07 18:10:58 +08:00
|
|
|
* Check that the data (and log if separate) is an ok size.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
|
|
|
STATIC int
|
2014-10-02 07:05:32 +08:00
|
|
|
xfs_check_sizes(
|
|
|
|
struct xfs_mount *mp)
|
2007-10-12 09:03:40 +08:00
|
|
|
{
|
2014-10-02 07:05:32 +08:00
|
|
|
struct xfs_buf *bp;
|
2007-10-12 09:03:40 +08:00
|
|
|
xfs_daddr_t d;
|
2014-10-02 07:05:32 +08:00
|
|
|
int error;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
|
|
|
|
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "filesystem size mismatch detected");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EFBIG;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2014-10-02 07:05:32 +08:00
|
|
|
error = xfs_buf_read_uncached(mp->m_ddev_targp,
|
2010-09-22 08:47:20 +08:00
|
|
|
d - XFS_FSS_TO_BB(mp, 1),
|
2014-10-02 07:05:32 +08:00
|
|
|
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "last sector read failed");
|
2014-10-02 07:05:32 +08:00
|
|
|
return error;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_relse(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-10-02 07:05:32 +08:00
|
|
|
if (mp->m_logdev_targp == mp->m_ddev_targp)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
|
|
|
|
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
|
|
|
|
xfs_warn(mp, "log size mismatch detected");
|
|
|
|
return -EFBIG;
|
|
|
|
}
|
|
|
|
error = xfs_buf_read_uncached(mp->m_logdev_targp,
|
2010-09-22 08:47:20 +08:00
|
|
|
d - XFS_FSB_TO_BB(mp, 1),
|
2014-10-02 07:05:32 +08:00
|
|
|
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
|
|
|
|
if (error) {
|
|
|
|
xfs_warn(mp, "log device read failed");
|
|
|
|
return error;
|
2007-10-12 09:03:40 +08:00
|
|
|
}
|
2014-10-02 07:05:32 +08:00
|
|
|
xfs_buf_relse(bp);
|
2007-10-12 09:03:40 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-06-08 21:33:32 +08:00
|
|
|
/*
|
|
|
|
* Clear the quotaflags in memory and in the superblock.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_mount_reset_sbqflags(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
mp->m_qflags = 0;
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
/* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
|
2009-06-08 21:33:32 +08:00
|
|
|
if (mp->m_sb.sb_qflags == 0)
|
|
|
|
return 0;
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
mp->m_sb.sb_qflags = 0;
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
|
2009-06-08 21:33:32 +08:00
|
|
|
return 0;
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
return xfs_sync_sb(mp, false);
|
2009-06-08 21:33:32 +08:00
|
|
|
}
|
|
|
|
|
2010-02-06 06:59:53 +08:00
|
|
|
__uint64_t
|
|
|
|
xfs_default_resblks(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
__uint64_t resblks;
|
|
|
|
|
|
|
|
/*
|
2010-03-04 09:46:25 +08:00
|
|
|
* We default to 5% or 8192 fsbs of space reserved, whichever is
|
|
|
|
* smaller. This is intended to cover concurrent allocation
|
|
|
|
* transactions when we initially hit enospc. These each require a 4
|
|
|
|
* block reservation. Hence by default we cover roughly 2000 concurrent
|
|
|
|
* allocation reservations.
|
2010-02-06 06:59:53 +08:00
|
|
|
*/
|
|
|
|
resblks = mp->m_sb.sb_dblocks;
|
|
|
|
do_div(resblks, 20);
|
2010-03-04 09:46:25 +08:00
|
|
|
resblks = min_t(__uint64_t, resblks, 8192);
|
2010-02-06 06:59:53 +08:00
|
|
|
return resblks;
|
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* This function does the following on an initial mount of a file system:
|
|
|
|
* - reads the superblock from disk and init the mount struct
|
|
|
|
* - if we're a 32-bit kernel, do a size check on the superblock
|
|
|
|
* so we don't mount terabyte filesystems
|
|
|
|
* - init mount struct realtime fields
|
|
|
|
* - allocate inode hash table for fs
|
|
|
|
* - init directory manager
|
|
|
|
* - perform recovery and init the log manager
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_mountfs(
|
2008-08-13 14:49:32 +08:00
|
|
|
xfs_mount_t *mp)
|
2007-10-12 09:03:40 +08:00
|
|
|
{
|
|
|
|
xfs_sb_t *sbp = &(mp->m_sb);
|
|
|
|
xfs_inode_t *rip;
|
|
|
|
__uint64_t resblks;
|
2009-06-08 21:33:32 +08:00
|
|
|
uint quotamount = 0;
|
|
|
|
uint quotaflags = 0;
|
2007-10-12 09:03:40 +08:00
|
|
|
int error = 0;
|
|
|
|
|
2013-08-12 18:49:41 +08:00
|
|
|
xfs_sb_mount_common(mp, sbp);
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2008-03-06 10:45:50 +08:00
|
|
|
/*
|
2015-01-22 06:10:33 +08:00
|
|
|
* Check for a mismatched features2 values. Older kernels read & wrote
|
|
|
|
* into the wrong sb offset for sb_features2 on some platforms due to
|
|
|
|
* xfs_sb_t not being 64bit size aligned when sb_features2 was added,
|
|
|
|
* which made older superblock reading/writing routines swap it as a
|
|
|
|
* 64-bit value.
|
2008-03-06 10:45:50 +08:00
|
|
|
*
|
2008-04-10 10:19:34 +08:00
|
|
|
* For backwards compatibility, we make both slots equal.
|
|
|
|
*
|
2015-01-22 06:10:33 +08:00
|
|
|
* If we detect a mismatched field, we OR the set bits into the existing
|
|
|
|
* features2 field in case it has already been modified; we don't want
|
|
|
|
* to lose any features. We then update the bad location with the ORed
|
|
|
|
* value so that older kernels will see any features2 flags. The
|
|
|
|
* superblock writeback code ensures the new sb_features2 is copied to
|
|
|
|
* sb_bad_features2 before it is logged or written to disk.
|
2008-03-06 10:45:50 +08:00
|
|
|
*/
|
2008-04-10 10:19:34 +08:00
|
|
|
if (xfs_sb_has_mismatched_features2(sbp)) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "correcting sb_features alignment problem");
|
2008-03-06 10:45:50 +08:00
|
|
|
sbp->sb_features2 |= sbp->sb_bad_features2;
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2008-04-10 10:19:34 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Re-check for ATTR2 in case it was found in bad_features2
|
|
|
|
* slot.
|
|
|
|
*/
|
2008-04-30 16:15:28 +08:00
|
|
|
if (xfs_sb_version_hasattr2(&mp->m_sb) &&
|
|
|
|
!(mp->m_flags & XFS_MOUNT_NOATTR2))
|
2008-04-10 10:19:34 +08:00
|
|
|
mp->m_flags |= XFS_MOUNT_ATTR2;
|
2008-04-30 16:15:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (xfs_sb_version_hasattr2(&mp->m_sb) &&
|
|
|
|
(mp->m_flags & XFS_MOUNT_NOATTR2)) {
|
|
|
|
xfs_sb_version_removeattr2(&mp->m_sb);
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2008-04-10 10:19:34 +08:00
|
|
|
|
2008-04-30 16:15:28 +08:00
|
|
|
/* update sb_versionnum for the clearing of the morebits */
|
|
|
|
if (!sbp->sb_features2)
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2008-03-06 10:45:50 +08:00
|
|
|
}
|
|
|
|
|
2014-05-20 05:46:40 +08:00
|
|
|
/* always use v2 inodes by default now */
|
|
|
|
if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
|
|
|
|
mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2014-05-20 05:46:40 +08:00
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* Check if sb_agblocks is aligned at stripe boundary
|
|
|
|
* If sb_agblocks is NOT aligned turn off m_dalign since
|
|
|
|
* allocator alignment is within an ag, therefore ag has
|
|
|
|
* to be aligned at stripe boundary.
|
|
|
|
*/
|
2009-01-19 09:04:07 +08:00
|
|
|
error = xfs_update_alignment(mp);
|
2007-10-12 09:03:40 +08:00
|
|
|
if (error)
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
|
|
|
xfs_alloc_compute_maxlevels(mp);
|
|
|
|
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
|
|
|
|
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
|
|
|
|
xfs_ialloc_compute_maxlevels(mp);
|
|
|
|
|
|
|
|
xfs_set_maxicount(mp);
|
|
|
|
|
2014-07-15 06:07:01 +08:00
|
|
|
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
|
2009-03-30 16:21:31 +08:00
|
|
|
if (error)
|
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-07-15 06:07:01 +08:00
|
|
|
error = xfs_uuid_mount(mp);
|
|
|
|
if (error)
|
|
|
|
goto out_remove_sysfs;
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* Set the minimum read and write sizes
|
|
|
|
*/
|
|
|
|
xfs_set_rw_sizes(mp);
|
|
|
|
|
2011-01-04 08:35:03 +08:00
|
|
|
/* set the low space thresholds for dynamic preallocation */
|
|
|
|
xfs_set_low_space_thresholds(mp);
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* Set the inode cluster size.
|
|
|
|
* This may still be overridden by the file system
|
|
|
|
* block size if it is larger than the chosen cluster size.
|
xfs: increase inode cluster size for v5 filesystems
v5 filesystems use 512 byte inodes as a minimum, so read inodes in
clusters that are effectively half the size of a v4 filesystem with
256 byte inodes. For v5 fielsystems, scale the inode cluster size
with the size of the inode so that we keep a constant 32 inodes per
cluster ratio for all inode IO.
This only works if mkfs.xfs sets the inode alignment appropriately
for larger inode clusters, so this functionality is made conditional
on mkfs doing the right thing. xfs_repair needs to know about
the inode alignment changes, too.
Wall time:
create bulkstat find+stat ls -R unlink
v4 237s 161s 173s 201s 299s
v5 235s 163s 205s 31s 356s
patched 234s 160s 182s 29s 317s
System time:
create bulkstat find+stat ls -R unlink
v4 2601s 2490s 1653s 1656s 2960s
v5 2637s 2497s 1681s 20s 3216s
patched 2613s 2451s 1658s 20s 3007s
So, wall time same or down across the board, system time same or
down across the board, and cache hit rates all improve except for
the ls -R case which is a pure cold cache directory read workload
on v5 filesystems...
So, this patch removes most of the performance and CPU usage
differential between v4 and v5 filesystems on traversal related
workloads.
Note: while this patch is currently for v5 filesystems only, there
is no reason it can't be ported back to v4 filesystems. This hasn't
been done here because bringing the code back to v4 requires
forwards and backwards kernel compatibility testing. i.e. to
deterine if older kernels(*) do the right thing with larger inode
alignments but still only using 8k inode cluster sizes. None of this
testing and validation on v4 filesystems has been done, so for the
moment larger inode clusters is limited to v5 superblocks.
(*) a current default config v4 filesystem should mount just fine on
2.6.23 (when lazy-count support was introduced), and so if we change
the alignment emitted by mkfs without a feature bit then we have to
make sure it works properly on all kernels since 2.6.23. And if we
allow it to be changed when the lazy-count bit is not set, then it's
all kernels since v2 logs were introduced that need to be tested for
compatibility...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-11-01 12:27:20 +08:00
|
|
|
*
|
|
|
|
* For v5 filesystems, scale the cluster size with the inode size to
|
|
|
|
* keep a constant ratio of inode per cluster buffer, but only if mkfs
|
|
|
|
* has set the inode alignment value appropriately for larger cluster
|
|
|
|
* sizes.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
|
|
|
mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
|
xfs: increase inode cluster size for v5 filesystems
v5 filesystems use 512 byte inodes as a minimum, so read inodes in
clusters that are effectively half the size of a v4 filesystem with
256 byte inodes. For v5 fielsystems, scale the inode cluster size
with the size of the inode so that we keep a constant 32 inodes per
cluster ratio for all inode IO.
This only works if mkfs.xfs sets the inode alignment appropriately
for larger inode clusters, so this functionality is made conditional
on mkfs doing the right thing. xfs_repair needs to know about
the inode alignment changes, too.
Wall time:
create bulkstat find+stat ls -R unlink
v4 237s 161s 173s 201s 299s
v5 235s 163s 205s 31s 356s
patched 234s 160s 182s 29s 317s
System time:
create bulkstat find+stat ls -R unlink
v4 2601s 2490s 1653s 1656s 2960s
v5 2637s 2497s 1681s 20s 3216s
patched 2613s 2451s 1658s 20s 3007s
So, wall time same or down across the board, system time same or
down across the board, and cache hit rates all improve except for
the ls -R case which is a pure cold cache directory read workload
on v5 filesystems...
So, this patch removes most of the performance and CPU usage
differential between v4 and v5 filesystems on traversal related
workloads.
Note: while this patch is currently for v5 filesystems only, there
is no reason it can't be ported back to v4 filesystems. This hasn't
been done here because bringing the code back to v4 requires
forwards and backwards kernel compatibility testing. i.e. to
deterine if older kernels(*) do the right thing with larger inode
alignments but still only using 8k inode cluster sizes. None of this
testing and validation on v4 filesystems has been done, so for the
moment larger inode clusters is limited to v5 superblocks.
(*) a current default config v4 filesystem should mount just fine on
2.6.23 (when lazy-count support was introduced), and so if we change
the alignment emitted by mkfs without a feature bit then we have to
make sure it works properly on all kernels since 2.6.23. And if we
allow it to be changed when the lazy-count bit is not set, then it's
all kernels since v2 logs were introduced that need to be tested for
compatibility...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-11-01 12:27:20 +08:00
|
|
|
if (xfs_sb_version_hascrc(&mp->m_sb)) {
|
|
|
|
int new_size = mp->m_inode_cluster_size;
|
|
|
|
|
|
|
|
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
|
|
|
|
if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
|
|
|
|
mp->m_inode_cluster_size = new_size;
|
|
|
|
}
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2015-05-29 06:57:27 +08:00
|
|
|
/*
|
|
|
|
* If enabled, sparse inode chunk alignment is expected to match the
|
|
|
|
* cluster size. Full inode chunk alignment must match the chunk size,
|
|
|
|
* but that is checked on sb read verification...
|
|
|
|
*/
|
|
|
|
if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
|
|
|
|
mp->m_sb.sb_spino_align !=
|
|
|
|
XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
|
|
|
|
xfs_warn(mp,
|
|
|
|
"Sparse inode block alignment (%u) must match cluster size (%llu).",
|
|
|
|
mp->m_sb.sb_spino_align,
|
|
|
|
XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
|
|
|
|
error = -EINVAL;
|
|
|
|
goto out_remove_uuid;
|
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* Set inode alignment fields
|
|
|
|
*/
|
|
|
|
xfs_set_inoalignment(mp);
|
|
|
|
|
|
|
|
/*
|
2013-08-12 11:15:03 +08:00
|
|
|
* Check that the data (and log if separate) is an ok size.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
2008-08-13 14:49:32 +08:00
|
|
|
error = xfs_check_sizes(mp);
|
2007-10-12 09:03:40 +08:00
|
|
|
if (error)
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_remove_uuid;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Initialize realtime fields in the mount structure
|
|
|
|
*/
|
2007-10-12 09:03:40 +08:00
|
|
|
error = xfs_rtmount_init(mp);
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "RT mount failed");
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_remove_uuid;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copies the low order bits of the timestamp and the randomly
|
|
|
|
* set "sequence" number out of a UUID.
|
|
|
|
*/
|
|
|
|
uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
|
|
|
|
|
|
|
|
mp->m_dmevmask = 0; /* not persistent; set after each mount */
|
|
|
|
|
2014-06-06 13:01:58 +08:00
|
|
|
error = xfs_da_mount(mp);
|
|
|
|
if (error) {
|
|
|
|
xfs_warn(mp, "Failed dir/attr init: %d", error);
|
|
|
|
goto out_remove_uuid;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the precomputed transaction reservations values.
|
|
|
|
*/
|
|
|
|
xfs_trans_init(mp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate and initialize the per-ag data.
|
|
|
|
*/
|
2010-01-11 19:47:44 +08:00
|
|
|
spin_lock_init(&mp->m_perag_lock);
|
2010-05-27 09:58:13 +08:00
|
|
|
INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
|
2010-01-11 19:47:44 +08:00
|
|
|
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "Failed per-ag init: %d", error);
|
2014-06-06 13:01:58 +08:00
|
|
|
goto out_free_dir;
|
2010-01-11 19:47:44 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-02-04 16:31:52 +08:00
|
|
|
if (!sbp->sb_logblocks) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "no log defined");
|
2009-02-04 16:31:52 +08:00
|
|
|
XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -EFSCORRUPTED;
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_free_perag;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* log's mount-time initialization. Perform 1st part recovery if needed
|
|
|
|
*/
|
2009-02-04 16:31:52 +08:00
|
|
|
error = xfs_log_mount(mp, mp->m_logdev_targp,
|
|
|
|
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
|
|
|
|
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "log mount failed");
|
2012-04-23 13:59:06 +08:00
|
|
|
goto out_fail_wait;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
/*
|
|
|
|
* Now the log is mounted, we know if it was an unclean shutdown or
|
|
|
|
* not. If it was, with the first phase of recovery has completed, we
|
|
|
|
* have consistent AG blocks on disk. We have not recovered EFIs yet,
|
|
|
|
* but they are recovered transactionally in the second recovery phase
|
|
|
|
* later.
|
|
|
|
*
|
|
|
|
* Hence we can safely re-initialise incore superblock counters from
|
|
|
|
* the per-ag data. These may not be correct if the filesystem was not
|
|
|
|
* cleanly unmounted, so we need to wait for recovery to finish before
|
|
|
|
* doing this.
|
|
|
|
*
|
|
|
|
* If the filesystem was cleanly unmounted, then we can trust the
|
|
|
|
* values in the superblock to be correct and we don't need to do
|
|
|
|
* anything here.
|
|
|
|
*
|
|
|
|
* If we are currently making the filesystem, the initialisation will
|
|
|
|
* fail as the perag data is in an undefined state.
|
|
|
|
*/
|
|
|
|
if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
|
|
|
|
!XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
|
|
|
|
!mp->m_sb.sb_inprogress) {
|
|
|
|
error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
|
2009-02-04 16:31:52 +08:00
|
|
|
if (error)
|
2014-08-04 11:49:40 +08:00
|
|
|
goto out_log_dealloc;
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
}
|
2009-02-04 16:31:52 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Get and sanity-check the root inode.
|
|
|
|
* Save the pointer to it in the mount structure.
|
|
|
|
*/
|
2010-06-24 09:35:17 +08:00
|
|
|
error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "failed to read root inode");
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_log_dealloc;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(rip != NULL);
|
|
|
|
|
2011-07-26 14:31:30 +08:00
|
|
|
if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "corrupted root inode %llu: not a directory",
|
2006-06-09 13:29:40 +08:00
|
|
|
(unsigned long long)rip->i_ino);
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_iunlock(rip, XFS_ILOCK_EXCL);
|
|
|
|
XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
|
|
|
|
mp);
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -EFSCORRUPTED;
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_rele_rip;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
mp->m_rootip = rip; /* save it */
|
|
|
|
|
|
|
|
xfs_iunlock(rip, XFS_ILOCK_EXCL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize realtime inode pointers in the mount structure
|
|
|
|
*/
|
2007-10-12 09:03:40 +08:00
|
|
|
error = xfs_rtmount_inodes(mp);
|
|
|
|
if (error) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Free up the root inode.
|
|
|
|
*/
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "failed to read RT inodes");
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_rele_rip;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2009-01-19 09:04:07 +08:00
|
|
|
* If this is a read-only mount defer the superblock updates until
|
|
|
|
* the next remount into writeable mode. Otherwise we would never
|
|
|
|
* perform the update e.g. for the root filesystem.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2015-01-22 06:10:31 +08:00
|
|
|
if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
|
|
|
|
error = xfs_sync_sb(mp, false);
|
2008-04-10 10:21:18 +08:00
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "failed to write sb changes");
|
2009-02-04 16:33:58 +08:00
|
|
|
goto out_rtunmount;
|
2008-04-10 10:21:18 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialise the XFS quota management subsystem for this mount
|
|
|
|
*/
|
2009-06-08 21:33:32 +08:00
|
|
|
if (XFS_IS_QUOTA_RUNNING(mp)) {
|
|
|
|
error = xfs_qm_newmount(mp, "amount, "aflags);
|
|
|
|
if (error)
|
|
|
|
goto out_rtunmount;
|
|
|
|
} else {
|
|
|
|
ASSERT(!XFS_IS_QUOTA_ON(mp));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If a file system had quotas running earlier, but decided to
|
|
|
|
* mount without -o uquota/pquota/gquota options, revoke the
|
|
|
|
* quotachecked license.
|
|
|
|
*/
|
|
|
|
if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_notice(mp, "resetting quota flags");
|
2009-06-08 21:33:32 +08:00
|
|
|
error = xfs_mount_reset_sbqflags(mp);
|
|
|
|
if (error)
|
2014-07-15 05:41:25 +08:00
|
|
|
goto out_rtunmount;
|
2009-06-08 21:33:32 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Finish recovering the file system. This part needed to be
|
|
|
|
* delayed until after the root and real-time bitmap inodes
|
|
|
|
* were consistently read in.
|
|
|
|
*/
|
2008-08-13 14:49:32 +08:00
|
|
|
error = xfs_log_mount_finish(mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "log mount finish failed");
|
2009-02-04 16:33:58 +08:00
|
|
|
goto out_rtunmount;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Complete the quota initialisation, post-log-replay component.
|
|
|
|
*/
|
2009-06-08 21:33:32 +08:00
|
|
|
if (quotamount) {
|
|
|
|
ASSERT(mp->m_qflags == 0);
|
|
|
|
mp->m_qflags = quotaflags;
|
|
|
|
|
|
|
|
xfs_qm_mount_quotas(mp);
|
|
|
|
}
|
|
|
|
|
2007-06-18 14:50:27 +08:00
|
|
|
/*
|
|
|
|
* Now we are mounted, reserve a small amount of unused space for
|
|
|
|
* privileged transactions. This is needed so that transaction
|
|
|
|
* space required for critical operations can dip into this pool
|
|
|
|
* when at ENOSPC. This is needed for operations like create with
|
|
|
|
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
|
|
|
|
* are not allowed to use this reserved space.
|
2010-03-04 09:46:25 +08:00
|
|
|
*
|
|
|
|
* This may drive us straight to ENOSPC on mount, but that implies
|
|
|
|
* we were already there on the last unmount. Warn if this occurs.
|
2007-06-18 14:50:27 +08:00
|
|
|
*/
|
2010-02-06 06:59:53 +08:00
|
|
|
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
|
|
|
|
resblks = xfs_default_resblks(mp);
|
|
|
|
error = xfs_reserve_blocks(mp, &resblks, NULL);
|
|
|
|
if (error)
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"Unable to allocate reserve blocks. Continuing without reserve pool.");
|
2010-02-06 06:59:53 +08:00
|
|
|
}
|
2007-06-18 14:50:27 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
|
2009-02-04 16:33:58 +08:00
|
|
|
out_rtunmount:
|
|
|
|
xfs_rtunmount_inodes(mp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_rele_rip:
|
2008-03-27 15:01:08 +08:00
|
|
|
IRELE(rip);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_log_dealloc:
|
2009-03-16 15:19:29 +08:00
|
|
|
xfs_log_unmount(mp);
|
2012-04-23 13:59:06 +08:00
|
|
|
out_fail_wait:
|
|
|
|
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
|
|
|
|
xfs_wait_buftarg(mp->m_logdev_targp);
|
|
|
|
xfs_wait_buftarg(mp->m_ddev_targp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_free_perag:
|
2008-08-13 14:50:47 +08:00
|
|
|
xfs_free_perag(mp);
|
2014-06-06 13:01:58 +08:00
|
|
|
out_free_dir:
|
|
|
|
xfs_da_unmount(mp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_remove_uuid:
|
2009-03-30 16:21:31 +08:00
|
|
|
xfs_uuid_unmount(mp);
|
2014-07-15 06:07:01 +08:00
|
|
|
out_remove_sysfs:
|
|
|
|
xfs_sysfs_del(&mp->m_kobj);
|
2009-02-04 16:31:52 +08:00
|
|
|
out:
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This flushes out the inodes,dquots and the superblock, unmounts the
|
|
|
|
* log and makes sure that incore structures are freed.
|
|
|
|
*/
|
2008-08-13 14:49:57 +08:00
|
|
|
void
|
|
|
|
xfs_unmountfs(
|
|
|
|
struct xfs_mount *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-08-13 14:49:57 +08:00
|
|
|
__uint64_t resblks;
|
|
|
|
int error;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-11-06 22:50:47 +08:00
|
|
|
cancel_delayed_work_sync(&mp->m_eofblocks_work);
|
|
|
|
|
2009-06-08 21:33:32 +08:00
|
|
|
xfs_qm_unmount_quotas(mp);
|
2009-02-04 16:33:58 +08:00
|
|
|
xfs_rtunmount_inodes(mp);
|
2008-08-13 14:49:04 +08:00
|
|
|
IRELE(mp->m_rootip);
|
|
|
|
|
2007-06-18 14:50:17 +08:00
|
|
|
/*
|
|
|
|
* We can potentially deadlock here if we have an inode cluster
|
2009-03-29 15:55:42 +08:00
|
|
|
* that has been freed has its buffer still pinned in memory because
|
2007-06-18 14:50:17 +08:00
|
|
|
* the transaction is still sitting in a iclog. The stale inodes
|
|
|
|
* on that buffer will have their flush locks held until the
|
|
|
|
* transaction hits the disk and the callbacks run. the inode
|
|
|
|
* flush takes the flush lock unconditionally and with nothing to
|
|
|
|
* push out the iclog we will never get that unlocked. hence we
|
|
|
|
* need to force the log first.
|
|
|
|
*/
|
2010-01-19 17:56:46 +08:00
|
|
|
xfs_log_force(mp, XFS_LOG_SYNC);
|
2010-02-06 09:39:36 +08:00
|
|
|
|
|
|
|
/*
|
2012-04-23 13:58:34 +08:00
|
|
|
* Flush all pending changes from the AIL.
|
|
|
|
*/
|
|
|
|
xfs_ail_push_all_sync(mp->m_ail);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* And reclaim all inodes. At this point there should be no dirty
|
2012-10-08 18:56:00 +08:00
|
|
|
* inodes and none should be pinned or locked, but use synchronous
|
|
|
|
* reclaim just to be sure. We can stop background inode reclaim
|
|
|
|
* here as well if it is still running.
|
2010-02-06 09:39:36 +08:00
|
|
|
*/
|
2012-10-08 18:56:00 +08:00
|
|
|
cancel_delayed_work_sync(&mp->m_reclaim_work);
|
2010-02-06 09:39:36 +08:00
|
|
|
xfs_reclaim_inodes(mp, SYNC_WAIT);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-08 21:33:32 +08:00
|
|
|
xfs_qm_unmount(mp);
|
2008-10-30 13:53:25 +08:00
|
|
|
|
2007-06-18 14:50:27 +08:00
|
|
|
/*
|
|
|
|
* Unreserve any blocks we have so that when we unmount we don't account
|
|
|
|
* the reserved free space as used. This is really only necessary for
|
|
|
|
* lazy superblock counting because it trusts the incore superblock
|
2009-03-29 15:55:42 +08:00
|
|
|
* counters to be absolutely correct on clean unmount.
|
2007-06-18 14:50:27 +08:00
|
|
|
*
|
|
|
|
* We don't bother correcting this elsewhere for lazy superblock
|
|
|
|
* counting because on mount of an unclean filesystem we reconstruct the
|
|
|
|
* correct counter value and this is irrelevant.
|
|
|
|
*
|
|
|
|
* For non-lazy counter filesystems, this doesn't matter at all because
|
|
|
|
* we only every apply deltas to the superblock and hence the incore
|
|
|
|
* value does not matter....
|
|
|
|
*/
|
|
|
|
resblks = 0;
|
2008-04-10 10:20:03 +08:00
|
|
|
error = xfs_reserve_blocks(mp, &resblks, NULL);
|
|
|
|
if (error)
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "Unable to free reserved block pool. "
|
2008-04-10 10:20:03 +08:00
|
|
|
"Freespace may not be correct on next mount.");
|
|
|
|
|
2011-06-30 06:10:14 +08:00
|
|
|
error = xfs_log_sbcount(mp);
|
2008-04-10 10:21:18 +08:00
|
|
|
if (error)
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "Unable to update superblock counters. "
|
2008-04-10 10:21:18 +08:00
|
|
|
"Freespace may not be correct on next mount.");
|
2011-09-14 22:08:26 +08:00
|
|
|
|
2009-03-16 15:19:29 +08:00
|
|
|
xfs_log_unmount(mp);
|
2014-06-06 13:01:58 +08:00
|
|
|
xfs_da_unmount(mp);
|
2009-03-30 16:21:31 +08:00
|
|
|
xfs_uuid_unmount(mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-08-13 14:17:37 +08:00
|
|
|
#if defined(DEBUG)
|
2007-08-30 15:20:53 +08:00
|
|
|
xfs_errortag_clearall(mp, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
2008-08-13 14:50:47 +08:00
|
|
|
xfs_free_perag(mp);
|
2014-07-15 06:07:01 +08:00
|
|
|
|
|
|
|
xfs_sysfs_del(&mp->m_kobj);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-11-28 11:02:59 +08:00
|
|
|
/*
|
|
|
|
* Determine whether modifications can proceed. The caller specifies the minimum
|
|
|
|
* freeze level for which modifications should not be allowed. This allows
|
|
|
|
* certain operations to proceed while the freeze sequence is in progress, if
|
|
|
|
* necessary.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
xfs_fs_writable(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int level)
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
{
|
2014-11-28 11:02:59 +08:00
|
|
|
ASSERT(level > SB_UNFROZEN);
|
|
|
|
if ((mp->m_super->s_writers.frozen >= level) ||
|
|
|
|
XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-07-11 22:51:44 +08:00
|
|
|
* xfs_log_sbcount
|
|
|
|
*
|
2011-06-30 06:10:14 +08:00
|
|
|
* Sync the superblock counters to disk.
|
2011-07-11 22:51:44 +08:00
|
|
|
*
|
2014-11-28 11:02:59 +08:00
|
|
|
* Note this code can be called during the process of freezing, so we use the
|
|
|
|
* transaction allocator that does not block when the transaction subsystem is
|
|
|
|
* in its frozen state.
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
*/
|
|
|
|
int
|
2011-06-30 06:10:14 +08:00
|
|
|
xfs_log_sbcount(xfs_mount_t *mp)
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
{
|
2014-11-28 11:02:59 +08:00
|
|
|
/* allow this to proceed during the freeze sequence... */
|
|
|
|
if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we don't need to do this if we are updating the superblock
|
|
|
|
* counters on every modification.
|
|
|
|
*/
|
|
|
|
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
|
|
|
|
return 0;
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
return xfs_sync_sb(mp, true);
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
}
|
|
|
|
|
2015-05-29 05:39:34 +08:00
|
|
|
/*
|
|
|
|
* Deltas for the inode count are +/-64, hence we use a large batch size
|
|
|
|
* of 128 so we don't need to take the counter lock on every update.
|
|
|
|
*/
|
|
|
|
#define XFS_ICOUNT_BATCH 128
|
2015-02-23 18:19:28 +08:00
|
|
|
int
|
|
|
|
xfs_mod_icount(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int64_t delta)
|
|
|
|
{
|
2015-05-29 05:39:34 +08:00
|
|
|
__percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
|
|
|
|
if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
|
2015-02-23 18:19:28 +08:00
|
|
|
ASSERT(0);
|
|
|
|
percpu_counter_add(&mp->m_icount, -delta);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-23 18:19:53 +08:00
|
|
|
int
|
|
|
|
xfs_mod_ifree(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int64_t delta)
|
|
|
|
{
|
|
|
|
percpu_counter_add(&mp->m_ifree, delta);
|
|
|
|
if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
|
|
|
|
ASSERT(0);
|
|
|
|
percpu_counter_add(&mp->m_ifree, -delta);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2015-02-23 18:22:03 +08:00
|
|
|
|
2015-05-29 05:39:34 +08:00
|
|
|
/*
|
|
|
|
* Deltas for the block count can vary from 1 to very large, but lock contention
|
|
|
|
* only occurs on frequent small block count updates such as in the delayed
|
|
|
|
* allocation path for buffered writes (page a time updates). Hence we set
|
|
|
|
* a large batch count (1024) to minimise global counter updates except when
|
|
|
|
* we get near to ENOSPC and we have to be very accurate with our updates.
|
|
|
|
*/
|
|
|
|
#define XFS_FDBLOCKS_BATCH 1024
|
2015-02-23 18:22:03 +08:00
|
|
|
int
|
|
|
|
xfs_mod_fdblocks(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int64_t delta,
|
|
|
|
bool rsvd)
|
|
|
|
{
|
|
|
|
int64_t lcounter;
|
|
|
|
long long res_used;
|
|
|
|
s32 batch;
|
|
|
|
|
|
|
|
if (delta > 0) {
|
|
|
|
/*
|
|
|
|
* If the reserve pool is depleted, put blocks back into it
|
|
|
|
* first. Most of the time the pool is full.
|
|
|
|
*/
|
|
|
|
if (likely(mp->m_resblks == mp->m_resblks_avail)) {
|
|
|
|
percpu_counter_add(&mp->m_fdblocks, delta);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
|
|
|
|
|
|
|
|
if (res_used > delta) {
|
|
|
|
mp->m_resblks_avail += delta;
|
|
|
|
} else {
|
|
|
|
delta -= res_used;
|
|
|
|
mp->m_resblks_avail = mp->m_resblks;
|
|
|
|
percpu_counter_add(&mp->m_fdblocks, delta);
|
|
|
|
}
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Taking blocks away, need to be more accurate the closer we
|
|
|
|
* are to zero.
|
|
|
|
*
|
|
|
|
* If the counter has a value of less than 2 * max batch size,
|
|
|
|
* then make everything serialise as we are real close to
|
|
|
|
* ENOSPC.
|
|
|
|
*/
|
2015-05-29 05:39:34 +08:00
|
|
|
if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
|
|
|
|
XFS_FDBLOCKS_BATCH) < 0)
|
2015-02-23 18:22:03 +08:00
|
|
|
batch = 1;
|
|
|
|
else
|
2015-05-29 05:39:34 +08:00
|
|
|
batch = XFS_FDBLOCKS_BATCH;
|
2015-02-23 18:22:03 +08:00
|
|
|
|
|
|
|
__percpu_counter_add(&mp->m_fdblocks, delta, batch);
|
2015-05-29 05:39:34 +08:00
|
|
|
if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp),
|
|
|
|
XFS_FDBLOCKS_BATCH) >= 0) {
|
2015-02-23 18:22:03 +08:00
|
|
|
/* we had space! */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* lock up the sb for dipping into reserves before releasing the space
|
|
|
|
* that took us to ENOSPC.
|
|
|
|
*/
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
percpu_counter_add(&mp->m_fdblocks, -delta);
|
|
|
|
if (!rsvd)
|
|
|
|
goto fdblocks_enospc;
|
|
|
|
|
|
|
|
lcounter = (long long)mp->m_resblks_avail + delta;
|
|
|
|
if (lcounter >= 0) {
|
|
|
|
mp->m_resblks_avail = lcounter;
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
printk_once(KERN_WARNING
|
|
|
|
"Filesystem \"%s\": reserve blocks depleted! "
|
|
|
|
"Consider increasing reserve pool size.",
|
|
|
|
mp->m_fsname);
|
|
|
|
fdblocks_enospc:
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
|
2015-02-23 18:22:54 +08:00
|
|
|
int
|
|
|
|
xfs_mod_frextents(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int64_t delta)
|
|
|
|
{
|
|
|
|
int64_t lcounter;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
lcounter = mp->m_sb.sb_frextents + delta;
|
|
|
|
if (lcounter < 0)
|
|
|
|
ret = -ENOSPC;
|
|
|
|
else
|
|
|
|
mp->m_sb.sb_frextents = lcounter;
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* xfs_getsb() is called to obtain the buffer for the superblock.
|
|
|
|
* The buffer is returned locked and read in from disk.
|
|
|
|
* The buffer should be released with a call to xfs_brelse().
|
|
|
|
*
|
|
|
|
* If the flags parameter is BUF_TRYLOCK, then we'll only return
|
|
|
|
* the superblock buffer if it can be locked without sleeping.
|
|
|
|
* If it can't then we'll return NULL.
|
|
|
|
*/
|
2011-07-08 20:36:19 +08:00
|
|
|
struct xfs_buf *
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_getsb(
|
2011-07-08 20:36:19 +08:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-07-08 20:36:19 +08:00
|
|
|
struct xfs_buf *bp = mp->m_sb_bp;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-07-08 20:36:19 +08:00
|
|
|
if (!xfs_buf_trylock(bp)) {
|
|
|
|
if (flags & XBF_TRYLOCK)
|
2005-04-17 06:20:36 +08:00
|
|
|
return NULL;
|
2011-07-08 20:36:19 +08:00
|
|
|
xfs_buf_lock(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-07-08 20:36:19 +08:00
|
|
|
|
2011-07-23 07:40:04 +08:00
|
|
|
xfs_buf_hold(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
ASSERT(XFS_BUF_ISDONE(bp));
|
2006-01-15 09:37:08 +08:00
|
|
|
return bp;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Used to free the superblock along various error paths.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_freesb(
|
2010-09-22 08:47:20 +08:00
|
|
|
struct xfs_mount *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-09-22 08:47:20 +08:00
|
|
|
struct xfs_buf *bp = mp->m_sb_bp;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_lock(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_sb_bp = NULL;
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_relse(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-02-15 17:44:46 +08:00
|
|
|
/*
|
|
|
|
* If the underlying (data/log/rt) device is readonly, there are some
|
|
|
|
* operations that cannot proceed.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_dev_is_read_only(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
char *message)
|
|
|
|
{
|
|
|
|
if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
|
|
|
|
xfs_readonly_buftarg(mp->m_logdev_targp) ||
|
|
|
|
(mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_notice(mp, "%s required on read-only device.", message);
|
|
|
|
xfs_notice(mp, "write access unavailable, cannot proceed.");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EROFS;
|
2010-02-15 17:44:46 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|