2004-08-02 05:59:26 +08:00
|
|
|
/*
|
|
|
|
* QEMU System Emulator block driver
|
2007-09-17 05:08:06 +08:00
|
|
|
*
|
2004-08-02 05:59:26 +08:00
|
|
|
* Copyright (c) 2003 Fabrice Bellard
|
2007-09-17 05:08:06 +08:00
|
|
|
*
|
2004-08-02 05:59:26 +08:00
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
#ifndef BLOCK_INT_H
|
|
|
|
#define BLOCK_INT_H
|
|
|
|
|
2014-09-05 21:46:16 +08:00
|
|
|
#include "block/accounting.h"
|
2012-12-18 01:19:44 +08:00
|
|
|
#include "block/block.h"
|
2015-10-19 23:53:24 +08:00
|
|
|
#include "block/throttle-groups.h"
|
2012-12-18 01:20:00 +08:00
|
|
|
#include "qemu/option.h"
|
|
|
|
#include "qemu/queue.h"
|
2015-09-01 21:48:02 +08:00
|
|
|
#include "qemu/coroutine.h"
|
2012-12-18 01:20:00 +08:00
|
|
|
#include "qemu/timer.h"
|
2011-09-22 04:16:47 +08:00
|
|
|
#include "qapi-types.h"
|
2013-01-22 00:09:41 +08:00
|
|
|
#include "qemu/hbitmap.h"
|
2013-05-25 11:09:44 +08:00
|
|
|
#include "block/snapshot.h"
|
2013-08-21 23:02:47 +08:00
|
|
|
#include "qemu/main-loop.h"
|
2013-09-02 20:14:39 +08:00
|
|
|
#include "qemu/throttle.h"
|
2007-11-11 10:51:17 +08:00
|
|
|
|
2012-07-27 16:05:22 +08:00
|
|
|
#define BLOCK_FLAG_ENCRYPT 1
|
|
|
|
#define BLOCK_FLAG_COMPAT6 4
|
|
|
|
#define BLOCK_FLAG_LAZY_REFCOUNTS 8
|
2007-09-17 05:59:02 +08:00
|
|
|
|
2012-07-27 16:05:22 +08:00
|
|
|
#define BLOCK_OPT_SIZE "size"
|
|
|
|
#define BLOCK_OPT_ENCRYPT "encryption"
|
|
|
|
#define BLOCK_OPT_COMPAT6 "compat6"
|
|
|
|
#define BLOCK_OPT_BACKING_FILE "backing_file"
|
|
|
|
#define BLOCK_OPT_BACKING_FMT "backing_fmt"
|
|
|
|
#define BLOCK_OPT_CLUSTER_SIZE "cluster_size"
|
|
|
|
#define BLOCK_OPT_TABLE_SIZE "table_size"
|
|
|
|
#define BLOCK_OPT_PREALLOC "preallocation"
|
|
|
|
#define BLOCK_OPT_SUBFMT "subformat"
|
|
|
|
#define BLOCK_OPT_COMPAT_LEVEL "compat"
|
|
|
|
#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts"
|
2013-01-30 07:26:52 +08:00
|
|
|
#define BLOCK_OPT_ADAPTER_TYPE "adapter_type"
|
2013-11-07 22:56:38 +08:00
|
|
|
#define BLOCK_OPT_REDUNDANCY "redundancy"
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 14:29:58 +08:00
|
|
|
#define BLOCK_OPT_NOCOW "nocow"
|
2015-02-13 17:20:53 +08:00
|
|
|
#define BLOCK_OPT_OBJECT_SIZE "object_size"
|
2015-02-19 06:40:49 +08:00
|
|
|
#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits"
|
2009-05-18 22:42:10 +08:00
|
|
|
|
2014-11-20 23:27:11 +08:00
|
|
|
#define BLOCK_PROBE_BUF_SIZE 512
|
|
|
|
|
2015-11-09 18:16:46 +08:00
|
|
|
enum BdrvTrackedRequestType {
|
|
|
|
BDRV_TRACKED_READ,
|
|
|
|
BDRV_TRACKED_WRITE,
|
|
|
|
BDRV_TRACKED_FLUSH,
|
|
|
|
BDRV_TRACKED_IOCTL,
|
|
|
|
BDRV_TRACKED_DISCARD,
|
|
|
|
};
|
|
|
|
|
2013-06-24 23:13:10 +08:00
|
|
|
typedef struct BdrvTrackedRequest {
|
|
|
|
BlockDriverState *bs;
|
2013-12-03 22:31:25 +08:00
|
|
|
int64_t offset;
|
|
|
|
unsigned int bytes;
|
2015-11-09 18:16:46 +08:00
|
|
|
enum BdrvTrackedRequestType type;
|
2013-12-05 00:08:50 +08:00
|
|
|
|
2013-12-04 23:43:44 +08:00
|
|
|
bool serialising;
|
2013-12-05 00:08:50 +08:00
|
|
|
int64_t overlap_offset;
|
|
|
|
unsigned int overlap_bytes;
|
|
|
|
|
2013-06-24 23:13:10 +08:00
|
|
|
QLIST_ENTRY(BdrvTrackedRequest) list;
|
|
|
|
Coroutine *co; /* owner, used for deadlock detection */
|
|
|
|
CoQueue wait_queue; /* coroutines blocked on this request */
|
2013-12-13 20:04:35 +08:00
|
|
|
|
|
|
|
struct BdrvTrackedRequest *waiting_for;
|
2013-06-24 23:13:10 +08:00
|
|
|
} BdrvTrackedRequest;
|
|
|
|
|
2004-08-02 05:59:26 +08:00
|
|
|
struct BlockDriver {
|
|
|
|
const char *format_name;
|
|
|
|
int instance_size;
|
2013-10-02 20:33:48 +08:00
|
|
|
|
2014-03-04 02:11:34 +08:00
|
|
|
/* set to true if the BlockDriver is a block filter */
|
|
|
|
bool is_filter;
|
|
|
|
/* for snapshots block filter like Quorum can implement the
|
|
|
|
* following recursive callback.
|
2014-01-24 04:31:36 +08:00
|
|
|
* It's purpose is to recurse on the filter children while calling
|
|
|
|
* bdrv_recurse_is_first_non_filter on them.
|
|
|
|
* For a sample implementation look in the future Quorum block filter.
|
2013-10-02 20:33:48 +08:00
|
|
|
*/
|
2014-01-24 04:31:36 +08:00
|
|
|
bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
|
|
|
|
BlockDriverState *candidate);
|
2013-10-02 20:33:48 +08:00
|
|
|
|
2004-08-02 05:59:26 +08:00
|
|
|
int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
|
2009-06-15 20:04:22 +08:00
|
|
|
int (*bdrv_probe_device)(const char *filename);
|
2013-03-18 23:40:51 +08:00
|
|
|
|
|
|
|
/* Any driver implementing this callback is expected to be able to handle
|
|
|
|
* NULL file names in its .bdrv_open() implementation */
|
2013-03-16 01:47:22 +08:00
|
|
|
void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
|
2013-09-24 23:07:04 +08:00
|
|
|
/* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
|
|
|
|
* this field set to true, except ones that are defined only by their
|
|
|
|
* child's bs.
|
|
|
|
* An example of the last type will be the quorum block driver.
|
|
|
|
*/
|
|
|
|
bool bdrv_needs_filename;
|
2012-09-21 03:13:19 +08:00
|
|
|
|
2014-06-04 21:09:35 +08:00
|
|
|
/* Set if a driver can support backing files */
|
|
|
|
bool supports_backing;
|
|
|
|
|
2012-09-21 03:13:19 +08:00
|
|
|
/* For handling image reopen for split or non-split files */
|
|
|
|
int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
|
|
|
|
BlockReopenQueue *queue, Error **errp);
|
|
|
|
void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
|
|
|
|
void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
|
2015-11-16 22:34:59 +08:00
|
|
|
void (*bdrv_join_options)(QDict *options, QDict *old_options);
|
2012-09-21 03:13:19 +08:00
|
|
|
|
2013-09-05 20:22:29 +08:00
|
|
|
int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp);
|
|
|
|
int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp);
|
2007-09-17 05:08:06 +08:00
|
|
|
int (*bdrv_read)(BlockDriverState *bs, int64_t sector_num,
|
2004-08-02 05:59:26 +08:00
|
|
|
uint8_t *buf, int nb_sectors);
|
2007-09-17 05:08:06 +08:00
|
|
|
int (*bdrv_write)(BlockDriverState *bs, int64_t sector_num,
|
2004-08-02 05:59:26 +08:00
|
|
|
const uint8_t *buf, int nb_sectors);
|
2004-09-19 03:32:11 +08:00
|
|
|
void (*bdrv_close)(BlockDriverState *bs);
|
2014-06-05 17:21:11 +08:00
|
|
|
int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp);
|
2004-08-02 05:59:26 +08:00
|
|
|
int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
|
2005-12-19 02:28:15 +08:00
|
|
|
int (*bdrv_make_empty)(BlockDriverState *bs);
|
2014-07-19 02:24:56 +08:00
|
|
|
|
2015-04-27 19:50:54 +08:00
|
|
|
void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
|
2014-07-19 02:24:56 +08:00
|
|
|
|
2006-08-02 00:21:11 +08:00
|
|
|
/* aio */
|
2014-10-07 19:59:14 +08:00
|
|
|
BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
|
2009-04-08 02:43:24 +08:00
|
|
|
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb, void *opaque);
|
2014-10-07 19:59:14 +08:00
|
|
|
BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
|
2009-04-08 02:43:24 +08:00
|
|
|
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb, void *opaque);
|
2014-10-07 19:59:14 +08:00
|
|
|
BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb, void *opaque);
|
2014-10-07 19:59:14 +08:00
|
|
|
BlockAIOCB *(*bdrv_aio_discard)(BlockDriverState *bs,
|
2011-10-17 18:32:14 +08:00
|
|
|
int64_t sector_num, int nb_sectors,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb, void *opaque);
|
2006-08-02 00:21:11 +08:00
|
|
|
|
2011-07-14 23:27:13 +08:00
|
|
|
int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
|
|
|
|
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
|
|
|
|
int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
|
|
|
|
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
|
2012-02-07 21:27:25 +08:00
|
|
|
/*
|
|
|
|
* Efficiently zero a region of the disk image. Typically an image format
|
|
|
|
* would use a compact metadata representation to implement this. This
|
|
|
|
* function pointer may be NULL and .bdrv_co_writev() will be called
|
|
|
|
* instead.
|
|
|
|
*/
|
|
|
|
int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
|
2013-10-24 18:06:51 +08:00
|
|
|
int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
|
2011-10-17 18:32:14 +08:00
|
|
|
int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
|
|
|
|
int64_t sector_num, int nb_sectors);
|
2013-09-05 01:00:28 +08:00
|
|
|
int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
|
2011-11-14 20:44:19 +08:00
|
|
|
int64_t sector_num, int nb_sectors, int *pnum);
|
2011-07-14 23:27:13 +08:00
|
|
|
|
2011-11-15 05:09:45 +08:00
|
|
|
/*
|
|
|
|
* Invalidate any cached meta-data.
|
|
|
|
*/
|
2014-03-12 22:59:16 +08:00
|
|
|
void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp);
|
2015-12-22 21:07:08 +08:00
|
|
|
int (*bdrv_inactivate)(BlockDriverState *bs);
|
2011-11-15 05:09:45 +08:00
|
|
|
|
2011-11-11 00:25:44 +08:00
|
|
|
/*
|
|
|
|
* Flushes all data that was already written to the OS all the way down to
|
|
|
|
* the disk (for example raw-posix calls fsync()).
|
|
|
|
*/
|
|
|
|
int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
|
|
|
|
|
2011-11-11 01:10:11 +08:00
|
|
|
/*
|
|
|
|
* Flushes all internal caches to the OS. The data may still sit in a
|
|
|
|
* writeback cache of the host OS, but it will survive a crash of the qemu
|
|
|
|
* process.
|
|
|
|
*/
|
|
|
|
int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
|
|
|
|
|
2006-08-02 00:21:11 +08:00
|
|
|
const char *protocol_name;
|
|
|
|
int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset);
|
block: Avoid unecessary drv->bdrv_getlength() calls
The block layer generally keeps the size of an image cached in
bs->total_sectors so that it doesn't have to perform expensive
operations to get the size whenever it needs it.
This doesn't work however when using a backend that can change its size
without qemu being aware of it, i.e. passthrough of removable media like
CD-ROMs or floppy disks. For this reason, the caching is disabled when a
removable device is used.
It is obvious that checking whether the _guest_ device has removable
media isn't the right thing to do when we want to know whether the size
of the host backend can change. To make things worse, non-top-level
BlockDriverStates never have any device attached, which makes qemu
assume they are removable, so drv->bdrv_getlength() is always called on
the protocol layer. In the case of raw-posix, this causes unnecessary
lseek() system calls, which turned out to be rather expensive.
This patch completely changes the logic and disables bs->total_sectors
caching only for certain block driver types, for which a size change is
expected: host_cdrom and host_floppy on POSIX, host_device on win32; also
the raw format in case it sits on top of one of these protocols, but in
the common case the nested bdrv_getlength() call on the protocol driver
will use the cache again and avoid an expensive drv->bdrv_getlength()
call.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
2013-10-29 19:18:58 +08:00
|
|
|
|
2006-08-02 00:21:11 +08:00
|
|
|
int64_t (*bdrv_getlength)(BlockDriverState *bs);
|
block: Avoid unecessary drv->bdrv_getlength() calls
The block layer generally keeps the size of an image cached in
bs->total_sectors so that it doesn't have to perform expensive
operations to get the size whenever it needs it.
This doesn't work however when using a backend that can change its size
without qemu being aware of it, i.e. passthrough of removable media like
CD-ROMs or floppy disks. For this reason, the caching is disabled when a
removable device is used.
It is obvious that checking whether the _guest_ device has removable
media isn't the right thing to do when we want to know whether the size
of the host backend can change. To make things worse, non-top-level
BlockDriverStates never have any device attached, which makes qemu
assume they are removable, so drv->bdrv_getlength() is always called on
the protocol layer. In the case of raw-posix, this causes unnecessary
lseek() system calls, which turned out to be rather expensive.
This patch completely changes the logic and disables bs->total_sectors
caching only for certain block driver types, for which a size change is
expected: host_cdrom and host_floppy on POSIX, host_device on win32; also
the raw format in case it sits on top of one of these protocols, but in
the common case the nested bdrv_getlength() call on the protocol driver
will use the cache again and avoid an expensive drv->bdrv_getlength()
call.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
2013-10-29 19:18:58 +08:00
|
|
|
bool has_variable_length;
|
2011-07-12 19:56:39 +08:00
|
|
|
int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
|
block: Avoid unecessary drv->bdrv_getlength() calls
The block layer generally keeps the size of an image cached in
bs->total_sectors so that it doesn't have to perform expensive
operations to get the size whenever it needs it.
This doesn't work however when using a backend that can change its size
without qemu being aware of it, i.e. passthrough of removable media like
CD-ROMs or floppy disks. For this reason, the caching is disabled when a
removable device is used.
It is obvious that checking whether the _guest_ device has removable
media isn't the right thing to do when we want to know whether the size
of the host backend can change. To make things worse, non-top-level
BlockDriverStates never have any device attached, which makes qemu
assume they are removable, so drv->bdrv_getlength() is always called on
the protocol layer. In the case of raw-posix, this causes unnecessary
lseek() system calls, which turned out to be rather expensive.
This patch completely changes the logic and disables bs->total_sectors
caching only for certain block driver types, for which a size change is
expected: host_cdrom and host_floppy on POSIX, host_device on win32; also
the raw format in case it sits on top of one of these protocols, but in
the common case the nested bdrv_getlength() call on the protocol driver
will use the cache again and avoid an expensive drv->bdrv_getlength()
call.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
2013-10-29 19:18:58 +08:00
|
|
|
|
2007-09-17 05:08:06 +08:00
|
|
|
int (*bdrv_write_compressed)(BlockDriverState *bs, int64_t sector_num,
|
2006-08-06 05:31:00 +08:00
|
|
|
const uint8_t *buf, int nb_sectors);
|
|
|
|
|
2007-09-17 05:08:06 +08:00
|
|
|
int (*bdrv_snapshot_create)(BlockDriverState *bs,
|
2006-08-06 05:31:00 +08:00
|
|
|
QEMUSnapshotInfo *sn_info);
|
2007-09-17 05:08:06 +08:00
|
|
|
int (*bdrv_snapshot_goto)(BlockDriverState *bs,
|
2006-08-06 05:31:00 +08:00
|
|
|
const char *snapshot_id);
|
snapshot: distinguish id and name in snapshot delete
Snapshot creation actually already distinguish id and name since it take
a structured parameter *sn, but delete can't. Later an accurate delete
is needed in qmp_transaction abort and blockdev-snapshot-delete-sync,
so change its prototype. Also *errp is added to tip error, but return
value is kepted to let caller check what kind of error happens. Existing
caller for it are savevm, delvm and qemu-img, they are not impacted by
introducing a new function bdrv_snapshot_delete_by_id_or_name(), which
check the return value and do the operation again.
Before this patch:
For qcow2, it search id first then name to find the one to delete.
For rbd, it search name.
For sheepdog, it does nothing.
After this patch:
For qcow2, logic is the same by call it twice in caller.
For rbd, it always fails in delete with id, but still search for name
in second try, no change to user.
Some code for *errp is based on Pavel's patch.
Signed-off-by: Wenchao Xia <xiawenc@linux.vnet.ibm.com>
Signed-off-by: Pavel Hrdina <phrdina@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-09-11 14:04:33 +08:00
|
|
|
int (*bdrv_snapshot_delete)(BlockDriverState *bs,
|
|
|
|
const char *snapshot_id,
|
|
|
|
const char *name,
|
|
|
|
Error **errp);
|
2007-09-17 05:08:06 +08:00
|
|
|
int (*bdrv_snapshot_list)(BlockDriverState *bs,
|
2006-08-06 05:31:00 +08:00
|
|
|
QEMUSnapshotInfo **psn_info);
|
2010-09-22 10:58:41 +08:00
|
|
|
int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
|
2013-12-04 17:10:54 +08:00
|
|
|
const char *snapshot_id,
|
|
|
|
const char *name,
|
|
|
|
Error **errp);
|
2006-08-06 05:31:00 +08:00
|
|
|
int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
|
2013-10-09 16:46:16 +08:00
|
|
|
ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs);
|
2006-08-02 00:21:11 +08:00
|
|
|
|
2013-04-06 03:27:53 +08:00
|
|
|
int (*bdrv_save_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov,
|
|
|
|
int64_t pos);
|
2009-07-11 05:11:57 +08:00
|
|
|
int (*bdrv_load_vmstate)(BlockDriverState *bs, uint8_t *buf,
|
|
|
|
int64_t pos, int size);
|
2009-04-06 03:10:55 +08:00
|
|
|
|
2010-01-12 19:55:17 +08:00
|
|
|
int (*bdrv_change_backing_file)(BlockDriverState *bs,
|
|
|
|
const char *backing_file, const char *backing_fmt);
|
|
|
|
|
2006-08-19 19:45:59 +08:00
|
|
|
/* removable device specific */
|
2015-10-19 23:53:11 +08:00
|
|
|
bool (*bdrv_is_inserted)(BlockDriverState *bs);
|
2006-08-19 19:45:59 +08:00
|
|
|
int (*bdrv_media_changed)(BlockDriverState *bs);
|
2012-02-04 02:24:53 +08:00
|
|
|
void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
|
2011-09-07 00:58:47 +08:00
|
|
|
void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
|
2007-09-17 16:09:54 +08:00
|
|
|
|
2007-12-25 00:10:43 +08:00
|
|
|
/* to control generic scsi devices */
|
2014-10-07 19:59:14 +08:00
|
|
|
BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
|
2009-03-29 01:28:41 +08:00
|
|
|
unsigned long int req, void *buf,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb, void *opaque);
|
2007-12-25 00:10:43 +08:00
|
|
|
|
2009-05-18 22:42:10 +08:00
|
|
|
/* List of options for creating images, terminated by name == NULL */
|
2014-06-05 17:20:51 +08:00
|
|
|
QemuOptsList *create_opts;
|
2009-03-29 01:55:10 +08:00
|
|
|
|
2010-06-29 18:37:54 +08:00
|
|
|
/*
|
|
|
|
* Returns 0 for completed check, -errno for internal errors.
|
|
|
|
* The check results are stored in result.
|
|
|
|
*/
|
2012-05-11 22:07:02 +08:00
|
|
|
int (*bdrv_check)(BlockDriverState* bs, BdrvCheckResult *result,
|
|
|
|
BdrvCheckMode fix);
|
2009-04-22 07:11:50 +08:00
|
|
|
|
2014-10-27 18:12:50 +08:00
|
|
|
int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts,
|
2015-07-27 23:51:32 +08:00
|
|
|
BlockDriverAmendStatusCB *status_cb,
|
|
|
|
void *cb_opaque);
|
2013-09-03 16:09:50 +08:00
|
|
|
|
2015-11-18 16:52:54 +08:00
|
|
|
void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
|
2010-03-16 00:27:00 +08:00
|
|
|
|
2012-12-06 21:32:58 +08:00
|
|
|
/* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
|
|
|
|
int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
|
|
|
|
const char *tag);
|
2013-11-20 10:01:54 +08:00
|
|
|
int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
|
|
|
|
const char *tag);
|
2012-12-06 21:32:58 +08:00
|
|
|
int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
|
|
|
|
bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
|
|
|
|
|
2014-07-16 23:48:16 +08:00
|
|
|
void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
|
2013-12-12 02:26:16 +08:00
|
|
|
|
2010-07-28 17:26:29 +08:00
|
|
|
/*
|
|
|
|
* Returns 1 if newly created images are guaranteed to contain only
|
|
|
|
* zeros, 0 otherwise.
|
|
|
|
*/
|
|
|
|
int (*bdrv_has_zero_init)(BlockDriverState *bs);
|
2009-11-30 23:54:15 +08:00
|
|
|
|
2014-05-08 22:34:37 +08:00
|
|
|
/* Remove fd handlers, timers, and other event loop callbacks so the event
|
|
|
|
* loop is no longer in use. Called with no in-flight requests and in
|
|
|
|
* depth-first traversal order with parents before child nodes.
|
|
|
|
*/
|
|
|
|
void (*bdrv_detach_aio_context)(BlockDriverState *bs);
|
|
|
|
|
|
|
|
/* Add fd handlers, timers, and other event loop callbacks so I/O requests
|
|
|
|
* can be processed again. Called with no in-flight requests and in
|
|
|
|
* depth-first traversal order with child nodes before parent nodes.
|
|
|
|
*/
|
|
|
|
void (*bdrv_attach_aio_context)(BlockDriverState *bs,
|
|
|
|
AioContext *new_context);
|
|
|
|
|
2014-07-04 18:04:33 +08:00
|
|
|
/* io queue for linux-aio */
|
|
|
|
void (*bdrv_io_plug)(BlockDriverState *bs);
|
|
|
|
void (*bdrv_io_unplug)(BlockDriverState *bs);
|
|
|
|
void (*bdrv_flush_io_queue)(BlockDriverState *bs);
|
|
|
|
|
2015-02-16 19:47:54 +08:00
|
|
|
/**
|
|
|
|
* Try to get @bs's logical and physical block size.
|
|
|
|
* On success, store them in @bsz and return zero.
|
|
|
|
* On failure, return negative errno.
|
|
|
|
*/
|
|
|
|
int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
|
|
|
|
/**
|
|
|
|
* Try to get @bs's geometry (cyls, heads, sectors)
|
|
|
|
* On success, store them in @geo and return 0.
|
|
|
|
* On failure return -errno.
|
|
|
|
* Only drivers that want to override guest geometry implement this
|
|
|
|
* callback; see hd_geometry_guess().
|
|
|
|
*/
|
|
|
|
int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
|
|
|
|
|
2015-11-09 18:16:53 +08:00
|
|
|
/**
|
|
|
|
* Drain and stop any internal sources of requests in the driver, and
|
|
|
|
* remain so until next I/O callback (e.g. bdrv_co_writev) is called.
|
|
|
|
*/
|
|
|
|
void (*bdrv_drain)(BlockDriverState *bs);
|
|
|
|
|
2010-04-13 17:29:33 +08:00
|
|
|
QLIST_ENTRY(BlockDriver) list;
|
2004-08-02 05:59:26 +08:00
|
|
|
};
|
|
|
|
|
2013-10-24 18:06:56 +08:00
|
|
|
typedef struct BlockLimits {
|
|
|
|
/* maximum number of sectors that can be discarded at once */
|
|
|
|
int max_discard;
|
|
|
|
|
|
|
|
/* optimal alignment for discard requests in sectors */
|
|
|
|
int64_t discard_alignment;
|
|
|
|
|
|
|
|
/* maximum number of sectors that can zeroized at once */
|
|
|
|
int max_write_zeroes;
|
|
|
|
|
|
|
|
/* optimal alignment for write zeroes requests in sectors */
|
|
|
|
int64_t write_zeroes_alignment;
|
2013-11-27 18:07:04 +08:00
|
|
|
|
|
|
|
/* optimal transfer length in sectors */
|
|
|
|
int opt_transfer_length;
|
2013-11-28 17:23:32 +08:00
|
|
|
|
2014-10-27 17:18:44 +08:00
|
|
|
/* maximal transfer length in sectors */
|
|
|
|
int max_transfer_length;
|
|
|
|
|
2013-11-28 17:23:32 +08:00
|
|
|
/* memory alignment so that no bounce buffer is needed */
|
2015-05-12 22:30:55 +08:00
|
|
|
size_t min_mem_alignment;
|
|
|
|
|
|
|
|
/* memory alignment for bounce buffer */
|
2013-11-28 17:23:32 +08:00
|
|
|
size_t opt_mem_alignment;
|
2015-07-09 17:56:44 +08:00
|
|
|
|
|
|
|
/* maximum number of iovec elements */
|
|
|
|
int max_iov;
|
2013-10-24 18:06:56 +08:00
|
|
|
} BlockLimits;
|
|
|
|
|
2014-05-23 21:29:42 +08:00
|
|
|
typedef struct BdrvOpBlocker BdrvOpBlocker;
|
|
|
|
|
2014-06-21 03:57:33 +08:00
|
|
|
typedef struct BdrvAioNotifier {
|
|
|
|
void (*attached_aio_context)(AioContext *new_context, void *opaque);
|
|
|
|
void (*detach_aio_context)(void *opaque);
|
|
|
|
|
|
|
|
void *opaque;
|
|
|
|
|
|
|
|
QLIST_ENTRY(BdrvAioNotifier) list;
|
|
|
|
} BdrvAioNotifier;
|
|
|
|
|
2015-04-08 19:43:47 +08:00
|
|
|
struct BdrvChildRole {
|
2015-04-29 23:29:39 +08:00
|
|
|
void (*inherit_options)(int *child_flags, QDict *child_options,
|
|
|
|
int parent_flags, QDict *parent_options);
|
2015-04-08 19:43:47 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
extern const BdrvChildRole child_file;
|
|
|
|
extern const BdrvChildRole child_format;
|
|
|
|
|
2015-06-15 19:24:19 +08:00
|
|
|
struct BdrvChild {
|
2015-04-08 19:49:41 +08:00
|
|
|
BlockDriverState *bs;
|
2015-04-27 19:46:22 +08:00
|
|
|
char *name;
|
2015-04-08 19:49:41 +08:00
|
|
|
const BdrvChildRole *role;
|
|
|
|
QLIST_ENTRY(BdrvChild) next;
|
2015-09-17 19:18:23 +08:00
|
|
|
QLIST_ENTRY(BdrvChild) next_parent;
|
2015-06-15 19:24:19 +08:00
|
|
|
};
|
2015-04-08 19:49:41 +08:00
|
|
|
|
2012-02-29 04:54:06 +08:00
|
|
|
/*
|
|
|
|
* Note: the function bdrv_append() copies and swaps contents of
|
|
|
|
* BlockDriverStates, so if you add new fields to this struct, please
|
|
|
|
* inspect bdrv_append() to determine if the new fields need to be
|
|
|
|
* copied as well.
|
|
|
|
*/
|
2004-08-02 05:59:26 +08:00
|
|
|
struct BlockDriverState {
|
2006-08-06 21:35:09 +08:00
|
|
|
int64_t total_sectors; /* if we are reading a disk image, give its
|
|
|
|
size in sectors */
|
2004-08-02 05:59:26 +08:00
|
|
|
int read_only; /* if true, the media is read only */
|
2010-02-14 19:39:18 +08:00
|
|
|
int open_flags; /* flags used to open the file, re-used for re-open */
|
2004-08-02 05:59:26 +08:00
|
|
|
int encrypted; /* if true, the media is encrypted */
|
2009-03-06 07:01:01 +08:00
|
|
|
int valid_key; /* if true, a valid encryption key has been set */
|
2007-12-25 00:10:43 +08:00
|
|
|
int sg; /* if true, the device is a /dev/sg* */
|
2011-11-29 00:08:47 +08:00
|
|
|
int copy_on_read; /* if true, copy read backing sectors into image
|
|
|
|
note this is a reference count */
|
raw: Prohibit dangerous writes for probed images
If the user neglects to specify the image format, QEMU probes the
image to guess it automatically, for convenience.
Relying on format probing is insecure for raw images (CVE-2008-2004).
If the guest writes a suitable header to the device, the next probe
will recognize a format chosen by the guest. A malicious guest can
abuse this to gain access to host files, e.g. by crafting a QCOW2
header with backing file /etc/shadow.
Commit 1e72d3b (April 2008) provided -drive parameter format to let
users disable probing. Commit f965509 (March 2009) extended QCOW2 to
optionally store the backing file format, to let users disable backing
file probing. QED has had a flag to suppress probing since the
beginning (2010), set whenever a raw backing file is assigned.
All of these additions that allow to avoid format probing have to be
specified explicitly. The default still allows the attack.
In order to fix this, commit 79368c8 (July 2010) put probed raw images
in a restricted mode, in which they wouldn't be able to overwrite the
first few bytes of the image so that they would identify as a different
image. If a write to the first sector would write one of the signatures
of another driver, qemu would instead zero out the first four bytes.
This patch was later reverted in commit 8b33d9e (September 2010) because
it didn't get the handling of unaligned qiov members right.
Today's block layer that is based on coroutines and has qiov utility
functions makes it much easier to get this functionality right, so this
patch implements it.
The other differences of this patch to the old one are that it doesn't
silently write something different than the guest requested by zeroing
out some bytes (it fails the request instead) and that it doesn't
maintain a list of signatures in the raw driver (it calls the usual
probe function instead).
Note that this change doesn't introduce new breakage for false positive
cases where the guest legitimately writes data into the first sector
that matches the signatures of an image format (e.g. for nested virt):
These cases were broken before, only the failure mode changes from
corruption after the next restart (when the wrong format is probed) to
failing the problematic write request.
Also note that like in the original patch, the restrictions only apply
if the image format has been guessed by probing. Explicitly specifying a
format allows guests to write anything they like.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1416497234-29880-8-git-send-email-kwolf@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-11-20 23:27:12 +08:00
|
|
|
bool probed;
|
2004-08-02 05:59:26 +08:00
|
|
|
|
2006-08-19 19:45:59 +08:00
|
|
|
BlockDriver *drv; /* NULL means no media */
|
2004-08-02 05:59:26 +08:00
|
|
|
void *opaque;
|
|
|
|
|
2014-10-07 19:59:05 +08:00
|
|
|
BlockBackend *blk; /* owning backend, if any */
|
|
|
|
|
2014-05-08 22:34:37 +08:00
|
|
|
AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
|
2014-06-21 03:57:33 +08:00
|
|
|
/* long-running tasks intended to always use the same AioContext as this
|
|
|
|
* BDS may register themselves in this list to be notified of changes
|
|
|
|
* regarding this BDS's context */
|
|
|
|
QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
|
2014-05-08 22:34:37 +08:00
|
|
|
|
2015-01-22 21:03:30 +08:00
|
|
|
char filename[PATH_MAX];
|
|
|
|
char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
|
|
|
|
this file image */
|
2009-03-29 01:55:10 +08:00
|
|
|
char backing_format[16]; /* if non-zero and backing_file exists */
|
2006-08-19 19:45:59 +08:00
|
|
|
|
2014-07-19 02:24:56 +08:00
|
|
|
QDict *full_open_options;
|
2015-01-22 21:03:30 +08:00
|
|
|
char exact_filename[PATH_MAX];
|
2014-07-19 02:24:56 +08:00
|
|
|
|
2015-06-17 20:55:21 +08:00
|
|
|
BdrvChild *backing;
|
2015-06-16 20:19:22 +08:00
|
|
|
BdrvChild *file;
|
2010-04-14 20:17:38 +08:00
|
|
|
|
2013-06-24 23:13:10 +08:00
|
|
|
/* Callback before write request is processed */
|
|
|
|
NotifierWithReturnList before_write_notifiers;
|
|
|
|
|
2013-12-04 23:43:44 +08:00
|
|
|
/* number of in-flight serialising requests */
|
|
|
|
unsigned int serialising_in_flight;
|
2012-01-18 22:40:42 +08:00
|
|
|
|
2015-11-04 21:15:36 +08:00
|
|
|
/* I/O throttling.
|
|
|
|
* throttle_state tells us if this BDS has I/O limits configured.
|
|
|
|
* io_limits_enabled tells us if they are currently being
|
|
|
|
* enforced, but it can be temporarily set to false */
|
2013-09-02 20:14:39 +08:00
|
|
|
CoQueue throttled_reqs[2];
|
2011-11-03 16:57:25 +08:00
|
|
|
bool io_limits_enabled;
|
2015-06-09 00:17:44 +08:00
|
|
|
/* The following fields are protected by the ThrottleGroup lock.
|
|
|
|
* See the ThrottleGroup documentation for details. */
|
|
|
|
ThrottleState *throttle_state;
|
|
|
|
ThrottleTimers throttle_timers;
|
|
|
|
unsigned pending_reqs[2];
|
2015-06-09 00:17:42 +08:00
|
|
|
QLIST_ENTRY(BlockDriverState) round_robin;
|
2011-11-03 16:57:25 +08:00
|
|
|
|
2015-10-19 23:53:20 +08:00
|
|
|
/* Offset after the highest byte written to */
|
|
|
|
uint64_t wr_highest_offset;
|
|
|
|
|
2013-10-24 18:06:56 +08:00
|
|
|
/* I/O Limits */
|
|
|
|
BlockLimits bl;
|
|
|
|
|
2013-08-22 15:24:14 +08:00
|
|
|
/* Whether produces zeros when read beyond eof */
|
|
|
|
bool zero_beyond_eof;
|
|
|
|
|
2011-11-29 19:42:20 +08:00
|
|
|
/* Alignment requirement for offset/length of I/O requests */
|
|
|
|
unsigned int request_alignment;
|
|
|
|
|
2009-09-05 01:01:15 +08:00
|
|
|
/* do we need to tell the quest if we have a volatile write cache? */
|
|
|
|
int enable_write_cache;
|
|
|
|
|
2014-01-24 04:31:32 +08:00
|
|
|
/* the following member gives a name to every node on the bs graph. */
|
|
|
|
char node_name[32];
|
|
|
|
/* element of the list of named nodes building the graph */
|
|
|
|
QTAILQ_ENTRY(BlockDriverState) node_list;
|
|
|
|
/* element of the list of "drives" the guest sees */
|
|
|
|
QTAILQ_ENTRY(BlockDriverState) device_list;
|
2016-01-29 23:36:11 +08:00
|
|
|
/* element of the list of all BlockDriverStates (all_bdrv_states) */
|
|
|
|
QTAILQ_ENTRY(BlockDriverState) bs_list;
|
2016-01-29 23:36:12 +08:00
|
|
|
/* element of the list of monitor-owned BDS */
|
|
|
|
QTAILQ_ENTRY(BlockDriverState) monitor_list;
|
2013-11-13 18:29:43 +08:00
|
|
|
QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
|
2013-08-23 09:14:46 +08:00
|
|
|
int refcnt;
|
2011-11-17 21:40:27 +08:00
|
|
|
|
|
|
|
QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
|
2012-01-18 22:40:43 +08:00
|
|
|
|
2014-05-23 21:29:42 +08:00
|
|
|
/* operation blockers */
|
|
|
|
QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
|
|
|
|
|
2012-01-18 22:40:43 +08:00
|
|
|
/* long-running background operation */
|
|
|
|
BlockJob *job;
|
2012-09-21 03:13:19 +08:00
|
|
|
|
2015-04-10 00:47:50 +08:00
|
|
|
/* The node that this node inherited default options from (and a reopen on
|
|
|
|
* which can affect this node by changing these defaults). This is always a
|
|
|
|
* parent node of this node. */
|
|
|
|
BlockDriverState *inherits_from;
|
2015-04-08 19:49:41 +08:00
|
|
|
QLIST_HEAD(, BdrvChild) children;
|
2015-09-17 19:18:23 +08:00
|
|
|
QLIST_HEAD(, BdrvChild) parents;
|
2015-04-08 19:49:41 +08:00
|
|
|
|
2013-03-15 17:35:02 +08:00
|
|
|
QDict *options;
|
2015-05-08 22:15:03 +08:00
|
|
|
QDict *explicit_options;
|
2014-05-18 06:58:19 +08:00
|
|
|
BlockdevDetectZeroesOptions detect_zeroes;
|
2014-05-23 21:29:47 +08:00
|
|
|
|
|
|
|
/* The error object in use for blocking operations on backing_hd */
|
|
|
|
Error *backing_blocker;
|
block: add event when disk usage exceeds threshold
Managing applications, like oVirt (http://www.ovirt.org), make extensive
use of thin-provisioned disk images.
To let the guest run smoothly and be not unnecessarily paused, oVirt sets
a disk usage threshold (so called 'high water mark') based on the occupation
of the device, and automatically extends the image once the threshold
is reached or exceeded.
In order to detect the crossing of the threshold, oVirt has no choice but
aggressively polling the QEMU monitor using the query-blockstats command.
This lead to unnecessary system load, and is made even worse under scale:
deployments with hundreds of VMs are no longer rare.
To fix this, this patch adds:
* A new monitor command `block-set-write-threshold', to set a mark for
a given block device.
* A new event `BLOCK_WRITE_THRESHOLD', to report if a block device
usage exceeds the threshold.
* A new `write_threshold' field into the `BlockDeviceInfo' structure,
to report the configured threshold.
This will allow the managing application to use smarter and more
efficient monitoring, greatly reducing the need of polling.
[Updated qemu-iotests 067 output to add the new 'write_threshold'
property. --Stefan]
[Changed g_assert_false() to !g_assert() to fix the build on older glib
versions. --Kevin]
Signed-off-by: Francesco Romani <fromani@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1421068273-692-1-git-send-email-fromani@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2015-01-12 21:11:13 +08:00
|
|
|
|
|
|
|
/* threshold limit for writes, in bytes. "High water mark". */
|
|
|
|
uint64_t write_threshold_offset;
|
|
|
|
NotifierWithReturn write_threshold_notifier;
|
2015-10-23 11:08:09 +08:00
|
|
|
|
|
|
|
int quiesce_counter;
|
2004-08-02 05:59:26 +08:00
|
|
|
};
|
|
|
|
|
2015-10-19 23:53:24 +08:00
|
|
|
struct BlockBackendRootState {
|
|
|
|
int open_flags;
|
|
|
|
bool read_only;
|
|
|
|
BlockdevDetectZeroesOptions detect_zeroes;
|
|
|
|
|
|
|
|
char *throttle_group;
|
|
|
|
ThrottleState *throttle_state;
|
|
|
|
};
|
|
|
|
|
2015-06-17 20:55:21 +08:00
|
|
|
static inline BlockDriverState *backing_bs(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
return bs->backing ? bs->backing->bs : NULL;
|
|
|
|
}
|
|
|
|
|
2014-12-03 01:32:41 +08:00
|
|
|
|
|
|
|
/* Essential block drivers which must always be statically linked into qemu, and
|
|
|
|
* which therefore can be accessed without using bdrv_find_format() */
|
|
|
|
extern BlockDriver bdrv_file;
|
|
|
|
extern BlockDriver bdrv_raw;
|
|
|
|
extern BlockDriver bdrv_qcow2;
|
|
|
|
|
2015-10-27 04:39:06 +08:00
|
|
|
extern QTAILQ_HEAD(BdrvStates, BlockDriverState) bdrv_states;
|
|
|
|
|
2015-04-28 21:27:51 +08:00
|
|
|
/**
|
|
|
|
* bdrv_setup_io_funcs:
|
|
|
|
*
|
|
|
|
* Prepare a #BlockDriver for I/O request processing by populating
|
|
|
|
* unimplemented coroutine and AIO interfaces with generic wrapper functions
|
|
|
|
* that fall back to implemented interfaces.
|
|
|
|
*/
|
|
|
|
void bdrv_setup_io_funcs(BlockDriver *bdrv);
|
2014-12-03 01:32:41 +08:00
|
|
|
|
2012-05-28 15:27:54 +08:00
|
|
|
int get_tmp_filename(char *filename, int size);
|
raw: Prohibit dangerous writes for probed images
If the user neglects to specify the image format, QEMU probes the
image to guess it automatically, for convenience.
Relying on format probing is insecure for raw images (CVE-2008-2004).
If the guest writes a suitable header to the device, the next probe
will recognize a format chosen by the guest. A malicious guest can
abuse this to gain access to host files, e.g. by crafting a QCOW2
header with backing file /etc/shadow.
Commit 1e72d3b (April 2008) provided -drive parameter format to let
users disable probing. Commit f965509 (March 2009) extended QCOW2 to
optionally store the backing file format, to let users disable backing
file probing. QED has had a flag to suppress probing since the
beginning (2010), set whenever a raw backing file is assigned.
All of these additions that allow to avoid format probing have to be
specified explicitly. The default still allows the attack.
In order to fix this, commit 79368c8 (July 2010) put probed raw images
in a restricted mode, in which they wouldn't be able to overwrite the
first few bytes of the image so that they would identify as a different
image. If a write to the first sector would write one of the signatures
of another driver, qemu would instead zero out the first four bytes.
This patch was later reverted in commit 8b33d9e (September 2010) because
it didn't get the handling of unaligned qiov members right.
Today's block layer that is based on coroutines and has qiov utility
functions makes it much easier to get this functionality right, so this
patch implements it.
The other differences of this patch to the old one are that it doesn't
silently write something different than the guest requested by zeroing
out some bytes (it fails the request instead) and that it doesn't
maintain a list of signatures in the raw driver (it calls the usual
probe function instead).
Note that this change doesn't introduce new breakage for false positive
cases where the guest legitimately writes data into the first sector
that matches the signatures of an image format (e.g. for nested virt):
These cases were broken before, only the failure mode changes from
corruption after the next restart (when the wrong format is probed) to
failing the problematic write request.
Also note that like in the original patch, the restrictions only apply
if the image format has been guessed by probing. Explicitly specifying a
format allows guests to write anything they like.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1416497234-29880-8-git-send-email-kwolf@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-11-20 23:27:12 +08:00
|
|
|
BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
|
|
|
|
const char *filename);
|
2005-12-19 02:28:15 +08:00
|
|
|
|
2011-11-03 16:57:25 +08:00
|
|
|
void bdrv_set_io_limits(BlockDriverState *bs,
|
2013-09-02 20:14:39 +08:00
|
|
|
ThrottleConfig *cfg);
|
|
|
|
|
2011-11-03 16:57:25 +08:00
|
|
|
|
2013-06-24 23:13:10 +08:00
|
|
|
/**
|
|
|
|
* bdrv_add_before_write_notifier:
|
|
|
|
*
|
|
|
|
* Register a callback that is invoked before write requests are processed but
|
|
|
|
* after any throttling or waiting for overlapping requests.
|
|
|
|
*/
|
|
|
|
void bdrv_add_before_write_notifier(BlockDriverState *bs,
|
|
|
|
NotifierWithReturn *notifier);
|
|
|
|
|
2014-05-08 22:34:37 +08:00
|
|
|
/**
|
|
|
|
* bdrv_detach_aio_context:
|
|
|
|
*
|
|
|
|
* May be called from .bdrv_detach_aio_context() to detach children from the
|
|
|
|
* current #AioContext. This is only needed by block drivers that manage their
|
2015-06-17 20:55:21 +08:00
|
|
|
* own children. Both ->file and ->backing are automatically handled and
|
2014-05-08 22:34:37 +08:00
|
|
|
* block drivers should not call this function on them explicitly.
|
|
|
|
*/
|
|
|
|
void bdrv_detach_aio_context(BlockDriverState *bs);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* bdrv_attach_aio_context:
|
|
|
|
*
|
|
|
|
* May be called from .bdrv_attach_aio_context() to attach children to the new
|
|
|
|
* #AioContext. This is only needed by block drivers that manage their own
|
2015-06-17 20:55:21 +08:00
|
|
|
* children. Both ->file and ->backing are automatically handled and block
|
2014-05-08 22:34:37 +08:00
|
|
|
* drivers should not call this function on them explicitly.
|
|
|
|
*/
|
|
|
|
void bdrv_attach_aio_context(BlockDriverState *bs,
|
|
|
|
AioContext *new_context);
|
|
|
|
|
2014-06-21 03:57:33 +08:00
|
|
|
/**
|
|
|
|
* bdrv_add_aio_context_notifier:
|
|
|
|
*
|
|
|
|
* If a long-running job intends to be always run in the same AioContext as a
|
|
|
|
* certain BDS, it may use this function to be notified of changes regarding the
|
|
|
|
* association of the BDS to an AioContext.
|
|
|
|
*
|
|
|
|
* attached_aio_context() is called after the target BDS has been attached to a
|
|
|
|
* new AioContext; detach_aio_context() is called before the target BDS is being
|
|
|
|
* detached from its old AioContext.
|
|
|
|
*/
|
|
|
|
void bdrv_add_aio_context_notifier(BlockDriverState *bs,
|
|
|
|
void (*attached_aio_context)(AioContext *new_context, void *opaque),
|
|
|
|
void (*detach_aio_context)(void *opaque), void *opaque);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* bdrv_remove_aio_context_notifier:
|
|
|
|
*
|
|
|
|
* Unsubscribe of change notifications regarding the BDS's AioContext. The
|
|
|
|
* parameters given here have to be the same as those given to
|
|
|
|
* bdrv_add_aio_context_notifier().
|
|
|
|
*/
|
|
|
|
void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
|
|
|
|
void (*aio_context_attached)(AioContext *,
|
|
|
|
void *),
|
|
|
|
void (*aio_context_detached)(void *),
|
|
|
|
void *opaque);
|
|
|
|
|
2009-06-15 20:04:22 +08:00
|
|
|
#ifdef _WIN32
|
|
|
|
int is_windows_drive(const char *filename);
|
|
|
|
#endif
|
|
|
|
|
2012-03-30 19:17:13 +08:00
|
|
|
/**
|
|
|
|
* stream_start:
|
|
|
|
* @bs: Block device to operate on.
|
|
|
|
* @base: Block device that will become the new base, or %NULL to
|
|
|
|
* flatten the whole backing file chain onto @bs.
|
|
|
|
* @base_id: The file name that will be written to @bs as the new
|
|
|
|
* backing file if the job completes. Ignored if @base is %NULL.
|
2012-04-25 23:51:03 +08:00
|
|
|
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
|
2012-09-28 23:22:59 +08:00
|
|
|
* @on_error: The action to take upon error.
|
2012-03-30 19:17:13 +08:00
|
|
|
* @cb: Completion function for the job.
|
|
|
|
* @opaque: Opaque pointer value passed to @cb.
|
2012-04-25 23:51:00 +08:00
|
|
|
* @errp: Error object.
|
2012-03-30 19:17:13 +08:00
|
|
|
*
|
|
|
|
* Start a streaming operation on @bs. Clusters that are unallocated
|
|
|
|
* in @bs, but allocated in any image between @base and @bs (both
|
|
|
|
* exclusive) will be written to @bs. At the end of a successful
|
|
|
|
* streaming job, the backing file of @bs will be changed to
|
|
|
|
* @base_id in the written image and to @base in the live BlockDriverState.
|
|
|
|
*/
|
2012-04-25 23:51:00 +08:00
|
|
|
void stream_start(BlockDriverState *bs, BlockDriverState *base,
|
2012-09-28 23:22:59 +08:00
|
|
|
const char *base_id, int64_t speed, BlockdevOnError on_error,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb,
|
2012-04-25 23:51:00 +08:00
|
|
|
void *opaque, Error **errp);
|
2012-01-18 22:40:44 +08:00
|
|
|
|
2012-09-28 01:29:13 +08:00
|
|
|
/**
|
|
|
|
* commit_start:
|
2013-12-16 14:45:30 +08:00
|
|
|
* @bs: Active block device.
|
|
|
|
* @top: Top block device to be committed.
|
|
|
|
* @base: Block device that will be written into, and become the new top.
|
2012-09-28 01:29:13 +08:00
|
|
|
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
|
|
|
|
* @on_error: The action to take upon error.
|
|
|
|
* @cb: Completion function for the job.
|
|
|
|
* @opaque: Opaque pointer value passed to @cb.
|
block: extend block-commit to accept a string for the backing file
On some image chains, QEMU may not always be able to resolve the
filenames properly, when updating the backing file of an image
after a block commit.
For instance, certain relative pathnames may fail, or drives may
have been specified originally by file descriptor (e.g. /dev/fd/???),
or a relative protocol pathname may have been used.
In these instances, QEMU may lack the information to be able to make
the correct choice, but the user or management layer most likely does
have that knowledge.
With this extension to the block-commit api, the user is able to change
the backing file of the overlay image as part of the block-commit
operation.
This allows the change to be 'safe', in the sense that if the attempt
to write the overlay image metadata fails, then the block-commit
operation returns failure, without disrupting the guest.
If the commit top is the active layer, then specifying the backing
file string will be treated as an error (there is no overlay image
to modify in that case).
If a backing file string is not specified in the command, the backing
file string to use is determined in the same manner as it was
previously.
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-26 03:40:10 +08:00
|
|
|
* @backing_file_str: String to use as the backing file in @top's overlay
|
2012-09-28 01:29:13 +08:00
|
|
|
* @errp: Error object.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
void commit_start(BlockDriverState *bs, BlockDriverState *base,
|
|
|
|
BlockDriverState *top, int64_t speed,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockdevOnError on_error, BlockCompletionFunc *cb,
|
block: extend block-commit to accept a string for the backing file
On some image chains, QEMU may not always be able to resolve the
filenames properly, when updating the backing file of an image
after a block commit.
For instance, certain relative pathnames may fail, or drives may
have been specified originally by file descriptor (e.g. /dev/fd/???),
or a relative protocol pathname may have been used.
In these instances, QEMU may lack the information to be able to make
the correct choice, but the user or management layer most likely does
have that knowledge.
With this extension to the block-commit api, the user is able to change
the backing file of the overlay image as part of the block-commit
operation.
This allows the change to be 'safe', in the sense that if the attempt
to write the overlay image metadata fails, then the block-commit
operation returns failure, without disrupting the guest.
If the commit top is the active layer, then specifying the backing
file string will be treated as an error (there is no overlay image
to modify in that case).
If a backing file string is not specified in the command, the backing
file string to use is determined in the same manner as it was
previously.
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-26 03:40:10 +08:00
|
|
|
void *opaque, const char *backing_file_str, Error **errp);
|
2013-12-16 14:45:30 +08:00
|
|
|
/**
|
|
|
|
* commit_active_start:
|
|
|
|
* @bs: Active block device to be committed.
|
|
|
|
* @base: Block device that will be written into, and become the new top.
|
|
|
|
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
|
|
|
|
* @on_error: The action to take upon error.
|
|
|
|
* @cb: Completion function for the job.
|
|
|
|
* @opaque: Opaque pointer value passed to @cb.
|
|
|
|
* @errp: Error object.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
|
|
|
|
int64_t speed,
|
|
|
|
BlockdevOnError on_error,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb,
|
2013-12-16 14:45:30 +08:00
|
|
|
void *opaque, Error **errp);
|
2012-10-18 22:49:23 +08:00
|
|
|
/*
|
|
|
|
* mirror_start:
|
|
|
|
* @bs: Block device to operate on.
|
|
|
|
* @target: Block device to write to.
|
2014-06-28 00:25:25 +08:00
|
|
|
* @replaces: Block graph node name to replace once the mirror is done. Can
|
|
|
|
* only be used when full mirroring is selected.
|
2012-10-18 22:49:23 +08:00
|
|
|
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
|
2013-01-22 00:09:46 +08:00
|
|
|
* @granularity: The chosen granularity for the dirty bitmap.
|
2013-01-22 16:03:13 +08:00
|
|
|
* @buf_size: The amount of data that can be in flight at one time.
|
2012-10-18 22:49:23 +08:00
|
|
|
* @mode: Whether to collapse all images in the chain to the target.
|
2012-10-18 22:49:28 +08:00
|
|
|
* @on_source_error: The action to take upon error reading from the source.
|
|
|
|
* @on_target_error: The action to take upon error writing to the target.
|
2015-06-08 13:56:08 +08:00
|
|
|
* @unmap: Whether to unmap target where source sectors only contain zeroes.
|
2012-10-18 22:49:23 +08:00
|
|
|
* @cb: Completion function for the job.
|
|
|
|
* @opaque: Opaque pointer value passed to @cb.
|
|
|
|
* @errp: Error object.
|
|
|
|
*
|
|
|
|
* Start a mirroring operation on @bs. Clusters that are allocated
|
|
|
|
* in @bs will be written to @bs until the job is cancelled or
|
|
|
|
* manually completed. At the end of a successful mirroring job,
|
|
|
|
* @bs will be switched to read from @target.
|
|
|
|
*/
|
|
|
|
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
|
2014-06-28 00:25:25 +08:00
|
|
|
const char *replaces,
|
2015-04-18 07:49:51 +08:00
|
|
|
int64_t speed, uint32_t granularity, int64_t buf_size,
|
2013-01-22 16:03:13 +08:00
|
|
|
MirrorSyncMode mode, BlockdevOnError on_source_error,
|
2012-10-18 22:49:28 +08:00
|
|
|
BlockdevOnError on_target_error,
|
2015-06-08 13:56:08 +08:00
|
|
|
bool unmap,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb,
|
2012-10-18 22:49:23 +08:00
|
|
|
void *opaque, Error **errp);
|
|
|
|
|
2013-06-24 23:13:11 +08:00
|
|
|
/*
|
|
|
|
* backup_start:
|
|
|
|
* @bs: Block device to operate on.
|
|
|
|
* @target: Block device to write to.
|
|
|
|
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
|
2013-07-27 02:39:04 +08:00
|
|
|
* @sync_mode: What parts of the disk image should be copied to the destination.
|
2015-06-05 08:20:34 +08:00
|
|
|
* @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
|
2013-06-24 23:13:11 +08:00
|
|
|
* @on_source_error: The action to take upon error reading from the source.
|
|
|
|
* @on_target_error: The action to take upon error writing to the target.
|
|
|
|
* @cb: Completion function for the job.
|
|
|
|
* @opaque: Opaque pointer value passed to @cb.
|
2015-11-06 07:13:17 +08:00
|
|
|
* @txn: Transaction that this job is part of (may be NULL).
|
2013-06-24 23:13:11 +08:00
|
|
|
*
|
|
|
|
* Start a backup operation on @bs. Clusters in @bs are written to @target
|
|
|
|
* until the job is cancelled or manually completed.
|
|
|
|
*/
|
|
|
|
void backup_start(BlockDriverState *bs, BlockDriverState *target,
|
2013-07-27 02:39:04 +08:00
|
|
|
int64_t speed, MirrorSyncMode sync_mode,
|
2015-04-18 07:49:58 +08:00
|
|
|
BdrvDirtyBitmap *sync_bitmap,
|
2013-07-27 02:39:04 +08:00
|
|
|
BlockdevOnError on_source_error,
|
2013-06-24 23:13:11 +08:00
|
|
|
BlockdevOnError on_target_error,
|
2014-10-07 19:59:15 +08:00
|
|
|
BlockCompletionFunc *cb, void *opaque,
|
2015-11-06 07:13:17 +08:00
|
|
|
BlockJobTxn *txn, Error **errp);
|
2013-06-24 23:13:11 +08:00
|
|
|
|
2015-09-17 19:01:50 +08:00
|
|
|
void blk_set_bs(BlockBackend *blk, BlockDriverState *bs);
|
|
|
|
|
2014-10-07 19:59:25 +08:00
|
|
|
void blk_dev_change_media_cb(BlockBackend *blk, bool load);
|
|
|
|
bool blk_dev_has_removable_media(BlockBackend *blk);
|
2016-01-30 03:49:10 +08:00
|
|
|
bool blk_dev_has_tray(BlockBackend *blk);
|
2014-10-07 19:59:25 +08:00
|
|
|
void blk_dev_eject_request(BlockBackend *blk, bool force);
|
|
|
|
bool blk_dev_is_tray_open(BlockBackend *blk);
|
|
|
|
bool blk_dev_is_medium_locked(BlockBackend *blk);
|
|
|
|
void blk_dev_resize_cb(BlockBackend *blk);
|
|
|
|
|
2015-04-28 21:27:50 +08:00
|
|
|
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
|
2015-09-16 22:08:17 +08:00
|
|
|
bool bdrv_requests_pending(BlockDriverState *bs);
|
2015-04-28 21:27:50 +08:00
|
|
|
|
2015-11-09 18:16:54 +08:00
|
|
|
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
|
|
|
|
void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
|
|
|
|
|
2016-01-29 23:36:12 +08:00
|
|
|
void blockdev_close_all_bdrv_states(void);
|
|
|
|
|
2004-08-02 05:59:26 +08:00
|
|
|
#endif /* BLOCK_INT_H */
|