From c8bf1336824ebd698d37b71763e1c43190f2229a Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:07:38 +0200
Subject: [PATCH 01/39] Consolidate min_not_zero

We have several users of min_not_zero, each of them using their own
definition.  Move the define to kernel.h.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-settings.c               |  5 -----
 drivers/block/drbd/drbd_receiver.c |  1 -
 drivers/md/dm-snap.c               |  2 --
 drivers/md/dm-table.c              |  5 -----
 include/linux/kernel.h             | 10 ++++++++++
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..8d592b559bd3 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -455,11 +455,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
 /**
  * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
  * @t:	the stacking driver (top)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..484ecbb6b772 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2972,7 +2972,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 	 * we still need to figure out whether we accept that. */
 	mdev->p_size = p_size;
 
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 	if (get_ldev(mdev)) {
 		warn_if_differ_considerably(mdev, "lower level device sizes",
 			   p_size, drbd_get_max_capacity(mdev->ldev));
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d3094d97..f30f6e8d594e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -706,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	return 0;
 }
 
-#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
-
 /*
  * Return a minimum chunk size of all snapshots that have the specified origin.
  * Return zero if the origin has no snapshots.
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f9fc07d7a4b9..90267f8d64ee 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -486,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	return 0;
 }
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 			 sector_t start, sector_t len, void *data)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b0a35e6bc69..f5df2f4acb0d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -640,6 +640,16 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	(void) (&_max1 == &_max2);		\
 	_max1 > _max2 ? _max1 : _max2; })
 
+/**
+ * min_not_zero - return the minimum that is _not_ zero, unless both are zero
+ * @x: value1
+ * @y: value2
+ */
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
 /**
  * clamp - return a value clamped to a given range with strict typechecking
  * @val: current value

From 13f05c8d8e98bbdce89158bfdb2e380940695a88 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:50:10 +0200
Subject: [PATCH 02/39] block/scsi: Provide a limit on the number of integrity
 segments

Some controllers have a hardware limit on the number of protection
information scatter-gather list segments they can handle.

Introduce a max_integrity_segments limit in the block layer and provide
a new scsi_host_template setting that allows HBA drivers to provide a
value suitable for the hardware.

Add support for honoring the integrity segment limit when merging both
bios and requests.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-integrity.c     | 93 ++++++++++++++++++++++++++++++---------
 block/blk-merge.c         | 23 ++++++----
 block/blk-settings.c      |  3 ++
 block/blk-sysfs.c         | 11 +++++
 block/blk.h               |  8 ----
 drivers/scsi/hosts.c      |  1 +
 drivers/scsi/scsi_lib.c   | 26 +++++++----
 drivers/scsi/scsi_sysfs.c |  2 +
 include/linux/bio.h       |  4 ++
 include/linux/blkdev.h    | 33 ++++++++++++--
 include/scsi/scsi.h       |  6 +++
 include/scsi/scsi_host.h  |  7 +++
 12 files changed, 167 insertions(+), 50 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..885cbb59967e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
 
 /**
  * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  *
  * Description: Returns the number of elements required in a
- * scatterlist corresponding to the integrity metadata in a request.
+ * scatterlist corresponding to the integrity metadata in a bio.
  */
-int blk_rq_count_integrity_sg(struct request *rq)
+int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	unsigned int segments = 0;
+	unsigned int seg_size = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	segments = 0;
+	bio_for_each_integrity_vec(iv, bio, i) {
 
-	rq_for_each_integrity_segment(iv, rq, iter) {
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
 
-		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (seg_size + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
+			seg_size += iv->bv_len;
+		} else {
+new_segment:
 			segments++;
+			seg_size = iv->bv_len;
+		}
 
 		ivprv = iv;
 	}
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
 
 /**
  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  * @sglist:	target scatterlist
  *
  * Description: Map the integrity vectors in request into a
  * scatterlist.  The scatterlist must be big enough to hold all
  * elements.  I.e. sized using blk_rq_count_integrity_sg().
  */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
+			    struct scatterlist *sglist)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	struct scatterlist *sg;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	struct scatterlist *sg = NULL;
+	unsigned int segments = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	sg = NULL;
-	segments = 0;
-
-	rq_for_each_integrity_segment(iv, rq, iter) {
+	bio_for_each_integrity_vec(iv, bio, i) {
 
 		if (ivprv) {
 			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
 				goto new_segment;
 
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (sg->length + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
 			sg->length += iv->bv_len;
 		} else {
 new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 }
 EXPORT_SYMBOL(blk_integrity_compare);
 
+int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
+			   struct request *next)
+{
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return -1;
+
+	if (req->nr_integrity_segments + next->nr_integrity_segments >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_rq);
+
+int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
+			    struct bio *bio)
+{
+	int nr_integrity_segs;
+	struct bio *next = bio->bi_next;
+
+	bio->bi_next = NULL;
+	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
+	bio->bi_next = next;
+
+	if (req->nr_integrity_segments + nr_integrity_segs >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	req->nr_integrity_segments += nr_integrity_segs;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_bio);
+
 struct integrity_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(struct blk_integrity *, char *);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3b0cd4249671..6a725461654d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
-		return 0;
-	}
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+		goto no_merge;
+
+	if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
+		goto no_merge;
 
 	/*
 	 * This will form the start of a new hw segment.  Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	 */
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
+
+no_merge:
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
+	return 0;
 }
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
+	if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
+		return 0;
+
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
 	return 1;
@@ -372,9 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
-	if (blk_integrity_rq(req) != blk_integrity_rq(next))
-		return 0;
-
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d592b559bd3..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -509,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					    b->seg_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+						 b->max_integrity_segments);
 
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f5..b014f7739e98 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_segments(q), (page));
 }
 
+static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_integrity_segments, (page));
+}
+
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
 	.show = queue_max_segments_show,
 };
 
+static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.show = queue_max_integrity_segments_show,
+};
+
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
 	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
 	.show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_max_segments_entry.attr,
+	&queue_max_integrity_segments_entry.attr,
 	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..6738831ba447 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -132,14 +132,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-
-#endif /* BLK_DEV_INTEGRITY */
-
 static inline int blk_cpu_to_group(int cpu)
 {
 #ifdef CONFIG_SCHED_MC
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8a8f803439e1..10478153641b 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -376,6 +376,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->this_id = sht->this_id;
 	shost->can_queue = sht->can_queue;
 	shost->sg_tablesize = sht->sg_tablesize;
+	shost->sg_prot_tablesize = sht->sg_prot_tablesize;
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ade720422c6..861c0b937ac9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -968,11 +968,13 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
  */
 int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 {
-	int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+	struct request *rq = cmd->request;
+
+	int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask);
 	if (error)
 		goto err_exit;
 
-	if (blk_bidi_rq(cmd->request)) {
+	if (blk_bidi_rq(rq)) {
 		struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
 			scsi_sdb_cache, GFP_ATOMIC);
 		if (!bidi_sdb) {
@@ -980,28 +982,28 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 			goto err_exit;
 		}
 
-		cmd->request->next_rq->special = bidi_sdb;
-		error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
-								    GFP_ATOMIC);
+		rq->next_rq->special = bidi_sdb;
+		error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC);
 		if (error)
 			goto err_exit;
 	}
 
-	if (blk_integrity_rq(cmd->request)) {
+	if (blk_integrity_rq(rq)) {
 		struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
 		int ivecs, count;
 
 		BUG_ON(prot_sdb == NULL);
-		ivecs = blk_rq_count_integrity_sg(cmd->request);
+		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
 
 		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
 			error = BLKPREP_DEFER;
 			goto err_exit;
 		}
 
-		count = blk_rq_map_integrity_sg(cmd->request,
+		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
 						prot_sdb->table.sgl);
 		BUG_ON(unlikely(count > ivecs));
+		BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q)));
 
 		cmd->prot_sdb = prot_sdb;
 		cmd->prot_sdb->table.nents = count;
@@ -1625,6 +1627,14 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
 					SCSI_MAX_SG_CHAIN_SEGMENTS));
 
+	if (scsi_host_prot_dma(shost)) {
+		shost->sg_prot_tablesize =
+			min_not_zero(shost->sg_prot_tablesize,
+				     (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
+		BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
+		blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
+	}
+
 	blk_queue_max_hw_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index c3f67373a4f8..20ad59dff730 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -251,6 +251,7 @@ shost_rd_attr(host_busy, "%hu\n");
 shost_rd_attr(cmd_per_lun, "%hd\n");
 shost_rd_attr(can_queue, "%hd\n");
 shost_rd_attr(sg_tablesize, "%hu\n");
+shost_rd_attr(sg_prot_tablesize, "%hu\n");
 shost_rd_attr(unchecked_isa_dma, "%d\n");
 shost_rd_attr(prot_capabilities, "%u\n");
 shost_rd_attr(prot_guard_type, "%hd\n");
@@ -262,6 +263,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_cmd_per_lun.attr,
 	&dev_attr_can_queue.attr,
 	&dev_attr_sg_tablesize.attr,
+	&dev_attr_sg_prot_tablesize.attr,
 	&dev_attr_unchecked_isa_dma.attr,
 	&dev_attr_proc_name.attr,
 	&dev_attr_scan.attr,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5274103434ad..2c3fd7421607 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -496,6 +496,10 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 #define bip_for_each_vec(bvl, bip, i)					\
 	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
 
+#define bio_for_each_integrity_vec(_bvl, _bio, _iter)			\
+	for_each_bio(_bio)						\
+		bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
+
 #define bio_integrity(bio) (bio->bi_integrity != NULL)
 
 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..7e661106270a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,9 @@ struct request {
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	unsigned short nr_integrity_segments;
+#endif
 
 	unsigned short ioprio;
 
@@ -243,6 +246,7 @@ struct queue_limits {
 
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
+	unsigned short		max_integrity_segments;
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
@@ -1213,8 +1217,13 @@ struct blk_integrity {
 extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 extern void blk_integrity_unregister(struct gendisk *);
 extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request *);
+extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+				   struct scatterlist *);
+extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
+				  struct request *);
+extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
+				   struct bio *);
 
 static inline
 struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@ -1235,16 +1244,32 @@ static inline int blk_integrity_rq(struct request *rq)
 	return bio_integrity(rq->bio);
 }
 
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+	q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(struct request_queue *q)
+{
+	return q->limits.max_integrity_segments;
+}
+
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 #define blk_integrity_rq(rq)			(0)
-#define blk_rq_count_integrity_sg(a)		(0)
-#define blk_rq_map_integrity_sg(a, b)		(0)
+#define blk_rq_count_integrity_sg(a, b)		(0)
+#define blk_rq_map_integrity_sg(a, b, c)	(0)
 #define bdev_get_integrity(a)			(0)
 #define blk_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);
+#define blk_queue_max_integrity_segments(a, b)	do { } while (0);
+#define queue_max_integrity_segments(a)		(0)
+#define blk_integrity_merge_rq(a, b, c)		(0)
+#define blk_integrity_merge_bio(a, b, c)	(0)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 8fcb6e0e9e72..d63533a4a59e 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -31,6 +31,12 @@ struct scsi_cmnd;
 #define SCSI_MAX_SG_CHAIN_SEGMENTS	SCSI_MAX_SG_SEGMENTS
 #endif
 
+/*
+ * DIX-capable adapters effectively support infinite chaining for the
+ * protection information scatterlist
+ */
+#define SCSI_MAX_PROT_SG_SEGMENTS	0xFFFF
+
 /*
  * Special value for scanning to specify scanning or rescanning of all
  * possible channels, (target) ids, or luns on a given shost.
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b7bdecb7b76e..d0a6a845f204 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -388,6 +388,7 @@ struct scsi_host_template {
 	 * of scatter-gather.
 	 */
 	unsigned short sg_tablesize;
+	unsigned short sg_prot_tablesize;
 
 	/*
 	 * Set this if the host adapter has limitations beside segment count.
@@ -599,6 +600,7 @@ struct Scsi_Host {
 	int can_queue;
 	short cmd_per_lun;
 	short unsigned int sg_tablesize;
+	short unsigned int sg_prot_tablesize;
 	short unsigned int max_sectors;
 	unsigned long dma_boundary;
 	/* 
@@ -823,6 +825,11 @@ static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
 	return shost->prot_capabilities;
 }
 
+static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
+{
+	return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
+}
+
 static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
 {
 	static unsigned char cap[] = { 0,

From 175b79f0632544d62aae72e5496c14e3e3ff2ae7 Mon Sep 17 00:00:00 2001
From: Christof Schmitt <christof.schmitt@de.ibm.com>
Date: Fri, 10 Sep 2010 20:50:40 +0200
Subject: [PATCH 03/39] zfcp: Report scatter gather limit for DIX protection
 information

When sending DIX integrity segments with an I/O request, the
restriction for the maximum number of segments is still the same for
the zfcp hardware. Report the new sg_prot_tablesize for the SCSI host,
so that the number of integrity segments plus the number of data
segments is not larger than the hardware limit. This results in using
half of the hardware segments for integrity data and the other half
for regular data.

Reviewed-by: Swen Schillig <swen@vnet.ibm.com>
Signed-off-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 drivers/s390/scsi/zfcp_scsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index cb000c9833bb..208256e39def 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -681,6 +681,7 @@ void zfcp_scsi_set_prot(struct zfcp_adapter *adapter)
 	    adapter->adapter_features & FSF_FEATURE_DIX_PROT_TCPIP) {
 		mask |= SHOST_DIX_TYPE1_PROTECTION;
 		scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP);
+		shost->sg_prot_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2;
 		shost->sg_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2;
 		shost->max_sectors = ZFCP_QDIO_MAX_SBALES_PER_REQ * 8 / 2;
 	}

From 8dcbdc742fec9e6c542ff9cb599d2ee620753d07 Mon Sep 17 00:00:00 2001
From: San Mehat <san@android.com>
Date: Tue, 14 Sep 2010 08:48:01 +0200
Subject: [PATCH 04/39] block: block_dump: Add number of sectors to debug
 output

Signed-off-by: San Mehat <san@android.com>
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..8d07c1b7e701 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1612,11 +1612,12 @@ void submit_bio(int rw, struct bio *bio)
 
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
-				bdevname(bio->bi_bdev, b));
+				bdevname(bio->bi_bdev, b),
+				count);
 		}
 	}
 

From 144177991ca624841ddbd1e7edff958fc0f6d1fe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 13:08:27 +0200
Subject: [PATCH 05/39] block: fix an address space warning in blk-map.c

Change type of 2nd parameter of blk_rq_aligned() into unsigned long
and remove unnecessary casting. Now we can call it with 'uaddr'
instead of 'ubuf' in __blk_rq_map_user() so that it can remove
following warnings from sparse:

 block/blk-map.c:57:31: warning: incorrect type in argument 2 (different address spaces)
 block/blk-map.c:57:31:    expected void *addr
 block/blk-map.c:57:31:    got void [noderef] <asn:1>*ubuf

However blk_rq_map_kern() needs one more local variable to handle it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c        | 5 +++--
 include/linux/blkdev.h | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index c65d7593f7f1..ac0f7d46db63 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	 * direct dma. else, set up kernel bounce buffers
 	 */
 	uaddr = (unsigned long) ubuf;
-	if (blk_rq_aligned(q, ubuf, len) && !map_data)
+	if (blk_rq_aligned(q, uaddr, len) && !map_data)
 		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
 		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
 {
 	int reading = rq_data_dir(rq) == READ;
+	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
 	struct bio *bio;
 	int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (!len || !kbuf)
 		return -EINVAL;
 
-	do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+	do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e661106270a..780824edac16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1097,11 +1097,11 @@ static inline int queue_dma_alignment(struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
-static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
 	unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	return !((unsigned long)addr & alignment) && !(len & alignment);
+	return !(addr & alignment) && !(len & alignment);
 }
 
 /* assumes size > 256 */

From 6d1d8050b4bc89d0165d29b58e894aeba2564a97 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:05 -0500
Subject: [PATCH 06/39] block, partition: add partition_meta_info to hd_struct

I'm reposting this patch series as v4 since there have been no additional
comments, and I cleaned up one extra bit of unneeded code (in 3/3). The patches
are against Linus's tree: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c
(2.6.36-rc3).

Would this patchset be suitable for inclusion in an mm branch?

This changes adds a partition_meta_info struct which itself contains a
union of structures that provide partition table specific metadata.

This change leaves the union empty. The subsequent patch includes an
implementation for CONFIG_EFI_PARTITION-based metadata.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c         |  1 +
 block/ioctl.c         |  2 +-
 fs/partitions/check.c | 23 ++++++++++++++++---
 fs/partitions/check.h |  3 +++
 include/linux/genhd.h | 53 +++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..c8da12055264 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1004,6 +1004,7 @@ static void disk_release(struct device *dev)
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
+	free_part_info(&disk->part0);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..2c15fe0912c4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 
 			/* all seems OK */
 			part = add_partition(disk, partno, start, length,
-					     ADDPART_FLAG_NONE);
+					     ADDPART_FLAG_NONE, NULL);
 			mutex_unlock(&bdev->bd_mutex);
 			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..6dfbee03ccc6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	free_part_stats(p);
+	free_part_info(p);
 	kfree(p);
 }
 
@@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
 struct hd_struct *add_partition(struct gendisk *disk, int partno,
-				sector_t start, sector_t len, int flags)
+				sector_t start, sector_t len, int flags,
+				struct partition_meta_info *info)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
 
+	if (info) {
+		struct partition_meta_info *pinfo = alloc_part_info(disk);
+		if (!pinfo)
+			goto out_free_stats;
+		memcpy(pinfo, info, sizeof(*info));
+		p->info = pinfo;
+	}
+
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
 		dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free_stats;
+		goto out_free_info;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	return p;
 
+out_free_info:
+	free_part_info(p);
 out_free_stats:
 	free_part_stats(p);
 out_free:
@@ -642,6 +654,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
+		struct partition_meta_info *info = NULL;
 
 		size = state->parts[p].size;
 		if (!size)
@@ -675,8 +688,12 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 				size = get_capacity(disk) - from;
 			}
 		}
+
+		if (state->parts[p].has_info)
+			info = &state->parts[p].info;
 		part = add_partition(disk, p, from, size,
-				     state->parts[p].flags);
+				     state->parts[p].flags,
+				     &state->parts[p].info);
 		if (IS_ERR(part)) {
 			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
 			       disk->disk_name, p, -PTR_ERR(part));
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/genhd.h>
 
 /*
  * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
 		sector_t from;
 		sector_t size;
 		int flags;
+		bool has_info;
+		struct partition_meta_info info;
 	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4d8fb0..66e26b5a1537 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/kdev_t.h>
 #include <linux/rcupdate.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -86,7 +87,15 @@ struct disk_stats {
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
 };
-	
+
+#define PARTITION_META_INFO_VOLNAMELTH	64
+#define PARTITION_META_INFO_UUIDLTH	16
+
+struct partition_meta_info {
+	u8 uuid[PARTITION_META_INFO_UUIDLTH];	/* always big endian */
+	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
+};
+
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
@@ -95,6 +104,7 @@ struct hd_struct {
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
+	struct partition_meta_info *info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
 #endif
@@ -181,6 +191,30 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
+static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
+{
+	int i;
+	for (i = 0; i < 16; ++i) {
+		*to++ = (hex_to_bin(*uuid_str) << 4) |
+			(hex_to_bin(*(uuid_str + 1)));
+		uuid_str += 2;
+		switch (i) {
+		case 3:
+		case 5:
+		case 7:
+		case 9:
+			uuid_str++;
+			continue;
+		}
+	}
+}
+
+static inline char *part_unpack_uuid(const u8 *uuid, char *out)
+{
+	sprintf(out, "%pU", uuid);
+	return out;
+}
+
 static inline int disk_max_parts(struct gendisk *disk)
 {
 	if (disk->flags & GENHD_FL_EXT_DEVT)
@@ -342,6 +376,19 @@ static inline int part_in_flight(struct hd_struct *part)
 	return part->in_flight[0] + part->in_flight[1];
 }
 
+static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
+{
+	if (disk)
+		return kzalloc_node(sizeof(struct partition_meta_info),
+				    GFP_KERNEL, disk->node_id);
+	return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
+}
+
+static inline void free_part_info(struct hd_struct *part)
+{
+	kfree(part->info);
+}
+
 /* block/blk-core.c */
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
@@ -533,7 +580,9 @@ extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     int partno, sector_t start,
-						     sector_t len, int flags);
+						     sector_t len, int flags,
+						     struct partition_meta_info
+						       *info);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 

From eec7ecfede74bb996060efefd5c157acd5794e8a Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:06 -0500
Subject: [PATCH 07/39] genhd, efi: add efi partition metadata to hd_structs

This change extends the partition_meta_info structure to
support EFI GPT-specific metadata and ensures that data
is copied in on partition scanning.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/partitions/efi.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
  *
  ************************************************************/
 #include <linux/crc32.h>
+#include <linux/ctype.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
 	gpt_entry *ptes = NULL;
 	u32 i;
 	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+	u8 unparsed_guid[37];
 
 	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
 	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		struct partition_meta_info *info;
+		unsigned label_count = 0;
+		unsigned label_max;
 		u64 start = le64_to_cpu(ptes[i].starting_lba);
 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
 		if (!efi_guidcmp(ptes[i].partition_type_guid,
 				 PARTITION_LINUX_RAID_GUID))
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+		info = &state->parts[i + 1].info;
+		/* Instead of doing a manual swap to big endian, reuse the
+		 * common ASCII hex format as the interim.
+		 */
+		efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+		part_pack_uuid(unparsed_guid, info->uuid);
+
+		/* Naively convert UTF16-LE to 7 bits. */
+		label_max = min(sizeof(info->volname) - 1,
+				sizeof(ptes[i].partition_name));
+		info->volname[label_max] = 0;
+		while (label_count < label_max) {
+			u8 c = ptes[i].partition_name[label_count] & 0xff;
+			if (c && !isprint(c))
+				c = '!';
+			info->volname[label_count] = c;
+			label_count++;
+		}
+		state->parts[i + 1].has_info = true;
 	}
 	kfree(ptes);
 	kfree(gpt);

From b5af921ec02333e943efb59aca4f56b78fc0e100 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:07 -0500
Subject: [PATCH 08/39] init: add support for root devices specified by
 partition UUID

This is the third patch in a series which adds support for
storing partition metadata, optionally, off of the hd_struct.

One major use for that data is being able to resolve partition
by other identities than just the index on a block device.  Device
enumeration varies by platform and there's a benefit to being able
to use something like EFI GPT's GUIDs to determine the correct
block device and partition to mount as the root.

This change adds that support to root= by adding support for
the following syntax:

  root=PARTUUID=hex-uuid

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c    |  9 +++++--
 init/do_mounts.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index c8da12055264..5c9c503de423 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -642,6 +642,7 @@ void __init printk_all_partitions(void)
 		struct hd_struct *part;
 		char name_buf[BDEVNAME_SIZE];
 		char devt_buf[BDEVT_SIZE];
+		u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -660,10 +661,14 @@ void __init printk_all_partitions(void)
 		while ((part = disk_part_iter_next(&piter))) {
 			bool is_part0 = part == &disk->part0;
 
-			printk("%s%s %10llu %s", is_part0 ? "" : "  ",
+			uuid[0] = 0;
+			if (part->info)
+				part_unpack_uuid(part->info->uuid, uuid);
+
+			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part->nr_sects >> 1,
-			       disk_name(disk, part->partno, name_buf));
+			       disk_name(disk, part->partno, name_buf), uuid);
 			if (is_part0) {
 				if (disk->driverfs_dev != NULL &&
 				    disk->driverfs_dev->driver != NULL)
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 02e3ca4fc527..804f9c6ba216 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -58,6 +58,60 @@ static int __init readwrite(char *str)
 __setup("ro", readonly);
 __setup("rw", readwrite);
 
+/**
+ * match_dev_by_uuid - callback for finding a partition using its uuid
+ * @dev:	device passed in by the caller
+ * @data:	opaque pointer to a 36 byte char array with a UUID
+ *
+ * Returns 1 if the device matches, and 0 otherwise.
+ */
+static int __init match_dev_by_uuid(struct device *dev, void *data)
+{
+	u8 *uuid = data;
+	struct hd_struct *part = dev_to_part(dev);
+
+	if (!part->info)
+		goto no_match;
+
+	if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
+			goto no_match;
+
+	return 1;
+no_match:
+	return 0;
+}
+
+
+/**
+ * devt_from_partuuid - looks up the dev_t of a partition by its UUID
+ * @uuid:	36 byte char array containing a hex ascii UUID
+ *
+ * The function will return the first partition which contains a matching
+ * UUID value in its partition_meta_info struct.  This does not search
+ * by filesystem UUIDs.
+ *
+ * Returns the matching dev_t on success or 0 on failure.
+ */
+static dev_t __init devt_from_partuuid(char *uuid_str)
+{
+	dev_t res = 0;
+	struct device *dev = NULL;
+	u8 uuid[16];
+
+	/* Pack the requested UUID in the expected format. */
+	part_pack_uuid(uuid_str, uuid);
+
+	dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
+	if (!dev)
+		goto done;
+
+	res = dev->devt;
+	put_device(dev);
+
+done:
+	return res;
+}
+
 /*
  *	Convert a name into device number.  We accept the following variants:
  *
@@ -68,6 +122,8 @@ __setup("rw", readwrite);
  *         of partition - device number of disk plus the partition number
  *	5) /dev/<disk_name>p<decimal> - same as the above, that form is
  *	   used when disk name of partitioned disk ends on a digit.
+ *	6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
+ *	   unique id of a partition if the partition table provides it.
  *
  *	If name doesn't have fall into the categories above, we return (0,0).
  *	block_class is used to check if something is a disk name. If the disk
@@ -82,6 +138,16 @@ dev_t name_to_dev_t(char *name)
 	dev_t res = 0;
 	int part;
 
+	if (strncmp(name, "PARTUUID=", 9) == 0) {
+		name += 9;
+		if (strlen(name) != 36)
+			goto fail;
+		res = devt_from_partuuid(name);
+		if (!res)
+			goto fail;
+		goto done;
+	}
+
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
 

From 2610a25406087ef797f4187e7f82dd04335056c7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 16 Sep 2010 12:55:57 +0900
Subject: [PATCH 09/39] sg: fix a warning in blk_rq_aligned() call

2nd argument of blk_rq_aligned() has changed to 'unsigned long' by
the previous commit 'block: fix an address space warning in blk-map.c'.
That commit neglected to update a user of that function.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 78d616315d8e..655ab920cff4 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1657,7 +1657,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
 	    dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
 	    !sfp->parentdp->device->host->unchecked_isa_dma &&
-	    blk_rq_aligned(q, hp->dxferp, dxfer_len))
+	    blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len))
 		md = NULL;
 	else
 		md = &map_data;

From 38b6f45a97bbb8536cc7d095b577f580bd4d643e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 16 Sep 2010 08:33:54 +0200
Subject: [PATCH 10/39] core: match_dev_by_uuid() should not be marked __init

It is also called outside the scope of init functions. Stephen
reports:

WARNING: init/mounts.o(.text+0x21a): Section mismatch in reference from the function name_to_dev_t() to the function .init.text:match_dev_by_uuid()
The function name_to_dev_t() references
the function __init match_dev_by_uuid().
This is often because name_to_dev_t lacks a __init
annotation or the annotation of match_dev_by_uuid is wrong.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 init/do_mounts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 804f9c6ba216..b7fc83994f39 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -65,7 +65,7 @@ __setup("rw", readwrite);
  *
  * Returns 1 if the device matches, and 0 otherwise.
  */
-static int __init match_dev_by_uuid(struct device *dev, void *data)
+static int match_dev_by_uuid(struct device *dev, void *data)
 {
 	u8 *uuid = data;
 	struct hd_struct *part = dev_to_part(dev);

From af41d7bd9b685ab4e8f930627874ba4f4728e128 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:32 -0400
Subject: [PATCH 11/39] blk-cgroup: Kill the header printed at the start of
 blkio.weight_device file

o Kill extra "dev weight" header which is printed when somebody reads
  blkio.weight_device file. This really seems to be out of convention. No other
  blkio files are printing any header at the start of file. I think it is ok
  to just print values and how to interpret values should be part of
  documentation.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6809645d212..c1a39d90d14a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -811,8 +811,6 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 	struct blkio_cgroup *blkcg;
 	struct blkio_policy_node *pn;
 
-	seq_printf(m, "dev\tweight\n");
-
 	blkcg = cgroup_to_blkio_cgroup(cgrp);
 	if (!list_empty(&blkcg->policy_list)) {
 		spin_lock_irq(&blkcg->lock);

From 062a644d6121d5e2f51c0b2ca0cbc5155ebf845b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:33 -0400
Subject: [PATCH 12/39] blk-cgroup: Prepare the base for supporting more than
 one IO control policies

o This patch prepares the base for introducing new IO control policies.
  Currently all the code is written knowing there is only one policy
  and that is proportional bandwidth. Creating infrastructure for newer
  policies to come in.

o Also there were many functions which were generated using macro. It was
  very confusing. Got rid of those.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c  | 544 ++++++++++++++++++++++++++++++++------------
 block/blk-cgroup.h  |  36 ++-
 block/cfq-iosched.c |   1 +
 block/cfq.h         |   2 +-
 4 files changed, 429 insertions(+), 154 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c1a39d90d14a..7762987fdf9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
+/* for encoding cft->private value on file */
+#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
+/* What policy owns the file, proportional or throttle */
+#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
+#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
+
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
 	list_add(&pn->node, &blkcg->policy_list);
 }
 
+static inline bool cftype_blkg_same_policy(struct cftype *cft,
+			struct blkio_group *blkg)
+{
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+
+	if (blkg->plid == plid)
+		return 1;
+
+	return 0;
+}
+
+/* Determines if policy node matches cgroup file being accessed */
+static inline bool pn_matches_cftype(struct cftype *cft,
+			struct blkio_policy_node *pn)
+{
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int fileid = BLKIOFILE_ATTR(cft->private);
+
+	return (plid == pn->plid && fileid == pn->fileid);
+}
+
 /* Must be called with blkcg->lock held */
 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 {
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 
 /* Must be called with blkcg->lock held */
 static struct blkio_policy_node *
-blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
+		enum blkio_policy_id plid, int fileid)
 {
 	struct blkio_policy_node *pn;
 
 	list_for_each_entry(pn, &blkcg->policy_list, node) {
-		if (pn->dev == dev)
+		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
 			return pn;
 	}
 
@@ -86,6 +114,20 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+static inline void
+blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+		if (blkiop->ops.blkio_update_group_weight_fn)
+			blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -341,7 +383,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev)
+		struct blkio_group *blkg, void *key, dev_t dev,
+		enum blkio_policy_id plid)
 {
 	unsigned long flags;
 
@@ -350,6 +393,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	rcu_assign_pointer(blkg->key, key);
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	blkg->plid = plid;
 	spin_unlock_irqrestore(&blkcg->lock, flags);
 	/* Need to take css reference ? */
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +452,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
-#define SHOW_FUNCTION(__VAR)						\
-static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-				       struct cftype *cftype)		\
-{									\
-	struct blkio_cgroup *blkcg;					\
-									\
-	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
-	return (u64)blkcg->__VAR;					\
-}
-
-SHOW_FUNCTION(weight);
-#undef SHOW_FUNCTION
-
-static int
-blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
-{
-	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	struct blkio_policy_type *blkiop;
-	struct blkio_policy_node *pn;
-
-	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
-
-	blkcg = cgroup_to_blkio_cgroup(cgroup);
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-	blkcg->weight = (unsigned int)val;
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		pn = blkio_policy_search_node(blkcg, blkg->dev);
-
-		if (pn)
-			continue;
-
-		list_for_each_entry(blkiop, &blkio_list, list)
-			blkiop->ops.blkio_update_group_weight_fn(blkg,
-					blkcg->weight);
-	}
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
-	return 0;
-}
-
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
@@ -593,52 +592,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	return disk_total;
 }
 
-#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
-static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-		struct cftype *cftype, struct cgroup_map_cb *cb)	\
-{									\
-	struct blkio_cgroup *blkcg;					\
-	struct blkio_group *blkg;					\
-	struct hlist_node *n;						\
-	uint64_t cgroup_total = 0;					\
-									\
-	if (!cgroup_lock_live_group(cgroup))				\
-		return -ENODEV;						\
-									\
-	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
-	rcu_read_lock();						\
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-		if (blkg->dev) {					\
-			spin_lock_irq(&blkg->stats_lock);		\
-			cgroup_total += blkio_get_stat(blkg, cb,	\
-						blkg->dev, type);	\
-			spin_unlock_irq(&blkg->stats_lock);		\
-		}							\
-	}								\
-	if (show_total)							\
-		cb->fill(cb, "Total", cgroup_total);			\
-	rcu_read_unlock();						\
-	cgroup_unlock();						\
-	return 0;							\
-}
-
-SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
-SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
-SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
-SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
-SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
-SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
-SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
-#endif
-#undef SHOW_FUNCTION_PER_GROUP
-
 static int blkio_check_dev_num(dev_t dev)
 {
 	int part = 0;
@@ -652,7 +605,7 @@ static int blkio_check_dev_num(dev_t dev)
 }
 
 static int blkio_policy_parse_and_set(char *buf,
-				      struct blkio_policy_node *newpn)
+	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
@@ -705,12 +658,20 @@ static int blkio_policy_parse_and_set(char *buf,
 	if (s[1] == NULL)
 		return -EINVAL;
 
-	ret = strict_strtoul(s[1], 10, &temp);
-	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-	    temp > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
+	switch (plid) {
+	case BLKIO_POLICY_PROP:
+		ret = strict_strtoul(s[1], 10, &temp);
+		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+			temp > BLKIO_WEIGHT_MAX)
+			return -EINVAL;
 
-	newpn->weight =  temp;
+		newpn->plid = plid;
+		newpn->fileid = fileid;
+		newpn->weight = temp;
+		break;
+	default:
+		BUG();
+	}
 
 	return 0;
 }
@@ -720,7 +681,8 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 {
 	struct blkio_policy_node *pn;
 
-	pn = blkio_policy_search_node(blkcg, dev);
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight_device);
 	if (pn)
 		return pn->weight;
 	else
@@ -728,18 +690,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+/* Checks whether user asked for deleting a policy rule */
+static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
+{
+	switch(pn->plid) {
+	case BLKIO_POLICY_PROP:
+		if (pn->weight == 0)
+			return 1;
+		break;
+	default:
+		BUG();
+	}
 
-static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
-				       const char *buffer)
+	return 0;
+}
+
+static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
+					struct blkio_policy_node *newpn)
+{
+	switch(oldpn->plid) {
+	case BLKIO_POLICY_PROP:
+		oldpn->weight = newpn->weight;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Some rules/values in blkg have changed. Propogate those to respective
+ * policies.
+ */
+static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
+		struct blkio_group *blkg, struct blkio_policy_node *pn)
+{
+	unsigned int weight;
+
+	switch(pn->plid) {
+	case BLKIO_POLICY_PROP:
+		weight = pn->weight ? pn->weight :
+				blkcg->weight;
+		blkio_update_group_weight(blkg, weight);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * A policy node rule has been updated. Propogate this update to all the
+ * block groups which might be affected by this update.
+ */
+static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
+				struct blkio_policy_node *pn)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
+			continue;
+		blkio_update_blkg_policy(blkcg, blkg, pn);
+	}
+
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+}
+
+static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
+ 				       const char *buffer)
 {
 	int ret = 0;
 	char *buf;
 	struct blkio_policy_node *newpn, *pn;
 	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
 	int keep_newpn = 0;
-	struct hlist_node *n;
-	struct blkio_policy_type *blkiop;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int fileid = BLKIOFILE_ATTR(cft->private);
 
 	buf = kstrdup(buffer, GFP_KERNEL);
 	if (!buf)
@@ -751,7 +781,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 		goto free_buf;
 	}
 
-	ret = blkio_policy_parse_and_set(buf, newpn);
+	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
 	if (ret)
 		goto free_newpn;
 
@@ -759,9 +789,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 
 	spin_lock_irq(&blkcg->lock);
 
-	pn = blkio_policy_search_node(blkcg, newpn->dev);
+	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
 	if (!pn) {
-		if (newpn->weight != 0) {
+		if (!blkio_delete_rule_command(newpn)) {
 			blkio_policy_insert_node(blkcg, newpn);
 			keep_newpn = 1;
 		}
@@ -769,33 +799,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 		goto update_io_group;
 	}
 
-	if (newpn->weight == 0) {
-		/* weight == 0 means deleteing a specific weight */
+	if (blkio_delete_rule_command(newpn)) {
 		blkio_policy_delete_node(pn);
 		spin_unlock_irq(&blkcg->lock);
 		goto update_io_group;
 	}
 	spin_unlock_irq(&blkcg->lock);
 
-	pn->weight = newpn->weight;
+	blkio_update_policy_rule(pn, newpn);
 
 update_io_group:
-	/* update weight for each cfqg */
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (newpn->dev == blkg->dev) {
-			list_for_each_entry(blkiop, &blkio_list, list)
-				blkiop->ops.blkio_update_group_weight_fn(blkg,
-							 newpn->weight ?
-							 newpn->weight :
-							 blkcg->weight);
-		}
-	}
-
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
+	blkio_update_policy_node_blkg(blkcg, newpn);
 
 free_newpn:
 	if (!keep_newpn)
@@ -805,21 +819,219 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
-				      struct seq_file *m)
+static void
+blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
+{
+	switch(pn->plid) {
+		case BLKIO_POLICY_PROP:
+			if (pn->fileid == BLKIO_PROP_weight_device)
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->weight);
+			break;
+		default:
+			BUG();
+	}
+}
+
+/* cgroup files which read their data from policy nodes end up here */
+static void blkio_read_policy_node_files(struct cftype *cft,
+			struct blkio_cgroup *blkcg, struct seq_file *m)
 {
-	struct blkio_cgroup *blkcg;
 	struct blkio_policy_node *pn;
 
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
 	if (!list_empty(&blkcg->policy_list)) {
 		spin_lock_irq(&blkcg->lock);
 		list_for_each_entry(pn, &blkcg->policy_list, node) {
-			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-				   MINOR(pn->dev), pn->weight);
+			if (!pn_matches_cftype(cft, pn))
+				continue;
+			blkio_print_policy_node(m, pn);
 		}
 		spin_unlock_irq(&blkcg->lock);
 	}
+}
+
+static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *m)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight_device:
+			blkio_read_policy_node_files(cft, blkcg, m);
+			return 0;
+		default:
+			BUG();
+		}
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
+		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
+		bool show_total)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	uint64_t cgroup_total = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (blkg->dev) {
+			if (!cftype_blkg_same_policy(cft, blkg))
+				continue;
+			spin_lock_irq(&blkg->stats_lock);
+			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
+						type);
+			spin_unlock_irq(&blkg->stats_lock);
+		}
+	}
+	if (show_total)
+		cb->fill(cb, "Total", cgroup_total);
+	rcu_read_unlock();
+	return 0;
+}
+
+/* All map kind of cgroup file get serviced by this function */
+static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
+				struct cgroup_map_cb *cb)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_TIME, 0);
+		case BLKIO_PROP_sectors:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SECTORS, 0);
+		case BLKIO_PROP_io_service_bytes:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_BYTES, 1);
+		case BLKIO_PROP_io_serviced:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICED, 1);
+		case BLKIO_PROP_io_service_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_TIME, 1);
+		case BLKIO_PROP_io_wait_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_WAIT_TIME, 1);
+		case BLKIO_PROP_io_merged:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_MERGED, 1);
+		case BLKIO_PROP_io_queued:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_QUEUED, 1);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+		case BLKIO_PROP_dequeue:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_DEQUEUE, 0);
+		case BLKIO_PROP_avg_queue_size:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+		case BLKIO_PROP_group_wait_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_GROUP_WAIT_TIME, 0);
+		case BLKIO_PROP_idle_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_IDLE_TIME, 0);
+		case BLKIO_PROP_empty_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_EMPTY_TIME, 0);
+#endif
+		default:
+			BUG();
+		}
+		break;
+
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	struct blkio_policy_node *pn;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+	blkcg->weight = (unsigned int)val;
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		pn = blkio_policy_search_node(blkcg, blkg->dev,
+				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
+		if (pn)
+			continue;
+
+		blkio_update_group_weight(blkg, blkcg->weight);
+	}
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+	return 0;
+}
+
+static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight:
+			return (u64)blkcg->weight;
+		}
+		break;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+static int
+blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight:
+			return blkio_weight_write(blkcg, val);
+		}
+		break;
+	default:
+		BUG();
+	}
 
 	return 0;
 }
@@ -827,46 +1039,66 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
-		.read_seq_string = blkiocg_weight_device_read,
-		.write_string = blkiocg_weight_device_write,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
-		.read_u64 = blkiocg_weight_read,
-		.write_u64 = blkiocg_weight_write,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
 	},
 	{
 		.name = "time",
-		.read_map = blkiocg_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "sectors",
-		.read_map = blkiocg_sectors_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_sectors),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_service_bytes",
-		.read_map = blkiocg_io_service_bytes_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_serviced",
-		.read_map = blkiocg_io_serviced_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_serviced),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_service_time",
-		.read_map = blkiocg_io_service_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_wait_time",
-		.read_map = blkiocg_io_wait_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_wait_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_merged",
-		.read_map = blkiocg_io_merged_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_merged),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_queued",
-		.read_map = blkiocg_io_queued_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_queued),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "reset_stats",
@@ -875,23 +1107,33 @@ struct cftype blkio_files[] = {
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-		.read_map = blkiocg_avg_queue_size_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_avg_queue_size),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "group_wait_time",
-		.read_map = blkiocg_group_wait_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_group_wait_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "idle_time",
-		.read_map = blkiocg_idle_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_idle_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "empty_time",
-		.read_map = blkiocg_empty_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_empty_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "dequeue",
-		.read_map = blkiocg_dequeue_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_dequeue),
+		.read_map = blkiocg_file_read_map,
 	},
 #endif
 };
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..c8de2598429d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,10 @@
 
 #include <linux/cgroup.h>
 
+enum blkio_policy_id {
+	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
+};
+
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 
 #ifndef CONFIG_BLK_CGROUP
@@ -65,6 +69,25 @@ enum blkg_state_flags {
 	BLKG_empty,
 };
 
+/* cgroup files owned by proportional weight policy */
+enum blkcg_file_name_prop {
+	BLKIO_PROP_weight = 1,
+	BLKIO_PROP_weight_device,
+	BLKIO_PROP_io_service_bytes,
+	BLKIO_PROP_io_serviced,
+	BLKIO_PROP_time,
+	BLKIO_PROP_sectors,
+	BLKIO_PROP_io_service_time,
+	BLKIO_PROP_io_wait_time,
+	BLKIO_PROP_io_merged,
+	BLKIO_PROP_io_queued,
+	BLKIO_PROP_avg_queue_size,
+	BLKIO_PROP_group_wait_time,
+	BLKIO_PROP_idle_time,
+	BLKIO_PROP_empty_time,
+	BLKIO_PROP_dequeue,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -112,6 +135,8 @@ struct blkio_group {
 	char path[128];
 	/* The device MKDEV(major, minor), this group has been created for */
 	dev_t dev;
+	/* policy which owns this blk group */
+	enum blkio_policy_id plid;
 
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
@@ -122,6 +147,10 @@ struct blkio_policy_node {
 	struct list_head node;
 	dev_t dev;
 	unsigned int weight;
+	/* This node belongs to max bw policy or porportional weight policy */
+	enum blkio_policy_id plid;
+	/* cgroup file to which this rule belongs to */
+	int fileid;
 };
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
@@ -139,6 +168,7 @@ struct blkio_policy_ops {
 struct blkio_policy_type {
 	struct list_head list;
 	struct blkio_policy_ops ops;
+	enum blkio_policy_id plid;
 };
 
 /* Blkio controller policy registration */
@@ -212,7 +242,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev);
+	struct blkio_group *blkg, void *key, dev_t dev,
+	enum blkio_policy_id plid);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
@@ -234,7 +265,8 @@ static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {}
+		struct blkio_group *blkg, void *key, dev_t dev,
+		enum blkio_policy_id plid) {}
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb4086f7dfef..b9f86190763b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4013,6 +4013,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
+	.plid = BLKIO_POLICY_PROP,
 };
 #else
 static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 
 static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev) {
-	blkiocg_add_blkio_group(blkcg, blkg, key, dev);
+	blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
 }
 
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)

From 4c9eefa16c6f124ffcc736cb719b24ea27f85017 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:34 -0400
Subject: [PATCH 13/39] blk-cgroup: Introduce cgroup changes for throttling
 policy

o cgroup chagnes for throttle policy.

o Introduces READ and WRITE bytes per second throttling rules.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 145 ++++++++++++++++++++++++++++++++++++++++++---
 block/blk-cgroup.h |  30 +++++++++-
 2 files changed, 167 insertions(+), 8 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7762987fdf9e..aae8c930a6f8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -128,6 +128,27 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 	}
 }
 
+static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
+				int fileid)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+
+		if (fileid == BLKIO_THROTL_read_bps_device
+		    && blkiop->ops.blkio_update_group_read_bps_fn)
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps);
+
+		if (fileid == BLKIO_THROTL_write_bps_device
+		    && blkiop->ops.blkio_update_group_write_bps_fn)
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -612,6 +633,7 @@ static int blkio_policy_parse_and_set(char *buf,
 	unsigned long major, minor, temp;
 	int i = 0;
 	dev_t dev;
+	u64 bps;
 
 	memset(s, 0, sizeof(s));
 
@@ -667,7 +689,16 @@ static int blkio_policy_parse_and_set(char *buf,
 
 		newpn->plid = plid;
 		newpn->fileid = fileid;
-		newpn->weight = temp;
+		newpn->val.weight = temp;
+		break;
+	case BLKIO_POLICY_THROTL:
+		ret = strict_strtoull(s[1], 10, &bps);
+		if (ret)
+			return -EINVAL;
+
+		newpn->plid = plid;
+		newpn->fileid = fileid;
+		newpn->val.bps = bps;
 		break;
 	default:
 		BUG();
@@ -684,18 +715,45 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device);
 	if (pn)
-		return pn->weight;
+		return pn->val.weight;
 	else
 		return blkcg->weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device);
+	if (pn)
+		return pn->val.bps;
+	else
+		return -1;
+}
+
+uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device);
+	if (pn)
+		return pn->val.bps;
+	else
+		return -1;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		if (pn->weight == 0)
+		if (pn->val.weight == 0)
+			return 1;
+		break;
+	case BLKIO_POLICY_THROTL:
+		if (pn->val.bps == 0)
 			return 1;
 		break;
 	default:
@@ -710,7 +768,10 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 {
 	switch(oldpn->plid) {
 	case BLKIO_POLICY_PROP:
-		oldpn->weight = newpn->weight;
+		oldpn->val.weight = newpn->val.weight;
+		break;
+	case BLKIO_POLICY_THROTL:
+		oldpn->val.bps = newpn->val.bps;
 		break;
 	default:
 		BUG();
@@ -725,13 +786,23 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
 	unsigned int weight;
+	u64 bps;
 
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		weight = pn->weight ? pn->weight :
+		weight = pn->val.weight ? pn->val.weight :
 				blkcg->weight;
 		blkio_update_group_weight(blkg, weight);
 		break;
+	case BLKIO_POLICY_THROTL:
+		switch(pn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			bps = pn->val.bps ? pn->val.bps : (-1);
+			blkio_update_group_bps(blkg, bps, pn->fileid);
+			break;
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -826,7 +897,17 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 		case BLKIO_POLICY_PROP:
 			if (pn->fileid == BLKIO_PROP_weight_device)
 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->weight);
+					MINOR(pn->dev), pn->val.weight);
+			break;
+		case BLKIO_POLICY_THROTL:
+			if (pn->fileid == BLKIO_THROTL_read_bps_device)
+				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.bps);
+			else if (pn->fileid == BLKIO_THROTL_write_bps_device)
+				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.bps);
+			else
+				BUG();
 			break;
 		default:
 			BUG();
@@ -869,6 +950,16 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 			BUG();
 		}
 		break;
+	case BLKIO_POLICY_THROTL:
+		switch(name){
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			blkio_read_policy_node_files(cft, blkcg, m);
+			return 0;
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -959,7 +1050,18 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 			BUG();
 		}
 		break;
-
+	case BLKIO_POLICY_THROTL:
+		switch(name){
+		case BLKIO_THROTL_io_service_bytes:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_BYTES, 1);
+		case BLKIO_THROTL_io_serviced:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICED, 1);
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -1052,6 +1154,23 @@ struct cftype blkio_files[] = {
 		.read_u64 = blkiocg_file_read_u64,
 		.write_u64 = blkiocg_file_write_u64,
 	},
+	{
+		.name = "throttle.read_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1070,12 +1189,24 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_io_service_bytes),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_serviced",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_serviced),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "throttle.io_serviced",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_serviced),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_service_time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c8de2598429d..1b738827b2f6 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
 
 enum blkio_policy_id {
 	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
+	BLKIO_POLICY_THROTL,		/* Throttling */
 };
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -88,6 +89,14 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_dequeue,
 };
 
+/* cgroup files owned by throttle policy */
+enum blkcg_file_name_throtl {
+	BLKIO_THROTL_read_bps_device,
+	BLKIO_THROTL_write_bps_device,
+	BLKIO_THROTL_io_service_bytes,
+	BLKIO_THROTL_io_serviced,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -146,23 +155,42 @@ struct blkio_group {
 struct blkio_policy_node {
 	struct list_head node;
 	dev_t dev;
-	unsigned int weight;
 	/* This node belongs to max bw policy or porportional weight policy */
 	enum blkio_policy_id plid;
 	/* cgroup file to which this rule belongs to */
 	int fileid;
+
+	union {
+		unsigned int weight;
+		/*
+		 * Rate read/write in terms of byptes per second
+		 * Whether this rate represents read or write is determined
+		 * by file type "fileid".
+		 */
+		u64 bps;
+	} val;
 };
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
+				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
 						unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
+						u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
+						u64 write_bps);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
+	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
 };
 
 struct blkio_policy_type {

From e43473b7f223ec866f7db273697e76c337c390f9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:35 -0400
Subject: [PATCH 14/39] blkio: Core implementation of throttle policy

o Actual implementation of throttling policy in block layer. Currently it
  implements READ and WRITE bytes per second throttling logic. IOPS throttling
  comes in later patches.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Kconfig             |  12 +
 block/Makefile            |   1 +
 block/blk-core.c          |  24 +
 block/blk-throttle.c      | 909 ++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h |   3 +
 include/linux/blkdev.h    |  24 +
 init/Kconfig              |   9 +-
 7 files changed, 979 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-throttle.c

diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_THROTTLING
+	bool "Block layer bio throttling support"
+	depends on BLK_CGROUP=y && EXPERIMENTAL
+	default n
+	---help---
+	Block layer bio throttling support. It can be used to limit
+	the IO rate to a device. IO rate policies are per cgroup and
+	one needs to mount and use blkio cgroup controller for creating
+	cgroups and specifying per device IO rate policies.
+
+	See Documentation/cgroups/blkio-controller.txt for more information.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 8d07c1b7e701..797d5095eb83 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -382,6 +382,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
+	throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -459,6 +460,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
+	blk_throtl_exit(q);
+
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +518,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		return NULL;
 	}
 
+	if (blk_throtl_init(q)) {
+		kmem_cache_free(blk_requestq_cachep, q);
+		return NULL;
+	}
+
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
@@ -1522,6 +1530,15 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
+		blk_throtl_bio(q, &bio);
+
+		/*
+		 * If bio = NULL, bio has been throttled and will be submitted
+		 * later.
+		 */
+		if (!bio)
+			break;
+
 		trace_block_bio_queue(q, bio);
 
 		ret = q->make_request_fn(q, bio);
@@ -2580,6 +2597,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_delayed_work(struct request_queue *q,
+			struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..4b492011e0de
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,909 @@
+/*
+ * Interface for controlling IO bandwidth on a request queue
+ *
+ * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+
+/* Max dispatch from a group in 1 round */
+static int throtl_grp_quantum = 8;
+
+/* Total max dispatch from all groups in one round */
+static int throtl_quantum = 32;
+
+/* Throttling is performed over 100ms slice and after that slice is renewed */
+static unsigned long throtl_slice = HZ/10;	/* 100 ms */
+
+struct throtl_rb_root {
+	struct rb_root rb;
+	struct rb_node *left;
+	unsigned int count;
+	unsigned long min_disptime;
+};
+
+#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
+			.count = 0, .min_disptime = 0}
+
+#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
+
+struct throtl_grp {
+	/* List of throtl groups on the request queue*/
+	struct hlist_node tg_node;
+
+	/* active throtl group service_tree member */
+	struct rb_node rb_node;
+
+	/*
+	 * Dispatch time in jiffies. This is the estimated time when group
+	 * will unthrottle and is ready to dispatch more bio. It is used as
+	 * key to sort active groups in service tree.
+	 */
+	unsigned long disptime;
+
+	struct blkio_group blkg;
+	atomic_t ref;
+	unsigned int flags;
+
+	/* Two lists for READ and WRITE */
+	struct bio_list bio_lists[2];
+
+	/* Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/* bytes per second rate limits */
+	uint64_t bps[2];
+
+	/* Number of bytes disptached in current slice */
+	uint64_t bytes_disp[2];
+
+	/* When did we start a new slice */
+	unsigned long slice_start[2];
+	unsigned long slice_end[2];
+};
+
+struct throtl_data
+{
+	/* List of throtl groups */
+	struct hlist_head tg_list;
+
+	/* service tree for active throtl groups */
+	struct throtl_rb_root tg_service_tree;
+
+	struct throtl_grp root_tg;
+	struct request_queue *queue;
+
+	/* Total Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/*
+	 * number of total undestroyed groups (excluding root group)
+	 */
+	unsigned int nr_undestroyed_grps;
+
+	/* Work for dispatching throttled bios */
+	struct delayed_work throtl_work;
+};
+
+enum tg_state_flags {
+	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
+};
+
+#define THROTL_TG_FNS(name)						\
+static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
+{									\
+	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
+{									\
+	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
+{									\
+	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
+}
+
+THROTL_TG_FNS(on_rr);
+
+#define throtl_log_tg(td, tg, fmt, args...)				\
+	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
+				blkg_path(&(tg)->blkg), ##args);      	\
+
+#define throtl_log(td, fmt, args...)	\
+	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
+
+static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
+{
+	if (blkg)
+		return container_of(blkg, struct throtl_grp, blkg);
+
+	return NULL;
+}
+
+static inline int total_nr_queued(struct throtl_data *td)
+{
+	return (td->nr_queued[0] + td->nr_queued[1]);
+}
+
+static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
+{
+	atomic_inc(&tg->ref);
+	return tg;
+}
+
+static void throtl_put_tg(struct throtl_grp *tg)
+{
+	BUG_ON(atomic_read(&tg->ref) <= 0);
+	if (!atomic_dec_and_test(&tg->ref))
+		return;
+	kfree(tg);
+}
+
+static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
+			struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct throtl_grp *tg = NULL;
+	void *key = td;
+	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	/*
+	 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
+	 * tree of blkg (instead of traversing through hash list all
+	 * the time.
+	 */
+	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+	/* Fill in device details for root group */
+	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		tg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+
+	if (tg)
+		goto done;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+	if (!tg)
+		goto done;
+
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/*
+	 * Take the initial reference that will be released on destroy
+	 * This can be thought of a joint reference by cgroup and
+	 * request queue which will be dropped by either request queue
+	 * exit or cgroup deletion path depending on who is exiting first.
+	 */
+	atomic_set(&tg->ref, 1);
+
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+				MKDEV(major, minor), BLKIO_POLICY_THROTL);
+
+	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
+done:
+	return tg;
+}
+
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+	struct cgroup *cgroup;
+	struct throtl_grp *tg = NULL;
+
+	rcu_read_lock();
+	cgroup = task_cgroup(current, blkio_subsys_id);
+	tg = throtl_find_alloc_tg(td, cgroup);
+	if (!tg)
+		tg = &td->root_tg;
+	rcu_read_unlock();
+	return tg;
+}
+
+static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+{
+	/* Service tree is empty */
+	if (!root->count)
+		return NULL;
+
+	if (!root->left)
+		root->left = rb_first(&root->rb);
+
+	if (root->left)
+		return rb_entry_tg(root->left);
+
+	return NULL;
+}
+
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+
+static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+{
+	if (root->left == n)
+		root->left = NULL;
+	rb_erase_init(n, &root->rb);
+	--root->count;
+}
+
+static void update_min_dispatch_time(struct throtl_rb_root *st)
+{
+	struct throtl_grp *tg;
+
+	tg = throtl_rb_first(st);
+	if (!tg)
+		return;
+
+	st->min_disptime = tg->disptime;
+}
+
+static void
+tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+{
+	struct rb_node **node = &st->rb.rb_node;
+	struct rb_node *parent = NULL;
+	struct throtl_grp *__tg;
+	unsigned long key = tg->disptime;
+	int left = 1;
+
+	while (*node != NULL) {
+		parent = *node;
+		__tg = rb_entry_tg(parent);
+
+		if (time_before(key, __tg->disptime))
+			node = &parent->rb_left;
+		else {
+			node = &parent->rb_right;
+			left = 0;
+		}
+	}
+
+	if (left)
+		st->left = &tg->rb_node;
+
+	rb_link_node(&tg->rb_node, parent, node);
+	rb_insert_color(&tg->rb_node, &st->rb);
+}
+
+static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	tg_service_tree_add(st, tg);
+	throtl_mark_tg_on_rr(tg);
+	st->count++;
+}
+
+static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (!throtl_tg_on_rr(tg))
+		__throtl_enqueue_tg(td, tg);
+}
+
+static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
+	throtl_clear_tg_on_rr(tg);
+}
+
+static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (throtl_tg_on_rr(tg))
+		__throtl_dequeue_tg(td, tg);
+}
+
+static void throtl_schedule_next_dispatch(struct throtl_data *td)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	/*
+	 * If there are more bios pending, schedule more work.
+	 */
+	if (!total_nr_queued(td))
+		return;
+
+	BUG_ON(!st->count);
+
+	update_min_dispatch_time(st);
+
+	if (time_before_eq(st->min_disptime, jiffies))
+		throtl_schedule_delayed_work(td->queue, 0);
+	else
+		throtl_schedule_delayed_work(td->queue,
+				(st->min_disptime - jiffies));
+}
+
+static inline void
+throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	tg->bytes_disp[rw] = 0;
+	tg->slice_start[rw] = jiffies;
+	tg->slice_end[rw] = jiffies + throtl_slice;
+	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+static inline void throtl_extend_slice(struct throtl_data *td,
+		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+/* Determine if previously allocated or extended slice is complete or not */
+static bool
+throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
+		return 0;
+
+	return 1;
+}
+
+/* Trim the used slices and adjust slice start accordingly */
+static inline void
+throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	unsigned long nr_slices, bytes_trim, time_elapsed;
+
+	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
+
+	/*
+	 * If bps are unlimited (-1), then time slice don't get
+	 * renewed. Don't try to trim the slice if slice is used. A new
+	 * slice will start when appropriate.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		return;
+
+	time_elapsed = jiffies - tg->slice_start[rw];
+
+	nr_slices = time_elapsed / throtl_slice;
+
+	if (!nr_slices)
+		return;
+
+	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+
+	if (!bytes_trim)
+		return;
+
+	if (tg->bytes_disp[rw] >= bytes_trim)
+		tg->bytes_disp[rw] -= bytes_trim;
+	else
+		tg->bytes_disp[rw] = 0;
+
+	tg->slice_start[rw] += nr_slices * throtl_slice;
+
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+			" start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			tg->slice_start[rw], tg->slice_end[rw], jiffies);
+}
+
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+
+	/*
+	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
+
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/* Calc approx time to dispatch */
+	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+
+	if (!jiffy_wait)
+		jiffy_wait = 1;
+
+	/*
+	 * This wait time is without taking into consideration the rounding
+	 * up we did. Add that time also.
+	 */
+	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
+
+	if (wait)
+		*wait = jiffy_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+
+	return 0;
+}
+
+static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+	bool sync = bio->bi_rw & REQ_SYNC;
+
+	/* Charge the bio to the group */
+	tg->bytes_disp[rw] += bio->bi_size;
+
+	/*
+	 * TODO: This will take blkg->stats_lock. Figure out a way
+	 * to avoid this cost.
+	 */
+	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+
+}
+
+static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
+			struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+
+	bio_list_add(&tg->bio_lists[rw], bio);
+	/* Take a bio reference on tg */
+	throtl_ref_get_tg(tg);
+	tg->nr_queued[rw]++;
+	td->nr_queued[rw]++;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+{
+	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+	struct bio *bio;
+
+	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
+		tg_may_dispatch(td, tg, bio, &read_wait);
+
+	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+		tg_may_dispatch(td, tg, bio, &write_wait);
+
+	min_wait = min(read_wait, write_wait);
+	disptime = jiffies + min_wait;
+
+	/*
+	 * If group is already on active tree, then update dispatch time
+	 * only if it is lesser than existing dispatch time. Otherwise
+	 * always update the dispatch time
+	 */
+
+	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
+		return;
+
+	/* Update dispatch time */
+	throtl_dequeue_tg(td, tg);
+	tg->disptime = disptime;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
+				bool rw, struct bio_list *bl)
+{
+	struct bio *bio;
+
+	bio = bio_list_pop(&tg->bio_lists[rw]);
+	tg->nr_queued[rw]--;
+	/* Drop bio reference on tg */
+	throtl_put_tg(tg);
+
+	BUG_ON(td->nr_queued[rw] <= 0);
+	td->nr_queued[rw]--;
+
+	throtl_charge_bio(tg, bio);
+	bio_list_add(bl, bio);
+	bio->bi_rw |= REQ_THROTTLED;
+
+	throtl_trim_slice(td, tg, rw);
+}
+
+static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio_list *bl)
+{
+	unsigned int nr_reads = 0, nr_writes = 0;
+	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
+	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+	struct bio *bio;
+
+	/* Try to dispatch 75% READS and 25% WRITES */
+
+	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_reads++;
+
+		if (nr_reads >= max_nr_reads)
+			break;
+	}
+
+	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_writes++;
+
+		if (nr_writes >= max_nr_writes)
+			break;
+	}
+
+	return nr_reads + nr_writes;
+}
+
+static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+{
+	unsigned int nr_disp = 0;
+	struct throtl_grp *tg;
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	while (1) {
+		tg = throtl_rb_first(st);
+
+		if (!tg)
+			break;
+
+		if (time_before(jiffies, tg->disptime))
+			break;
+
+		throtl_dequeue_tg(td, tg);
+
+		nr_disp += throtl_dispatch_tg(td, tg, bl);
+
+		if (tg->nr_queued[0] || tg->nr_queued[1]) {
+			tg_update_disptime(td, tg);
+			throtl_enqueue_tg(td, tg);
+		}
+
+		if (nr_disp >= throtl_quantum)
+			break;
+	}
+
+	return nr_disp;
+}
+
+/* Dispatch throttled bios. Should be called without queue lock held. */
+static int throtl_dispatch(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	unsigned int nr_disp = 0;
+	struct bio_list bio_list_on_stack;
+	struct bio *bio;
+
+	spin_lock_irq(q->queue_lock);
+
+	if (!total_nr_queued(td))
+		goto out;
+
+	bio_list_init(&bio_list_on_stack);
+
+	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+			total_nr_queued(td), td->nr_queued[READ],
+			td->nr_queued[WRITE]);
+
+	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+
+	if (nr_disp)
+		throtl_log(td, "bios disp=%u", nr_disp);
+
+	throtl_schedule_next_dispatch(td);
+out:
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * If we dispatched some requests, unplug the queue to make sure
+	 * immediate dispatch
+	 */
+	if (nr_disp) {
+		while((bio = bio_list_pop(&bio_list_on_stack)))
+			generic_make_request(bio);
+		blk_unplug(q);
+	}
+	return nr_disp;
+}
+
+void blk_throtl_work(struct work_struct *work)
+{
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					throtl_work.work);
+	struct request_queue *q = td->queue;
+
+	throtl_dispatch(q);
+}
+
+/* Call with queue lock held */
+void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+{
+
+	struct throtl_data *td = q->td;
+	struct delayed_work *dwork = &td->throtl_work;
+
+	if (total_nr_queued(td) > 0) {
+		/*
+		 * We might have a work scheduled to be executed in future.
+		 * Cancel that and schedule a new one.
+		 */
+		__cancel_delayed_work(dwork);
+		kblockd_schedule_delayed_work(q, dwork, delay);
+		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
+				delay, jiffies);
+	}
+}
+EXPORT_SYMBOL(throtl_schedule_delayed_work);
+
+static void
+throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	/* Something wrong if we are trying to remove same group twice */
+	BUG_ON(hlist_unhashed(&tg->tg_node));
+
+	hlist_del_init(&tg->tg_node);
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	throtl_put_tg(tg);
+	td->nr_undestroyed_grps--;
+}
+
+static void throtl_release_tgs(struct throtl_data *td)
+{
+	struct hlist_node *pos, *n;
+	struct throtl_grp *tg;
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * If cgroup removal path got to blk_group first and removed
+		 * it from cgroup list, then it will take care of destroying
+		 * cfqg also.
+		 */
+		if (!blkiocg_del_blkio_group(&tg->blkg))
+			throtl_destroy_tg(td, tg);
+	}
+}
+
+static void throtl_td_free(struct throtl_data *td)
+{
+	kfree(td);
+}
+
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid throtl_data pointer as long as we are
+ * rcu read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if queue was going away, cgroup deltion
+ * path got to it first.
+ */
+void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+	unsigned long flags;
+	struct throtl_data *td = key;
+
+	spin_lock_irqsave(td->queue->queue_lock, flags);
+	throtl_destroy_tg(td, tg_of_blkg(blkg));
+	spin_unlock_irqrestore(td->queue->queue_lock, flags);
+}
+
+static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
+			u64 read_bps)
+{
+	tg_of_blkg(blkg)->bps[READ] = read_bps;
+}
+
+static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
+			u64 write_bps)
+{
+	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+}
+
+void throtl_shutdown_timer_wq(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+
+	cancel_delayed_work_sync(&td->throtl_work);
+}
+
+static struct blkio_policy_type blkio_policy_throtl = {
+	.ops = {
+		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
+		.blkio_update_group_read_bps_fn =
+					throtl_update_blkio_group_read_bps,
+		.blkio_update_group_write_bps_fn =
+					throtl_update_blkio_group_write_bps,
+	},
+};
+
+int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+{
+	struct throtl_data *td = q->td;
+	struct throtl_grp *tg;
+	struct bio *bio = *biop;
+	bool rw = bio_data_dir(bio), update_disptime = true;
+
+	if (bio->bi_rw & REQ_THROTTLED) {
+		bio->bi_rw &= ~REQ_THROTTLED;
+		return 0;
+	}
+
+	spin_lock_irq(q->queue_lock);
+	tg = throtl_get_tg(td);
+
+	if (tg->nr_queued[rw]) {
+		/*
+		 * There is already another bio queued in same dir. No
+		 * need to update dispatch time.
+		 */
+		update_disptime = false;
+		goto queue_bio;
+	}
+
+	/* Bio is with-in rate limit of group */
+	if (tg_may_dispatch(td, tg, bio, NULL)) {
+		throtl_charge_bio(tg, bio);
+		goto out;
+	}
+
+queue_bio:
+	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
+			" queued=%d/%d", rw == READ ? 'R' : 'W',
+			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->nr_queued[READ], tg->nr_queued[WRITE]);
+
+	throtl_add_bio_tg(q->td, tg, bio);
+	*biop = NULL;
+
+	if (update_disptime) {
+		tg_update_disptime(td, tg);
+		throtl_schedule_next_dispatch(td);
+	}
+
+out:
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
+int blk_throtl_init(struct request_queue *q)
+{
+	struct throtl_data *td;
+	struct throtl_grp *tg;
+
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	if (!td)
+		return -ENOMEM;
+
+	INIT_HLIST_HEAD(&td->tg_list);
+	td->tg_service_tree = THROTL_RB_ROOT;
+
+	/* Init root group */
+	tg = &td->root_tg;
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/* Practically unlimited BW */
+	tg->bps[0] = tg->bps[1] = -1;
+	atomic_set(&tg->ref, 1);
+
+	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+
+	rcu_read_lock();
+	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
+					0, BLKIO_POLICY_THROTL);
+	rcu_read_unlock();
+
+	/* Attach throtl data to request queue */
+	td->queue = q;
+	q->td = td;
+	return 0;
+}
+
+void blk_throtl_exit(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	bool wait = false;
+
+	BUG_ON(!td);
+
+	throtl_shutdown_timer_wq(q);
+
+	spin_lock_irq(q->queue_lock);
+	throtl_release_tgs(td);
+	blkiocg_del_blkio_group(&td->root_tg.blkg);
+
+	/* If there are other groups */
+	if (td->nr_undestroyed_grps >= 1)
+		wait = true;
+
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * Wait for tg->blkg->key accessors to exit their grace periods.
+	 * Do this wait only if there are other undestroyed groups out
+	 * there (other than root group). This can happen if cgroup deletion
+	 * path claimed the responsibility of cleaning up a group before
+	 * queue cleanup code get to the group.
+	 *
+	 * Do not call synchronize_rcu() unconditionally as there are drivers
+	 * which create/delete request queue hundreds of times during scan/boot
+	 * and synchronize_rcu() can take significant time and slow down boot.
+	 */
+	if (wait)
+		synchronize_rcu();
+	throtl_td_free(td);
+}
+
+static int __init throtl_init(void)
+{
+	blkio_policy_register(&blkio_policy_throtl);
+	return 0;
+}
+
+module_init(throtl_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..10a0c291b55a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -130,6 +130,8 @@ enum rq_flag_bits {
 	/* bio only flags */
 	__REQ_UNPLUG,		/* unplug the immediately after submission */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+	__REQ_THROTTLED,	/* This bio has already been subjected to
+				 * throttling rules. Don't do it again. */
 
 	/* request only flags */
 	__REQ_SORTED,		/* elevator knows about this request */
@@ -172,6 +174,7 @@ enum rq_flag_bits {
 
 #define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
+#define REQ_THROTTLED		(1 << __REQ_THROTTLED)
 
 #define REQ_SORTED		(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER		(1 << __REQ_SOFTBARRIER)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 780824edac16..1341df5806df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -371,6 +371,11 @@ struct request_queue
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	/* Throttle data */
+	struct throtl_data *td;
+#endif
 };
 
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
@@ -1131,6 +1136,7 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
@@ -1174,6 +1180,24 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
 }
 #endif
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
+extern void throtl_shutdown_timer_wq(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+{
+	return 0;
+}
+
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
+static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
+static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd9..950ba26f7233 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -634,11 +634,14 @@ config BLK_CGROUP
 
 	Currently, CFQ IO scheduler uses it to recognize task groups and
 	control disk bandwidth allocation (proportional time slice allocation)
-	to such task groups.
+	to such task groups. It is also used by bio throttling logic in
+	block layer to implement upper limit in IO rates on a device.
 
 	This option only enables generic Block IO controller infrastructure.
-	One needs to also enable actual IO controlling logic in CFQ for it
-	to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+	One needs to also enable actual IO controlling logic/policy. For
+	enabling proportional weight division of disk bandwidth in CFQ seti
+	CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
+	CONFIG_BLK_THROTTLE=y.
 
 	See Documentation/cgroups/blkio-controller.txt for more information.
 

From 7702e8f45b0a3bb262b9366c60beb5445758d94c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:36 -0400
Subject: [PATCH 15/39] blk-cgroup: cgroup changes for IOPS limit support

o cgroup changes for IOPS throttling rules.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 139 +++++++++++++++++++++++++++++++++++++++------
 block/blk-cgroup.h |  13 +++++
 2 files changed, 135 insertions(+), 17 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index aae8c930a6f8..20ce6f584e43 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -149,6 +149,27 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 	}
 }
 
+static inline void blkio_update_group_iops(struct blkio_group *blkg,
+			unsigned int iops, int fileid)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+
+		if (fileid == BLKIO_THROTL_read_iops_device
+		    && blkiop->ops.blkio_update_group_read_iops_fn)
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops);
+
+		if (fileid == BLKIO_THROTL_write_iops_device
+		    && blkiop->ops.blkio_update_group_write_iops_fn)
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -630,7 +651,7 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp;
+	unsigned long major, minor, temp, iops;
 	int i = 0;
 	dev_t dev;
 	u64 bps;
@@ -692,13 +713,28 @@ static int blkio_policy_parse_and_set(char *buf,
 		newpn->val.weight = temp;
 		break;
 	case BLKIO_POLICY_THROTL:
-		ret = strict_strtoull(s[1], 10, &bps);
-		if (ret)
-			return -EINVAL;
+		switch(fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			ret = strict_strtoull(s[1], 10, &bps);
+			if (ret)
+				return -EINVAL;
 
-		newpn->plid = plid;
-		newpn->fileid = fileid;
-		newpn->val.bps = bps;
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.bps = bps;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			ret = strict_strtoul(s[1], 10, &iops);
+			if (ret)
+				return -EINVAL;
+
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.iops = iops;
+			break;
+		}
 		break;
 	default:
 		BUG();
@@ -744,6 +780,29 @@ uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 		return -1;
 }
 
+unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device);
+	if (pn)
+		return pn->val.iops;
+	else
+		return -1;
+}
+
+unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device);
+	if (pn)
+		return pn->val.iops;
+	else
+		return -1;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
@@ -753,8 +812,17 @@ static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 			return 1;
 		break;
 	case BLKIO_POLICY_THROTL:
-		if (pn->val.bps == 0)
-			return 1;
+		switch(pn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			if (pn->val.bps == 0)
+				return 1;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			if (pn->val.iops == 0)
+				return 1;
+		}
 		break;
 	default:
 		BUG();
@@ -771,7 +839,15 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 		oldpn->val.weight = newpn->val.weight;
 		break;
 	case BLKIO_POLICY_THROTL:
-		oldpn->val.bps = newpn->val.bps;
+		switch(newpn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			oldpn->val.bps = newpn->val.bps;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			oldpn->val.iops = newpn->val.iops;
+		}
 		break;
 	default:
 		BUG();
@@ -785,7 +861,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
-	unsigned int weight;
+	unsigned int weight, iops;
 	u64 bps;
 
 	switch(pn->plid) {
@@ -801,6 +877,11 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 			bps = pn->val.bps ? pn->val.bps : (-1);
 			blkio_update_group_bps(blkg, bps, pn->fileid);
 			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			iops = pn->val.iops ? pn->val.iops : (-1);
+			blkio_update_group_iops(blkg, iops, pn->fileid);
+			break;
 		}
 		break;
 	default:
@@ -900,14 +981,18 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 					MINOR(pn->dev), pn->val.weight);
 			break;
 		case BLKIO_POLICY_THROTL:
-			if (pn->fileid == BLKIO_THROTL_read_bps_device)
+			switch(pn->fileid) {
+			case BLKIO_THROTL_read_bps_device:
+			case BLKIO_THROTL_write_bps_device:
 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
 					MINOR(pn->dev), pn->val.bps);
-			else if (pn->fileid == BLKIO_THROTL_write_bps_device)
-				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.bps);
-			else
-				BUG();
+				break;
+			case BLKIO_THROTL_read_iops_device:
+			case BLKIO_THROTL_write_iops_device:
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.iops);
+				break;
+			}
 			break;
 		default:
 			BUG();
@@ -954,6 +1039,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 		switch(name){
 		case BLKIO_THROTL_read_bps_device:
 		case BLKIO_THROTL_write_bps_device:
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
 			blkio_read_policy_node_files(cft, blkcg, m);
 			return 0;
 		default:
@@ -1171,6 +1258,24 @@ struct cftype blkio_files[] = {
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
+
+	{
+		.name = "throttle.read_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1b738827b2f6..2070053a30b1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -93,6 +93,8 @@ enum blkcg_file_name_prop {
 enum blkcg_file_name_throtl {
 	BLKIO_THROTL_read_bps_device,
 	BLKIO_THROTL_write_bps_device,
+	BLKIO_THROTL_read_iops_device,
+	BLKIO_THROTL_write_iops_device,
 	BLKIO_THROTL_io_service_bytes,
 	BLKIO_THROTL_io_serviced,
 };
@@ -168,6 +170,7 @@ struct blkio_policy_node {
 		 * by file type "fileid".
 		 */
 		u64 bps;
+		unsigned int iops;
 	} val;
 };
 
@@ -177,6 +180,10 @@ extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
+				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
@@ -185,12 +192,18 @@ typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
 						u64 read_bps);
 typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
 						u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg,
+						unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg,
+						unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
 	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
 	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
+	blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
+	blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
 };
 
 struct blkio_policy_type {

From 8e89d13f4ede2467629a971618537430fafaaea3 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:37 -0400
Subject: [PATCH 16/39] blkio: Implementation of IOPS limit logic

o core logic of implementing IOPS throttling.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 164 +++++++++++++++++++++++++++++++++----------
 1 file changed, 127 insertions(+), 37 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4b492011e0de..af53f37c1b13 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -59,8 +59,13 @@ struct throtl_grp {
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 
+	/* IOPS limits */
+	unsigned int iops[2];
+
 	/* Number of bytes disptached in current slice */
 	uint64_t bytes_disp[2];
+	/* Number of bio's dispatched in current slice */
+	unsigned int io_disp[2];
 
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
@@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 
 	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
 	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
 
 	hlist_add_head(&tg->tg_node, &td->tg_list);
 	td->nr_undestroyed_grps++;
@@ -335,6 +342,7 @@ static inline void
 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
+	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
 	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-	unsigned long nr_slices, bytes_trim, time_elapsed;
+	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
 
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 		return;
 
 	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
-	if (!bytes_trim)
+	if (!bytes_trim && !io_trim)
 		return;
 
 	if (tg->bytes_disp[rw] >= bytes_trim)
@@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	else
 		tg->bytes_disp[rw] = 0;
 
+	if (tg->io_disp[rw] >= io_trim)
+		tg->io_disp[rw] -= io_trim;
+	else
+		tg->io_disp[rw] = 0;
+
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
-/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
- */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
+		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
-	u64 bytes_allowed, extra_bytes;
+	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
-	/*
-	 * Currently whole state machine of group depends on first bio
-	 * queued in the group bio list. So one should not be calling
-	 * this function with a different bio if there are other bios
-	 * queued.
-	 */
-	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
-	/* If tg->bps = -1, then BW is unlimited */
-	if (tg->bps[rw] == -1) {
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 
-	/*
-	 * If previous slice expired, start a new one otherwise renew/extend
-	 * existing slice to make sure it is at least throtl_slice interval
-	 * long since now.
-	 */
-	if (throtl_slice_used(td, tg, rw))
-		throtl_start_new_slice(td, tg, rw);
-	else {
-		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
-	}
+	/* Calc approx time to dispatch */
+	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+
+	if (jiffy_wait > jiffy_elapsed)
+		jiffy_wait = jiffy_wait - jiffy_elapsed;
+	else
+		jiffy_wait = 1;
+
+	if (wait)
+		*wait = jiffy_wait;
+	return 0;
+}
+
+static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
+		struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	 * up we did. Add that time also.
 	 */
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
-
 	if (wait)
 		*wait = jiffy_wait;
+	return 0;
+}
 
-	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
-		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+
+	/*
+ 	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
+	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	max_wait = max(bps_wait, iops_wait);
+
+	if (wait)
+		*wait = max_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + max_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
 
 	return 0;
 }
@@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
+	tg->io_disp[rw]++;
 
 	/*
 	 * TODO: This will take blkg->stats_lock. Figure out a way
 	 * to avoid this cost.
 	 */
 	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
-
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
 }
 
+static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
+			unsigned int read_iops)
+{
+	tg_of_blkg(blkg)->iops[READ] = read_iops;
+}
+
+static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
+			unsigned int write_iops)
+{
+	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+}
+
 void throtl_shutdown_timer_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = {
 					throtl_update_blkio_group_read_bps,
 		.blkio_update_group_write_bps_fn =
 					throtl_update_blkio_group_write_bps,
+		.blkio_update_group_read_iops_fn =
+					throtl_update_blkio_group_read_iops,
+		.blkio_update_group_write_iops_fn =
+					throtl_update_blkio_group_write_iops,
 	},
+	.plid = BLKIO_POLICY_THROTL,
 };
 
 int blk_throtl_bio(struct request_queue *q, struct bio **biop)
@@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
-			" queued=%d/%d", rw == READ ? 'R' : 'W',
+	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+			" iodisp=%u iops=%u queued=%d/%d",
+			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	throtl_add_bio_tg(q->td, tg, bio);
@@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
+	tg->iops[0] = tg->iops[1] = -1;
 	atomic_set(&tg->ref, 1);
 
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);

From 2786c4e5e54802c34297e55050fef3e862a27b3f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:38 -0400
Subject: [PATCH 17/39] blkio: Documentation Update

o Documentation update

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/cgroups/blkio-controller.txt | 106 ++++++++++++++++++++-
 1 file changed, 103 insertions(+), 3 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 48e0b21b0059..2f5613fbe680 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -8,12 +8,17 @@ both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
 Plan is to use the same cgroup based management interface for blkio controller
 and based on user options switch IO policies in the background.
 
-In the first phase, this patchset implements proportional weight time based
-division of disk policy. It is implemented in CFQ. Hence this policy takes
-effect only on leaf nodes when CFQ is being used.
+Currently two IO control policies are implemented. First one is proportional
+weight time based division of disk policy. It is implemented in CFQ. Hence
+this policy takes effect only on leaf nodes when CFQ is being used. The second
+one is throttling policy which can be used to specify upper IO rate limits
+on devices. This policy is implemented in generic block layer and can be
+used on leaf nodes as well as higher level logical devices like device mapper.
 
 HOWTO
 =====
+Proportional Weight division of bandwidth
+-----------------------------------------
 You can do a very simple testing of running two dd threads in two different
 cgroups. Here is what you can do.
 
@@ -55,6 +60,35 @@ cgroups. Here is what you can do.
   group dispatched to the disk. We provide fairness in terms of disk time, so
   ideally io.disk_time of cgroups should be in proportion to the weight.
 
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller
+        mount -t cgroup -o blkio none /cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <byes_per_second>".
+
+        echo "8:16  1048576" > /cgroup/blkio/blkio.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not.
+
+		# dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+		# iflag=direct
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.write_bps_device file.
+
 Various user visible config options
 ===================================
 CONFIG_BLK_CGROUP
@@ -68,8 +102,13 @@ CONFIG_CFQ_GROUP_IOSCHED
 	- Enables group scheduling in CFQ. Currently only 1 level of group
 	  creation is allowed.
 
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
 Details of cgroup files
 =======================
+Proportional weight policy files
+--------------------------------
 - blkio.weight
 	- Specifies per cgroup weight. This is default weight of the group
 	  on all the devices until and unless overridden by per device rule.
@@ -210,6 +249,67 @@ Details of cgroup files
 	  and minor number of the device and third field specifies the number
 	  of times a group was dequeued from a particular device.
 
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjectd to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) completed to/from the disk by the group (as
+	  seen by throttling policy). These are further divided by the type
+	  of operation - read or write, sync or async. First two fields specify
+	  the major and minor number of the device, third field specifies the
+	  operation type and the fourth field specifies the number of IOs.
+
+	  blkio.io_serviced does accounting as seen by CFQ and counts are in
+	  number of requests (struct request). On the other hand,
+	  blkio.throttle.io_serviced counts number of IO in terms of number
+	  of bios as seen by throttling policy.  These bios can later be
+	  merged by elevator and total number of requests completed can be
+	  lesser.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+	  These numbers should roughly be same as blkio.io_service_bytes as
+	  updated by CFQ. The difference between two is that
+	  blkio.io_service_bytes will not be updated if CFQ is not operating
+	  on request queue.
+
+Common files among various policies
+-----------------------------------
 - blkio.reset_stats
 	- Writing an int to this file will result in resetting all the stats
 	  for that cgroup.

From 01ea50638bc04ca5259f5711fcdedefcdde1cf43 Mon Sep 17 00:00:00 2001
From: "Signed-off-by: Jan Kara" <jack@suse.cz>
Date: Thu, 16 Sep 2010 20:36:36 +0200
Subject: [PATCH 18/39] block: Fix race during disk initialization

When a new disk is being discovered, add_disk() first ties the bdev to gendisk
(via register_disk()->blkdev_get()) and only after that calls
bdi_register_bdev(). Because register_disk() also creates disk's kobject, it
can happen that userspace manages to open and modify the device's data (or
inode) before its BDI is properly initialized leading to a warning in
__mark_inode_dirty().

Fix the problem by registering BDI early enough.

This patch addresses https://bugzilla.kernel.org/show_bug.cgi?id=16312

Cc: stable@kernel.org
Reported-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 5c9c503de423..7923e720ddf5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
+	/* Register BDI before referencing it from bdev */ 
+	bdi = &disk->queue->backing_dev_info;
+	bdi_register_dev(bdi, disk_devt(disk));
+
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
 
-	bdi = &disk->queue->backing_dev_info;
-	bdi_register_dev(bdi, disk_devt(disk));
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);

From 6d0aed7a38d06284db2a0e46c0a072b0c1c3299b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 17 Sep 2010 10:00:46 +0200
Subject: [PATCH 19/39] do_mounts: only enable PARTUUID for CONFIG_BLOCK

When CONFIG_BLOCK is not enabled:

init/do_mounts.c:71: error: implicit declaration of function 'dev_to_part'
init/do_mounts.c:71: warning: initialization makes pointer from integer without a cast
init/do_mounts.c:73: error: dereferencing pointer to incomplete type
init/do_mounts.c:76: error: dereferencing pointer to incomplete type
init/do_mounts.c:76: error: dereferencing pointer to incomplete type
init/do_mounts.c:102: error: implicit declaration of function 'part_pack_uuid'
init/do_mounts.c:104: error: 'block_class' undeclared (first use in this function)

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 init/do_mounts.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index b7fc83994f39..42db0551c3aa 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -58,6 +58,7 @@ static int __init readwrite(char *str)
 __setup("ro", readonly);
 __setup("rw", readwrite);
 
+#ifdef CONFIG_BLOCK
 /**
  * match_dev_by_uuid - callback for finding a partition using its uuid
  * @dev:	device passed in by the caller
@@ -111,6 +112,7 @@ static dev_t __init devt_from_partuuid(char *uuid_str)
 done:
 	return res;
 }
+#endif
 
 /*
  *	Convert a name into device number.  We accept the following variants:
@@ -138,6 +140,7 @@ dev_t name_to_dev_t(char *name)
 	dev_t res = 0;
 	int part;
 
+#ifdef CONFIG_BLOCK
 	if (strncmp(name, "PARTUUID=", 9) == 0) {
 		name += 9;
 		if (strlen(name) != 36)
@@ -147,6 +150,7 @@ dev_t name_to_dev_t(char *name)
 			goto fail;
 		goto done;
 	}
+#endif
 
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;

From 749ef9f8423054e326f3a246327ed2db4b6d395f Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 20 Sep 2010 15:24:50 +0200
Subject: [PATCH 20/39] cfq: improve fsync performance for small files

Fsync performance for small files achieved by cfq on high-end disks is
lower than what deadline can achieve, due to idling introduced between
the sync write happening in process context and the journal commit.

Moreover, when competing with a sequential reader, a process writing
small files and fsync-ing them is starved.

This patch fixes the two problems by:
- marking journal commits as WRITE_SYNC, so that they get the REQ_NOIDLE
  flag set,
- force all queues that have REQ_NOIDLE requests to be put in the noidle
  tree.

Having the queue associated to the fsync-ing process and the one associated
 to journal commits in the noidle tree allows:
- switching between them without idling,
- fairness vs. competing idling queues, since they will be serviced only
  after the noidle tree expires its slice.

Acked-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ++++--------------
 fs/jbd/commit.c     |  2 +-
 fs/jbd2/commit.c    |  2 +-
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b9f86190763b..684592621736 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -216,7 +216,6 @@ struct cfq_data {
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
-	bool noidle_tree_requires_idle;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -2126,7 +2125,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
-	cfqd->noidle_tree_requires_idle = false;
 }
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3108,7 +3106,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+		enable_idle = 0;
+	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
@@ -3421,17 +3421,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
-			cfqd->noidle_tree_requires_idle |=
-				!(rq->cmd_flags & REQ_NOIDLE);
-			/*
-			 * Idling is enabled for SYNC_WORKLOAD.
-			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-			 * only if we processed at least one !REQ_NOIDLE request
-			 */
-			if (cfqd->serving_type == SYNC_WORKLOAD
-			    || cfqd->noidle_tree_requires_idle
-			    || cfqq->cfqg->nr_cfqq == 1)
-				cfq_arm_slice_timer(cfqd);
+			cfq_arm_slice_timer(cfqd);
 		}
 	}
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..3f030e9efea6 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -318,7 +318,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..80910f51d4b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -360,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for

From 4b1977698ceb4c4caa800d475127139da49966f9 Mon Sep 17 00:00:00 2001
From: Mark Lord <kernel@teksavvy.com>
Date: Fri, 24 Sep 2010 09:51:13 -0400
Subject: [PATCH 21/39] block: Prevent hang_check firing during long I/O

During long I/O operations, the hang_check timer may fire,
trigger stack dumps that unnecessarily alarm the user.

Eg.  hdparm --security-erase NULL /dev/sdb  ## can take *hours* to complete

So, if hang_check is armed, we should wake up periodically
to prevent it from triggering.  This patch uses a wake-up interval
equal to half the hang_check timer period, which keeps overhead low enough.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	DECLARE_COMPLETION_ONSTACK(wait);
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
+	unsigned long hang_check;
 
 	/*
 	 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-	wait_for_completion(&wait);
+
+	/* Prevent hang_check timer from firing at us during very long I/O */
+	hang_check = sysctl_hung_task_timeout_secs;
+	if (hang_check)
+		while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
+	else
+		wait_for_completion(&wait);
 
 	if (rq->errors)
 		err = -EIO;

From c49825facfd4969585224a896a5e717f88450cad Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 24 Sep 2010 20:25:49 +0200
Subject: [PATCH 22/39] block: set the bounce_pfn to the actual DMA limit
 rather than to max memory

The bounce_pfn of the request queue in 64 bit systems is set to the
current max_low_pfn. Adding more memory later makes this incorrect.
Memory allocated beyond this boot time max_low_pfn appear to require
bounce buffers (bounce buffers are actually not allocated but used in
calculating segments that may result in "over max segments limit"
errors).

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f8f2ddf20613..f47af5031eaa 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,16 +214,14 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-	q->limits.bounce_pfn = b_pfn;
 #endif
+	q->limits.bounce_pfn = b_pfn;
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);

From e4ecda1b60bfd2333c12bbe71b153d3b6bdc831a Mon Sep 17 00:00:00 2001
From: Mark Lord <mlord@pobox.com>
Date: Sat, 25 Sep 2010 11:17:22 +0200
Subject: [PATCH 23/39] Fix compile error in blk-exec.c for
 !CONFIG_DETECT_HUNG_TASK

Ensure that 'sysctl_hung_task_timeout_secs' is defined
even when CONFIG_DETECT_HUNG_TASK is not set.
This way we can safely reference it without need for
ifdefs in the code elsewhere.  eg. in block/blk-exec.c

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/sched.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7dd..dbafa9e34a2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -336,6 +336,9 @@ extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
+#else
+/* Avoid need for ifdefs elsewhere in the code */
+enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
 /* Attach to any functions which should be ignored in wchan output. */

From 260a67a9e534f0c7d49ddd6451833d54ba39ac81 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 1 Oct 2010 14:42:43 +0200
Subject: [PATCH 24/39] block: revert bad fix for memory hotplug causing
 bounces

Revert "block: set the bounce_pfn to the actual DMA limit rather than to max memory"

This reverts commit c49825facfd4969585224a896a5e717f88450cad.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f47af5031eaa..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,14 +214,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
+	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-#endif
 	q->limits.bounce_pfn = b_pfn;
+#endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
+		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);

From efb012b361cf9319cd86ff169afa7550b7aa9336 Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 1 Oct 2010 14:45:27 +0200
Subject: [PATCH 25/39] block: set the bounce_pfn to the actual DMA limit
 rather than to max memory

The bounce_pfn of the request queue in 64 bit systems is set to the
current max_low_pfn. Adding more memory later makes this incorrect.
Memory allocated beyond this boot time max_low_pfn appear to require
bounce buffers (bounce buffers are actually not allocated but used in
calculating segments that may result in "over max segments limit"
errors).

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f8f2ddf20613..a3600a7ab8bb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->limits.bounce_pfn = max_low_pfn;
+	q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;

From 13f98250f587b7defa39ed738dfa74b600e46e7b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:41 +0200
Subject: [PATCH 26/39] blkio: Do not export throttle files if
 CONFIG_BLK_DEV_THROTTLING=n

Currently throttling related files were visible even if user had disabled
throttling using config options. It was switching off background throttling
of bio but not the cgroup files. This patch fixes it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 107 +++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 20ce6f584e43..86e7066a0b62 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1241,6 +1241,59 @@ struct cftype blkio_files[] = {
 		.read_u64 = blkiocg_file_read_u64,
 		.write_u64 = blkiocg_file_write_u64,
 	},
+	{
+		.name = "time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_time),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "sectors",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_sectors),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "io_service_bytes",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "io_serviced",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_serviced),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "io_service_time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_time),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "io_wait_time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_wait_time),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "io_merged",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_merged),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "io_queued",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_queued),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "reset_stats",
+		.write_u64 = blkiocg_reset_stats,
+	},
+#ifdef CONFIG_BLK_DEV_THROTTLING
 	{
 		.name = "throttle.read_bps_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
@@ -1276,70 +1329,20 @@ struct cftype blkio_files[] = {
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
-	{
-		.name = "time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "sectors",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_sectors),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "throttle.io_service_bytes",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_io_service_bytes),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_serviced),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "throttle.io_serviced",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
 				BLKIO_THROTL_io_serviced),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "io_service_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_service_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_wait_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_wait_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_merged",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_merged),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_queued",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_queued),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "reset_stats",
-		.write_u64 = blkiocg_reset_stats,
-	},
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",

From 61014e96e6ed55b8db0af31574eec2a75d4e8755 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:44 +0200
Subject: [PATCH 27/39] blkio: deletion of a cgroup was causes oops

o Now a cgroup list of blkg elements can contain blkg from multiple policies.
  Before sending an unlink event, make sure blkg belongs to they policy. If
  policy does not own the blkg, do not send update for this blkg.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 86e7066a0b62..b06ca70354e3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1411,13 +1411,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		/*
 		 * This blkio_group is being unlinked as associated cgroup is
 		 * going away. Let all the IO controlling policies know about
-		 * this event. Currently this is static call to one io
-		 * controlling policy. Once we have more policies in place, we
-		 * need some dynamic registration of callback function.
+		 * this event.
 		 */
 		spin_lock(&blkio_list_lock);
-		list_for_each_entry(blkiop, &blkio_list, list)
+		list_for_each_entry(blkiop, &blkio_list, list) {
+			if (blkiop->plid != blkg->plid)
+				continue;
 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
+		}
 		spin_unlock(&blkio_list_lock);
 	} while (1);
 

From 02977e4af7ed3b478c505e50491ffdf3e1314cf4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:48 +0200
Subject: [PATCH 28/39] blkio: Add root group to td->tg_list

o Currently all the dynamically allocated groups, except root grp is added
  to td->tg_list. This was not a problem so far but in next patch I will
  travel through td->tg_list to process any updates of limits on the group.
  If root group is not in tg_list, then root group's updates are not
  processed.

o It is better to root group also to tg_list instead of doing special
  processing for it during limit updates.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index af53f37c1b13..bc2936b80add 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -87,7 +87,7 @@ struct throtl_data
 	unsigned int nr_queued[2];
 
 	/*
-	 * number of total undestroyed groups (excluding root group)
+	 * number of total undestroyed groups
 	 */
 	unsigned int nr_undestroyed_grps;
 
@@ -940,7 +940,17 @@ int blk_throtl_init(struct request_queue *q)
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
 	tg->iops[0] = tg->iops[1] = -1;
-	atomic_set(&tg->ref, 1);
+
+	/*
+	 * Set root group reference to 2. One reference will be dropped when
+	 * all groups on tg_list are being deleted during queue exit. Other
+	 * reference will remain there as we don't want to delete this group
+	 * as it is statically allocated and gets destroyed when throtl_data
+	 * goes away.
+	 */
+	atomic_set(&tg->ref, 2);
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
 
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
@@ -966,10 +976,9 @@ void blk_throtl_exit(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 	throtl_release_tgs(td);
-	blkiocg_del_blkio_group(&td->root_tg.blkg);
 
 	/* If there are other groups */
-	if (td->nr_undestroyed_grps >= 1)
+	if (td->nr_undestroyed_grps > 0)
 		wait = true;
 
 	spin_unlock_irq(q->queue_lock);

From fe0714377ee2ca161bf2afb7773e22f15f1786d4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:49 +0200
Subject: [PATCH 29/39] blkio: Recalculate the throttled bio dispatch time upon
 throttle limit change

o Currently any cgroup throttle limit changes are processed asynchronousy and
  the change does not take affect till a new bio is dispatched from same group.

o It might happen that a user sets a redicuously low limit on throttling.
  Say 1 bytes per second on reads. In such cases simple operations like mount
  a disk can wait for a very long time.

o Once bio is throttled, there is no easy way to come out of that wait even if
  user increases the read limit later.

o This patch fixes it. Now if a user changes the cgroup limits, we recalculate
  the bio dispatch time according to new limits.

o Can't take queueu lock under blkcg_lock, hence after the change I wake
  up the dispatch thread again which recalculates the time. So there are some
  variables being synchronized across two threads without lock and I had to
  make use of barriers. Hoping I have used barriers correctly. Any review of
  memory barrier code especially will help.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c   |  15 +++--
 block/blk-cgroup.h   |  21 +++----
 block/blk-throttle.c | 134 +++++++++++++++++++++++++++++++++++++------
 block/cfq-iosched.c  |   4 +-
 4 files changed, 139 insertions(+), 35 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b06ca70354e3..52c12130a5de 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -124,7 +124,8 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 		if (blkiop->plid != blkg->plid)
 			continue;
 		if (blkiop->ops.blkio_update_group_weight_fn)
-			blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+							blkg, weight);
 	}
 }
 
@@ -141,11 +142,13 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 
 		if (fileid == BLKIO_THROTL_read_bps_device
 		    && blkiop->ops.blkio_update_group_read_bps_fn)
-			blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps);
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+								blkg, bps);
 
 		if (fileid == BLKIO_THROTL_write_bps_device
 		    && blkiop->ops.blkio_update_group_write_bps_fn)
-			blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps);
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+								blkg, bps);
 	}
 }
 
@@ -162,11 +165,13 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 
 		if (fileid == BLKIO_THROTL_read_iops_device
 		    && blkiop->ops.blkio_update_group_read_iops_fn)
-			blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops);
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+								blkg, iops);
 
 		if (fileid == BLKIO_THROTL_write_iops_device
 		    && blkiop->ops.blkio_update_group_write_iops_fn)
-			blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops);
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+								blkg,iops);
 	}
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2070053a30b1..034c35562dba 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,16 +186,17 @@ extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
-typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
-						unsigned int weight);
-typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
-						u64 read_bps);
-typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
-						u64 write_bps);
-typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg,
-						unsigned int read_iops);
-typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg,
-						unsigned int write_iops);
+
+typedef void (blkio_update_group_weight_fn) (void *key,
+			struct blkio_group *blkg, unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (void * key,
+			struct blkio_group *blkg, u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (void *key,
+			struct blkio_group *blkg, u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (void *key,
+			struct blkio_group *blkg, unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (void *key,
+			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bc2936b80add..11713ed852f4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -70,6 +70,9 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
+
+	/* Some throttle limits got updated for the group */
+	bool limits_changed;
 };
 
 struct throtl_data
@@ -93,6 +96,8 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
+
+	atomic_t limits_changed;
 };
 
 enum tg_state_flags {
@@ -592,15 +597,6 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 
-	/*
-	 * If group is already on active tree, then update dispatch time
-	 * only if it is lesser than existing dispatch time. Otherwise
-	 * always update the dispatch time
-	 */
-
-	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
-		return;
-
 	/* Update dispatch time */
 	throtl_dequeue_tg(td, tg);
 	tg->disptime = disptime;
@@ -691,6 +687,46 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 	return nr_disp;
 }
 
+static void throtl_process_limit_change(struct throtl_data *td)
+{
+	struct throtl_grp *tg;
+	struct hlist_node *pos, *n;
+
+	/*
+	 * Make sure atomic_inc() effects from
+	 * throtl_update_blkio_group_read_bps(), group of functions are
+	 * visible.
+	 * Is this required or smp_mb__after_atomic_inc() was suffcient
+	 * after the atomic_inc().
+	 */
+	smp_rmb();
+	if (!atomic_read(&td->limits_changed))
+		return;
+
+	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * Do I need an smp_rmb() here to make sure tg->limits_changed
+		 * update is visible. I am relying on smp_rmb() at the
+		 * beginning of function and not putting a new one here.
+		 */
+
+		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
+			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+				" riops=%u wiops=%u", tg->bps[READ],
+				tg->bps[WRITE], tg->iops[READ],
+				tg->iops[WRITE]);
+			tg_update_disptime(td, tg);
+			tg->limits_changed = false;
+		}
+	}
+
+	smp_mb__before_atomic_dec();
+	atomic_dec(&td->limits_changed);
+	smp_mb__after_atomic_dec();
+}
+
 /* Dispatch throttled bios. Should be called without queue lock held. */
 static int throtl_dispatch(struct request_queue *q)
 {
@@ -701,6 +737,8 @@ static int throtl_dispatch(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 
+	throtl_process_limit_change(td);
+
 	if (!total_nr_queued(td))
 		goto out;
 
@@ -821,28 +859,74 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
-static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
-			u64 read_bps)
+/*
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race becuase
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+				struct blkio_group *blkg, u64 read_bps)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->bps[READ] = read_bps;
+	/* Make sure read_bps is updated before setting limits_changed */
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+
+	/* Make sure tg->limits_changed is updated before td->limits_changed */
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+
+	/* Schedule a work now to process the limit change */
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
-			u64 write_bps)
+static void throtl_update_blkio_group_write_bps(void *key,
+				struct blkio_group *blkg, u64 write_bps)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
-			unsigned int read_iops)
+static void throtl_update_blkio_group_read_iops(void *key,
+			struct blkio_group *blkg, unsigned int read_iops)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->iops[READ] = read_iops;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
-			unsigned int write_iops)
+static void throtl_update_blkio_group_write_iops(void *key,
+			struct blkio_group *blkg, unsigned int write_iops)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
 void throtl_shutdown_timer_wq(struct request_queue *q)
@@ -886,8 +970,14 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
+		 * Still update the disptime if rate limits on this group
+		 * were changed.
 		 */
-		update_disptime = false;
+		if (!tg->limits_changed)
+			update_disptime = false;
+		else
+			tg->limits_changed = false;
+
 		goto queue_bio;
 	}
 
@@ -929,6 +1019,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
+	atomic_set(&td->limits_changed, 0);
 
 	/* Init root group */
 	tg = &td->root_tg;
@@ -996,6 +1087,13 @@ void blk_throtl_exit(struct request_queue *q)
 	 */
 	if (wait)
 		synchronize_rcu();
+
+	/*
+	 * Just being safe to make sure after previous flush if some body did
+	 * update limits through cgroup and another work got queued, cancel
+	 * it.
+	 */
+	throtl_shutdown_timer_wq(q);
 	throtl_td_free(td);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 684592621736..86338d5d4d09 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -951,8 +951,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-void
-cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+					unsigned int weight)
 {
 	cfqg_of_blkg(blkg)->weight = weight;
 }

From 3aad5d3ee4e4fce8f4b5bb6ca73342dcade42b33 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:51:14 +0200
Subject: [PATCH 30/39] blkio-throttle: Fix link failure failure on i386

o Randy Dunlap reported following linux-next failure. This patch fixes it.

on i386:

blk-throttle.c:(.text+0x1abb8): undefined reference to `__udivdi3'
blk-throttle.c:(.text+0x1b1dc): undefined reference to `__udivdi3'

o bytes_per_second interface is 64bit and I was continuing to do 64 bit
  division even on 32bit platform without help of special macros/functions
  hence the failure.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 11713ed852f4..a46700255719 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -378,7 +378,8 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
+	unsigned long nr_slices, time_elapsed, io_trim;
+	u64 bytes_trim, tmp;
 
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -396,8 +397,10 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	if (!nr_slices)
 		return;
+	tmp = tg->bps[rw] * throtl_slice * nr_slices;
+	do_div(tmp, HZ);
+	bytes_trim = tmp;
 
-	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
 	if (!bytes_trim && !io_trim)
@@ -415,7 +418,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
@@ -462,7 +465,7 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
-	u64 bytes_allowed, extra_bytes;
+	u64 bytes_allowed, extra_bytes, tmp;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -473,8 +476,9 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-				/ MSEC_PER_SEC;
+	tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd);
+	do_div(tmp, MSEC_PER_SEC);
+	bytes_allowed = tmp;
 
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 		if (wait)

From 5e901a2b95db709c5e40599ff4df6029be1e2a12 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:38 +0200
Subject: [PATCH 31/39] blkio-throttle: There is no need to convert jiffies to
 milli seconds

o Do not convert jiffies to mili seconds as it is not required. Just work
  with jiffies and HZ.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a46700255719..c1bc1b6c887a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -439,8 +439,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-				/ MSEC_PER_SEC;
+	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
 
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
@@ -476,8 +475,8 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd);
-	do_div(tmp, MSEC_PER_SEC);
+	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+	do_div(tmp, HZ);
 	bytes_allowed = tmp;
 
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {

From 9355aede5a3c4975e0ba8bbfe2b9d1fd73308916 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:41 +0200
Subject: [PATCH 32/39] blkio-throttle: limit max iops value to UINT_MAX

- Limit max iops value to UINT_MAX and return error to user if value is more
  than that instead of accepting bigger values and truncating implicitly.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 11 +++++++----
 block/blk-cgroup.h |  3 +++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 52c12130a5de..0f59b23096db 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -656,10 +656,10 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp, iops;
+	unsigned long major, minor, temp;
 	int i = 0;
 	dev_t dev;
-	u64 bps;
+	u64 bps, iops;
 
 	memset(s, 0, sizeof(s));
 
@@ -731,13 +731,16 @@ static int blkio_policy_parse_and_set(char *buf,
 			break;
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
-			ret = strict_strtoul(s[1], 10, &iops);
+			ret = strict_strtoull(s[1], 10, &iops);
 			if (ret)
 				return -EINVAL;
 
+			if (iops > THROTL_IOPS_MAX)
+				return -EINVAL;
+
 			newpn->plid = plid;
 			newpn->fileid = fileid;
-			newpn->val.iops = iops;
+			newpn->val.iops = (unsigned int)iops;
 			break;
 		}
 		break;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 034c35562dba..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -20,6 +20,9 @@ enum blkio_policy_id {
 	BLKIO_POLICY_THROTL,		/* Throttling */
 };
 
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX		UINT_MAX
+
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 
 #ifndef CONFIG_BLK_CGROUP

From c49c06e4960949a9bced708858433fcf6ca36a9c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:42 +0200
Subject: [PATCH 33/39] blkio-throttle: Fix possible multiplication overflow in
 iops calculations

o User can specify max iops value of 32bit (UINT_MAX), through cgroup
  interface. If a user has specified say 4294967294 (UNIT_MAX  - 2), then
  on 32bit platform, following multiplication can overflow.

  io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd)

o Explicitly cast the multiplication to 64bit and then perform division and
  then check whether result is still great then UNINT_MAX.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c1bc1b6c887a..56ad4531b412 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -430,6 +430,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+	u64 tmp;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -439,7 +440,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
+	/*
+	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+	 * will allow dispatch after 1 second and after that slice should
+	 * have been trimmed.
+	 */
+
+	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+	do_div(tmp, HZ);
+
+	if (tmp > UINT_MAX)
+		io_allowed = UINT_MAX;
+	else
+		io_allowed = tmp;
 
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)

From 892b6f90db81cccb723d5d92f4fddc2d68b206e1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 13 Oct 2010 21:18:03 +0200
Subject: [PATCH 34/39] block: Ensure physical block size is unsigned int

Physical block size was declared unsigned int to accomodate the maximum
size reported by READ CAPACITY(16).  Make sure we use the right type in
the related functions.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c   | 2 +-
 include/linux/blkdev.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a3600a7ab8bb..315b88c8cbbb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
  *   hardware can operate on without reverting to read-modify-write
  *   operations.
  */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 {
 	q->limits.physical_block_size = size;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1341df5806df..8f3dd981b973 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,7 +860,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
-extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@ -1013,7 +1013,7 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q)
 	return q->limits.physical_block_size;
 }
 
-static inline int bdev_physical_block_size(struct block_device *bdev)
+static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
 {
 	return queue_physical_block_size(bdev_get_queue(bdev));
 }

From e817bf3f68f55e7307c3e9abe5f32d0c05c83988 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 15 Oct 2010 15:49:18 +0200
Subject: [PATCH 35/39] block: Fix double free in blk_integrity_unregister

Commit 3839e4b introduced a kobject_put but failed to remove the
kmem_cache_free beneath it, leading to a double free.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-integrity.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 885cbb59967e..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -432,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
 	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
 	kobject_del(&bi->kobj);
 	kobject_put(&bi->kobj);
-	kmem_cache_free(integrity_cachep, bi);
 	disk->integrity = NULL;
 }
 EXPORT_SYMBOL(blk_integrity_unregister);

From 495d2b3883682fcd1c3dee3a45e38fd00154ae25 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 15 Oct 2010 15:49:20 +0200
Subject: [PATCH 36/39] block: Make the integrity mapped property a bio flag

Previously we tracked whether the integrity metadata had been remapped
using a request flag. This was fine for low-level retries. However, if
an I/O was redriven by upper layers we would end up remapping again,
causing the retry to fail.

Deprecate the REQ_INTEGRITY flag and introduce BIO_MAPPED_INTEGRITY
which enables filesystems to notify lower layers that the bio in
question has already been remapped.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sd_dif.c     | 11 ++++++-----
 include/linux/blk_types.h |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 84be62149c6c..0cb39ff21171 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -375,21 +375,20 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s
 	unsigned int i, j;
 	u32 phys, virt;
 
-	/* Already remapped? */
-	if (rq->cmd_flags & REQ_INTEGRITY)
-		return 0;
-
 	sdkp = rq->bio->bi_bdev->bd_disk->private_data;
 
 	if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION)
 		return 0;
 
-	rq->cmd_flags |= REQ_INTEGRITY;
 	phys = hw_sector & 0xffffffff;
 
 	__rq_for_each_bio(bio, rq) {
 		struct bio_vec *iv;
 
+		/* Already remapped? */
+		if (bio_flagged(bio, BIO_MAPPED_INTEGRITY))
+			break;
+
 		virt = bio->bi_integrity->bip_sector & 0xffffffff;
 
 		bip_for_each_vec(iv, bio->bi_integrity, i) {
@@ -408,6 +407,8 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s
 
 			kunmap_atomic(sdt, KM_USER0);
 		}
+
+		bio->bi_flags |= BIO_MAPPED_INTEGRITY;
 	}
 
 	return 0;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 10a0c291b55a..d36629620a4f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -97,6 +97,7 @@ struct bio {
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
 #define BIO_QUIET	11	/* Make BIO Quiet */
+#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -148,7 +149,6 @@ enum rq_flag_bits {
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
-	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_FLUSH,		/* request for cache flush */
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
@@ -190,7 +190,6 @@ enum rq_flag_bits {
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
-#define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
 #define REQ_FLUSH		(1 << __REQ_FLUSH)
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)

From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Tue, 19 Oct 2010 09:05:00 +0200
Subject: [PATCH 37/39] block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

When reloading partition tables, quiesce IO to ensure that no
request references to the partition struct exists. When it is safe
to free the partition table, the IO for that device is restarted
again.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         | 24 ++++++++++++++++--------
 block/blk-merge.c        |  2 +-
 block/blk.h              |  4 ----
 block/genhd.c            | 14 ++++++++++++++
 fs/partitions/check.c    | 12 ++++++++++++
 include/linux/blkdev.h   |  1 +
 include/linux/elevator.h |  2 ++
 include/linux/genhd.h    |  1 +
 8 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 797d5095eb83..ddc68332d655 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
+	if (priv) {
 		rl->elvpriv++;
 
-	if (blk_queue_io_stat(q))
-		rw_flags |= REQ_IO_STAT;
+		/*
+		 * Don't do stats for non-priv requests
+		 */
+		if (blk_queue_io_stat(q))
+			rw_flags |= REQ_IO_STAT;
+	}
+
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6a725461654d..38ff234012a4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 6738831ba447..1340cce5721a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index 7923e720ddf5..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
+	struct gendisk *disk = ptbl->disk;
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	kfree(ptbl);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+
+		spin_lock_irq(q->queue_lock);
+		elv_quiesce_start(q);
+		spin_unlock_irq(q->queue_lock);
+
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
+	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6dfbee03ccc6..30f46c2cb9d5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,17 +365,25 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+	struct gendisk *disk = part_to_disk(part);
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
+	struct request_queue *q = disk->queue;
 
 	if (partno >= ptbl->len)
 		return;
@@ -390,6 +398,10 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
+	spin_lock_irq(q->queue_lock);
+	elv_quiesce_start(q);
+	spin_unlock_irq(q->queue_lock);
+
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8f3dd981b973..16f7f1be1acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2c958f4fce1e..df1ee866d715 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -121,6 +121,8 @@ extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
+extern void elv_quiesce_start(struct request_queue *);
+extern void elv_quiesce_end(struct request_queue *);
 
 /*
  * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 66e26b5a1537..57647ecfc1bd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
 	struct hd_struct *last_lookup;
+	struct gendisk *disk;
 	struct hd_struct *part[];
 };
 

From 11a691bea48887c27425cc40bf291e74c922df25 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 21 Oct 2010 10:32:29 +0200
Subject: [PATCH 38/39] block: Turn bvec_k{un,}map_irq() into static inline
 functions

Convert bvec_k{un,}map_irq() from macros to static inline functions if
!CONFIG_HIGHMEM, so we can easier detect mistakes like the one fixed in
93055c31045a2d5599ec613a0c6cdcefc481a460 ("ps3disk: passing wrong variable =
to
bvec_kunmap_irq()")

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/bio.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2c3fd7421607..ba679992d39b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -346,8 +346,15 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
 }
 
 #else
-#define bvec_kmap_irq(bvec, flags)	(page_address((bvec)->bv_page) + (bvec)->bv_offset)
-#define bvec_kunmap_irq(buf, flags)	do { *(flags) = 0; } while (0)
+static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
+{
+	return page_address(bvec->bv_page) + bvec->bv_offset;
+}
+
+static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
+{
+	*flags = 0;
+}
 #endif
 
 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,

From b4627321e18582dcbdeb45d77df29d3177107c65 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 22 Oct 2010 09:48:43 +0200
Subject: [PATCH 39/39] cfq-iosched: Fix a gcc 4.5 warning and put some
 comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Andi encountedred following warning with gcc 4.5

  linux/block/cfq-iosched.c: In function ‘cfq_dispatch_requests’:
  linux/block/cfq-iosched.c:2156:3: warning: array subscript is above array
  bounds

- Warning happens due to following code.

  slice = group_slice * count /
		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
		cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

  gcc is complaining about cfqg->busy_queues_avg[] being indexed by CFQ
  prio classes (RT, BE and IDLE) while the array size is only 2.

- At run time, we never access cfqg->busy_queues_avg[IDLE] and return from
  function before this code hits.

- To fix warning increase the array size though it will remain unused. This
  patch also puts some comments to clarify some of the confusions.

- I have taken Jens's patch and modified it a bit.

- Compile tested with gcc 4.4 and boot tested. I don't have gcc 4.5
  running, Andi can you please test it with gcc 4.5 to make sure it
  worked.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 86338d5d4d09..dfceb6386bd5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -157,6 +157,7 @@ enum wl_prio_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
+	CFQ_PRIO_NR,
 };
 
 /*
@@ -181,10 +182,19 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
-	/* Per group busy queus average. Useful for workload slice calc. */
-	unsigned int busy_queues_avg[2];
 	/*
-	 * rr lists of queues with requests, onle rr for each priority class.
+	 * Per group busy queus average. Useful for workload slice calc. We
+	 * create the array for each prio class but at run time it is used
+	 * only for RT and BE class and slot for IDLE class remains unused.
+	 * This is primarily done to avoid confusion and a gcc warning.
+	 */
+	unsigned int busy_queues_avg[CFQ_PRIO_NR];
+	/*
+	 * rr lists of queues with requests. We maintain service trees for
+	 * RT and BE classes. These trees are subdivided in subclasses
+	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
+	 * class there is no subclassification and all the cfq queues go on
+	 * a single tree service_tree_idle.
 	 * Counts are embedded in the cfq_rb_root
 	 */
 	struct cfq_rb_root service_trees[2][3];