From d15156138dad40205c9fdd9abe85c9e1479ae272 Mon Sep 17 00:00:00 2001
From: Douglas Gilbert <dgilbert@interlog.com>
Date: Tue, 1 Jul 2014 10:48:05 -0600
Subject: block SG_IO: add SG_FLAG_Q_AT_HEAD flag

After the SG_IO ioctl was copied into the block layer and
later into the bsg driver, subtle differences emerged.

One difference is the way injected commands are queued through
the block layer (i.e. this is not SCSI device queueing nor SATA
NCQ). Summarizing:
  - SG_IO on block layer device: blk_exec*(at_head=false)
  - sg device SG_IO: at_head=true
  - bsg device SG_IO: at_head=true

Some time ago Boaz Harrosh introduced a sg v4 flag called
BSG_FLAG_Q_AT_TAIL to override the bsg driver default. A
recent patch titled: "sg: add SG_FLAG_Q_AT_TAIL flag"
allowed the sg driver default to be overridden. This patch
allows a SG_IO ioctl sent to a block layer device to have
its default overridden.

ChangeLog:
    - introduce SG_FLAG_Q_AT_HEAD flag in sg.h to cause
      commands that are injected via a block layer
      device SG_IO ioctl to set at_head=true
    - make comments clearer about queueing in sg.h since the
      header is used both by the sg device and block layer
      device implementations of the SG_IO ioctl.
    - introduce BSG_FLAG_Q_AT_HEAD in bsg.h for compatibility
      (it does nothing) and update comments.

Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c       |  5 ++++-
 include/scsi/sg.h        |  3 +++
 include/uapi/linux/bsg.h | 11 ++++++-----
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index bda1497add4c..51bf5155ee75 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -290,6 +290,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	unsigned long start_time;
 	ssize_t ret = 0;
 	int writing = 0;
+	int at_head = 0;
 	struct request *rq;
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	struct bio *bio;
@@ -313,6 +314,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		case SG_DXFER_FROM_DEV:
 			break;
 		}
+	if (hdr->flags & SG_FLAG_Q_AT_HEAD)
+		at_head = 1;
 
 	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
 	if (!rq)
@@ -369,7 +372,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	 * (if he doesn't check that is his problem).
 	 * N.B. a non-zero SCSI status is _not_ necessarily an error.
 	 */
-	blk_execute_rq(q, bd_disk, rq, 0);
+	blk_execute_rq(q, bd_disk, rq, at_head);
 
 	hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
diff --git a/include/scsi/sg.h b/include/scsi/sg.h
index a9f3c6fc3f57..4734c15ab5d6 100644
--- a/include/scsi/sg.h
+++ b/include/scsi/sg.h
@@ -129,6 +129,9 @@ typedef struct sg_io_hdr
 #define SG_FLAG_MMAP_IO 4       /* request memory mapped IO */
 #define SG_FLAG_NO_DXFER 0x10000 /* no transfer of kernel buffers to/from */
 				/* user space (debug indirect IO) */
+/* defaults:: for sg driver: Q_AT_HEAD; for block layer: Q_AT_TAIL */
+#define SG_FLAG_Q_AT_TAIL 0x10
+#define SG_FLAG_Q_AT_HEAD 0x20
 
 /* following 'info' values are "or"-ed together */
 #define SG_INFO_OK_MASK 0x1
diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h
index 7a12e1c0f371..02986cf8b6f1 100644
--- a/include/uapi/linux/bsg.h
+++ b/include/uapi/linux/bsg.h
@@ -10,12 +10,13 @@
 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT	2
 
 /*
- * For flags member below
- * sg.h sg_io_hdr also has bits defined for it's flags member. However
- * none of these bits are implemented/used by bsg. The bits below are
- * allocated to not conflict with sg.h ones anyway.
+ * For flag constants below:
+ * sg.h sg_io_hdr also has bits defined for it's flags member. These
+ * two flag values (0x10 and 0x20) have the same meaning in sg.h . For
+ * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult.
  */
-#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_HEAD 0x20
 
 struct sg_io_v4 {
 	__s32 guard;		/* [i] 'Q' to differentiate from v3 */
-- 
cgit v1.2.3


From cb553215d5d277d4838d7d6b7722e964bcf5ca1f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 26 Jun 2014 17:41:47 +0800
Subject: include/uapi/linux/virtio_blk.h: introduce feature of VIRTIO_BLK_F_MQ

Current virtio-blk spec only supports one virtual queue for transfering
data between VM and host, and inside VM all kinds of operations on
the virtual queue needs to hold one lock, so cause below problems:

	- bad scalability
	- bad throughput

This patch requests to introduce feature of VIRTIO_BLK_F_MQ
so that more than one virtual queues can be used to virtio-blk
device, then above problems can be solved or eased.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/virtio_blk.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 6d8e61c48563..9ad67b267584 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_WCE	9	/* Writeback mode enabled after reset */
 #define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
 #define VIRTIO_BLK_F_CONFIG_WCE	11	/* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 
 #ifndef __KERNEL__
 /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
@@ -77,6 +78,10 @@ struct virtio_blk_config {
 
 	/* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
 	__u8 wce;
+	__u8 unused;
+
+	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
+	__u16 num_queues;
 } __attribute__((packed));
 
 /*
-- 
cgit v1.2.3


From 6a27b656fc0210e976db362e1368c56db05c8f08 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 26 Jun 2014 17:41:48 +0800
Subject: block: virtio-blk: support multi virt queues per virtio-blk device

Firstly this patch supports more than one virtual queues for virtio-blk
device.

Secondly this patch maps the virtual queue to blk-mq's hardware queue.

With this approach, both scalability and performance can be improved.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/virtio_blk.c | 104 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 20 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f63d358f3d93..0a581400de0f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -15,17 +15,22 @@
 #include <linux/numa.h>
 
 #define PART_BITS 4
+#define VQ_NAME_LEN 16
 
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
 static struct workqueue_struct *virtblk_wq;
 
+struct virtio_blk_vq {
+	struct virtqueue *vq;
+	spinlock_t lock;
+	char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
 struct virtio_blk
 {
 	struct virtio_device *vdev;
-	struct virtqueue *vq;
-	spinlock_t vq_lock;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
@@ -47,6 +52,10 @@ struct virtio_blk
 
 	/* Ida index - used to track minor number allocations. */
 	int index;
+
+	/* num of vqs */
+	int num_vqs;
+	struct virtio_blk_vq *vqs;
 };
 
 struct virtblk_req
@@ -133,14 +142,15 @@ static void virtblk_done(struct virtqueue *vq)
 {
 	struct virtio_blk *vblk = vq->vdev->priv;
 	bool req_done = false;
+	int qid = vq->index;
 	struct virtblk_req *vbr;
 	unsigned long flags;
 	unsigned int len;
 
-	spin_lock_irqsave(&vblk->vq_lock, flags);
+	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
 	do {
 		virtqueue_disable_cb(vq);
-		while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
 			blk_mq_complete_request(vbr->req);
 			req_done = true;
 		}
@@ -151,7 +161,7 @@ static void virtblk_done(struct virtqueue *vq)
 	/* In case queue is stopped waiting for more buffers. */
 	if (req_done)
 		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
-	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -160,6 +170,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	unsigned long flags;
 	unsigned int num;
+	int qid = hctx->queue_num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
 	int err;
 	bool notify = false;
@@ -202,12 +213,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
 	}
 
-	spin_lock_irqsave(&vblk->vq_lock, flags);
-	err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
+	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+	err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
 	if (err) {
-		virtqueue_kick(vblk->vq);
+		virtqueue_kick(vblk->vqs[qid].vq);
 		blk_mq_stop_hw_queue(hctx);
-		spin_unlock_irqrestore(&vblk->vq_lock, flags);
+		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 		/* Out of mem doesn't actually happen, since we fall back
 		 * to direct descriptors */
 		if (err == -ENOMEM || err == -ENOSPC)
@@ -215,12 +226,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
-	if (last && virtqueue_kick_prepare(vblk->vq))
+	if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
 		notify = true;
-	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 
 	if (notify)
-		virtqueue_notify(vblk->vq);
+		virtqueue_notify(vblk->vqs[qid].vq);
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
@@ -377,12 +388,64 @@ static void virtblk_config_changed(struct virtio_device *vdev)
 static int init_vq(struct virtio_blk *vblk)
 {
 	int err = 0;
+	int i;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct virtqueue **vqs;
+	unsigned short num_vqs;
+	struct virtio_device *vdev = vblk->vdev;
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+				   struct virtio_blk_config, num_queues,
+				   &num_vqs);
+	if (err)
+		num_vqs = 1;
+
+	vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
+	if (!vblk->vqs) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+	if (!callbacks)
+		goto err_callbacks;
+
+	vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+	if (!vqs)
+		goto err_vqs;
 
-	/* We expect one virtqueue, for output. */
-	vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
-	if (IS_ERR(vblk->vq))
-		err = PTR_ERR(vblk->vq);
+	for (i = 0; i < num_vqs; i++) {
+		callbacks[i] = virtblk_done;
+		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+		names[i] = vblk->vqs[i].name;
+	}
+
+	/* Discover virtqueues and write information to configuration.  */
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+	if (err)
+		goto err_find_vqs;
 
+	for (i = 0; i < num_vqs; i++) {
+		spin_lock_init(&vblk->vqs[i].lock);
+		vblk->vqs[i].vq = vqs[i];
+	}
+	vblk->num_vqs = num_vqs;
+
+ err_find_vqs:
+	kfree(vqs);
+ err_vqs:
+	kfree(callbacks);
+ err_callbacks:
+	kfree(names);
+ err_names:
+	if (err)
+		kfree(vblk->vqs);
+ out:
 	return err;
 }
 
@@ -551,7 +614,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 	err = init_vq(vblk);
 	if (err)
 		goto out_free_vblk;
-	spin_lock_init(&vblk->vq_lock);
 
 	/* FIXME: How many partitions?  How long is a piece of string? */
 	vblk->disk = alloc_disk(1 << PART_BITS);
@@ -562,7 +624,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	/* Default queue sizing is to fill the ring. */
 	if (!virtblk_queue_depth) {
-		virtblk_queue_depth = vblk->vq->num_free;
+		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
 		/* ... but without indirect descs, we use 2 descs per req */
 		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
 			virtblk_queue_depth /= 2;
@@ -570,7 +632,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
 	vblk->tag_set.ops = &virtio_mq_ops;
-	vblk->tag_set.nr_hw_queues = 1;
 	vblk->tag_set.queue_depth = virtblk_queue_depth;
 	vblk->tag_set.numa_node = NUMA_NO_NODE;
 	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
@@ -578,6 +639,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 		sizeof(struct virtblk_req) +
 		sizeof(struct scatterlist) * sg_elems;
 	vblk->tag_set.driver_data = vblk;
+	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
 
 	err = blk_mq_alloc_tag_set(&vblk->tag_set);
 	if (err)
@@ -727,6 +789,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
 	put_disk(vblk->disk);
 	vdev->config->del_vqs(vdev);
+	kfree(vblk->vqs);
 	kfree(vblk);
 
 	/* Only free device id if we don't have any users */
@@ -777,7 +840,8 @@ static const struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
-	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
+	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+	VIRTIO_BLK_F_MQ,
 };
 
 static struct virtio_driver virtio_blk = {
-- 
cgit v1.2.3


From e952658020c5150ad4987d313e25e8e2fb38d529 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 22 Nov 2013 15:53:41 +0100
Subject: drbd: Move write_ordering from connection to resource

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  5 +++--
 drivers/block/drbd/drbd_main.c     |  2 +-
 drivers/block/drbd/drbd_nl.c       |  4 ++--
 drivers/block/drbd/drbd_proc.c     |  2 +-
 drivers/block/drbd/drbd_receiver.c | 25 ++++++++++++-------------
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a76ceb344d64..1ef2474e8f11 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -594,6 +594,8 @@ struct drbd_resource {
 	unsigned susp_nod:1;		/* IO suspended because no data */
 	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
 
+	enum write_ordering_e write_ordering;
+
 	cpumask_var_t cpu_mask;
 };
 
@@ -636,7 +638,6 @@ struct drbd_connection {
 	struct drbd_epoch *current_epoch;
 	spinlock_t epoch_lock;
 	unsigned int epochs;
-	enum write_ordering_e write_ordering;
 	atomic_t current_tle_nr;	/* transfer log epoch number */
 	unsigned current_tle_writes;	/* writes seen within this tl epoch */
 
@@ -1478,7 +1479,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 		generic_make_request(bio);
 }
 
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_resource *resource, enum write_ordering_e wo);
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 960645c26e6f..17b9a237f2e6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2579,6 +2579,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	kref_init(&resource->kref);
 	idr_init(&resource->devices);
 	INIT_LIST_HEAD(&resource->connections);
+	resource->write_ordering = WO_bdev_flush;
 	list_add_tail_rcu(&resource->resources, &drbd_resources);
 	mutex_init(&resource->conf_update);
 	mutex_init(&resource->adm_mutex);
@@ -2617,7 +2618,6 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 	INIT_LIST_HEAD(&connection->current_epoch->list);
 	connection->epochs = 1;
 	spin_lock_init(&connection->epoch_lock);
-	connection->write_ordering = WO_bdev_flush;
 
 	connection->send.seen_any_write_yet = false;
 	connection->send.current_epoch_nr = 0;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 1b35c45c92b7..43fad2c1ba01 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1399,7 +1399,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	else
 		set_bit(MD_NO_FUA, &device->flags);
 
-	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+	drbd_bump_write_ordering(device->resource, WO_bdev_flush);
 
 	drbd_md_sync(device);
 
@@ -1704,7 +1704,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	new_disk_conf = NULL;
 	new_plan = NULL;
 
-	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+	drbd_bump_write_ordering(device->resource, WO_bdev_flush);
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &device->flags);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 89736bdbbc70..f11e57308104 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -281,7 +281,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   atomic_read(&device->unacked_cnt),
 			   atomic_read(&device->ap_bio_cnt),
 			   first_peer_device(device)->connection->epochs,
-			   write_ordering_chars[first_peer_device(device)->connection->write_ordering]
+			   write_ordering_chars[device->resource->write_ordering]
 			);
 			seq_printf(seq, " oos:%llu\n",
 				   Bit2KB((unsigned long long)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5b17ec88ea05..c7084188c2ae 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1151,7 +1151,7 @@ static void drbd_flush(struct drbd_connection *connection)
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
-	if (connection->write_ordering >= WO_bdev_flush) {
+	if (connection->resource->write_ordering >= WO_bdev_flush) {
 		rcu_read_lock();
 		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 			struct drbd_device *device = peer_device->device;
@@ -1168,7 +1168,7 @@ static void drbd_flush(struct drbd_connection *connection)
 				/* would rather check on EOPNOTSUPP, but that is not reliable.
 				 * don't try again for ANY return value != 0
 				 * if (rv == -EOPNOTSUPP) */
-				drbd_bump_write_ordering(connection, WO_drain_io);
+				drbd_bump_write_ordering(connection->resource, WO_drain_io);
 			}
 			put_ldev(device);
 			kref_put(&device->kref, drbd_destroy_device);
@@ -1262,10 +1262,10 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
  * @connection:	DRBD connection.
  * @wo:		Write ordering method to try.
  */
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
+void drbd_bump_write_ordering(struct drbd_resource *resource, enum write_ordering_e wo)
 {
 	struct disk_conf *dc;
-	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
 	enum write_ordering_e pwo;
 	int vnr;
 	static char *write_ordering_str[] = {
@@ -1274,12 +1274,10 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord
 		[WO_bdev_flush] = "flush",
 	};
 
-	pwo = connection->write_ordering;
+	pwo = resource->write_ordering;
 	wo = min(pwo, wo);
 	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-
+	idr_for_each_entry(&resource->devices, device, vnr) {
 		if (!get_ldev_if_state(device, D_ATTACHING))
 			continue;
 		dc = rcu_dereference(device->ldev->disk_conf);
@@ -1291,9 +1289,9 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord
 		put_ldev(device);
 	}
 	rcu_read_unlock();
-	connection->write_ordering = wo;
-	if (pwo != connection->write_ordering || wo == WO_bdev_flush)
-		drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
+	resource->write_ordering = wo;
+	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
 /**
@@ -1471,7 +1469,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	 * R_PRIMARY crashes now.
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
-	switch (connection->write_ordering) {
+	switch (connection->resource->write_ordering) {
 	case WO_none:
 		if (rv == FE_RECYCLED)
 			return 0;
@@ -1498,7 +1496,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 
 		return 0;
 	default:
-		drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
+		drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+			 connection->resource->write_ordering);
 		return -EIO;
 	}
 
-- 
cgit v1.2.3


From 8fe39aac0578cbb0abf27e1be70ff581e0c1d836 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 22 Nov 2013 13:22:13 +0100
Subject: drbd: device->ldev is not guaranteed on an D_ATTACHING disk

Some parts of the code assumed that get_ldev_if_state(device, D_ATTACHING)
is sufficient to access the ldev member of the device object. That was
wrong. ldev may not be there or might be freed at any time if the device
has a disk state of D_ATTACHING.

bm_rw()
  Documented that drbd_bm_read() is only called from drbd_adm_attach.
  drbd_bm_write() is only called when a reference is held, and it is
  documented that a caller has to hold a reference before calling
  drbd_bm_write()

drbd_bm_write_page()
  Use get_ldev() instead of get_ldev_if_state(device, D_ATTACHING)

drbd_bmio_set_n_write()
  No longer use get_ldev_if_state(device, D_ATTACHING). All callers
  hold a reference to ldev now.

drbd_bmio_clear_n_write()
  All callers where holding a reference of ldev anyways. Remove the
  misleading get_ldev_if_state(device, D_ATTACHING)

drbd_reconsider_max_bio_size()
  Removed the get_ldev_if_state(device, D_ATTACHING). All callers
  now pass a struct drbd_backing_dev* when they have a proper
  reference, or a NULL pointer.
  Before this fix, the receiver could trigger a NULL pointer
  deref when in drbd_reconsider_max_bio_size()

drbd_bump_write_ordering()
  Used get_ldev_if_state(device, D_ATTACHING) with the wrong assumption.
  Remove it, and allow the caller to pass in a struct drbd_backing_dev*
  when the caller knows that accessing this bdev is safe.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c   |  4 +++-
 drivers/block/drbd/drbd_int.h      |  9 ++++----
 drivers/block/drbd/drbd_main.c     | 36 +++++++++++++------------------
 drivers/block/drbd/drbd_nl.c       | 41 +++++++++++++++++++++++-------------
 drivers/block/drbd/drbd_receiver.c | 43 ++++++++++++++++++++++++++------------
 include/linux/drbd.h               |  2 +-
 6 files changed, 79 insertions(+), 56 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 1aa29f8fdfe1..ed310415020b 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1085,6 +1085,8 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		kfree(ctx);
 		return -ENODEV;
 	}
+	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+	   drbd_adm_attach(), after device->ldev was assigned. */
 
 	if (!ctx->flags)
 		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
@@ -1260,7 +1262,7 @@ int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold
 		.kref = { ATOMIC_INIT(2) },
 	};
 
-	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+	if (!get_ldev(device)) {  /* put is in bm_aio_ctx_destroy() */
 		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
 		kfree(ctx);
 		return -ENODEV;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 1ef2474e8f11..c87bc8e8fd82 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -984,8 +984,8 @@ extern int drbd_bitmap_io(struct drbd_device *device,
 extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
 		int (*io_fn)(struct drbd_device *),
 		char *why, enum bm_flag flags);
-extern int drbd_bmio_set_n_write(struct drbd_device *device);
-extern int drbd_bmio_clear_n_write(struct drbd_device *device);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
 extern void drbd_ldev_destroy(struct drbd_device *device);
 
 /* Meta data layout
@@ -1313,7 +1313,7 @@ enum determine_dev_size {
 extern enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_device *);
-extern void drbd_reconsider_max_bio_size(struct drbd_device *device);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
 					enum drbd_role new_role,
 					int force);
@@ -1479,7 +1479,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 		generic_make_request(bio);
 }
 
-void drbd_bump_write_ordering(struct drbd_resource *resource, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo);
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 17b9a237f2e6..a6af93528d57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3466,23 +3466,19 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
  *
  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_set_n_write(struct drbd_device *device)
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
 {
 	int rv = -EIO;
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		drbd_md_set_flag(device, MDF_FULL_SYNC);
-		drbd_md_sync(device);
-		drbd_bm_set_all(device);
-
-		rv = drbd_bm_write(device);
+	drbd_md_set_flag(device, MDF_FULL_SYNC);
+	drbd_md_sync(device);
+	drbd_bm_set_all(device);
 
-		if (!rv) {
-			drbd_md_clear_flag(device, MDF_FULL_SYNC);
-			drbd_md_sync(device);
-		}
+	rv = drbd_bm_write(device);
 
-		put_ldev(device);
+	if (!rv) {
+		drbd_md_clear_flag(device, MDF_FULL_SYNC);
+		drbd_md_sync(device);
 	}
 
 	return rv;
@@ -3494,18 +3490,11 @@ int drbd_bmio_set_n_write(struct drbd_device *device)
  *
  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_clear_n_write(struct drbd_device *device)
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
 {
-	int rv = -EIO;
-
 	drbd_resume_al(device);
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		drbd_bm_clear_all(device);
-		rv = drbd_bm_write(device);
-		put_ldev(device);
-	}
-
-	return rv;
+	drbd_bm_clear_all(device);
+	return drbd_bm_write(device);
 }
 
 static int w_bitmap_io(struct drbd_work *w, int unused)
@@ -3603,6 +3592,9 @@ static int w_go_diskless(struct drbd_work *w, int unused)
  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
  * called from worker context. It MUST NOT be used while a previous such
  * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
  */
 void drbd_queue_bitmap_io(struct drbd_device *device,
 			  int (*io_fn)(struct drbd_device *),
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 43fad2c1ba01..25f4b6f67c21 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1110,15 +1110,16 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
 	return 0;
 }
 
-static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				   unsigned int max_bio_size)
 {
 	struct request_queue * const q = device->rq_queue;
 	unsigned int max_hw_sectors = max_bio_size >> 9;
 	unsigned int max_segments = 0;
 	struct request_queue *b = NULL;
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		b = device->ldev->backing_bdev->bd_disk->queue;
+	if (bdev) {
+		b = bdev->backing_bdev->bd_disk->queue;
 
 		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
 		rcu_read_lock();
@@ -1163,11 +1164,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
 				 b->backing_dev_info.ra_pages);
 			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
 		}
-		put_ldev(device);
 	}
 }
 
-void drbd_reconsider_max_bio_size(struct drbd_device *device)
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
 {
 	unsigned int now, new, local, peer;
 
@@ -1175,10 +1175,9 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
 	local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
 	peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+	if (bdev) {
+		local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
 		device->local_max_bio_size = local;
-		put_ldev(device);
 	}
 	local = min(local, DRBD_MAX_BIO_SIZE);
 
@@ -1211,7 +1210,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
 	if (new != now)
 		drbd_info(device, "max BIO size = %u\n", new);
 
-	drbd_setup_queue_param(device, new);
+	drbd_setup_queue_param(device, bdev, new);
 }
 
 /* Starts the worker thread */
@@ -1399,7 +1398,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	else
 		set_bit(MD_NO_FUA, &device->flags);
 
-	drbd_bump_write_ordering(device->resource, WO_bdev_flush);
+	drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
 
 	drbd_md_sync(device);
 
@@ -1704,7 +1703,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	new_disk_conf = NULL;
 	new_plan = NULL;
 
-	drbd_bump_write_ordering(device->resource, WO_bdev_flush);
+	drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1720,7 +1719,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	device->read_cnt = 0;
 	device->writ_cnt = 0;
 
-	drbd_reconsider_max_bio_size(device);
+	drbd_reconsider_max_bio_size(device, device->ldev);
 
 	/* If I am currently not R_PRIMARY,
 	 * but meta data primary indicator is set,
@@ -2648,8 +2647,13 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
-	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync.
@@ -2673,6 +2677,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
 	drbd_resume_io(device);
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2698,7 +2703,7 @@ out:
 	return 0;
 }
 
-static int drbd_bmio_set_susp_al(struct drbd_device *device)
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
 {
 	int rv;
 
@@ -2719,8 +2724,13 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
-	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync.
@@ -2747,6 +2757,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
 	drbd_resume_io(device);
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7084188c2ae..be0c3761cdc6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1168,7 +1168,7 @@ static void drbd_flush(struct drbd_connection *connection)
 				/* would rather check on EOPNOTSUPP, but that is not reliable.
 				 * don't try again for ANY return value != 0
 				 * if (rv == -EOPNOTSUPP) */
-				drbd_bump_write_ordering(connection->resource, WO_drain_io);
+				drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
 			}
 			put_ldev(device);
 			kref_put(&device->kref, drbd_destroy_device);
@@ -1257,14 +1257,29 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 	return rv;
 }
 
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+	struct disk_conf *dc;
+
+	dc = rcu_dereference(bdev->disk_conf);
+
+	if (wo == WO_bdev_flush && !dc->disk_flushes)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && !dc->disk_drain)
+		wo = WO_none;
+
+	return wo;
+}
+
 /**
  * drbd_bump_write_ordering() - Fall back to an other write ordering method
  * @connection:	DRBD connection.
  * @wo:		Write ordering method to try.
  */
-void drbd_bump_write_ordering(struct drbd_resource *resource, enum write_ordering_e wo)
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo)
 {
-	struct disk_conf *dc;
 	struct drbd_device *device;
 	enum write_ordering_e pwo;
 	int vnr;
@@ -1278,17 +1293,18 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, enum write_orderin
 	wo = min(pwo, wo);
 	rcu_read_lock();
 	idr_for_each_entry(&resource->devices, device, vnr) {
-		if (!get_ldev_if_state(device, D_ATTACHING))
-			continue;
-		dc = rcu_dereference(device->ldev->disk_conf);
-
-		if (wo == WO_bdev_flush && !dc->disk_flushes)
-			wo = WO_drain_io;
-		if (wo == WO_drain_io && !dc->disk_drain)
-			wo = WO_none;
-		put_ldev(device);
+		if (get_ldev(device)) {
+			wo = max_allowed_wo(device->ldev, wo);
+			if (device->ldev == bdev)
+				bdev = NULL;
+			put_ldev(device);
+		}
 	}
 	rcu_read_unlock();
+
+	if (bdev)
+		wo = max_allowed_wo(bdev, wo);
+
 	resource->write_ordering = wo;
 	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
 		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
@@ -3709,7 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 	}
 
 	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
-	drbd_reconsider_max_bio_size(device);
 	/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
 	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
 	   drbd_reconsider_max_bio_size(), we can be sure that after
@@ -3717,6 +3732,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(device)) {
+		drbd_reconsider_max_bio_size(device, device->ldev);
 		dd = drbd_determine_dev_size(device, ddsf, NULL);
 		put_ldev(device);
 		if (dd == DS_ERROR)
@@ -3724,6 +3740,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 		drbd_md_sync(device);
 	} else {
 		/* I am diskless, need to accept the peer's size. */
+		drbd_reconsider_max_bio_size(device, NULL);
 		drbd_set_my_capacity(device, p_size);
 	}
 
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 3dbe9bd57a09..20ec8903b1e4 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -245,7 +245,7 @@ enum drbd_disk_state {
 	D_DISKLESS,
 	D_ATTACHING,      /* In the process of reading the meta-data */
 	D_FAILED,         /* Becomes D_DISKLESS as soon as we told it the peer */
-			/* when >= D_FAILED it is legal to access mdev->bc */
+			  /* when >= D_FAILED it is legal to access mdev->ldev */
 	D_NEGOTIATING,    /* Late attaching state, we need to talk to the peer */
 	D_INCONSISTENT,
 	D_OUTDATED,
-- 
cgit v1.2.3


From 28995af5cf3efd0bf4a74f5044cee0ae8cbc6b12 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 22 Nov 2013 16:48:14 +0100
Subject: drbd: rename drbd_free_bc() to drbd_free_ldev()

Since the member of drbd_device is called ldev

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  | 2 +-
 drivers/block/drbd/drbd_main.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c87bc8e8fd82..82ece1b1a701 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -950,7 +950,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a6af93528d57..8e0813ddf72a 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1992,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
 		drbd_bm_cleanup(device);
 	}
 
-	drbd_free_bc(device->ldev);
+	drbd_free_ldev(device->ldev);
 	device->ldev = NULL;
 
 	clear_bit(AL_SUSPENDED, &device->flags);
@@ -2187,7 +2187,7 @@ void drbd_destroy_device(struct kref *kref)
 	if (device->this_bdev)
 		bdput(device->this_bdev);
 
-	drbd_free_bc(device->ldev);
+	drbd_free_ldev(device->ldev);
 	device->ldev = NULL;
 
 	drbd_release_all_peer_reqs(device);
@@ -2960,7 +2960,7 @@ fail:
 	return err;
 }
 
-void drbd_free_bc(struct drbd_backing_dev *ldev)
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
 {
 	if (ldev == NULL)
 		return;
@@ -3533,7 +3533,7 @@ void drbd_ldev_destroy(struct drbd_device *device)
 	lc_destroy(device->act_log);
 	device->act_log = NULL;
 	__no_warn(local,
-		drbd_free_bc(device->ldev);
+		drbd_free_ldev(device->ldev);
 		device->ldev = NULL;);
 
 	clear_bit(GO_DISKLESS, &device->flags);
-- 
cgit v1.2.3


From 35b5ed5bbac2432acdfce1d9dec8dbf8fe7d60dd Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 4 Dec 2013 12:07:09 +0100
Subject: drbd: reduce number of spinlock drop/re-aquire cycles

Instead of dropping and re-aquiring the spinlock around the submit,
just remember that we want to submit, and do that only once we have
dropped the spinlock for good.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 09803d0d5207..4c7fee1a5a85 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1086,11 +1086,13 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 
 static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
 {
+	struct drbd_resource *resource = device->resource;
 	const int rw = bio_rw(req->master_bio);
 	struct bio_and_error m = { NULL, };
 	bool no_remote = false;
+	bool submit_private_bio = false;
 
-	spin_lock_irq(&device->resource->req_lock);
+	spin_lock_irq(&resource->req_lock);
 	if (rw == WRITE) {
 		/* This may temporarily give up the req_lock,
 		 * but will re-aquire it before it returns here.
@@ -1152,9 +1154,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 		/* needs to be marked within the same spinlock */
 		_req_mod(req, TO_BE_SUBMITTED);
 		/* but we need to give up the spinlock to submit */
-		spin_unlock_irq(&device->resource->req_lock);
-		drbd_submit_req_private_bio(req);
-		spin_lock_irq(&device->resource->req_lock);
+		submit_private_bio = true;
 	} else if (no_remote) {
 nodata:
 		if (__ratelimit(&drbd_ratelimit_state))
@@ -1167,8 +1167,16 @@ nodata:
 out:
 	if (drbd_req_put_completion_ref(req, &m, 1))
 		kref_put(&req->kref, drbd_req_destroy);
-	spin_unlock_irq(&device->resource->req_lock);
-
+	spin_unlock_irq(&resource->req_lock);
+
+	/* Even though above is a kref_put(), this is safe.
+	 * As long as we still need to submit our private bio,
+	 * we hold a completion ref, and the request cannot disappear.
+	 * If however this request did not even have a private bio to submit
+	 * (e.g. remote read), req may already be invalid now.
+	 * That's why we cannot check on req->private_bio. */
+	if (submit_private_bio)
+		drbd_submit_req_private_bio(req);
 	if (m.bio)
 		complete_master_bio(device, &m);
 }
-- 
cgit v1.2.3


From 44a4d551846b8c61aa430b9432c1fcdf88444708 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Nov 2013 12:40:58 +0100
Subject: drbd: refactor use of first_peer_device()

Reduce the number of calls to first_peer_device(). Instead, call
first_peer_device() just once to assign a local variable peer_device.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       | 30 +++++++++++-------
 drivers/block/drbd/drbd_receiver.c | 18 ++++++-----
 drivers/block/drbd/drbd_req.c      | 25 ++++++++-------
 drivers/block/drbd/drbd_state.c    | 65 +++++++++++++++++++-------------------
 drivers/block/drbd/drbd_worker.c   | 55 +++++++++++++++++---------------
 5 files changed, 105 insertions(+), 88 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 25f4b6f67c21..0bf8a6082bb8 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -552,8 +552,10 @@ void conn_try_outdate_peer_async(struct drbd_connection *connection)
 }
 
 enum drbd_state_rv
-drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
 {
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	const int max_tries = 4;
 	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 	struct net_conf *nc;
@@ -601,7 +603,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 		    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 			D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
 
-			if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
+			if (conn_try_outdate_peer(connection)) {
 				val.disk = D_UP_TO_DATE;
 				mask.disk = D_MASK;
 			}
@@ -611,7 +613,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 		if (rv == SS_NOTHING_TO_DO)
 			goto out;
 		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
-			if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
+			if (!conn_try_outdate_peer(connection) && force) {
 				drbd_warn(device, "Forced into split brain situation!\n");
 				mask.pdsk = D_MASK;
 				val.pdsk  = D_OUTDATED;
@@ -624,7 +626,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 			   retry at most once more in this case. */
 			int timeo;
 			rcu_read_lock();
-			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+			nc = rcu_dereference(connection->net_conf);
 			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
 			rcu_read_unlock();
 			schedule_timeout_interruptible(timeo);
@@ -661,7 +663,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 	} else {
 		/* Called from drbd_adm_set_role only.
 		 * We are still holding the conf_update mutex. */
-		nc = first_peer_device(device)->connection->net_conf;
+		nc = connection->net_conf;
 		if (nc)
 			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
 
@@ -683,8 +685,8 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 	if (device->state.conn >= C_WF_REPORT_PARAMS) {
 		/* if this was forced, we should consider sync */
 		if (forced)
-			drbd_send_uuids(first_peer_device(device));
-		drbd_send_current_state(first_peer_device(device));
+			drbd_send_uuids(peer_device);
+		drbd_send_current_state(peer_device);
 	}
 
 	drbd_md_sync(device);
@@ -1433,6 +1435,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
+	struct drbd_peer_device *peer_device;
+	struct drbd_connection *connection;
 	int err;
 	enum drbd_ret_code retcode;
 	enum determine_dev_size dd;
@@ -1455,7 +1459,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
 	device = adm_ctx.device;
 	mutex_lock(&adm_ctx.resource->adm_mutex);
-	conn_reconfig_start(first_peer_device(device)->connection);
+	peer_device = first_peer_device(device);
+	connection = peer_device ? peer_device->connection : NULL;
+	conn_reconfig_start(connection);
 
 	/* if you want to reconfigure, please tear down first */
 	if (device->state.disk > D_DISKLESS) {
@@ -1522,7 +1528,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 
 	rcu_read_lock();
-	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	nc = rcu_dereference(connection->net_conf);
 	if (nc) {
 		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
 			rcu_read_unlock();
@@ -1642,7 +1648,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	 */
 	wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
 	/* and for any other previously queued work */
-	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+	drbd_flush_workqueue(&connection->sender_work);
 
 	rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
 	retcode = rv;  /* FIXME: Type mismatch. */
@@ -1838,7 +1844,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
 	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 	put_ldev(device);
-	conn_reconfig_done(first_peer_device(device)->connection);
+	conn_reconfig_done(connection);
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -1849,7 +1855,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	drbd_force_state(device, NS(disk, D_DISKLESS));
 	drbd_md_sync(device);
  fail:
-	conn_reconfig_done(first_peer_device(device)->connection);
+	conn_reconfig_done(connection);
 	if (nbc) {
 		if (nbc->backing_bdev)
 			blkdev_put(nbc->backing_bdev,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index be0c3761cdc6..bb1434dfec8a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2857,8 +2857,10 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
 -1091   requires proto 91
 -1096   requires proto 96
  */
-static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
 {
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	u64 self, peer;
 	int i, j;
 
@@ -2884,7 +2886,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
 		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2907,7 +2909,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
 		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2940,7 +2942,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 		case 1: /*  self_pri && !peer_pri */ return 1;
 		case 2: /* !self_pri &&  peer_pri */ return -1;
 		case 3: /*  self_pri &&  peer_pri */
-			dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+			dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
 			return dc ? -1 : 1;
 		}
 	}
@@ -2953,14 +2955,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 	*rule_nr = 51;
 	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+		if (connection->agreed_pro_version < 96 ?
 		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
 		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
 		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of the peer's UUIDs. */
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
@@ -2990,14 +2992,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 	*rule_nr = 71;
 	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+		if (connection->agreed_pro_version < 96 ?
 		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
 		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
 		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of our UUIDs. */
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4c7fee1a5a85..042bbc689f5e 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -454,7 +454,9 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
 int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		struct bio_and_error *m)
 {
-	struct drbd_device *device = req->device;
+	struct drbd_device *const device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	struct net_conf *nc;
 	int p, rv = 0;
 
@@ -477,7 +479,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 * and from w_read_retry_remote */
 		D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
 		rcu_read_lock();
-		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		nc = rcu_dereference(connection->net_conf);
 		p = nc->wire_protocol;
 		rcu_read_unlock();
 		req->rq_state |=
@@ -549,7 +551,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
 		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
 		req->w.cb = w_send_read_req;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 		break;
 
@@ -585,23 +587,23 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
 		mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
 		req->w.cb =  w_send_dblock;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 
 		/* close the epoch, in case it outgrew the limit */
 		rcu_read_lock();
-		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		nc = rcu_dereference(connection->net_conf);
 		p = nc->max_epoch_size;
 		rcu_read_unlock();
-		if (first_peer_device(device)->connection->current_tle_writes >= p)
-			start_new_tl_epoch(first_peer_device(device)->connection);
+		if (connection->current_tle_writes >= p)
+			start_new_tl_epoch(connection);
 
 		break;
 
 	case QUEUE_FOR_SEND_OOS:
 		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
 		req->w.cb =  w_send_out_of_sync;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 		break;
 
@@ -714,7 +716,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 		get_ldev(device); /* always succeeds in this call path */
 		req->w.cb = w_restart_disk_io;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 		break;
 
@@ -736,7 +738,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 			mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
 			if (req->w.cb) {
-				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				/* w.cb expected to be w_send_dblock, or w_send_read_req */
+				drbd_queue_work(&connection->sender_work,
 						&req->w);
 				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
 			} /* else: FIXME can this happen? */
@@ -769,7 +772,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case QUEUE_AS_DRBD_BARRIER:
-		start_new_tl_epoch(first_peer_device(device)->connection);
+		start_new_tl_epoch(connection);
 		mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
 		break;
 	};
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index a5d8aae00e04..19da7c7590cd 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -952,6 +952,8 @@ enum drbd_state_rv
 __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	         enum chg_state_flags flags, struct completion *done)
 {
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	union drbd_state os;
 	enum drbd_state_rv rv = SS_SUCCESS;
 	enum sanitize_state_warnings ssw;
@@ -978,9 +980,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 			   this happen...*/
 
 			if (is_valid_state(device, os) == rv)
-				rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+				rv = is_valid_soft_transition(os, ns, connection);
 		} else
-			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+			rv = is_valid_soft_transition(os, ns, connection);
 	}
 
 	if (rv < SS_SUCCESS) {
@@ -997,7 +999,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	   sanitize_state(). Only display it here if we where not called from
 	   _conn_request_state() */
 	if (!(flags & CS_DC_SUSP))
-		conn_pr_state_change(first_peer_device(device)->connection, os, ns,
+		conn_pr_state_change(connection, os, ns,
 				     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
 
 	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
@@ -1017,19 +1019,19 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
 	/* put replicated vs not-replicated requests in seperate epochs */
 	if (did_remote != should_do_remote)
-		start_new_tl_epoch(first_peer_device(device)->connection);
+		start_new_tl_epoch(connection);
 
 	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
 		drbd_print_uuids(device, "attached to UUIDs");
 
 	/* Wake up role changes, that were delayed because of connection establishing */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
-	    no_peer_wf_report_params(first_peer_device(device)->connection))
-		clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags);
+	    no_peer_wf_report_params(connection))
+		clear_bit(STATE_SENT, &connection->flags);
 
 	wake_up(&device->misc_wait);
 	wake_up(&device->state_wait);
-	wake_up(&first_peer_device(device)->connection->ping_wait);
+	wake_up(&connection->ping_wait);
 
 	/* Aborted verify run, or we reached the stop sector.
 	 * Log the last position, unless end-of-device. */
@@ -1118,21 +1120,21 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
 	/* Receiver should clean up itself */
 	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-		drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+		drbd_thread_stop_nowait(&connection->receiver);
 
 	/* Now the receiver finished cleaning up itself, it should die */
 	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-		drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+		drbd_thread_stop_nowait(&connection->receiver);
 
 	/* Upon network failure, we need to restart the receiver. */
 	if (os.conn > C_WF_CONNECTION &&
 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-		drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver);
+		drbd_thread_restart_nowait(&connection->receiver);
 
 	/* Resume AL writing if we get a connection */
 	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
 		drbd_resume_al(device);
-		first_peer_device(device)->connection->connect_cnt++;
+		connection->connect_cnt++;
 	}
 
 	/* remember last attach time so request_timer_fn() won't
@@ -1150,7 +1152,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 		ascw->w.cb = w_after_state_ch;
 		ascw->device = device;
 		ascw->done = done;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&ascw->w);
 	} else {
 		drbd_err(device, "Could not kmalloc an ascw\n");
@@ -1222,6 +1224,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 			   union drbd_state ns, enum chg_state_flags flags)
 {
 	struct drbd_resource *resource = device->resource;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	struct sib_info sib;
 
 	sib.sib_reason = SIB_STATE_CHANGE;
@@ -1245,7 +1249,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	   state change. This function might sleep */
 
 	if (ns.susp_nod) {
-		struct drbd_connection *connection = first_peer_device(device)->connection;
 		enum drbd_req_event what = NOTHING;
 
 		spin_lock_irq(&device->resource->req_lock);
@@ -1267,8 +1270,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	}
 
 	if (ns.susp_fen) {
-		struct drbd_connection *connection = first_peer_device(device)->connection;
-
 		spin_lock_irq(&device->resource->req_lock);
 		if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
 			/* case2: The connection was established again: */
@@ -1294,8 +1295,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	 * which is unexpected. */
 	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
 	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-	    first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) {
-		drbd_gen_and_send_sync_uuid(first_peer_device(device));
+	    connection->agreed_pro_version >= 96 && get_ldev(device)) {
+		drbd_gen_and_send_sync_uuid(peer_device);
 		put_ldev(device);
 	}
 
@@ -1309,8 +1310,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 		atomic_set(&device->rs_pending_cnt, 0);
 		drbd_rs_cancel_all(device);
 
-		drbd_send_uuids(first_peer_device(device));
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
 	}
 	/* No point in queuing send_bitmap if we don't have a connection
 	 * anymore, so check also the _current_ state, not only the new state
@@ -1335,7 +1336,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 					set_bit(NEW_CUR_UUID, &device->flags);
 				} else {
 					drbd_uuid_new_current(device);
-					drbd_send_uuids(first_peer_device(device));
+					drbd_send_uuids(peer_device);
 				}
 			}
 			put_ldev(device);
@@ -1346,7 +1347,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
 		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(device);
-			drbd_send_uuids(first_peer_device(device));
+			drbd_send_uuids(peer_device);
 		}
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1373,16 +1374,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	/* Last part of the attaching process ... */
 	if (ns.conn >= C_CONNECTED &&
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-		drbd_send_sizes(first_peer_device(device), 0, 0);  /* to start sync... */
-		drbd_send_uuids(first_peer_device(device));
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
 	}
 
 	/* We want to pause/continue resync, tell peer. */
 	if (ns.conn >= C_CONNECTED &&
 	     ((os.aftr_isp != ns.aftr_isp) ||
 	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* In case one of the isp bits got set, suspend other devices. */
 	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1392,10 +1393,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	/* Make sure the peer gets informed about eventual state
 	   changes (ISP bits) while we were in WFReportParams. */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* We are in the progress to start a full sync... */
 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1449,7 +1450,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 					drbd_disk_str(device->state.disk));
 
 			if (ns.conn >= C_CONNECTED)
-				drbd_send_state(first_peer_device(device), ns);
+				drbd_send_state(peer_device, ns);
 
 			drbd_rs_cancel_all(device);
 
@@ -1473,7 +1474,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 				 drbd_disk_str(device->state.disk));
 
 		if (ns.conn >= C_CONNECTED)
-			drbd_send_state(first_peer_device(device), ns);
+			drbd_send_state(peer_device, ns);
 		/* corresponding get_ldev in __drbd_set_state
 		 * this may finally trigger drbd_ldev_destroy. */
 		put_ldev(device);
@@ -1481,7 +1482,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
 	/* Notify peer that I had a local IO error, and did not detached.. */
 	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* Disks got bigger while they were detached */
 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1499,14 +1500,14 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	/* sync target done with resync.  Explicitly notify peer, even though
 	 * it should (at least for non-empty resyncs) already know itself. */
 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* Verify finished, or reached stop sector.  Peer did not know about
 	 * the stop sector, and we may even have changed the stop sector during
 	 * verify to interrupt/stop early.  Send the new state. */
 	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
 	&& verify_can_do_stop_sector(device))
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* This triggers bitmap writeout of potentially still unwritten pages
 	 * if the resync finished cleanly, or aborted because of peer disk
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d8f57b6305cd..595ab57aea96 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -583,8 +583,10 @@ static int drbd_rs_number_requests(struct drbd_device *device)
 	return number;
 }
 
-static int make_resync_request(struct drbd_device *device, int cancel)
+static int make_resync_request(struct drbd_device *const device, int cancel)
 {
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	unsigned long bit;
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
@@ -618,15 +620,15 @@ static int make_resync_request(struct drbd_device *device, int cancel)
 
 	for (i = 0; i < number; i++) {
 		/* Stop generating RS requests, when half of the send buffer is filled */
-		mutex_lock(&first_peer_device(device)->connection->data.mutex);
-		if (first_peer_device(device)->connection->data.socket) {
-			queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
-			sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket) {
+			queued = connection->data.socket->sk->sk_wmem_queued;
+			sndbuf = connection->data.socket->sk->sk_sndbuf;
 		} else {
 			queued = 1;
 			sndbuf = 0;
 		}
-		mutex_unlock(&first_peer_device(device)->connection->data.mutex);
+		mutex_unlock(&connection->data.mutex);
 		if (queued > sndbuf / 2)
 			goto requeue;
 
@@ -696,9 +698,9 @@ next_sector:
 		/* adjust very last sectors, in case we are oddly sized */
 		if (sector + (size>>9) > capacity)
 			size = (capacity-sector)<<9;
-		if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
-		    first_peer_device(device)->connection->csums_tfm) {
-			switch (read_for_csum(first_peer_device(device), sector, size)) {
+		if (connection->agreed_pro_version >= 89 &&
+		    connection->csums_tfm) {
+			switch (read_for_csum(peer_device, sector, size)) {
 			case -EIO: /* Disk failure */
 				put_ldev(device);
 				return -EIO;
@@ -717,7 +719,7 @@ next_sector:
 			int err;
 
 			inc_rs_pending(device);
-			err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
+			err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
 						 sector, size, ID_SYNCER);
 			if (err) {
 				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -1351,7 +1353,8 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 	struct drbd_device *device = req->device;
-	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device->connection;
 	int err;
 
 	if (unlikely(cancel)) {
@@ -1365,7 +1368,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 	 * No more barriers will be sent, until we leave AHEAD mode again. */
 	maybe_send_barrier(connection, req->epoch);
 
-	err = drbd_send_out_of_sync(first_peer_device(device), req);
+	err = drbd_send_out_of_sync(peer_device, req);
 	req_mod(req, OOS_HANDED_TO_NETWORK);
 
 	return err;
@@ -1380,7 +1383,8 @@ int w_send_dblock(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 	struct drbd_device *device = req->device;
-	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
 	int err;
 
 	if (unlikely(cancel)) {
@@ -1392,7 +1396,7 @@ int w_send_dblock(struct drbd_work *w, int cancel)
 	maybe_send_barrier(connection, req->epoch);
 	connection->send.current_epoch_writes++;
 
-	err = drbd_send_dblock(first_peer_device(device), req);
+	err = drbd_send_dblock(peer_device, req);
 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
 
 	return err;
@@ -1407,7 +1411,8 @@ int w_send_read_req(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 	struct drbd_device *device = req->device;
-	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
 	int err;
 
 	if (unlikely(cancel)) {
@@ -1419,7 +1424,7 @@ int w_send_read_req(struct drbd_work *w, int cancel)
 	 * if there was any yet. */
 	maybe_send_barrier(connection, req->epoch);
 
-	err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
+	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
 				 (unsigned long)req);
 
 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
@@ -1633,6 +1638,8 @@ int w_start_resync(struct drbd_work *w, int cancel)
  */
 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 {
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	union drbd_state ns;
 	int r;
 
@@ -1651,7 +1658,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 			if (r > 0) {
 				drbd_info(device, "before-resync-target handler returned %d, "
 					 "dropping connection.\n", r);
-				conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 				return;
 			}
 		} else /* C_SYNC_SOURCE */ {
@@ -1664,7 +1671,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 				} else {
 					drbd_info(device, "before-resync-source handler returned %d, "
 						 "dropping connection.\n", r);
-					conn_request_state(first_peer_device(device)->connection,
+					conn_request_state(connection,
 							   NS(conn, C_DISCONNECTING), CS_HARD);
 					return;
 				}
@@ -1672,7 +1679,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		}
 	}
 
-	if (current == first_peer_device(device)->connection->worker.task) {
+	if (current == connection->worker.task) {
 		/* The worker should not sleep waiting for state_mutex,
 		   that can take long */
 		if (!mutex_trylock(device->state_mutex)) {
@@ -1756,12 +1763,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		 * drbd_resync_finished from here in that case.
 		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
 		 * and from after_state_ch otherwise. */
-		if (side == C_SYNC_SOURCE &&
-		    first_peer_device(device)->connection->agreed_pro_version < 96)
-			drbd_gen_and_send_sync_uuid(first_peer_device(device));
+		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(peer_device);
 
-		if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
-		    device->rs_total == 0) {
+		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
 			/* This still has a race (about when exactly the peers
 			 * detect connection loss) that can lead to a full sync
 			 * on next handshake. In 8.3.9 we fixed this with explicit
@@ -1777,7 +1782,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 				int timeo;
 
 				rcu_read_lock();
-				nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+				nc = rcu_dereference(connection->net_conf);
 				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
 				rcu_read_unlock();
 				schedule_timeout_interruptible(timeo);
-- 
cgit v1.2.3


From 70df70927b75eb86f12b14167c398b99dc3a56e4 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 20 Dec 2013 11:17:02 +0100
Subject: drbd: allow write-ordering policy to be bumped up again

Previously, once you disabled flushes as a means of enforcing
write-ordering, you'd need to detach/re-attach to enable them again.

Allow drbdsetup disk-options to re-enable previously disabled
write-ordering policy options at runtime.

While at it fix RCU in drbd_bump_write_ordering()
max_allowed_wo() uses rcu_dereference, therefore it must
be called within rcu_read_lock()/rcu_read_unlock()

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       | 10 +++++++++-
 drivers/block/drbd/drbd_receiver.c |  6 ++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0bf8a6082bb8..66065e60fdbc 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1294,6 +1294,13 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
 	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
 }
 
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+	return	a->disk_barrier != b->disk_barrier ||
+		a->disk_flushes != b->disk_flushes ||
+		a->disk_drain != b->disk_drain;
+}
+
 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -1400,7 +1407,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	else
 		set_bit(MD_NO_FUA, &device->flags);
 
-	drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+	if (write_ordering_changed(old_disk_conf, new_disk_conf))
+		drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
 
 	drbd_md_sync(device);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index bb1434dfec8a..8d8a6b7f8f5d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1290,7 +1290,8 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 	};
 
 	pwo = resource->write_ordering;
-	wo = min(pwo, wo);
+	if (wo != WO_bdev_flush)
+		wo = min(pwo, wo);
 	rcu_read_lock();
 	idr_for_each_entry(&resource->devices, device, vnr) {
 		if (get_ldev(device)) {
@@ -1300,11 +1301,12 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 			put_ldev(device);
 		}
 	}
-	rcu_read_unlock();
 
 	if (bdev)
 		wo = max_allowed_wo(bdev, wo);
 
+	rcu_read_unlock();
+
 	resource->write_ordering = wo;
 	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
 		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
-- 
cgit v1.2.3


From c7a58db4e9dc523b18bbfbc3aa311d8308acc293 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 20 Dec 2013 11:39:48 +0100
Subject: drbd: get rid of atomic update on disk bitmap works

Just trigger the occasional lazy bitmap write-out during resync
from the central wait_for_work() helper.

Previously, during resync, bitmap pages would be written out separately,
synchronously, one at a time, at least 8 times each (every 512 bytes
worth of bitmap cleared).

Now we trigger "merge friendly" bulk write out of all cleared pages
every two seconds during resync, and once the resync is finished.
Most pages will be written out only once.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 63 +-------------------------------------
 drivers/block/drbd/drbd_bitmap.c | 65 +++++++---------------------------------
 drivers/block/drbd/drbd_int.h    |  2 +-
 drivers/block/drbd/drbd_nl.c     |  7 -----
 drivers/block/drbd/drbd_worker.c | 54 +++++++++++++++++++++++++++++++++
 5 files changed, 66 insertions(+), 125 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 05a1780ffa85..9c42edf4871b 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,12 +92,6 @@ struct __packed al_transaction_on_disk {
 	__be32	context[AL_CONTEXT_PER_TRANSACTION];
 };
 
-struct update_odbm_work {
-	struct drbd_work w;
-	struct drbd_device *device;
-	unsigned int enr;
-};
-
 struct update_al_work {
 	struct drbd_work w;
 	struct drbd_device *device;
@@ -452,15 +446,6 @@ static unsigned int al_extent_to_bm_page(unsigned int al_enr)
 		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
 }
 
-static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
-{
-	return rs_enr >>
-		/* bit to page */
-		((PAGE_SHIFT + 3) -
-		/* resync extent number to bit */
-		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
-}
-
 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
 {
 	const unsigned int stripes = device->ldev->md.al_stripes;
@@ -682,40 +667,6 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
 	return 0;
 }
 
-static int w_update_odbm(struct drbd_work *w, int unused)
-{
-	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
-	struct drbd_device *device = udw->device;
-	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
-
-	if (!get_ldev(device)) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
-		kfree(udw);
-		return 0;
-	}
-
-	drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
-	put_ldev(device);
-
-	kfree(udw);
-
-	if (drbd_bm_total_weight(device) <= device->rs_failed) {
-		switch (device->state.conn) {
-		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
-		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
-			drbd_resync_finished(device);
-		default:
-			/* nothing to do */
-			break;
-		}
-	}
-	drbd_bcast_event(device, &sib);
-
-	return 0;
-}
-
-
 /* ATTENTION. The AL's extents are 4MB each, while the extents in the
  * resync LRU-cache are 16MB each.
  * The caller of this function has to hold an get_ldev() reference.
@@ -726,8 +677,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
 				      int count, int success)
 {
 	struct lc_element *e;
-	struct update_odbm_work *udw;
-
 	unsigned int enr;
 
 	D_ASSERT(device, atomic_read(&device->local_cnt));
@@ -791,17 +740,7 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
 
 		if (ext->rs_left == ext->rs_failed) {
 			ext->rs_failed = 0;
-
-			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
-			if (udw) {
-				udw->enr = ext->lce.lc_number;
-				udw->w.cb = w_update_odbm;
-				udw->device = device;
-				drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-						      &udw->w);
-			} else {
-				drbd_warn(device, "Could not kmalloc an udw\n");
-			}
+			wake_up(&first_peer_device(device)->connection->sender_work.q_wait);
 		}
 	} else {
 		drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ed310415020b..424ebf6bdad0 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1202,6 +1202,16 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
 	return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
 }
 
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device:	DRBD device.
+ * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
+}
+
 /**
  * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
  * @device:	DRBD device.
@@ -1227,61 +1237,6 @@ int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
 	return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
 }
 
-/**
- * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
- * @device:	DRBD device.
- * @idx:	bitmap page index
- *
- * We don't want to special case on logical_block_size of the backend device,
- * so we submit PAGE_SIZE aligned pieces.
- * Note that on "most" systems, PAGE_SIZE is 4k.
- *
- * In case this becomes an issue on systems with larger PAGE_SIZE,
- * we may want to change this again to write 4k aligned 4k pieces.
- */
-int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
-{
-	struct bm_aio_ctx *ctx;
-	int err;
-
-	if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
-		dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
-		return 0;
-	}
-
-	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
-	if (!ctx)
-		return -ENOMEM;
-
-	*ctx = (struct bm_aio_ctx) {
-		.device = device,
-		.in_flight = ATOMIC_INIT(1),
-		.done = 0,
-		.flags = BM_AIO_COPY_PAGES,
-		.error = 0,
-		.kref = { ATOMIC_INIT(2) },
-	};
-
-	if (!get_ldev(device)) {  /* put is in bm_aio_ctx_destroy() */
-		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
-		kfree(ctx);
-		return -ENODEV;
-	}
-
-	bm_page_io_async(ctx, idx, WRITE_SYNC);
-	wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
-
-	if (ctx->error)
-		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-		/* that causes us to detach, so the in memory bitmap will be
-		 * gone in a moment as well. */
-
-	device->bm_writ_cnt++;
-	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
-	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
-	return err;
-}
-
 /* NOTE
  * find_first_bit returns int, we return unsigned long.
  * For this to work on 32bit arch with bitnumbers > (1<<32),
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 82ece1b1a701..eb002a7656af 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1196,11 +1196,11 @@ extern void _drbd_bm_set_bits(struct drbd_device *device,
 		const unsigned long s, const unsigned long e);
 extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
 extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
-extern int  drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
 extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
 extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
 extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
 extern size_t	     drbd_bm_words(struct drbd_device *device);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 66065e60fdbc..52221f67e395 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -3641,13 +3641,6 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 	unsigned seq;
 	int err = -ENOMEM;
 
-	if (sib->sib_reason == SIB_SYNC_PROGRESS) {
-		if (time_after(jiffies, device->rs_last_bcast + HZ))
-			device->rs_last_bcast = jiffies;
-		else
-			return;
-	}
-
 	seq = atomic_inc_return(&drbd_genl_seq);
 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
 	if (!msg)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 595ab57aea96..59158858b17a 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1804,6 +1804,58 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	mutex_unlock(device->state_mutex);
 }
 
+static void update_on_disk_bitmap(struct drbd_device *device)
+{
+	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+	device->rs_last_bcast = jiffies;
+
+	if (!get_ldev(device))
+		return;
+
+	drbd_bm_write_lazy(device, 0);
+	if (drbd_bm_total_weight(device) <= device->rs_failed)
+		drbd_resync_finished(device);
+	drbd_bcast_event(device, &sib);
+	/* update timestamp, in case it took a while to write out stuff */
+	device->rs_last_bcast = jiffies;
+	put_ldev(device);
+}
+
+bool wants_lazy_bitmap_update(struct drbd_device *device)
+{
+	enum drbd_conns connection_state = device->state.conn;
+	return
+	/* only do a lazy writeout, if device is in some resync state */
+	   (connection_state == C_SYNC_SOURCE
+	||  connection_state == C_SYNC_TARGET
+	||  connection_state == C_PAUSED_SYNC_S
+	||  connection_state == C_PAUSED_SYNC_T) &&
+	/* AND
+	 * either we just finished, or the last lazy update
+	 * was some time ago already. */
+	   (drbd_bm_total_weight(device) <= device->rs_failed
+	||  time_after(jiffies, device->rs_last_bcast + 2*HZ));
+}
+
+static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (!wants_lazy_bitmap_update(device))
+			continue;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		update_on_disk_bitmap(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
 	spin_lock_irq(&queue->q_lock);
@@ -1882,6 +1934,8 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.
 		 * In which case we send the barrier above. */
+
+		try_update_all_on_disk_bitmaps(connection);
 	}
 	finish_wait(&connection->sender_work.q_wait, &wait);
 
-- 
cgit v1.2.3


From a80ca1ae81fc52e304e753f6de4ef248df364f9e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 27 Dec 2013 17:17:25 +0100
Subject: drbd: fix a race stopping the worker thread

We may implicitly call drbd_send() from inside wait_for_work(),
via maybe_send_barrier().

If the "stop" signal was send just before that, drbd_send() would call
flush_signals(), and we would run an unbounded schedule() afterwards.

Fix: check for thread_state == RUNNING before we schedule()

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 59158858b17a..47bc84017b5b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1930,6 +1930,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 		if (send_barrier)
 			maybe_send_barrier(connection,
 					connection->send.current_epoch_nr + 1);
+		/* drbd_send() may have called flush_signals() */
+		if (get_t_state(&connection->worker) != RUNNING)
+			break;
 		schedule();
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.
-- 
cgit v1.2.3


From 5ab7d2c005135849cf0bb1485d954c98f2cca57c Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 27 Jan 2014 15:58:22 +0100
Subject: drbd: fix resync finished detection

This fixes one recent regresion,
and one long existing bug.

The bug:
drbd_try_clear_on_disk_bm() assumed that all "count" bits have to be
accounted in the resync extent corresponding to the start sector.

Since we allow application requests to cross our "extent" boundaries,
this assumption is no longer true, resulting in possible misaccounting,
scary messages
("BAD! sector=12345s enr=6 rs_left=-7 rs_failed=0 count=58 cstate=..."),
and potentially, if the last bit to be cleared during resync would
reside in previously misaccounted resync extent, the resync would never
be recognized as finished, but would be "stalled" forever, even though
all blocks are in sync again and all bits have been cleared...

The regression was introduced by
    drbd: get rid of atomic update on disk bitmap works

For an "empty" resync (rs_total == 0), we must not "finish" the
resync on the SyncSource before the SyncTarget knows all relevant
information (sync uuid).  We need to wait for the full round-trip,
the SyncTarget will then explicitly notify us.

Also for normal, non-empty resyncs (rs_total > 0), the resync-finished
condition needs to be tested before the schedule() in wait_for_work, or
it is likely to be missed.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 315 ++++++++++++++++++---------------------
 drivers/block/drbd/drbd_int.h    |  42 ++++--
 drivers/block/drbd/drbd_state.c  |   3 +
 drivers/block/drbd/drbd_worker.c |  42 +++---
 4 files changed, 197 insertions(+), 205 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9c42edf4871b..278c31f24639 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -667,36 +667,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
 	return 0;
 }
 
+static const char *drbd_change_sync_fname[] = {
+	[RECORD_RS_FAILED] = "drbd_rs_failed_io",
+	[SET_IN_SYNC] = "drbd_set_in_sync",
+	[SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
+
 /* ATTENTION. The AL's extents are 4MB each, while the extents in the
  * resync LRU-cache are 16MB each.
  * The caller of this function has to hold an get_ldev() reference.
  *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
  * TODO will be obsoleted once we have a caching lru of the on disk bitmap
  */
-static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector,
-				      int count, int success)
+static bool update_rs_extent(struct drbd_device *device,
+		unsigned int enr, int count,
+		enum update_sync_bits_mode mode)
 {
 	struct lc_element *e;
-	unsigned int enr;
 
 	D_ASSERT(device, atomic_read(&device->local_cnt));
 
-	/* I simply assume that a sector/size pair never crosses
-	 * a 16 MB extent border. (Currently this is true...) */
-	enr = BM_SECT_TO_EXT(sector);
-
-	e = lc_get(device->resync, enr);
+	/* When setting out-of-sync bits,
+	 * we don't need it cached (lc_find).
+	 * But if it is present in the cache,
+	 * we should update the cached bit count.
+	 * Otherwise, that extent should be in the resync extent lru cache
+	 * already -- or we want to pull it in if necessary -- (lc_get),
+	 * then update and check rs_left and rs_failed. */
+	if (mode == SET_OUT_OF_SYNC)
+		e = lc_find(device->resync, enr);
+	else
+		e = lc_get(device->resync, enr);
 	if (e) {
 		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
 		if (ext->lce.lc_number == enr) {
-			if (success)
+			if (mode == SET_IN_SYNC)
 				ext->rs_left -= count;
+			else if (mode == SET_OUT_OF_SYNC)
+				ext->rs_left += count;
 			else
 				ext->rs_failed += count;
 			if (ext->rs_left < ext->rs_failed) {
-				drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d "
+				drbd_warn(device, "BAD! enr=%u rs_left=%d "
 				    "rs_failed=%d count=%d cstate=%s\n",
-				     (unsigned long long)sector,
 				     ext->lce.lc_number, ext->rs_left,
 				     ext->rs_failed, count,
 				     drbd_conn_str(device->state.conn));
@@ -730,24 +750,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
 				     ext->lce.lc_number, ext->rs_failed);
 			}
 			ext->rs_left = rs_left;
-			ext->rs_failed = success ? 0 : count;
+			ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
 			/* we don't keep a persistent log of the resync lru,
 			 * we can commit any change right away. */
 			lc_committed(device->resync);
 		}
-		lc_put(device->resync, &ext->lce);
+		if (mode != SET_OUT_OF_SYNC)
+			lc_put(device->resync, &ext->lce);
 		/* no race, we are within the al_lock! */
 
-		if (ext->rs_left == ext->rs_failed) {
+		if (ext->rs_left <= ext->rs_failed) {
 			ext->rs_failed = 0;
-			wake_up(&first_peer_device(device)->connection->sender_work.q_wait);
+			return true;
 		}
-	} else {
+	} else if (mode != SET_OUT_OF_SYNC) {
+		/* be quiet if lc_find() did not find it. */
 		drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
 		    device->resync_locked,
 		    device->resync->nr_elements,
 		    device->resync->flags);
 	}
+	return false;
 }
 
 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
@@ -766,105 +789,112 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go
 	}
 }
 
-/* clear the bit corresponding to the piece of storage in question:
- * size byte of data starting from sector.  Only clear a bits of the affected
- * one ore more _aligned_ BM_BLOCK_SIZE blocks.
- *
- * called by worker on C_SYNC_TARGET and receiver on SyncSource.
- *
- */
-void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
-		       const char *file, const unsigned int line)
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
 {
-	/* Is called from worker and receiver context _only_ */
-	unsigned long sbnr, ebnr, lbnr;
-	unsigned long count = 0;
-	sector_t esector, nr_sectors;
-	int wake_up = 0;
-	unsigned long flags;
+	return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
 
-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-		drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
-				(unsigned long long)sector, size);
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+	struct drbd_connection *connection;
+	if (rs_done)
+		set_bit(RS_DONE, &device->flags);
+		/* and also set RS_PROGRESS below */
+	else if (!lazy_bitmap_update_due(device))
 		return;
-	}
 
-	if (!get_ldev(device))
-		return; /* no disk, no metadata, no bitmap to clear bits in */
-
-	nr_sectors = drbd_get_capacity(device->this_bdev);
-	esector = sector + (size >> 9) - 1;
-
-	if (!expect(sector < nr_sectors))
-		goto out;
-	if (!expect(esector < nr_sectors))
-		esector = nr_sectors - 1;
-
-	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-	/* we clear it (in sync).
-	 * round up start sector, round down end sector.  we make sure we only
-	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
-	if (unlikely(esector < BM_SECT_PER_BIT-1))
-		goto out;
-	if (unlikely(esector == (nr_sectors-1)))
-		ebnr = lbnr;
-	else
-		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
-
-	if (sbnr > ebnr)
-		goto out;
+	/* compare with test_and_clear_bit() calls in and above
+	 * try_update_all_on_disk_bitmaps() from the drbd_worker(). */
+	if (test_and_set_bit(RS_PROGRESS, &device->flags))
+		return;
+	connection = first_peer_device(device)->connection;
+	if (!test_and_set_bit(CONN_RS_PROGRESS, &connection->flags))
+		wake_up(&connection->sender_work.q_wait);
+}
 
+static int update_sync_bits(struct drbd_device *device,
+		unsigned long sbnr, unsigned long ebnr,
+		enum update_sync_bits_mode mode)
+{
 	/*
-	 * ok, (capacity & 7) != 0 sometimes, but who cares...
-	 * we count rs_{total,left} in bits, not sectors.
+	 * We keep a count of set bits per resync-extent in the ->rs_left
+	 * caching member, so we need to loop and work within the resync extent
+	 * alignment. Typically this loop will execute exactly once.
 	 */
-	count = drbd_bm_clear_bits(device, sbnr, ebnr);
-	if (count) {
-		drbd_advance_rs_marks(device, drbd_bm_total_weight(device));
-		spin_lock_irqsave(&device->al_lock, flags);
-		drbd_try_clear_on_disk_bm(device, sector, count, true);
-		spin_unlock_irqrestore(&device->al_lock, flags);
-
-		/* just wake_up unconditional now, various lc_chaged(),
-		 * lc_put() in drbd_try_clear_on_disk_bm(). */
-		wake_up = 1;
+	unsigned long flags;
+	unsigned long count = 0;
+	unsigned int cleared = 0;
+	while (sbnr <= ebnr) {
+		/* set temporary boundary bit number to last bit number within
+		 * the resync extent of the current start bit number,
+		 * but cap at provided end bit number */
+		unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+		unsigned long c;
+
+		if (mode == RECORD_RS_FAILED)
+			/* Only called from drbd_rs_failed_io(), bits
+			 * supposedly still set.  Recount, maybe some
+			 * of the bits have been successfully cleared
+			 * by application IO meanwhile.
+			 */
+			c = drbd_bm_count_bits(device, sbnr, tbnr);
+		else if (mode == SET_IN_SYNC)
+			c = drbd_bm_clear_bits(device, sbnr, tbnr);
+		else /* if (mode == SET_OUT_OF_SYNC) */
+			c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+		if (c) {
+			spin_lock_irqsave(&device->al_lock, flags);
+			cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+			spin_unlock_irqrestore(&device->al_lock, flags);
+			count += c;
+		}
+		sbnr = tbnr + 1;
 	}
-out:
-	put_ldev(device);
-	if (wake_up)
+	if (count) {
+		if (mode == SET_IN_SYNC) {
+			unsigned long still_to_go = drbd_bm_total_weight(device);
+			bool rs_is_done = (still_to_go <= device->rs_failed);
+			drbd_advance_rs_marks(device, still_to_go);
+			if (cleared || rs_is_done)
+				maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+		} else if (mode == RECORD_RS_FAILED)
+			device->rs_failed += count;
 		wake_up(&device->al_wait);
+	}
+	return count;
 }
 
-/*
- * this is intended to set one request worth of data out of sync.
- * affects at least 1 bit,
- * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
  *
- * called by tl_clear and drbd_send_dblock (==drbd_make_request).
- * so this can be _any_ process.
  */
-int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size,
-			    const char *file, const unsigned int line)
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode,
+		const char *file, const unsigned int line)
 {
-	unsigned long sbnr, ebnr, flags;
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
 	sector_t esector, nr_sectors;
-	unsigned int enr, count = 0;
-	struct lc_element *e;
 
-	/* this should be an empty REQ_FLUSH */
-	if (size == 0)
+	/* This would be an empty REQ_FLUSH, be silent. */
+	if ((mode == SET_OUT_OF_SYNC) && size == 0)
 		return 0;
 
-	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-		drbd_err(device, "sector: %llus, size: %d\n",
-			(unsigned long long)sector, size);
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+		drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+				drbd_change_sync_fname[mode],
+				(unsigned long long)sector, size);
 		return 0;
 	}
 
 	if (!get_ldev(device))
-		return 0; /* no disk, no metadata, no bitmap to set bits in */
+		return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
 
 	nr_sectors = drbd_get_capacity(device->this_bdev);
 	esector = sector + (size >> 9) - 1;
@@ -874,25 +904,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
 	if (!expect(esector < nr_sectors))
 		esector = nr_sectors - 1;
 
-	/* we set it out of sync,
-	 * we do not need to round anything here */
-	sbnr = BM_SECT_TO_BIT(sector);
-	ebnr = BM_SECT_TO_BIT(esector);
-
-	/* ok, (capacity & 7) != 0 sometimes, but who cares...
-	 * we count rs_{total,left} in bits, not sectors.  */
-	spin_lock_irqsave(&device->al_lock, flags);
-	count = drbd_bm_set_bits(device, sbnr, ebnr);
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
-	enr = BM_SECT_TO_EXT(sector);
-	e = lc_find(device->resync, enr);
-	if (e)
-		lc_entry(e, struct bm_extent, lce)->rs_left += count;
-	spin_unlock_irqrestore(&device->al_lock, flags);
+	if (mode == SET_IN_SYNC) {
+		/* Round up start sector, round down end sector.  We make sure
+		 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+		if (unlikely(esector < BM_SECT_PER_BIT-1))
+			goto out;
+		if (unlikely(esector == (nr_sectors-1)))
+			ebnr = lbnr;
+		else
+			ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+		sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+	} else {
+		/* We set it out of sync, or record resync failure.
+		 * Should not round anything here. */
+		sbnr = BM_SECT_TO_BIT(sector);
+		ebnr = BM_SECT_TO_BIT(esector);
+	}
 
+	count = update_sync_bits(device, sbnr, ebnr, mode);
 out:
 	put_ldev(device);
-
 	return count;
 }
 
@@ -1209,69 +1242,3 @@ int drbd_rs_del_all(struct drbd_device *device)
 
 	return 0;
 }
-
-/**
- * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
- * @device:	DRBD device.
- * @sector:	The sector number.
- * @size:	Size of failed IO operation, in byte.
- */
-void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
-{
-	/* Is called from worker and receiver context _only_ */
-	unsigned long sbnr, ebnr, lbnr;
-	unsigned long count;
-	sector_t esector, nr_sectors;
-	int wake_up = 0;
-
-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-		drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
-				(unsigned long long)sector, size);
-		return;
-	}
-	nr_sectors = drbd_get_capacity(device->this_bdev);
-	esector = sector + (size >> 9) - 1;
-
-	if (!expect(sector < nr_sectors))
-		return;
-	if (!expect(esector < nr_sectors))
-		esector = nr_sectors - 1;
-
-	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-	/*
-	 * round up start sector, round down end sector.  we make sure we only
-	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
-	if (unlikely(esector < BM_SECT_PER_BIT-1))
-		return;
-	if (unlikely(esector == (nr_sectors-1)))
-		ebnr = lbnr;
-	else
-		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
-
-	if (sbnr > ebnr)
-		return;
-
-	/*
-	 * ok, (capacity & 7) != 0 sometimes, but who cares...
-	 * we count rs_{total,left} in bits, not sectors.
-	 */
-	spin_lock_irq(&device->al_lock);
-	count = drbd_bm_count_bits(device, sbnr, ebnr);
-	if (count) {
-		device->rs_failed += count;
-
-		if (get_ldev(device)) {
-			drbd_try_clear_on_disk_bm(device, sector, count, false);
-			put_ldev(device);
-		}
-
-		/* just wake_up unconditional now, various lc_chaged(),
-		 * lc_put() in drbd_try_clear_on_disk_bm(). */
-		wake_up = 1;
-	}
-	spin_unlock_irq(&device->al_lock);
-	if (wake_up)
-		wake_up(&device->al_wait);
-}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index eb002a7656af..a16f9ae3c98a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -432,7 +432,11 @@ enum {
 				 * goes into C_CONNECTED state. */
 	CONSIDER_RESYNC,
 
+	RS_PROGRESS,		/* tell worker that resync made significant progress */
+	RS_DONE,		/* tell worker that resync is done */
+
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
+
 	SUSPEND_IO,		/* suspend application io */
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
@@ -577,6 +581,7 @@ enum {
 				 * and potentially deadlock on, this drbd worker.
 				 */
 	DISCONNECT_SENT,
+	CONN_RS_PROGRESS,	/* tell worker that resync made significant progress */
 };
 
 struct drbd_resource {
@@ -1106,17 +1111,21 @@ struct bm_extent {
 /* in which _bitmap_ extent (resp. sector) the bit for a certain
  * _storage_ sector is located in */
 #define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
 
-/* how much _storage_ sectors we have per bitmap sector */
+/* first storage sector a bitmap extent corresponds to */
 #define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
 #define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
+
 
 /* in one sector of the bitmap, we have this many activity_log extents. */
 #define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
 
-#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
-#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
-
 /* the extent in "PER_EXTENT" below is an activity log extent
  * we need that many (long words/bytes) to store the bitmap
  *		     of one AL_EXTENT_SIZE chunk of storage.
@@ -1214,7 +1223,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon
 extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
 extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
 extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
-extern int drbd_bm_rs_done(struct drbd_device *device);
 /* for receive_bitmap */
 extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
 		size_t number, unsigned long *buffer);
@@ -1503,14 +1511,17 @@ extern int drbd_rs_del_all(struct drbd_device *device);
 extern void drbd_rs_failed_io(struct drbd_device *device,
 		sector_t sector, int size);
 extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
-extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector,
-		int size, const char *file, const unsigned int line);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode,
+		const char *file, const unsigned int line);
 #define drbd_set_in_sync(device, sector, size) \
-	__drbd_set_in_sync(device, sector, size, __FILE__, __LINE__)
-extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
-		int size, const char *file, const unsigned int line);
+	__drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
 #define drbd_set_out_of_sync(device, sector, size) \
-	__drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__)
+	__drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
+#define drbd_rs_failed_io(device, sector, size) \
+	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
 extern void drbd_al_shrink(struct drbd_device *device);
 extern int drbd_initialize_al(struct drbd_device *, void *);
 
@@ -1915,6 +1926,15 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
 	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+	return
+	   (connection_state == C_SYNC_SOURCE
+	||  connection_state == C_SYNC_TARGET
+	||  connection_state == C_PAUSED_SYNC_S
+	||  connection_state == C_PAUSED_SYNC_T);
+}
+
 /**
  * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
  * @M:		DRBD device.
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 19da7c7590cd..1bddd6cf8ac7 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1011,6 +1011,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 		atomic_inc(&device->local_cnt);
 
 	did_remote = drbd_should_do_remote(device->state);
+	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+		clear_bit(RS_DONE, &device->flags);
+
 	device->state.i = ns.i;
 	should_do_remote = drbd_should_do_remote(device->state);
 	device->resource->susp = ns.susp;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 47bc84017b5b..bafb62eb22c9 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1740,11 +1740,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 			device->rs_mark_time[i] = now;
 		}
 		_drbd_pause_after(device);
+		/* Forget potentially stale cached per resync extent bit-counts.
+		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
+		 * disabled, and know the disk state is ok. */
+		spin_lock(&device->al_lock);
+		lc_reset(device->resync);
+		device->resync_locked = 0;
+		device->resync_wenr = LC_FREE;
+		spin_unlock(&device->al_lock);
 	}
 	write_unlock(&global_state_lock);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	if (r == SS_SUCCESS) {
+		wake_up(&device->al_wait); /* for lc_reset() above */
 		/* reset rs_last_bcast when a resync or verify is started,
 		 * to deal with potential jiffies wrap. */
 		device->rs_last_bcast = jiffies - HZ;
@@ -1807,36 +1816,22 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 static void update_on_disk_bitmap(struct drbd_device *device)
 {
 	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+	bool resync_done = test_and_clear_bit(RS_DONE, &device->flags);
 	device->rs_last_bcast = jiffies;
 
 	if (!get_ldev(device))
 		return;
 
 	drbd_bm_write_lazy(device, 0);
-	if (drbd_bm_total_weight(device) <= device->rs_failed)
+	if (resync_done && is_sync_state(device->state.conn))
 		drbd_resync_finished(device);
+
 	drbd_bcast_event(device, &sib);
 	/* update timestamp, in case it took a while to write out stuff */
 	device->rs_last_bcast = jiffies;
 	put_ldev(device);
 }
 
-bool wants_lazy_bitmap_update(struct drbd_device *device)
-{
-	enum drbd_conns connection_state = device->state.conn;
-	return
-	/* only do a lazy writeout, if device is in some resync state */
-	   (connection_state == C_SYNC_SOURCE
-	||  connection_state == C_SYNC_TARGET
-	||  connection_state == C_PAUSED_SYNC_S
-	||  connection_state == C_PAUSED_SYNC_T) &&
-	/* AND
-	 * either we just finished, or the last lazy update
-	 * was some time ago already. */
-	   (drbd_bm_total_weight(device) <= device->rs_failed
-	||  time_after(jiffies, device->rs_last_bcast + 2*HZ));
-}
-
 static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
 {
 	struct drbd_peer_device *peer_device;
@@ -1845,8 +1840,9 @@ static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
-		if (!wants_lazy_bitmap_update(device))
+		if (!test_and_clear_bit(RS_PROGRESS, &device->flags))
 			continue;
+
 		kref_get(&device->kref);
 		rcu_read_unlock();
 		update_on_disk_bitmap(device);
@@ -1930,15 +1926,18 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 		if (send_barrier)
 			maybe_send_barrier(connection,
 					connection->send.current_epoch_nr + 1);
+
+		if (test_bit(CONN_RS_PROGRESS, &connection->flags))
+			break;
+
 		/* drbd_send() may have called flush_signals() */
 		if (get_t_state(&connection->worker) != RUNNING)
 			break;
+
 		schedule();
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.
 		 * In which case we send the barrier above. */
-
-		try_update_all_on_disk_bitmaps(connection);
 	}
 	finish_wait(&connection->sender_work.q_wait, &wait);
 
@@ -1973,6 +1972,9 @@ int drbd_worker(struct drbd_thread *thi)
 		if (list_empty(&work_list))
 			wait_for_work(connection, &work_list);
 
+		if (test_and_clear_bit(CONN_RS_PROGRESS, &connection->flags))
+			try_update_all_on_disk_bitmaps(connection);
+
 		if (signal_pending(current)) {
 			flush_signals(current);
 			if (get_t_state(thi) == RUNNING) {
-- 
cgit v1.2.3


From fcb096740a13c839e16e6735ba486879b54bc861 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 27 Jan 2014 16:04:14 +0100
Subject: drbd: stop the meta data sync timer before open coded meta data sync

If we re-write all meta data due to resize, we have open-coded write-out
of our meta data super block. Stop the md_sync_timer, it would just
trigger scary but in this case spurious "timer expired" messages.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 52221f67e395..133e9b37c6c2 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -967,6 +967,10 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	if (la_size_changed || md_moved || rs) {
 		u32 prev_flags;
 
+		/* We do some synchronous IO below, which may take some time.
+		 * Clear the timer, to avoid scary "timer expired!" messages,
+		 * "Superblock" is written out at least twice below, anyways. */
+		del_timer(&device->md_sync_timer);
 		drbd_al_shrink(device); /* All extents inactive. */
 
 		prev_flags = md->flags;
-- 
cgit v1.2.3


From 66ce6dbce2efa31d038a3148fa99bd86abd2203f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 5 Feb 2014 06:17:01 +0100
Subject: drbd: re-add lost conf_mutex protection in drbd_set_role

The conf_update mutex used to be held while clearing the
net_conf->discard_my_data flag inside drbd_set_role.

It was moved into drbd_adm_set_role with
    drbd: allow parallel promote/demote actions
but then replaced at that location by the newly introduced adm_mutex with
    drbd: Fix a potential deadlock in drbdsetup, introduce resource->adm_mutex

And I simply forgot to put it back in at the original location.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 133e9b37c6c2..23670d81ec32 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -661,11 +661,11 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
 			put_ldev(device);
 		}
 	} else {
-		/* Called from drbd_adm_set_role only.
-		 * We are still holding the conf_update mutex. */
+		mutex_lock(&device->resource->conf_update);
 		nc = connection->net_conf;
 		if (nc)
 			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+		mutex_unlock(&device->resource->conf_update);
 
 		set_disk_ro(device->vdisk, false);
 		if (get_ldev(device)) {
-- 
cgit v1.2.3


From 123ff122ade4b22961f2c244c44966d52c2a7ca5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 5 Feb 2014 06:13:53 +0100
Subject: drbd: trigger tcp_push_pending_frames() for PING and PING_ACK

This should reduce latency for such in-DRBD-protocol "pings",
and may help reduce spurious disconnect/reconnect cycles due to
 "PingAck did not arrive in time."

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8e0813ddf72a..77d34a61bb58 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -662,6 +662,11 @@ static int __send_command(struct drbd_connection *connection, int vnr,
 			    msg_flags);
 	if (data && !err)
 		err = drbd_send_all(connection, sock->socket, data, size, 0);
+	/* DRBD protocol "pings" are latency critical.
+	 * This is supposed to trigger tcp_push_pending_frames() */
+	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+		drbd_tcp_nodelay(sock->socket);
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 720979fb9048bc6b4460b39864c64d7fc920a10f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 5 Feb 2014 06:28:08 +0100
Subject: drbd: move set_disk_ro() to after we persisted the new role

This probably does not have any real life impact,
but we should first persist any potentially new UUID
and other meta data flags, as well as our new role,
before we allow/disallow write access.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 23670d81ec32..ceebd31eddb9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -655,7 +655,6 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
 	/* FIXME also wait for all pending P_BARRIER_ACK? */
 
 	if (new_role == R_SECONDARY) {
-		set_disk_ro(device->vdisk, true);
 		if (get_ldev(device)) {
 			device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 			put_ldev(device);
@@ -667,7 +666,6 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
 			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
 		mutex_unlock(&device->resource->conf_update);
 
-		set_disk_ro(device->vdisk, false);
 		if (get_ldev(device)) {
 			if (((device->state.conn < C_CONNECTED ||
 			       device->state.pdsk <= D_FAILED)
@@ -690,7 +688,7 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
 	}
 
 	drbd_md_sync(device);
-
+	set_disk_ro(device->vdisk, new_role == R_SECONDARY);
 	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 out:
 	mutex_unlock(device->state_mutex);
-- 
cgit v1.2.3


From 2ed912e9d35b4d7d766a530f5dd14513a3ab9f64 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 11 Feb 2014 08:56:53 +0100
Subject: drbd: explicitly submit meta data requests with REQ_NOIDLE

For some reason we have assumed NOIDLE was implied
by one of the other flags we set. It is not (anymore?).
Explicitly set REQ_NOIDLE for synchronous meta data updates,
or we can seriously starve random writes when using CFQ.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 278c31f24639..14b2f27291aa 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -150,7 +150,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 
 	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
 		rw |= REQ_FUA | REQ_FLUSH;
-	rw |= REQ_SYNC;
+	rw |= REQ_SYNC | REQ_NOIDLE;
 
 	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
-- 
cgit v1.2.3


From ba3c6fb87d2df008eed8faaf01bb198e512fa72f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 11 Feb 2014 08:57:18 +0100
Subject: drbd: close race when detaching from disk

BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
IP: bd_release+0x21/0x70
Process drbd_w_t7146
Call Trace:
 close_bdev_exclusive
 drbd_free_ldev		[drbd]
 drbd_ldev_destroy	[drbd]
 w_after_state_ch	[drbd]

Race probably went like this:
  state.disk = D_FAILED

... first one to hit zero during D_FAILED:
   put_ldev() /* ----------------> 0 */
     i = atomic_dec_return()
     if (i == 0)
       if (state.disk == D_FAILED)
         schedule_work(go_diskless)
                                /* 1 <------ */ get_ldev_if_state()
   go_diskless()
      do_some_pre_cleanup()                     corresponding put_ldev():
      force_state(D_DISKLESS)   /* 0 <------ */ i = atomic_dec_return()
                                                if (i == 0)
        atomic_inc() /* ---------> 1 */
        state.disk = D_DISKLESS
        schedule_work(after_state_ch)           /* execution pre-empted by IRQ ? */

   after_state_ch()
     put_ldev()
       i = atomic_dec_return()  /* 0 */
       if (i == 0)
         if (state.disk == D_DISKLESS)            if (state.disk == D_DISKLESS)
           drbd_ldev_destroy()                      drbd_ldev_destroy();

Trying to fix this by checking the disk state *before* the
atomic_dec_return(), which implies memory barriers, and by inserting
extra memory barriers around the state assignment in __drbd_set_state().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h   |  9 +++++++--
 drivers/block/drbd/drbd_state.c | 11 +++++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a16f9ae3c98a..a0ffc19ccf0e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1946,6 +1946,11 @@ static inline bool is_sync_state(enum drbd_conns connection_state)
 
 static inline void put_ldev(struct drbd_device *device)
 {
+	enum drbd_disk_state ds = device->state.disk;
+	/* We must check the state *before* the atomic_dec becomes visible,
+	 * or we have a theoretical race where someone hitting zero,
+	 * while state still D_FAILED, will then see D_DISKLESS in the
+	 * condition below and calling into destroy, where he must not, yet. */
 	int i = atomic_dec_return(&device->local_cnt);
 
 	/* This may be called from some endio handler,
@@ -1954,10 +1959,10 @@ static inline void put_ldev(struct drbd_device *device)
 	__release(local);
 	D_ASSERT(device, i >= 0);
 	if (i == 0) {
-		if (device->state.disk == D_DISKLESS)
+		if (ds == D_DISKLESS)
 			/* even internal references gone, safe to destroy */
 			drbd_ldev_destroy(device);
-		if (device->state.disk == D_FAILED) {
+		if (ds == D_FAILED) {
 			/* all application IO references gone. */
 			if (!test_and_set_bit(GO_DISKLESS, &device->flags))
 				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 1bddd6cf8ac7..6629f4668102 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -958,7 +958,6 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	enum drbd_state_rv rv = SS_SUCCESS;
 	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
-	bool did_remote, should_do_remote;
 
 	os = drbd_read_state(device);
 
@@ -1010,18 +1009,22 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
 		atomic_inc(&device->local_cnt);
 
-	did_remote = drbd_should_do_remote(device->state);
 	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
 		clear_bit(RS_DONE, &device->flags);
 
+	/* changes to local_cnt and device flags should be visible before
+	 * changes to state, which again should be visible before anything else
+	 * depending on that change happens. */
+	smp_wmb();
 	device->state.i = ns.i;
-	should_do_remote = drbd_should_do_remote(device->state);
 	device->resource->susp = ns.susp;
 	device->resource->susp_nod = ns.susp_nod;
 	device->resource->susp_fen = ns.susp_fen;
+	smp_wmb();
 
 	/* put replicated vs not-replicated requests in seperate epochs */
-	if (did_remote != should_do_remote)
+	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+	    drbd_should_do_remote((union drbd_dev_state)ns.i))
 		start_new_tl_epoch(connection);
 
 	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
-- 
cgit v1.2.3


From e334f55095b908f12c8bad991433f5d609e919d1 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 11 Feb 2014 09:30:49 +0100
Subject: drbd: make sure disk cleanup happens in worker context

The recent fix to put_ldev() (correct ordering of access to local_cnt
and state.disk; memory barrier in __drbd_set_state) guarantees
that the cleanup happens exactly once.

However it does not yet guarantee that the cleanup happens from worker
context, the last put_ldev() may still happen from atomic context,
which must not happen: blkdev_put() may sleep.

Fix this by scheduling the cleanup to the worker instead,
using a couple more bits in device->flags and a new helper,
drbd_device_post_work().

Generalized the "resync progress" work to cover these new work bits.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c |   9 +---
 drivers/block/drbd/drbd_int.h    |  40 +++++++++++-----
 drivers/block/drbd/drbd_main.c   |  59 -----------------------
 drivers/block/drbd/drbd_nl.c     |   2 +-
 drivers/block/drbd/drbd_worker.c | 100 +++++++++++++++++++++++++++++++++++----
 5 files changed, 120 insertions(+), 90 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 14b2f27291aa..5add2016216c 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -797,20 +797,13 @@ static bool lazy_bitmap_update_due(struct drbd_device *device)
 
 static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
 {
-	struct drbd_connection *connection;
 	if (rs_done)
 		set_bit(RS_DONE, &device->flags);
 		/* and also set RS_PROGRESS below */
 	else if (!lazy_bitmap_update_due(device))
 		return;
 
-	/* compare with test_and_clear_bit() calls in and above
-	 * try_update_all_on_disk_bitmaps() from the drbd_worker(). */
-	if (test_and_set_bit(RS_PROGRESS, &device->flags))
-		return;
-	connection = first_peer_device(device)->connection;
-	if (!test_and_set_bit(CONN_RS_PROGRESS, &connection->flags))
-		wake_up(&connection->sender_work.q_wait);
+	drbd_device_post_work(device, RS_PROGRESS);
 }
 
 static int update_sync_bits(struct drbd_device *device,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a0ffc19ccf0e..5768260feef6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -432,16 +432,12 @@ enum {
 				 * goes into C_CONNECTED state. */
 	CONSIDER_RESYNC,
 
-	RS_PROGRESS,		/* tell worker that resync made significant progress */
-	RS_DONE,		/* tell worker that resync is done */
-
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
 
 	SUSPEND_IO,		/* suspend application io */
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
-	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
 	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
 	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
 	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
@@ -454,6 +450,15 @@ enum {
 	B_RS_H_DONE,		/* Before resync handler done (already executed) */
 	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
 	READ_BALANCE_RR,
+
+	/* cleared only after backing device related structures have been destroyed. */
+	GOING_DISKLESS,		/* Disk is being detached, because of io-error, or admin request. */
+
+	/* to be used in drbd_device_post_work() */
+	GO_DISKLESS,		/* tell worker to schedule cleanup before detach */
+	DESTROY_DISK,		/* tell worker to close backing devices and destroy related structures. */
+	RS_PROGRESS,		/* tell worker that resync made significant progress */
+	RS_DONE,		/* tell worker that resync is done */
 };
 
 struct drbd_bitmap; /* opaque for drbd_device */
@@ -581,7 +586,8 @@ enum {
 				 * and potentially deadlock on, this drbd worker.
 				 */
 	DISCONNECT_SENT,
-	CONN_RS_PROGRESS,	/* tell worker that resync made significant progress */
+
+	DEVICE_WORK_PENDING,	/* tell worker that some device has pending work */
 };
 
 struct drbd_resource {
@@ -703,7 +709,6 @@ struct drbd_device {
 	unsigned long last_reattach_jif;
 	struct drbd_work resync_work;
 	struct drbd_work unplug_work;
-	struct drbd_work go_diskless;
 	struct drbd_work md_sync_work;
 	struct drbd_work start_resync_work;
 	struct timer_list resync_timer;
@@ -991,7 +996,6 @@ extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
 		char *why, enum bm_flag flags);
 extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
 extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
-extern void drbd_ldev_destroy(struct drbd_device *device);
 
 /* Meta data layout
  *
@@ -1796,6 +1800,18 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 	wake_up(&q->q_wait);
 }
 
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+	if (!test_and_set_bit(work_bit, &device->flags)) {
+		struct drbd_connection *connection =
+			first_peer_device(device)->connection;
+		struct drbd_work_queue *q = &connection->sender_work;
+		if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+			wake_up(&q->q_wait);
+	}
+}
+
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
 static inline void wake_asender(struct drbd_connection *connection)
@@ -1961,13 +1977,11 @@ static inline void put_ldev(struct drbd_device *device)
 	if (i == 0) {
 		if (ds == D_DISKLESS)
 			/* even internal references gone, safe to destroy */
-			drbd_ldev_destroy(device);
-		if (ds == D_FAILED) {
+			drbd_device_post_work(device, DESTROY_DISK);
+		if (ds == D_FAILED)
 			/* all application IO references gone. */
-			if (!test_and_set_bit(GO_DISKLESS, &device->flags))
-				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-						&device->go_diskless);
-		}
+			if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+				drbd_device_post_work(device, GO_DISKLESS);
 		wake_up(&device->misc_wait);
 	}
 }
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 77d34a61bb58..0cf609464ecf 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -63,7 +63,6 @@ static void drbd_release(struct gendisk *gd, fmode_t mode);
 static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_work *w, int unused);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
 	      "Lars Ellenberg <lars@linbit.com>");
@@ -1929,14 +1928,12 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	INIT_LIST_HEAD(&device->resync_reads);
 	INIT_LIST_HEAD(&device->resync_work.list);
 	INIT_LIST_HEAD(&device->unplug_work.list);
-	INIT_LIST_HEAD(&device->go_diskless.list);
 	INIT_LIST_HEAD(&device->md_sync_work.list);
 	INIT_LIST_HEAD(&device->start_resync_work.list);
 	INIT_LIST_HEAD(&device->bm_io_work.w.list);
 
 	device->resync_work.cb  = w_resync_timer;
 	device->unplug_work.cb  = w_send_write_hint;
-	device->go_diskless.cb  = w_go_diskless;
 	device->md_sync_work.cb = w_md_sync;
 	device->bm_io_work.w.cb = w_bitmap_io;
 	device->start_resync_work.cb = w_start_resync;
@@ -2011,7 +2008,6 @@ void drbd_device_cleanup(struct drbd_device *device)
 	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
 	D_ASSERT(device, list_empty(&device->resync_work.list));
 	D_ASSERT(device, list_empty(&device->unplug_work.list));
-	D_ASSERT(device, list_empty(&device->go_diskless.list));
 
 	drbd_set_defaults(device);
 }
@@ -3531,61 +3527,6 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
 	return 0;
 }
 
-void drbd_ldev_destroy(struct drbd_device *device)
-{
-	lc_destroy(device->resync);
-	device->resync = NULL;
-	lc_destroy(device->act_log);
-	device->act_log = NULL;
-	__no_warn(local,
-		drbd_free_ldev(device->ldev);
-		device->ldev = NULL;);
-
-	clear_bit(GO_DISKLESS, &device->flags);
-}
-
-static int w_go_diskless(struct drbd_work *w, int unused)
-{
-	struct drbd_device *device =
-		container_of(w, struct drbd_device, go_diskless);
-
-	D_ASSERT(device, device->state.disk == D_FAILED);
-	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
-	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
-	 * the protected members anymore, though, so once put_ldev reaches zero
-	 * again, it will be safe to free them. */
-
-	/* Try to write changed bitmap pages, read errors may have just
-	 * set some bits outside the area covered by the activity log.
-	 *
-	 * If we have an IO error during the bitmap writeout,
-	 * we will want a full sync next time, just in case.
-	 * (Do we want a specific meta data flag for this?)
-	 *
-	 * If that does not make it to stable storage either,
-	 * we cannot do anything about that anymore.
-	 *
-	 * We still need to check if both bitmap and ldev are present, we may
-	 * end up here after a failed attach, before ldev was even assigned.
-	 */
-	if (device->bitmap && device->ldev) {
-		/* An interrupted resync or similar is allowed to recounts bits
-		 * while we detach.
-		 * Any modifications would not be expected anymore, though.
-		 */
-		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
-					"detach", BM_LOCKED_TEST_ALLOWED)) {
-			if (test_bit(WAS_READ_ERROR, &device->flags)) {
-				drbd_md_set_flag(device, MDF_FULL_SYNC);
-				drbd_md_sync(device);
-			}
-		}
-	}
-
-	drbd_force_state(device, NS(disk, D_DISKLESS));
-	return 0;
-}
-
 /**
  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
  * @device:	DRBD device.
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index ceebd31eddb9..628167db673b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1482,7 +1482,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	 * drbd_ldev_destroy is done already, we may end up here very fast,
 	 * e.g. if someone calls attach from the on-io-error handler,
 	 * to realize a "hot spare" feature (not that I'd recommend that) */
-	wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
+	wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
 
 	/* make sure there is no leftover from previous force-detach attempts */
 	clear_bit(FORCE_DETACH, &device->flags);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index bafb62eb22c9..00bf4900a609 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1813,10 +1813,9 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	mutex_unlock(device->state_mutex);
 }
 
-static void update_on_disk_bitmap(struct drbd_device *device)
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
 {
 	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
-	bool resync_done = test_and_clear_bit(RS_DONE, &device->flags);
 	device->rs_last_bcast = jiffies;
 
 	if (!get_ldev(device))
@@ -1832,7 +1831,87 @@ static void update_on_disk_bitmap(struct drbd_device *device)
 	put_ldev(device);
 }
 
-static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+	lc_destroy(device->resync);
+	device->resync = NULL;
+	lc_destroy(device->act_log);
+	device->act_log = NULL;
+	__no_warn(local,
+		drbd_free_ldev(device->ldev);
+		device->ldev = NULL;);
+	clear_bit(GOING_DISKLESS, &device->flags);
+	wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+	D_ASSERT(device, device->state.disk == D_FAILED);
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
+
+	/* Try to write changed bitmap pages, read errors may have just
+	 * set some bits outside the area covered by the activity log.
+	 *
+	 * If we have an IO error during the bitmap writeout,
+	 * we will want a full sync next time, just in case.
+	 * (Do we want a specific meta data flag for this?)
+	 *
+	 * If that does not make it to stable storage either,
+	 * we cannot do anything about that anymore.
+	 *
+	 * We still need to check if both bitmap and ldev are present, we may
+	 * end up here after a failed attach, before ldev was even assigned.
+	 */
+	if (device->bitmap && device->ldev) {
+		/* An interrupted resync or similar is allowed to recounts bits
+		 * while we detach.
+		 * Any modifications would not be expected anymore, though.
+		 */
+		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+					"detach", BM_LOCKED_TEST_ALLOWED)) {
+			if (test_bit(WAS_READ_ERROR, &device->flags)) {
+				drbd_md_set_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+	}
+
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+#define WORK_PENDING(work_bit, todo)	(todo & (1UL << work_bit))
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+	if (WORK_PENDING(RS_DONE, todo) ||
+	    WORK_PENDING(RS_PROGRESS, todo))
+		update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo));
+	if (WORK_PENDING(GO_DISKLESS, todo))
+		go_diskless(device);
+	if (WORK_PENDING(DESTROY_DISK, todo))
+		drbd_ldev_destroy(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK	\
+	((1UL << GO_DISKLESS)	\
+	|(1UL << DESTROY_DISK)	\
+	|(1UL << RS_PROGRESS)	\
+	|(1UL << RS_DONE)	\
+	)
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+	unsigned long old, new;
+	do {
+		old = *flags;
+		new = old & ~DRBD_DEVICE_WORK_MASK;
+	} while (cmpxchg(flags, old, new) != old);
+	return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
 {
 	struct drbd_peer_device *peer_device;
 	int vnr;
@@ -1840,12 +1919,13 @@ static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
-		if (!test_and_clear_bit(RS_PROGRESS, &device->flags))
+		unsigned long todo = get_work_bits(&device->flags);
+		if (!todo)
 			continue;
 
 		kref_get(&device->kref);
 		rcu_read_unlock();
-		update_on_disk_bitmap(device);
+		do_device_work(device, todo);
 		kref_put(&device->kref, drbd_destroy_device);
 		rcu_read_lock();
 	}
@@ -1927,7 +2007,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 			maybe_send_barrier(connection,
 					connection->send.current_epoch_nr + 1);
 
-		if (test_bit(CONN_RS_PROGRESS, &connection->flags))
+		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
 			break;
 
 		/* drbd_send() may have called flush_signals() */
@@ -1972,8 +2052,8 @@ int drbd_worker(struct drbd_thread *thi)
 		if (list_empty(&work_list))
 			wait_for_work(connection, &work_list);
 
-		if (test_and_clear_bit(CONN_RS_PROGRESS, &connection->flags))
-			try_update_all_on_disk_bitmaps(connection);
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags))
+			do_unqueued_work(connection);
 
 		if (signal_pending(current)) {
 			flush_signals(current);
@@ -1998,13 +2078,15 @@ int drbd_worker(struct drbd_thread *thi)
 	}
 
 	do {
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags))
+			do_unqueued_work(connection);
 		while (!list_empty(&work_list)) {
 			w = list_first_entry(&work_list, struct drbd_work, list);
 			list_del_init(&w->list);
 			w->cb(w, 1);
 		}
 		dequeue_work_batch(&connection->sender_work, &work_list);
-	} while (!list_empty(&work_list));
+	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
 
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-- 
cgit v1.2.3


From ac0acb9e39ac41575cc6a344d04295436fd4eb4e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 11 Feb 2014 09:47:58 +0100
Subject: drbd: use drbd_device_post_work() in more places

This replaces the md_sync_work member of struct drbd_device
by a new MD_SYNC "work bit" in device->flags.

This replaces the resync_start_work member of struct drbd_device
by a new RS_START "work bit" in device->flags.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h    | 16 ++--------------
 drivers/block/drbd/drbd_main.c   | 25 +------------------------
 drivers/block/drbd/drbd_worker.c | 27 +++++++++++++++++----------
 3 files changed, 20 insertions(+), 48 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 5768260feef6..3c701b0e2956 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -457,6 +457,8 @@ enum {
 	/* to be used in drbd_device_post_work() */
 	GO_DISKLESS,		/* tell worker to schedule cleanup before detach */
 	DESTROY_DISK,		/* tell worker to close backing devices and destroy related structures. */
+	MD_SYNC,		/* tell worker to call drbd_md_sync() */
+	RS_START,		/* tell worker to start resync/OV */
 	RS_PROGRESS,		/* tell worker that resync made significant progress */
 	RS_DONE,		/* tell worker that resync is done */
 };
@@ -709,18 +711,10 @@ struct drbd_device {
 	unsigned long last_reattach_jif;
 	struct drbd_work resync_work;
 	struct drbd_work unplug_work;
-	struct drbd_work md_sync_work;
-	struct drbd_work start_resync_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
 	struct timer_list start_resync_timer;
 	struct timer_list request_timer;
-#ifdef DRBD_DEBUG_MD_SYNC
-	struct {
-		unsigned int line;
-		const char* func;
-	} last_md_mark_dirty;
-#endif
 
 	/* Used after attach while negotiating new disk state. */
 	union drbd_state new_state_tmp;
@@ -977,13 +971,7 @@ extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must
 extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
 extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
 extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
-#ifndef DRBD_DEBUG_MD_SYNC
 extern void drbd_md_mark_dirty(struct drbd_device *device);
-#else
-#define drbd_md_mark_dirty(m)	drbd_md_mark_dirty_(m, __LINE__ , __func__ )
-extern void drbd_md_mark_dirty_(struct drbd_device *device,
-		unsigned int line, const char *func);
-#endif
 extern void drbd_queue_bitmap_io(struct drbd_device *device,
 				 int (*io_fn)(struct drbd_device *),
 				 void (*done)(struct drbd_device *, int),
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0cf609464ecf..ed35d52b4763 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -60,7 +60,6 @@
 static DEFINE_MUTEX(drbd_main_mutex);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
 static void drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_work *w, int unused);
 
@@ -1928,15 +1927,11 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	INIT_LIST_HEAD(&device->resync_reads);
 	INIT_LIST_HEAD(&device->resync_work.list);
 	INIT_LIST_HEAD(&device->unplug_work.list);
-	INIT_LIST_HEAD(&device->md_sync_work.list);
-	INIT_LIST_HEAD(&device->start_resync_work.list);
 	INIT_LIST_HEAD(&device->bm_io_work.w.list);
 
 	device->resync_work.cb  = w_resync_timer;
 	device->unplug_work.cb  = w_send_write_hint;
-	device->md_sync_work.cb = w_md_sync;
 	device->bm_io_work.w.cb = w_bitmap_io;
-	device->start_resync_work.cb = w_start_resync;
 
 	init_timer(&device->resync_timer);
 	init_timer(&device->md_sync_timer);
@@ -3623,25 +3618,7 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
 static void md_sync_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
-
-	/* must not double-queue! */
-	if (list_empty(&device->md_sync_work.list))
-		drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-				      &device->md_sync_work);
-}
-
-static int w_md_sync(struct drbd_work *w, int unused)
-{
-	struct drbd_device *device =
-		container_of(w, struct drbd_device, md_sync_work);
-
-	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
-#ifdef DEBUG
-	drbd_warn(device, "last md_mark_dirty: %s:%u\n",
-		device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
-#endif
-	drbd_md_sync(device);
-	return 0;
+	drbd_device_post_work(device, MD_SYNC);
 }
 
 const char *cmdname(enum drbd_packet cmd)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 00bf4900a609..a4310fd99ffc 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1606,26 +1606,20 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 void start_resync_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
-
-	drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-			&device->start_resync_work);
+	drbd_device_post_work(device, RS_START);
 }
 
-int w_start_resync(struct drbd_work *w, int cancel)
+static void do_start_resync(struct drbd_device *device)
 {
-	struct drbd_device *device =
-		container_of(w, struct drbd_device, start_resync_work);
-
 	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
-		drbd_warn(device, "w_start_resync later...\n");
+		drbd_warn(device, "postponing start_resync ...\n");
 		device->start_resync_timer.expires = jiffies + HZ/10;
 		add_timer(&device->start_resync_timer);
-		return 0;
+		return;
 	}
 
 	drbd_start_resync(device, C_SYNC_SOURCE);
 	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
-	return 0;
 }
 
 /**
@@ -1882,9 +1876,18 @@ static void go_diskless(struct drbd_device *device)
 	drbd_force_state(device, NS(disk, D_DISKLESS));
 }
 
+static int do_md_sync(struct drbd_device *device)
+{
+	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_md_sync(device);
+	return 0;
+}
+
 #define WORK_PENDING(work_bit, todo)	(todo & (1UL << work_bit))
 static void do_device_work(struct drbd_device *device, const unsigned long todo)
 {
+	if (WORK_PENDING(MD_SYNC, todo))
+		do_md_sync(device);
 	if (WORK_PENDING(RS_DONE, todo) ||
 	    WORK_PENDING(RS_PROGRESS, todo))
 		update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo));
@@ -1892,11 +1895,15 @@ static void do_device_work(struct drbd_device *device, const unsigned long todo)
 		go_diskless(device);
 	if (WORK_PENDING(DESTROY_DISK, todo))
 		drbd_ldev_destroy(device);
+	if (WORK_PENDING(RS_START, todo))
+		do_start_resync(device);
 }
 
 #define DRBD_DEVICE_WORK_MASK	\
 	((1UL << GO_DISKLESS)	\
 	|(1UL << DESTROY_DISK)	\
+	|(1UL << MD_SYNC)	\
+	|(1UL << RS_START)	\
 	|(1UL << RS_PROGRESS)	\
 	|(1UL << RS_DONE)	\
 	)
-- 
cgit v1.2.3


From 4dd726f02928ded116f6c9aaf6392a400ef0d9f7 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 11 Feb 2014 11:15:36 +0100
Subject: drbd: get rid of drbd_queue_work_front

The last user was al_write_transaction, if called with "delegate",
and the last user to call it with "delegate = true" was the receiver
thread, which has no need to delegate, but can call it himself.

Finally drop the delegate parameter, drop the extra
w_al_write_transaction callback, and drop drbd_queue_work_front.

Do not (yet) change dequeue_work_item to dequeue_work_batch, though.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c   | 69 ++++----------------------------------
 drivers/block/drbd/drbd_int.h      | 14 ++------
 drivers/block/drbd/drbd_receiver.c |  2 +-
 drivers/block/drbd/drbd_req.c      |  2 +-
 drivers/block/drbd/drbd_worker.c   |  6 ++--
 5 files changed, 12 insertions(+), 81 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 5add2016216c..5f82a36f799d 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,14 +92,6 @@ struct __packed al_transaction_on_disk {
 	__be32	context[AL_CONTEXT_PER_TRANSACTION];
 };
 
-struct update_al_work {
-	struct drbd_work w;
-	struct drbd_device *device;
-	struct completion event;
-	int err;
-};
-
-
 void *drbd_md_get_buffer(struct drbd_device *device)
 {
 	int r;
@@ -291,26 +283,12 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
 	return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device, bool delegate);
+static int al_write_transaction(struct drbd_device *device);
 
-/* When called through generic_make_request(), we must delegate
- * activity log I/O to the worker thread: a further request
- * submitted via generic_make_request() within the same task
- * would be queued on current->bio_list, and would only start
- * after this function returns (see generic_make_request()).
- *
- * However, if we *are* the worker, we must not delegate to ourselves.
- */
-
-/*
- * @delegate:   delegate activity log I/O to the worker thread
- */
-void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
+void drbd_al_begin_io_commit(struct drbd_device *device)
 {
 	bool locked = false;
 
-	BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
 	/* Serialize multiple transactions.
 	 * This uses test_and_set_bit, memory barrier is implicit.
 	 */
@@ -329,7 +307,7 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
 			rcu_read_unlock();
 
 			if (write_al_updates)
-				al_write_transaction(device, delegate);
+				al_write_transaction(device);
 			spin_lock_irq(&device->al_lock);
 			/* FIXME
 			if (err)
@@ -346,12 +324,10 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
 /*
  * @delegate:   delegate activity log I/O to the worker thread
  */
-void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate)
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
 {
-	BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
 	if (drbd_al_begin_io_prepare(device, i))
-		drbd_al_begin_io_commit(device, delegate);
+		drbd_al_begin_io_commit(device);
 }
 
 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
@@ -464,8 +440,7 @@ static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
 	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
 }
 
-static int
-_al_write_transaction(struct drbd_device *device)
+int al_write_transaction(struct drbd_device *device)
 {
 	struct al_transaction_on_disk *buffer;
 	struct lc_element *e;
@@ -575,38 +550,6 @@ _al_write_transaction(struct drbd_device *device)
 	return err;
 }
 
-
-static int w_al_write_transaction(struct drbd_work *w, int unused)
-{
-	struct update_al_work *aw = container_of(w, struct update_al_work, w);
-	struct drbd_device *device = aw->device;
-	int err;
-
-	err = _al_write_transaction(device);
-	aw->err = err;
-	complete(&aw->event);
-
-	return err != -EIO ? err : 0;
-}
-
-/* Calls from worker context (see w_restart_disk_io()) need to write the
-   transaction directly. Others came through generic_make_request(),
-   those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_device *device, bool delegate)
-{
-	if (delegate) {
-		struct update_al_work al_work;
-		init_completion(&al_work.event);
-		al_work.w.cb = w_al_write_transaction;
-		al_work.device = device;
-		drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-				      &al_work.w);
-		wait_for_completion(&al_work.event);
-		return al_work.err;
-	} else
-		return _al_write_transaction(device);
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
 	int rv;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3c701b0e2956..72b500176083 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1491,9 +1491,9 @@ extern const char *drbd_role_str(enum drbd_role s);
 /* drbd_actlog.c */
 extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
 extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
@@ -1768,16 +1768,6 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
 	return MD_128MB_SECT * bdev->md.meta_dev_idx;
 }
 
-static inline void
-drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&q->q_lock, flags);
-	list_add(&w->list, &q->q);
-	spin_unlock_irqrestore(&q->q_lock, flags);
-	wake_up(&q->q_wait);
-}
-
 static inline void
 drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8d8a6b7f8f5d..cfc3d43ca43f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2351,7 +2351,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
 		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
 		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
-		drbd_al_begin_io(device, &peer_req->i, true);
+		drbd_al_begin_io(device, &peer_req->i);
 	}
 
 	err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 042bbc689f5e..1ee735590b61 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1286,7 +1286,7 @@ skip_fast_path:
 			if (!made_progress)
 				break;
 		}
-		drbd_al_begin_io_commit(device, false);
+		drbd_al_begin_io_commit(device);
 
 		list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
 			list_del_init(&req->tl_requests);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index a4310fd99ffc..2ff5fd49a3b1 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1438,7 +1438,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
 	struct drbd_device *device = req->device;
 
 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
-		drbd_al_begin_io(device, &req->i, false);
+		drbd_al_begin_io(device, &req->i);
 
 	drbd_req_make_private_bio(req, req->master_bio);
 	req->private_bio->bi_bdev = device->ldev->backing_bdev;
@@ -1991,7 +1991,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 		/* dequeue single item only,
 		 * we still use drbd_queue_work_front() in some places */
 		if (!list_empty(&connection->sender_work.q))
-			list_move(connection->sender_work.q.next, work_list);
+			list_splice_tail_init(&connection->sender_work.q, work_list);
 		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
 		if (!list_empty(work_list) || signal_pending(current)) {
 			spin_unlock_irq(&connection->resource->req_lock);
@@ -2054,8 +2054,6 @@ int drbd_worker(struct drbd_thread *thi)
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
 
-		/* as long as we use drbd_queue_work_front(),
-		 * we may only dequeue single work items here, not batches. */
 		if (list_empty(&work_list))
 			wait_for_work(connection, &work_list);
 
-- 
cgit v1.2.3


From 3e0c78d3466483ee1d6fdb4ce081e00f217db79e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 27 Feb 2014 00:02:21 +0100
Subject: drbd: drop wrong debugging aid

The textual representation of resync extents in /proc/drbd presented
with proc_details >= 3 was wrong, it used bitnumbers as bitmasks.

It was not particularly useful either, and I doubt anyone has even tried
to look at it in the last few years. Drop it.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_proc.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index f11e57308104..46bb8dd7890d 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -188,16 +188,6 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	}
 }
 
-static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
-{
-	struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
-
-	seq_printf(seq, "%5d %s %s\n", bme->rs_left,
-		   bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
-		   bme->flags & BME_LOCKED ? "LOCKED" : "------"
-		   );
-}
-
 static int drbd_seq_show(struct seq_file *seq, void *v)
 {
 	int i, prev_i = -1;
@@ -298,13 +288,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			lc_seq_printf_stats(seq, device->act_log);
 			put_ldev(device);
 		}
-
-		if (proc_details >= 2) {
-			if (device->resync) {
-				lc_seq_dump_details(seq, device->resync, "rs_left",
-					resync_dump_detail);
-			}
-		}
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 8ce953aa39e2bfd66036a27abdf761c2cb93f02c Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 27 Feb 2014 09:46:18 +0100
Subject: drbd: silence -Wmissing-prototypes warnings

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c     | 2 +-
 drivers/block/drbd/drbd_nl.c       | 2 +-
 drivers/block/drbd/drbd_receiver.c | 3 +--
 drivers/block/drbd/drbd_req.h      | 1 +
 drivers/block/drbd/drbd_state.c    | 6 +++---
 lib/lru_cache.c                    | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ed35d52b4763..68ec774d52ea 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2877,7 +2877,7 @@ void drbd_delete_device(struct drbd_device *device)
 	kref_sub(&device->kref, refs, drbd_destroy_device);
 }
 
-int __init drbd_init(void)
+static int __init drbd_init(void)
 {
 	int err;
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 628167db673b..81949d65fad0 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2913,7 +2913,7 @@ static struct drbd_connection *the_only_connection(struct drbd_resource *resourc
 	return list_first_entry(&resource->connections, struct drbd_connection, connections);
 }
 
-int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
 		const struct sib_info *sib)
 {
 	struct drbd_resource *resource = device->resource;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index cfc3d43ca43f..2fcc3af03bd8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3371,8 +3371,7 @@ disconnect:
  * return: NULL (alg name was "")
  *         ERR_PTR(error) if something goes wrong
  *         or the crypto hash ptr, if it worked out ok. */
-static
-struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
 		const char *alg, const char *name)
 {
 	struct crypto_hash *tfm;
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 8566cd5866b4..9f6a04080e9f 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -288,6 +288,7 @@ extern void complete_master_bio(struct drbd_device *device,
 extern void request_timer_fn(unsigned long data);
 extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
 extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
 
 /* this is in drbd_main.c */
 extern void drbd_restart_request(struct drbd_request *req);
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 6629f4668102..a7772e2aba7e 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -410,7 +410,7 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
 	return rv;
 }
 
-static void print_st(struct drbd_device *device, char *name, union drbd_state ns)
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
 {
 	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
 	    name,
@@ -1606,7 +1606,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 	return 0;
 }
 
-void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
 {
 	enum chg_state_flags flags = ~0;
 	struct drbd_peer_device *peer_device;
@@ -1695,7 +1695,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
 	return rv;
 }
 
-void
+static void
 conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
 	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
 {
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 4a83ecd03650..6111cd19762d 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -169,7 +169,7 @@ out_fail:
 	return NULL;
 }
 
-void lc_free_by_index(struct lru_cache *lc, unsigned i)
+static void lc_free_by_index(struct lru_cache *lc, unsigned i)
 {
 	void *p = lc->lc_element[i];
 	WARN_ON(!p);
-- 
cgit v1.2.3


From caa3db0e14cc301f07e758f4cadc36d4dead145a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Thu, 27 Feb 2014 20:49:54 +0100
Subject: drbd: Remove unnecessary/unused code

Get rid of dump_stack() debug statements.

There is no point whatsoever in registering and unregistering a reboot
notifier that doesn't do anything.

The intention was to switch to an "emergency read-only" mode,
so we won't have to resync the full activity log just because
we had been Primary before the reboot.

Once we have that implemented, we may re-introduce the reboot notifier.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  1 -
 drivers/block/drbd/drbd_main.c | 18 ------------------
 drivers/block/drbd/drbd_proc.c |  2 +-
 3 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 72b500176083..e306a22a60f1 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1468,7 +1468,6 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
 				"bio->bi_bdev == NULL\n",
 		       device_to_minor(device));
-		dump_stack();
 		bio_endio(bio, -ENODEV);
 		return;
 	}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 68ec774d52ea..7c060243ae46 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2125,20 +2125,6 @@ Enomem:
 	return -ENOMEM;
 }
 
-static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
-	void *unused)
-{
-	/* just so we have it.  you never know what interesting things we
-	 * might want to do here some day...
-	 */
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block drbd_notifier = {
-	.notifier_call = drbd_notify_sys,
-};
-
 static void drbd_release_all_peer_reqs(struct drbd_device *device)
 {
 	int rr;
@@ -2314,8 +2300,6 @@ static void drbd_cleanup(void)
 	struct drbd_device *device;
 	struct drbd_resource *resource, *tmp;
 
-	unregister_reboot_notifier(&drbd_notifier);
-
 	/* first remove proc,
 	 * drbdsetup uses it's presence to detect
 	 * whether DRBD is loaded.
@@ -2899,8 +2883,6 @@ static int __init drbd_init(void)
 		return err;
 	}
 
-	register_reboot_notifier(&drbd_notifier);
-
 	/*
 	 * allocate all necessary structs
 	 */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 46bb8dd7890d..886f6bef70dc 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -299,7 +299,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file)
 	int err;
 
 	if (try_module_get(THIS_MODULE)) {
-		err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+		err = single_open(file, drbd_seq_show, NULL);
 		if (err)
 			module_put(THIS_MODULE);
 		return err;
-- 
cgit v1.2.3


From a5655dac75b6c572e1ef430b61ad55245fffd523 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 11 Mar 2014 13:47:55 +0100
Subject: drbd: fix bogus resync stats in /proc/drbd

We intentionally do not serialize /proc/drbd access with
internal state changes or statistic updates.

Because of that, cat /proc/drbd  may race with resync just being
finished, still see the sync state, and find information about
number of blocks still to go, but then find the total number
of blocks within this resync has just been reset to 0
when accessing it.

This now produces bogus numbers in the resync speed estimates.

Fix by accessing all relevant data only once,
and fixing it up if "still to go" happens to be more than "total".

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  48 -------------------
 drivers/block/drbd/drbd_proc.c | 103 ++++++++++++++++++++++++++++++-----------
 2 files changed, 75 insertions(+), 76 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e306a22a60f1..abf5aefd9790 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1982,54 +1982,6 @@ static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_
 extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
 #endif
 
-/* you must have an "get_ldev" reference */
-static inline void drbd_get_syncer_progress(struct drbd_device *device,
-		unsigned long *bits_left, unsigned int *per_mil_done)
-{
-	/* this is to break it at compile time when we change that, in case we
-	 * want to support more than (1<<32) bits on a 32bit arch. */
-	typecheck(unsigned long, device->rs_total);
-
-	/* note: both rs_total and rs_left are in bits, i.e. in
-	 * units of BM_BLOCK_SIZE.
-	 * for the percentage, we don't care. */
-
-	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
-		*bits_left = device->ov_left;
-	else
-		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
-	/* >> 10 to prevent overflow,
-	 * +1 to prevent division by zero */
-	if (*bits_left > device->rs_total) {
-		/* doh. maybe a logic bug somewhere.
-		 * may also be just a race condition
-		 * between this and a disconnect during sync.
-		 * for now, just prevent in-kernel buffer overflow.
-		 */
-		smp_rmb();
-		drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
-				drbd_conn_str(device->state.conn),
-				*bits_left, device->rs_total, device->rs_failed);
-		*per_mil_done = 0;
-	} else {
-		/* Make sure the division happens in long context.
-		 * We allow up to one petabyte storage right now,
-		 * at a granularity of 4k per bit that is 2**38 bits.
-		 * After shift right and multiplication by 1000,
-		 * this should still fit easily into a 32bit long,
-		 * so we don't need a 64bit division on 32bit arch.
-		 * Note: currently we don't support such large bitmaps on 32bit
-		 * arch anyways, but no harm done to be prepared for it here.
-		 */
-		unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
-		unsigned long left = *bits_left >> shift;
-		unsigned long total = 1UL + (device->rs_total >> shift);
-		unsigned long tmp = 1000UL - left * 1000UL/total;
-		*per_mil_done = tmp;
-	}
-}
-
-
 /* this throttles on-the-fly application requests
  * according to max_buffers settings;
  * maybe re-implement using semaphores? */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 886f6bef70dc..9059d7bf8a36 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -60,20 +60,65 @@ static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
 		seq_printf(seq, "%ld", v);
 }
 
+static void drbd_get_syncer_progress(struct drbd_device *device,
+		union drbd_dev_state state, unsigned long *rs_total,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
+	typecheck(unsigned long, device->rs_total);
+	*rs_total = device->rs_total;
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+		*bits_left = device->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > *rs_total) {
+		/* D'oh. Maybe a logic bug somewhere.  More likely just a race
+		 * between state change and reset of rs_total.
+		 */
+		*bits_left = *rs_total;
+		*per_mil_done = *rs_total ? 0 : 1000;
+	} else {
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (*rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
+		*per_mil_done = tmp;
+	}
+}
+
+
 /*lge
  * progress bars shamelessly adapted from driver/md/md.c
  * output looks like
  *	[=====>..............] 33.5% (23456/123456)
  *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
  */
-static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq)
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+		union drbd_dev_state state)
 {
-	unsigned long db, dt, dbdt, rt, rs_left;
+	unsigned long db, dt, dbdt, rt, rs_total, rs_left;
 	unsigned int res;
 	int i, x, y;
 	int stalled = 0;
 
-	drbd_get_syncer_progress(device, &rs_left, &res);
+	drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
 
 	x = res/50;
 	y = 20-x;
@@ -85,21 +130,21 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 		seq_printf(seq, ".");
 	seq_printf(seq, "] ");
 
-	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
 		seq_printf(seq, "verified:");
 	else
 		seq_printf(seq, "sync'ed:");
 	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
 
 	/* if more than a few GB, display in MB */
-	if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+	if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
 		seq_printf(seq, "(%lu/%lu)M",
 			    (unsigned long) Bit2KB(rs_left >> 10),
-			    (unsigned long) Bit2KB(device->rs_total >> 10));
+			    (unsigned long) Bit2KB(rs_total >> 10));
 	else
 		seq_printf(seq, "(%lu/%lu)K\n\t",
 			    (unsigned long) Bit2KB(rs_left),
-			    (unsigned long) Bit2KB(device->rs_total));
+			    (unsigned long) Bit2KB(rs_total));
 
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
@@ -150,13 +195,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
 	if (dt == 0)
 		dt = 1;
-	db = device->rs_total - rs_left;
+	db = rs_total - rs_left;
 	dbdt = Bit2KB(db/dt);
 	seq_printf_with_thousands_grouping(seq, dbdt);
 	seq_printf(seq, ")");
 
-	if (device->state.conn == C_SYNC_TARGET ||
-	    device->state.conn == C_VERIFY_S) {
+	if (state.conn == C_SYNC_TARGET ||
+	    state.conn == C_VERIFY_S) {
 		seq_printf(seq, " want: ");
 		seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
 	}
@@ -168,8 +213,8 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 		unsigned long bm_bits = drbd_bm_bits(device);
 		unsigned long bit_pos;
 		unsigned long long stop_sector = 0;
-		if (device->state.conn == C_VERIFY_S ||
-		    device->state.conn == C_VERIFY_T) {
+		if (state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T) {
 			bit_pos = bm_bits - device->ov_left;
 			if (verify_can_do_stop_sector(device))
 				stop_sector = device->ov_stop_sector;
@@ -194,6 +239,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 	const char *sn;
 	struct drbd_device *device;
 	struct net_conf *nc;
+	union drbd_dev_state state;
 	char wp;
 
 	static char write_ordering_chars[] = {
@@ -231,11 +277,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			seq_printf(seq, "\n");
 		prev_i = i;
 
-		sn = drbd_conn_str(device->state.conn);
+		state = device->state;
+		sn = drbd_conn_str(state.conn);
 
-		if (device->state.conn == C_STANDALONE &&
-		    device->state.disk == D_DISKLESS &&
-		    device->state.role == R_SECONDARY) {
+		if (state.conn == C_STANDALONE &&
+		    state.disk == D_DISKLESS &&
+		    state.role == R_SECONDARY) {
 			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
 		} else {
 			/* reset device->congestion_reason */
@@ -248,15 +295,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
 			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
 			   i, sn,
-			   drbd_role_str(device->state.role),
-			   drbd_role_str(device->state.peer),
-			   drbd_disk_str(device->state.disk),
-			   drbd_disk_str(device->state.pdsk),
+			   drbd_role_str(state.role),
+			   drbd_role_str(state.peer),
+			   drbd_disk_str(state.disk),
+			   drbd_disk_str(state.pdsk),
 			   wp,
 			   drbd_suspended(device) ? 's' : 'r',
-			   device->state.aftr_isp ? 'a' : '-',
-			   device->state.peer_isp ? 'p' : '-',
-			   device->state.user_isp ? 'u' : '-',
+			   state.aftr_isp ? 'a' : '-',
+			   state.peer_isp ? 'p' : '-',
+			   state.user_isp ? 'u' : '-',
 			   device->congestion_reason ?: '-',
 			   test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
 			   device->send_cnt/2,
@@ -277,11 +324,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 				   Bit2KB((unsigned long long)
 					   drbd_bm_total_weight(device)));
 		}
-		if (device->state.conn == C_SYNC_SOURCE ||
-		    device->state.conn == C_SYNC_TARGET ||
-		    device->state.conn == C_VERIFY_S ||
-		    device->state.conn == C_VERIFY_T)
-			drbd_syncer_progress(device, seq);
+		if (state.conn == C_SYNC_SOURCE ||
+		    state.conn == C_SYNC_TARGET ||
+		    state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T)
+			drbd_syncer_progress(device, seq, state);
 
 		if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
 			lc_seq_printf_stats(seq, device->resync);
-- 
cgit v1.2.3


From 6a8d68b1878e8839b4927150da236c8248703c70 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 18 Mar 2014 12:22:14 +0100
Subject: drbd: don't implicitly resize Diskless node beyond end of device

During handshake, we compare backend sizes, and user set limits,
and agree on what device size we are going to expose.

We remember that last-agreed-size in our meta data.

But if we come up diskless, we have to accept what the peer
presents us with. We used to accept the peers maximum potential
capacity (backend size), which is wrong, and could lead to IO errors
due to access beyond end of device.

Instead, we need to accept the peer's current size.
Unless that is communicated as 0, in which case we
accept the backend size, or the user set limit, if set.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 2fcc3af03bd8..5626c5babc3f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3657,7 +3657,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 	struct drbd_device *device;
 	struct p_sizes *p = pi->data;
 	enum determine_dev_size dd = DS_UNCHANGED;
-	sector_t p_size, p_usize, my_usize;
+	sector_t p_size, p_usize, p_csize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
 
@@ -3668,6 +3668,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
 	p_size = be64_to_cpu(p->d_size);
 	p_usize = be64_to_cpu(p->u_size);
+	p_csize = be64_to_cpu(p->c_size);
 
 	/* just store the peer's disk size for now.
 	 * we still need to figure out whether we accept that. */
@@ -3742,9 +3743,21 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 			return -EIO;
 		drbd_md_sync(device);
 	} else {
-		/* I am diskless, need to accept the peer's size. */
+		/*
+		 * I am diskless, need to accept the peer's *current* size.
+		 * I must NOT accept the peers backing disk size,
+		 * it may have been larger than mine all along...
+		 *
+		 * At this point, the peer knows more about my disk, or at
+		 * least about what we last agreed upon, than myself.
+		 * So if his c_size is less than his d_size, the most likely
+		 * reason is that *my* d_size was smaller last time we checked.
+		 *
+		 * However, if he sends a zero current size,
+		 * take his (user-capped or) backing disk size anyways.
+		 */
 		drbd_reconsider_max_bio_size(device, NULL);
-		drbd_set_my_capacity(device, p_size);
+		drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
 	}
 
 	if (get_ldev(device)) {
-- 
cgit v1.2.3


From aaaba34576407857f6146ff6c330f06e63fb2bf2 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 18 Mar 2014 12:30:09 +0100
Subject: drbd: implement csums-after-crash-only

Checksum based resync trades CPU cycles for network bandwidth,
in situations where we expect much of the to-be-resynced blocks
to be actually identical on both sides already.

In a "network hickup" scenario, it won't help:
all to-be-resynced blocks will typically be different.

The use case is for the resync of *potentially* different blocks
after crash recovery -- the crash recovery had marked larger areas
(those covered by the activity log) as need-to-be-resynced,
just in case. Most of those blocks will be identical.

This option makes it possible to configure checksum based resync,
but only actually use it for the first resync after primary crash.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  2 ++
 drivers/block/drbd/drbd_receiver.c |  2 ++
 drivers/block/drbd/drbd_worker.c   | 24 ++++++++++++++++++++----
 include/linux/drbd_genl.h          |  3 +++
 include/linux/drbd_limits.h        |  1 +
 5 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index abf5aefd9790..fe6595a96a9a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -738,6 +738,8 @@ struct drbd_device {
 	struct rb_root read_requests;
 	struct rb_root write_requests;
 
+	/* use checksums for *this* resync */
+	bool use_csums;
 	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
 	unsigned long rs_total;
 	/* number of resync blocks that failed in this run */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5626c5babc3f..d326af67c27e 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2555,6 +2555,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 			peer_req->w.cb = w_e_end_csum_rs_req;
 			/* used in the sector offset progress display */
 			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+			/* remember to report stats in drbd_resync_finished */
+			device->use_csums = true;
 		} else if (pi->cmd == P_OV_REPLY) {
 			/* track progress, we may need to throttle */
 			atomic_add(size >> 9, &device->rs_sect_in);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 2ff5fd49a3b1..6532a697cf49 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -698,8 +698,8 @@ next_sector:
 		/* adjust very last sectors, in case we are oddly sized */
 		if (sector + (size>>9) > capacity)
 			size = (capacity-sector)<<9;
-		if (connection->agreed_pro_version >= 89 &&
-		    connection->csums_tfm) {
+
+		if (device->use_csums) {
 			switch (read_for_csum(peer_device, sector, size)) {
 			case -EIO: /* Disk failure */
 				put_ldev(device);
@@ -913,7 +913,7 @@ int drbd_resync_finished(struct drbd_device *device)
 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
 			khelper_cmd = "after-resync-target";
 
-		if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
+		if (device->use_csums && device->rs_total) {
 			const unsigned long s = device->rs_same_csum;
 			const unsigned long t = device->rs_total;
 			const int ratio =
@@ -1622,6 +1622,18 @@ static void do_start_resync(struct drbd_device *device)
 	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
 }
 
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+	bool csums_after_crash_only;
+	rcu_read_lock();
+	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+	rcu_read_unlock();
+	return connection->agreed_pro_version >= 89 &&		/* supported? */
+		connection->csums_tfm &&			/* configured? */
+		(csums_after_crash_only == 0			/* use for each resync? */
+		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
+}
+
 /**
  * drbd_start_resync() - Start the resync process
  * @device:	DRBD device.
@@ -1756,8 +1768,12 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		     drbd_conn_str(ns.conn),
 		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
 		     (unsigned long) device->rs_total);
-		if (side == C_SYNC_TARGET)
+		if (side == C_SYNC_TARGET) {
 			device->bm_resync_fo = 0;
+			device->use_csums = use_checksum_based_resync(connection, device);
+		} else {
+			device->use_csums = 0;
+		}
 
 		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
 		 * with w_send_oos, or the sync target will get confused as to
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 4193f5f2636c..71fc924c53fa 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -171,6 +171,9 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
 	__flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,	tentative)
 	__flg_field_def(29,	DRBD_GENLA_F_MANDATORY,	use_rle, DRBD_USE_RLE_DEF)
 	/* 9: __u32_field_def(30,	DRBD_GENLA_F_MANDATORY,	fencing_policy, DRBD_FENCING_DEF) */
+	/* 9: __str_field_def(31,     DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */
+	/* 9: __u32_field(32,         DRBD_F_REQUIRED | DRBD_F_INVARIANT,     peer_node_id) */
+	__flg_field_def(33, 0 /* OPTIONAL */,	csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF)
 )
 
 GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 17e50bb00521..9d2df1d51414 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -214,6 +214,7 @@
 #define DRBD_ALLOW_TWO_PRIMARIES_DEF	0
 #define DRBD_ALWAYS_ASBP_DEF	0
 #define DRBD_USE_RLE_DEF	1
+#define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0
 
 #define DRBD_AL_STRIPES_MIN     1
 #define DRBD_AL_STRIPES_MAX     1024
-- 
cgit v1.2.3


From 4920e37a9e851172158ecc0fd7e060712dc1b5e8 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 18 Mar 2014 14:40:13 +0100
Subject: drbd: Limit the time we are waiting for the first packet on an
 accepted socket

Before the patch
'drbd: Keep the listening socket open while trying to connect to the peer'

the newly created socket inherited the receive timeout from the listen
socket. The listen socket had a receive timeout of connect-intervall
+- 30% random jitter.

The real issue is that after the mentioned patch we had no timeout at all.
Now use 4 times the ping-timeout.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d326af67c27e..7da83f3a61eb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -791,8 +791,18 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
 {
 	unsigned int header_size = drbd_header_size(connection);
 	struct packet_info pi;
+	struct net_conf *nc;
 	int err;
 
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+	rcu_read_unlock();
+
 	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 	if (err != header_size) {
 		if (err >= 0)
-- 
cgit v1.2.3


From 5d0b17f1a29e8189d04aef447a3a53cfd05529b2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 18 Mar 2014 14:24:35 +0100
Subject: drbd: New net configuration option socket-check-timeout

In setups involving a DRBD-proxy and connections that experience a lot of
buffer-bloat it might be necessary to set ping-timeout to an
unusual high value. By default DRBD uses the same value to wait if a newly
established TCP-connection is stable. Since the DRBD-proxy is usually located
in the same data center such a long wait time may hinder DRBD's connect process.

In such setups socket-check-timeout should be set to
at least to the round trip time between DRBD and DRBD-proxy. I.e. in most
cases to 1.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 46 +++++++++++++++++++++++++-------------
 include/linux/drbd_genl.h          |  1 +
 include/linux/drbd_limits.h        |  5 +++++
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 7da83f3a61eb..b89e6fb468c6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -819,7 +819,7 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
  * drbd_socket_okay() - Free the socket if its connection is not okay
  * @sock:	pointer to the pointer to the socket.
  */
-static int drbd_socket_okay(struct socket **sock)
+static bool drbd_socket_okay(struct socket **sock)
 {
 	int rr;
 	char tb[4];
@@ -837,6 +837,30 @@ static int drbd_socket_okay(struct socket **sock)
 		return false;
 	}
 }
+
+static bool connection_established(struct drbd_connection *connection,
+				   struct socket **sock1,
+				   struct socket **sock2)
+{
+	struct net_conf *nc;
+	int timeout;
+	bool ok;
+
+	if (!*sock1 || !*sock2)
+		return false;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+	rcu_read_unlock();
+	schedule_timeout_interruptible(timeout);
+
+	ok = drbd_socket_okay(sock1);
+	ok = drbd_socket_okay(sock2) && ok;
+
+	return ok;
+}
+
 /* Gets called if a connection is established, or if a new minor gets created
    in a connection */
 int drbd_connected(struct drbd_peer_device *peer_device)
@@ -878,8 +902,8 @@ static int conn_connect(struct drbd_connection *connection)
 	struct drbd_socket sock, msock;
 	struct drbd_peer_device *peer_device;
 	struct net_conf *nc;
-	int vnr, timeout, h, ok;
-	bool discard_my_data;
+	int vnr, timeout, h;
+	bool discard_my_data, ok;
 	enum drbd_state_rv rv;
 	struct accept_wait_data ad = {
 		.connection = connection,
@@ -923,17 +947,8 @@ static int conn_connect(struct drbd_connection *connection)
 			}
 		}
 
-		if (sock.socket && msock.socket) {
-			rcu_read_lock();
-			nc = rcu_dereference(connection->net_conf);
-			timeout = nc->ping_timeo * HZ / 10;
-			rcu_read_unlock();
-			schedule_timeout_interruptible(timeout);
-			ok = drbd_socket_okay(&sock.socket);
-			ok = drbd_socket_okay(&msock.socket) && ok;
-			if (ok)
-				break;
-		}
+		if (connection_established(connection, &sock.socket, &msock.socket))
+			break;
 
 retry:
 		s = drbd_wait_for_connect(connection, &ad);
@@ -979,8 +994,7 @@ randomize:
 				goto out_release_sockets;
 		}
 
-		ok = drbd_socket_okay(&sock.socket);
-		ok = drbd_socket_okay(&msock.socket) && ok;
+		ok = connection_established(connection, &sock.socket, &msock.socket);
 	} while (!ok);
 
 	if (ad.s_listen)
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 71fc924c53fa..7b131ed8f9c6 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -174,6 +174,7 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
 	/* 9: __str_field_def(31,     DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */
 	/* 9: __u32_field(32,         DRBD_F_REQUIRED | DRBD_F_INVARIANT,     peer_node_id) */
 	__flg_field_def(33, 0 /* OPTIONAL */,	csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF)
+	__u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF)
 )
 
 GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 9d2df1d51414..8ac8c5d9a3ad 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -225,4 +225,9 @@
 #define DRBD_AL_STRIPE_SIZE_MAX   16777216
 #define DRBD_AL_STRIPE_SIZE_DEF   32
 #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
+
+#define DRBD_SOCKET_CHECK_TIMEO_MIN 0
+#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX
+#define DRBD_SOCKET_CHECK_TIMEO_DEF 0
+#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
 #endif
-- 
cgit v1.2.3


From 08d0dabf48ccf55e310b8ae9381858b477cabe2e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 20 Mar 2014 11:19:22 +0100
Subject: drbd: application writes may set-in-sync in protocol != C

If "dirty" blocks are written to during resync,
that brings them in-sync.

By explicitly requesting write-acks during resync even in protocol != C,
we now can actually respect this.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_interval.h |  4 ++-
 drivers/block/drbd/drbd_main.c     |  5 ++-
 drivers/block/drbd/drbd_receiver.c |  3 ++
 drivers/block/drbd/drbd_req.c      | 68 ++++++++++++++++++++++----------------
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index f38fcb00c10d..f210543f05f4 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -10,7 +10,9 @@ struct drbd_interval {
 	unsigned int size;	/* size in bytes */
 	sector_t end;		/* highest interval end in subtree */
 	int local:1		/* local or remote request? */;
-	int waiting:1;
+	int waiting:1;		/* someone is waiting for this to complete */
+	int completed:1;	/* this has been completed already;
+				 * ignore for conflict detection */
 };
 
 static inline void drbd_clear_interval(struct drbd_interval *i)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7c060243ae46..7ada5d363064 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1639,7 +1639,10 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 	if (peer_device->connection->agreed_pro_version >= 100) {
 		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
 			dp_flags |= DP_SEND_RECEIVE_ACK;
-		if (req->rq_state & RQ_EXP_WRITE_ACK)
+		/* During resync, request an explicit write ack,
+		 * even in protocol != C */
+		if (req->rq_state & RQ_EXP_WRITE_ACK
+		|| (dp_flags & DP_MAY_SET_IN_SYNC))
 			dp_flags |= DP_SEND_WRITE_ACK;
 	}
 	p->dp_flags = cpu_to_be32(dp_flags);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b89e6fb468c6..3a3c4893ea26 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1930,6 +1930,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
 		}
 		dec_unacked(device);
 	}
+
 	/* we delete from the conflict detection hash _after_ we sent out the
 	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
 	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
@@ -2156,6 +2157,8 @@ static int handle_write_conflicts(struct drbd_device *device,
 	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
 		if (i == &peer_req->i)
 			continue;
+		if (i->completed)
+			continue;
 
 		if (!i->local) {
 			/*
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 1ee735590b61..f07a724998ea 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -92,6 +92,19 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 	return req;
 }
 
+static void drbd_remove_request_interval(struct rb_root *root,
+					 struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i = &req->i;
+
+	drbd_remove_interval(root, i);
+
+	/* Wake up any processes waiting for this request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
 void drbd_req_destroy(struct kref *kref)
 {
 	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
@@ -115,6 +128,20 @@ void drbd_req_destroy(struct kref *kref)
 	 * here unconditionally */
 	list_del_init(&req->tl_requests);
 
+	/* finally remove the request from the conflict detection
+	 * respective block_id verification interval tree. */
+	if (!drbd_interval_empty(&req->i)) {
+		struct rb_root *root;
+
+		if (s & RQ_WRITE)
+			root = &device->write_requests;
+		else
+			root = &device->read_requests;
+		drbd_remove_request_interval(root, req);
+	} else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+		drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+			s, (unsigned long long)req->i.sector, req->i.size);
+
 	/* if it was a write, we may have to set the corresponding
 	 * bit(s) out-of-sync first. If it had a local part, we need to
 	 * release the reference to the activity log. */
@@ -188,19 +215,6 @@ void complete_master_bio(struct drbd_device *device,
 }
 
 
-static void drbd_remove_request_interval(struct rb_root *root,
-					 struct drbd_request *req)
-{
-	struct drbd_device *device = req->device;
-	struct drbd_interval *i = &req->i;
-
-	drbd_remove_interval(root, i);
-
-	/* Wake up any processes waiting for this request to complete.  */
-	if (i->waiting)
-		wake_up(&device->misc_wait);
-}
-
 /* Helper for __req_mod().
  * Set m->bio to the master bio, if it is fit to be completed,
  * or leave it alone (it is initialized to NULL in __req_mod),
@@ -254,18 +268,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 	ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
 	error = PTR_ERR(req->private_bio);
 
-	/* remove the request from the conflict detection
-	 * respective block_id verification hash */
-	if (!drbd_interval_empty(&req->i)) {
-		struct rb_root *root;
-
-		if (rw == WRITE)
-			root = &device->write_requests;
-		else
-			root = &device->read_requests;
-		drbd_remove_request_interval(root, req);
-	}
-
 	/* Before we can signal completion to the upper layers,
 	 * we may need to close the current transfer log epoch.
 	 * We are within the request lock, so we can simply compare
@@ -301,7 +303,15 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 		m->error = ok ? 0 : (error ?: -EIO);
 		m->bio = req->master_bio;
 		req->master_bio = NULL;
+		/* We leave it in the tree, to be able to verify later
+		 * write-acks in protocol != C during resync.
+		 * But we mark it as "complete", so it won't be counted as
+		 * conflict in a multi-primary setup. */
+		req->i.completed = true;
 	}
+
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
 }
 
 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
@@ -660,12 +670,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 	case WRITE_ACKED_BY_PEER_AND_SIS:
 		req->rq_state |= RQ_NET_SIS;
 	case WRITE_ACKED_BY_PEER:
-		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
-		/* protocol C; successfully written on peer.
+		/* Normal operation protocol C: successfully written on peer.
+		 * During resync, even in protocol != C,
+		 * we requested an explicit write ack anyways.
+		 * Which means we cannot even assert anything here.
 		 * Nothing more to do here.
 		 * We want to keep the tl in place for all protocols, to cater
 		 * for volatile write-back caches on lower level devices. */
-
 		goto ack_common;
 	case RECV_ACKED_BY_PEER:
 		D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
@@ -673,7 +684,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 * see also notes above in HANDED_OVER_TO_NETWORK about
 		 * protocol != C */
 	ack_common:
-		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
 		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
 		break;
 
-- 
cgit v1.2.3


From 0c066bc39e56a60785b32d6e43aa6659fb3793ab Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 20 Mar 2014 14:04:35 +0100
Subject: drbd: short-circuit in maybe_pull_ahead

If we already "pulled ahead", we can short-circuit,
and avoid logging the same messages over and over again.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f07a724998ea..3824d5c737e6 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -899,6 +899,9 @@ static void maybe_pull_ahead(struct drbd_device *device)
 	    connection->agreed_pro_version < 96)
 		return;
 
+	if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+		return; /* nothing to do ... */
+
 	/* If I don't even have good local storage, we can not reasonably try
 	 * to pull ahead of the peer. We also need the local reference to make
 	 * sure device->act_log is there.
-- 
cgit v1.2.3


From ccdd6a93ee0c4a68cfaeddc80a87fc092ff35b96 Mon Sep 17 00:00:00 2001
From: Monam Agarwal <monamagarwal123@gmail.com>
Date: Sun, 23 Mar 2014 02:02:29 +0530
Subject: drivers/block: Use RCU_INIT_POINTER(x, NULL) in drbd/drbd_state.c

This patch replaces rcu_assign_pointer(x, NULL) with RCU_INIT_POINTER(x, NULL)

The rcu_assign_pointer() ensures that the initialization of a structure
is carried out before storing a pointer to that structure.
And in the case of the NULL pointer, there is no structure to initialize.
So, rcu_assign_pointer(p, NULL) can be safely converted to RCU_INIT_POINTER(p, NULL)

Signed-off-by: Monam Agarwal <monamagarwal123@gmail.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index a7772e2aba7e..4a75b7a03b9d 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1570,7 +1570,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 		old_conf = connection->net_conf;
 		connection->my_addr_len = 0;
 		connection->peer_addr_len = 0;
-		rcu_assign_pointer(connection->net_conf, NULL);
+		RCU_INIT_POINTER(connection->net_conf, NULL);
 		conn_free_crypto(connection);
 		mutex_unlock(&connection->resource->conf_update);
 
-- 
cgit v1.2.3


From 659b2e3bb8b149f5f7e2f8551599044b715bcc21 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 25 Mar 2014 12:35:05 -0700
Subject: block: Convert last uses of __FUNCTION__ to __func__

Just about all of these have been converted to __func__,
so convert the last uses.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index fe6595a96a9a..b0ab358c39e6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1867,7 +1867,7 @@ static inline void inc_ap_pending(struct drbd_device *device)
 			func, line,					\
 			atomic_read(&device->which))
 
-#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__)
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
 static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
 {
 	if (atomic_dec_and_test(&device->ap_pending_cnt))
@@ -1886,7 +1886,7 @@ static inline void inc_rs_pending(struct drbd_device *device)
 	atomic_inc(&device->rs_pending_cnt);
 }
 
-#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__)
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
 static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
 {
 	atomic_dec(&device->rs_pending_cnt);
@@ -1907,14 +1907,14 @@ static inline void inc_unacked(struct drbd_device *device)
 	atomic_inc(&device->unacked_cnt);
 }
 
-#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__)
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
 static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
 {
 	atomic_dec(&device->unacked_cnt);
 	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
-#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__)
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
 static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
 {
 	atomic_sub(n, &device->unacked_cnt);
-- 
cgit v1.2.3


From 506afb6248af577eb702c73f3da52a12f4c56a38 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 31 Jan 2014 14:55:12 +0100
Subject: drbd: improve resync request throttling due to sendbuf size

If we throttle resync because the socket sendbuffer is filling up,
tell TCP about it, so it may expand the sendbuffer for us.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 6532a697cf49..0b5e4294acf9 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -592,7 +592,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
 	int max_bio_size;
 	int number, rollback_i, size;
-	int align, queued, sndbuf;
+	int align, requeue = 0;
 	int i = 0;
 
 	if (unlikely(cancel))
@@ -619,17 +619,22 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
 		goto requeue;
 
 	for (i = 0; i < number; i++) {
-		/* Stop generating RS requests, when half of the send buffer is filled */
+		/* Stop generating RS requests when half of the send buffer is filled,
+		 * but notify TCP that we'd like to have more space. */
 		mutex_lock(&connection->data.mutex);
 		if (connection->data.socket) {
-			queued = connection->data.socket->sk->sk_wmem_queued;
-			sndbuf = connection->data.socket->sk->sk_sndbuf;
-		} else {
-			queued = 1;
-			sndbuf = 0;
-		}
+			struct sock *sk = connection->data.socket->sk;
+			int queued = sk->sk_wmem_queued;
+			int sndbuf = sk->sk_sndbuf;
+			if (queued > sndbuf / 2) {
+				requeue = 1;
+				if (sk->sk_socket)
+					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			}
+		} else
+			requeue = 1;
 		mutex_unlock(&connection->data.mutex);
-		if (queued > sndbuf / 2)
+		if (requeue)
 			goto requeue;
 
 next_sector:
-- 
cgit v1.2.3


From 2f2abeae3cd139fb3581249388bf2018612a87e5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 27 Mar 2014 10:34:13 +0100
Subject: drbd: clear CRASHED_PRIMARY only after successful resync

If we lost a disk during the first resync after primary crash,
we could have prematurely cleared the CRASHED_PRIMARY flag.
Testing on C_CONNECTED is not what we meant there,
but testing for both peers to become D_UP_TO_DATE.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 4a75b7a03b9d..c35c0f001bb7 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1238,7 +1238,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	sib.os = os;
 	sib.ns = ns;
 
-	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+	if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+	&&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
 		clear_bit(CRASHED_PRIMARY, &device->flags);
 		if (device->p_uuid)
 			device->p_uuid[UI_FLAGS] &= ~((u64)2);
-- 
cgit v1.2.3


From f88c5d90ccca70841af88ba9456ba4aac6d10da8 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 27 Mar 2014 14:10:55 +0100
Subject: drbd: cosmetic: change all printk(level, ...) to pr_<level>(...)

Cosmetic change only.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 11 ++++++-----
 drivers/block/drbd/drbd_int.h    |  4 +---
 drivers/block/drbd/drbd_main.c   | 28 +++++++++++++---------------
 drivers/block/drbd/drbd_nl.c     |  4 +++-
 4 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 424ebf6bdad0..dab6923493cb 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -22,6 +22,8 @@
    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/bitops.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
@@ -353,9 +355,8 @@ static void bm_free_pages(struct page **pages, unsigned long number)
 
 	for (i = 0; i < number; i++) {
 		if (!pages[i]) {
-			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
-					  "a NULL pointer; i=%lu n=%lu\n",
-					  i, number);
+			pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+				 i, number);
 			continue;
 		}
 		__free_page(pages[i]);
@@ -592,7 +593,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 	end = offset + len;
 
 	if (end > b->bm_words) {
-		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		pr_alert("bm_memset end > bm_words\n");
 		return;
 	}
 
@@ -602,7 +603,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 		p_addr = bm_map_pidx(b, idx);
 		bm = p_addr + MLPP(offset);
 		if (bm+do_now > p_addr + LWPP) {
-			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
 			       p_addr, bm, (int)do_now);
 		} else
 			memset(bm, c, do_now * sizeof(long));
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b0ab358c39e6..c88d6c6be70b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1467,9 +1467,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 {
 	__release(local);
 	if (!bio->bi_bdev) {
-		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
-				"bio->bi_bdev == NULL\n",
-		       device_to_minor(device));
+		drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
 		bio_endio(bio, -ENODEV);
 		return;
 	}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7ada5d363064..686c5a59edeb 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -26,6 +26,8 @@
 
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/drbd.h>
 #include <asm/uaccess.h>
@@ -2333,7 +2335,7 @@ static void drbd_cleanup(void)
 
 	idr_destroy(&drbd_devices);
 
-	printk(KERN_INFO "drbd: module cleanup done.\n");
+	pr_info("module cleanup done.\n");
 }
 
 /**
@@ -2869,8 +2871,7 @@ static int __init drbd_init(void)
 	int err;
 
 	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
-		printk(KERN_ERR
-		       "drbd: invalid minor_count (%d)\n", minor_count);
+		pr_err("invalid minor_count (%d)\n", minor_count);
 #ifdef MODULE
 		return -EINVAL;
 #else
@@ -2880,8 +2881,7 @@ static int __init drbd_init(void)
 
 	err = register_blkdev(DRBD_MAJOR, "drbd");
 	if (err) {
-		printk(KERN_ERR
-		       "drbd: unable to register block device major %d\n",
+		pr_err("unable to register block device major %d\n",
 		       DRBD_MAJOR);
 		return err;
 	}
@@ -2899,7 +2899,7 @@ static int __init drbd_init(void)
 
 	err = drbd_genl_register();
 	if (err) {
-		printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+		pr_err("unable to register generic netlink family\n");
 		goto fail;
 	}
 
@@ -2910,34 +2910,32 @@ static int __init drbd_init(void)
 	err = -ENOMEM;
 	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
 	if (!drbd_proc)	{
-		printk(KERN_ERR "drbd: unable to register proc file\n");
+		pr_err("unable to register proc file\n");
 		goto fail;
 	}
 
 	retry.wq = create_singlethread_workqueue("drbd-reissue");
 	if (!retry.wq) {
-		printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+		pr_err("unable to create retry workqueue\n");
 		goto fail;
 	}
 	INIT_WORK(&retry.worker, do_retry);
 	spin_lock_init(&retry.lock);
 	INIT_LIST_HEAD(&retry.writes);
 
-	printk(KERN_INFO "drbd: initialized. "
+	pr_info("initialized. "
 	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
 	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
-	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
-	printk(KERN_INFO "drbd: registered as block device major %d\n",
-		DRBD_MAJOR);
-
+	pr_info("%s\n", drbd_buildtag());
+	pr_info("registered as block device major %d\n", DRBD_MAJOR);
 	return 0; /* Success! */
 
 fail:
 	drbd_cleanup();
 	if (err == -ENOMEM)
-		printk(KERN_ERR "drbd: ran out of memory\n");
+		pr_err("ran out of memory\n");
 	else
-		printk(KERN_ERR "drbd: initialization failure\n");
+		pr_err("initialization failure\n");
 	return err;
 }
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 81949d65fad0..c2e047f89bcf 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -23,6 +23,8 @@
 
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/drbd.h>
 #include <linux/in.h>
@@ -85,7 +87,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
 	if (genlmsg_reply(skb, info))
-		printk(KERN_ERR "drbd: error sending genl reply\n");
+		pr_err("error sending genl reply\n");
 }
 
 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
-- 
cgit v1.2.3


From 7f34f61490ee87a470cf229069d59b0987f42a59 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 22 Apr 2014 16:37:16 +0200
Subject: drbd: drbd_rs_number_requests: fix unit mismatch in comparison

We try to limit the number of "in-flight" resync requests.
One condition for that is the amount of requested data should not exceed
half of what can be covered by our "max-buffers" setting.

However we compared number of 4k pages with number of in-flight 512 Byte
sectors, and this extra throttle triggered much earlier than intended.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0b5e4294acf9..9f0acb011f30 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -504,9 +504,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
 	struct disk_conf *dc;
-	unsigned int want;     /* The number of sectors we want in the proxy */
+	unsigned int want;     /* The number of sectors we want in-flight */
 	int req_sect; /* Number of sectors to request in this turn */
-	int correction; /* Number of sectors more we need in the proxy*/
+	int correction; /* Number of sectors more we need in-flight */
 	int cps; /* correction per invocation of drbd_rs_controller() */
 	int steps; /* Number of time steps to plan ahead */
 	int curr_corr;
@@ -577,8 +577,13 @@ static int drbd_rs_number_requests(struct drbd_device *device)
 	 * potentially causing a distributed deadlock on congestion during
 	 * online-verify or (checksum-based) resync, if max-buffers,
 	 * socket buffer sizes and resync rate settings are mis-configured. */
-	if (mxb - device->rs_in_flight < number)
-		number = mxb - device->rs_in_flight;
+
+	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
+	 * "number of pages" (typically also 4k),
+	 * but "rs_in_flight" is in "sectors" (512 Byte). */
+	if (mxb - device->rs_in_flight/8 < number)
+		number = mxb - device->rs_in_flight/8;
 
 	return number;
 }
-- 
cgit v1.2.3


From 15e26f6a3c6de2c665b4a30b9a70a902111f281f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 11:43:21 +0200
Subject: drbd: add drbd_queue_work_if_unqueued helper

We sometimes do
    if (list_empty(&w.list))
	drbd_queue_work(&q, &w.list);

Removal (list_del_init) may happen outside all locks, after all
pending work entries have been moved to an on-stack local work list.

For not dynamically allocated, but embeded, work structs,
we must avoid to re-add until it really was removed.

Move that list_empty check inside the spin_lock(&q->q_lock)
within the helper function, and change to list_empty_careful().

This may have been the reason for a list_add corruption
inside drbd_queue_work().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h    | 11 +++++++++++
 drivers/block/drbd/drbd_worker.c |  8 ++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c88d6c6be70b..a71f8bb9dc80 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1777,6 +1777,17 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 	wake_up(&q->q_wait);
 }
 
+static inline void
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	if (list_empty_careful(&w->list))
+		list_add_tail(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
 static inline void
 drbd_device_post_work(struct drbd_device *device, int work_bit)
 {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 9f0acb011f30..49b88731b349 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -452,9 +452,9 @@ void resync_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
 
-	if (list_empty(&device->resync_work.list))
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-				&device->resync_work);
+	drbd_queue_work_if_unqueued(
+		&first_peer_device(device)->connection->sender_work,
+		&device->resync_work);
 }
 
 static void fifo_set(struct fifo_buffer *fb, int value)
@@ -1968,7 +1968,7 @@ static void do_unqueued_work(struct drbd_connection *connection)
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
 	spin_lock_irq(&queue->q_lock);
-	list_splice_init(&queue->q, work_list);
+	list_splice_tail_init(&queue->q, work_list);
 	spin_unlock_irq(&queue->q_lock);
 	return !list_empty(work_list);
 }
-- 
cgit v1.2.3


From 41d9f7cd5ba8a488fbc96350141c70c5c01bf8e6 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 25 Apr 2014 13:27:50 +0200
Subject: drbd: drop drbd_md_flush

The only user of drbd_md_flush was bm_rw(),
and it is always followed by either a drbd_md_sync(),
or an al_write_transaction(), which, if so configured,
both end up submiting a FLUSH|FUA request anyways.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c |  4 +---
 drivers/block/drbd/drbd_int.h    | 19 -------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index dab6923493cb..29af96038e69 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1155,9 +1155,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		err = -EIO; /* Disk timeout/force-detach during IO... */
 
 	now = jiffies;
-	if (rw == WRITE) {
-		drbd_md_flush(device);
-	} else /* rw == READ */ {
+	if (rw == READ) {
 		b->bm_set = bm_count_bits(b);
 		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
 		     jiffies - now);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a71f8bb9dc80..d85f43c9ef56 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2182,25 +2182,6 @@ static inline int drbd_queue_order_type(struct drbd_device *device)
 	return QUEUE_ORDERED_NONE;
 }
 
-static inline void drbd_md_flush(struct drbd_device *device)
-{
-	int r;
-
-	if (device->ldev == NULL) {
-		drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
-		return;
-	}
-
-	if (test_bit(MD_NO_FUA, &device->flags))
-		return;
-
-	r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
-	if (r) {
-		set_bit(MD_NO_FUA, &device->flags);
-		drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
-	}
-}
-
 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
 {
 	return list_first_entry_or_null(&resource->connections,
-- 
cgit v1.2.3


From b9ed7080d7d29112c898c64bad778b84eec0ed2d Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 23 Apr 2014 12:15:35 +0200
Subject: drbd: consistently use list_add_tail for peer_request tracking

Keep the epoch entry lists (active_ee, read_ee, sync_ee, ...)
consistently "oldest first".  That way finding the oldest not yet
successfully processed request is simply list_first_entry_or_null.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 4 ++--
 drivers/block/drbd/drbd_worker.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3a3c4893ea26..db5c58042734 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1777,7 +1777,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
 	peer_req->w.cb = e_end_resync_block;
 
 	spin_lock_irq(&device->resource->req_lock);
-	list_add(&peer_req->w.list, &device->sync_ee);
+	list_add_tail(&peer_req->w.list, &device->sync_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	atomic_add(pi->size >> 9, &device->rs_sect_ev);
@@ -2341,7 +2341,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	 * active_ee to become empty in drbd_submit_peer_request();
 	 * better not add ourselves here. */
 	if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
-		list_add(&peer_req->w.list, &device->active_ee);
+		list_add_tail(&peer_req->w.list, &device->active_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	if (device->state.conn == C_SYNC_TARGET)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 49b88731b349..ad57129289b5 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -410,7 +410,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 
 	peer_req->w.cb = w_e_send_csum;
 	spin_lock_irq(&device->resource->req_lock);
-	list_add(&peer_req->w.list, &device->read_ee);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	atomic_add(size >> 9, &device->rs_sect_ev);
-- 
cgit v1.2.3


From 45d2933c921c51ddd2780a806193ab00e03fe215 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 23 Apr 2014 12:25:23 +0200
Subject: drbd: also keep track of trim -> zero-out fallback peer_requests

To be able to find and present such zero-out fallback peer_requests
in debugfs, we add those to "active_ee", once that list drained.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index db5c58042734..7a1078d285dd 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1370,6 +1370,11 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		/* wait for all pending IO completions, before we start
 		 * zeroing things out. */
 		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		/* add it to the active list now,
+		 * so we can find it to present it in debugfs */
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->active_ee);
+		spin_unlock_irq(&device->resource->req_lock);
 		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
 			sector, ds >> 9, GFP_NOIO))
 			peer_req->flags |= EE_WAS_ERROR;
-- 
cgit v1.2.3


From c2258ffc56f2b34573c2917937190c1491620334 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 14 May 2014 21:35:21 +0200
Subject: drbd: poison free'd device, resource and connection structs

Now that we have additional asynchronous kref_get/kref_put
via debugfs, make sure we catch access after free.

Poison struct drbd_device, drbd_connection and drbd_resource
before kfree() with 0xfd, 0xfc, and 0xf2, respectively.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 686c5a59edeb..92547d16b2c7 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2192,6 +2192,7 @@ void drbd_destroy_device(struct kref *kref)
 	blk_cleanup_queue(device->rq_queue);
 	kfree(device->rs_plan_s);
 	kfree(first_peer_device(device));
+	memset(device, 0xfd, sizeof(*device));
 	kfree(device);
 
 	for_each_connection(connection, resource)
@@ -2285,6 +2286,7 @@ void drbd_destroy_resource(struct kref *kref)
 	idr_destroy(&resource->devices);
 	free_cpumask_var(resource->cpu_mask);
 	kfree(resource->name);
+	memset(resource, 0xf2, sizeof(*resource));
 	kfree(resource);
 }
 
@@ -2665,6 +2667,7 @@ void drbd_destroy_connection(struct kref *kref)
 	drbd_free_socket(&connection->data);
 	kfree(connection->int_dig_in);
 	kfree(connection->int_dig_vv);
+	memset(connection, 0xfc, sizeof(*connection));
 	kfree(connection);
 	kref_put(&resource->kref, drbd_destroy_resource);
 }
-- 
cgit v1.2.3


From a8ba0d606933c34c13ea971491a7e0dfa50208ef Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 14 May 2014 21:34:47 +0200
Subject: drbd: fix drbd_destroy_device reference count updates

drbd_destroy_device means to give up reference counts
on the connection(s) reachable via the peer_device(s).

It must not do that by iterating via device->resource->connections,
resource and connections may have already been disassociated
by drbd_free_resource, and we'd leak connection refs.

Instead, iterate via device->peer_devices->connection.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92547d16b2c7..56cf11b60cb8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2160,7 +2160,7 @@ void drbd_destroy_device(struct kref *kref)
 {
 	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
 	struct drbd_resource *resource = device->resource;
-	struct drbd_connection *connection;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
 
 	del_timer_sync(&device->request_timer);
 
@@ -2191,12 +2191,16 @@ void drbd_destroy_device(struct kref *kref)
 	put_disk(device->vdisk);
 	blk_cleanup_queue(device->rq_queue);
 	kfree(device->rs_plan_s);
-	kfree(first_peer_device(device));
+
+	/* not for_each_connection(connection, resource):
+	 * those may have been cleaned up and disassociated already.
+	 */
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+		kfree(peer_device);
+	}
 	memset(device, 0xfd, sizeof(*device));
 	kfree(device);
-
-	for_each_connection(connection, resource)
-		kref_put(&connection->kref, drbd_destroy_connection);
 	kref_put(&resource->kref, drbd_destroy_resource);
 }
 
-- 
cgit v1.2.3


From e37d2438d8e5e4c1225cf94d45347fa207835447 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 1 Apr 2014 23:53:30 +0200
Subject: drbd: track meta data IO intent, start and submit time

For diagnostic purposes, track intent, start time
and latest submit time of meta data IO.

Move separate members from struct drbd_device
into the embeded struct drbd_md_io.
s/md_io_(page|in_use)/md_io.\1/

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 26 +++++++++++++++++---------
 drivers/block/drbd/drbd_int.h    |  9 ++++++---
 drivers/block/drbd/drbd_main.c   | 14 +++++++-------
 drivers/block/drbd/drbd_nl.c     |  4 ++--
 drivers/block/drbd/drbd_worker.c |  9 +++------
 5 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 5f82a36f799d..d7e80663187c 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,20 +92,26 @@ struct __packed al_transaction_on_disk {
 	__be32	context[AL_CONTEXT_PER_TRANSACTION];
 };
 
-void *drbd_md_get_buffer(struct drbd_device *device)
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
 {
 	int r;
 
 	wait_event(device->misc_wait,
-		   (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+		   (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
 		   device->state.disk <= D_FAILED);
 
-	return r ? NULL : page_address(device->md_io_page);
+	if (r)
+		return NULL;
+
+	device->md_io.current_use = intent;
+	device->md_io.start_jif = jiffies;
+	device->md_io.submit_jif = device->md_io.start_jif - 1;
+	return page_address(device->md_io.page);
 }
 
 void drbd_md_put_buffer(struct drbd_device *device)
 {
-	if (atomic_dec_and_test(&device->md_io_in_use))
+	if (atomic_dec_and_test(&device->md_io.in_use))
 		wake_up(&device->misc_wait);
 }
 
@@ -150,7 +156,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	err = -EIO;
 	if (bio_add_page(bio, page, size, 0) != size)
 		goto out;
-	bio->bi_private = &device->md_io;
+	bio->bi_private = device;
 	bio->bi_end_io = drbd_md_io_complete;
 	bio->bi_rw = rw;
 
@@ -165,7 +171,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	}
 
 	bio_get(bio); /* one bio_put() is in the completion handler */
-	atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+	atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+	device->md_io.submit_jif = jiffies;
 	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
@@ -183,9 +190,9 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
 			 sector_t sector, int rw)
 {
 	int err;
-	struct page *iop = device->md_io_page;
+	struct page *iop = device->md_io.page;
 
-	D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
+	D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
 
@@ -465,7 +472,8 @@ int al_write_transaction(struct drbd_device *device)
 		return -EIO;
 	}
 
-	buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+	/* protects md_io_buffer, al_tr_cycle, ... */
+	buffer = drbd_md_get_buffer(device, __func__);
 	if (!buffer) {
 		drbd_err(device, "disk failed while waiting for md_io buffer\n");
 		put_ldev(device);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d85f43c9ef56..3f8281bbea53 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -542,6 +542,11 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
+	struct page *page;
+	unsigned long start_jif;	/* last call to drbd_md_get_buffer */
+	unsigned long submit_jif;	/* last _drbd_md_sync_page_io() submit */
+	const char *current_use;
+	atomic_t in_use;
 	unsigned int done;
 	int error;
 };
@@ -795,9 +800,7 @@ struct drbd_device {
 	atomic_t pp_in_use;		/* allocated from page pool */
 	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
 	wait_queue_head_t ee_wait;
-	struct page *md_io_page;	/* one page buffer for md_io */
 	struct drbd_md_io md_io;
-	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
 	spinlock_t al_lock;
 	wait_queue_head_t al_wait;
 	struct lru_cache *act_log;	/* activity log */
@@ -1336,7 +1339,7 @@ extern void resume_next_sg(struct drbd_device *device);
 extern void suspend_other_sg(struct drbd_device *device);
 extern int drbd_resync_finished(struct drbd_device *device);
 /* maybe rather drbd_main.c ? */
-extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
 extern void drbd_md_put_buffer(struct drbd_device *device);
 extern int drbd_md_sync_page_io(struct drbd_device *device,
 		struct drbd_backing_dev *bdev, sector_t sector, int rw);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 56cf11b60cb8..3ab74619c8eb 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1916,7 +1916,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	atomic_set(&device->rs_sect_in, 0);
 	atomic_set(&device->rs_sect_ev, 0);
 	atomic_set(&device->ap_in_flight, 0);
-	atomic_set(&device->md_io_in_use, 0);
+	atomic_set(&device->md_io.in_use, 0);
 
 	mutex_init(&device->own_state_mutex);
 	device->state_mutex = &device->own_state_mutex;
@@ -2187,7 +2187,7 @@ void drbd_destroy_device(struct kref *kref)
 
 	if (device->bitmap) /* should no longer be there. */
 		drbd_bm_cleanup(device);
-	__free_page(device->md_io_page);
+	__free_page(device->md_io.page);
 	put_disk(device->vdisk);
 	blk_cleanup_queue(device->rq_queue);
 	kfree(device->rs_plan_s);
@@ -2756,8 +2756,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
 	q->queue_lock = &resource->req_lock;
 
-	device->md_io_page = alloc_page(GFP_KERNEL);
-	if (!device->md_io_page)
+	device->md_io.page = alloc_page(GFP_KERNEL);
+	if (!device->md_io.page)
 		goto out_no_io_page;
 
 	if (drbd_bm_init(device))
@@ -2845,7 +2845,7 @@ out_idr_remove_minor:
 out_no_minor_idr:
 	drbd_bm_cleanup(device);
 out_no_bitmap:
-	__free_page(device->md_io_page);
+	__free_page(device->md_io.page);
 out_no_io_page:
 	put_disk(disk);
 out_no_disk:
@@ -3079,7 +3079,7 @@ void drbd_md_sync(struct drbd_device *device)
 	if (!get_ldev_if_state(device, D_FAILED))
 		return;
 
-	buffer = drbd_md_get_buffer(device);
+	buffer = drbd_md_get_buffer(device, __func__);
 	if (!buffer)
 		goto out;
 
@@ -3239,7 +3239,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
 	if (device->state.disk != D_DISKLESS)
 		return ERR_DISK_CONFIGURED;
 
-	buffer = drbd_md_get_buffer(device);
+	buffer = drbd_md_get_buffer(device, __func__);
 	if (!buffer)
 		return ERR_NOMEM;
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c2e047f89bcf..7fcdc54bf65a 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -887,7 +887,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	 * still lock the act_log to not trigger ASSERTs there.
 	 */
 	drbd_suspend_io(device);
-	buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+	buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
 	if (!buffer) {
 		drbd_resume_io(device);
 		return DS_ERROR;
@@ -1899,7 +1899,7 @@ static int adm_detach(struct drbd_device *device, int force)
 	}
 
 	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-	drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+	drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
 	retcode = drbd_request_state(device, NS(disk, D_FAILED));
 	drbd_md_put_buffer(device);
 	/* D_FAILED will transition to DISKLESS. */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ad57129289b5..3978d9ec6f00 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -67,13 +67,10 @@ rwlock_t global_state_lock;
  */
 void drbd_md_io_complete(struct bio *bio, int error)
 {
-	struct drbd_md_io *md_io;
 	struct drbd_device *device;
 
-	md_io = (struct drbd_md_io *)bio->bi_private;
-	device = container_of(md_io, struct drbd_device, md_io);
-
-	md_io->error = error;
+	device = bio->bi_private;
+	device->md_io.error = error;
 
 	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
 	 * to timeout on the lower level device, and eventually detach from it.
@@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error)
 	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
 	 */
 	drbd_md_put_buffer(device);
-	md_io->done = 1;
+	device->md_io.done = 1;
 	wake_up(&device->misc_wait);
 	bio_put(bio);
 	if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
-- 
cgit v1.2.3


From e5f891b2234dbab8c8797111a61519d0728ef855 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Nov 2013 12:32:01 +0100
Subject: drbd: gather detailed timing statistics for drbd_requests

Record (in jiffies) how much time a request spends in which stages.
Followup commits will use and present this additional timing information
so we can better locate and tackle the root causes of latency spikes,
or present the backlog for asynchronous replication.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h    | 54 ++++++++++++++++++++++-
 drivers/block/drbd/drbd_main.c   |  7 +--
 drivers/block/drbd/drbd_req.c    | 93 +++++++++++++++++++++++++---------------
 drivers/block/drbd/drbd_worker.c |  3 ++
 4 files changed, 119 insertions(+), 38 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3f8281bbea53..08fa2dc8cdba 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -317,7 +317,59 @@ struct drbd_request {
 
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
-	unsigned long start_time;
+
+	/* for generic IO accounting */
+	unsigned long start_jif;
+
+	/* for DRBD internal statistics */
+
+	/* Minimal set of time stamps to determine if we wait for activity log
+	 * transactions, local disk or peer.  32 bit "jiffies" are good enough,
+	 * we don't expect a DRBD request to be stalled for several month.
+	 */
+
+	/* before actual request processing */
+	unsigned long in_actlog_jif;
+
+	/* local disk */
+	unsigned long pre_submit_jif;
+
+	/* per connection */
+	unsigned long pre_send_jif;
+	unsigned long acked_jif;
+	unsigned long net_done_jif;
+
+	/* Possibly even more detail to track each phase:
+	 *  master_completion_jif
+	 *      how long did it take to complete the master bio
+	 *      (application visible latency)
+	 *  allocated_jif
+	 *      how long the master bio was blocked until we finally allocated
+	 *      a tracking struct
+	 *  in_actlog_jif
+	 *      how long did we wait for activity log transactions
+	 *
+	 *  net_queued_jif
+	 *      when did we finally queue it for sending
+	 *  pre_send_jif
+	 *      when did we start sending it
+	 *  post_send_jif
+	 *      how long did we block in the network stack trying to send it
+	 *  acked_jif
+	 *      when did we receive (or fake, in protocol A) a remote ACK
+	 *  net_done_jif
+	 *      when did we receive final acknowledgement (P_BARRIER_ACK),
+	 *      or decide, e.g. on connection loss, that we do no longer expect
+	 *      anything from this peer for this request.
+	 *
+	 *  pre_submit_jif
+	 *  post_sub_jif
+	 *      when did we start submiting to the lower level device,
+	 *      and how long did we block in that submit function
+	 *  local_completion_jif
+	 *      how long did it take the lower level device to complete this request
+	 */
+
 
 	/* once it hits 0, we may complete the master_bio */
 	atomic_t completion_ref;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 3ab74619c8eb..0baec7a3fa81 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -29,6 +29,7 @@
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
 #include <linux/module.h>
+#include <linux/jiffies.h>
 #include <linux/drbd.h>
 #include <asm/uaccess.h>
 #include <asm/types.h>
@@ -264,7 +265,7 @@ bail:
 
 /**
  * _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @device:	DRBD device.
+ * @connection:	DRBD connection to operate on.
  * @what:       The action/event to perform with all request objects
  *
  * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
@@ -2228,7 +2229,7 @@ static void do_retry(struct work_struct *ws)
 	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
 		struct drbd_device *device = req->device;
 		struct bio *bio = req->master_bio;
-		unsigned long start_time = req->start_time;
+		unsigned long start_jif = req->start_jif;
 		bool expected;
 
 		expected =
@@ -2263,7 +2264,7 @@ static void do_retry(struct work_struct *ws)
 		/* We are not just doing generic_make_request(),
 		 * as we want to keep the start_time information. */
 		inc_ap_bio(device);
-		__drbd_make_request(device, bio, start_time);
+		__drbd_make_request(device, bio, start_jif);
 	}
 }
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3824d5c737e6..1319beab1b37 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
 	int rw = bio_data_dir(req->master_bio);
-	unsigned long duration = jiffies - req->start_time;
+	unsigned long duration = jiffies - req->start_jif;
 	int cpu;
 	cpu = part_stat_lock();
 	part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
@@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 {
 	struct drbd_request *req;
 
-	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO);
 	if (!req)
 		return NULL;
 
@@ -366,14 +366,18 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		atomic_inc(&req->completion_ref);
 	}
 
-	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
 		atomic_inc(&req->completion_ref);
+	}
 
 	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
 		kref_get(&req->kref); /* wait for the DONE */
 
-	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
-		atomic_add(req->i.size >> 9, &device->ap_in_flight);
+	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+		/* potentially already completed in the asender thread */
+		if (!(s & RQ_NET_DONE))
+			atomic_add(req->i.size >> 9, &device->ap_in_flight);
+	}
 
 	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
 		atomic_inc(&req->completion_ref);
@@ -401,15 +405,18 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
 		dec_ap_pending(device);
 		++c_put;
+		req->acked_jif = jiffies;
 	}
 
 	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
 		++c_put;
 
-	if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
-		if (req->rq_state & RQ_NET_SENT)
+	if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+		if (s & RQ_NET_SENT)
 			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
-		++k_put;
+		if (s & RQ_EXP_BARR_ACK)
+			++k_put;
+		req->net_done_jif = jiffies;
 	}
 
 	/* potentially complete and destroy */
@@ -449,6 +456,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
 			bdevname(device->ldev->backing_bdev, b));
 }
 
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+	return (req->rq_state &
+		   (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+		== (RQ_WRITE|RQ_NET_PENDING);
+}
+
 /* obviously this could be coded as many single functions
  * instead of one huge switch,
  * or by putting the code directly in the respective locations
@@ -627,18 +647,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 	case HANDED_OVER_TO_NETWORK:
 		/* assert something? */
-		if (bio_data_dir(req->master_bio) == WRITE &&
-		    !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
+		if (is_pending_write_protocol_A(req))
 			/* this is what is dangerous about protocol A:
 			 * pretend it was successfully written on the peer. */
-			if (req->rq_state & RQ_NET_PENDING)
-				mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
-			/* else: neg-ack was faster... */
-			/* it is still not yet RQ_NET_DONE until the
-			 * corresponding epoch barrier got acked as well,
-			 * so we know what to dirty on connection loss */
-		}
-		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+			mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+						RQ_NET_SENT|RQ_NET_OK);
+		else
+			mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+		/* It is still not yet RQ_NET_DONE until the
+		 * corresponding epoch barrier got acked as well,
+		 * so we know what to dirty on connection loss. */
 		break;
 
 	case OOS_HANDED_TO_NETWORK:
@@ -1037,6 +1055,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	 * stable storage, and this is a WRITE, we may not even submit
 	 * this bio. */
 	if (get_ldev(device)) {
+		req->pre_submit_jif = jiffies;
 		if (drbd_insert_fault(device,
 				      rw == WRITE ? DRBD_FAULT_DT_WR
 				    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1063,7 +1082,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
  * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
  */
 static struct drbd_request *
-drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
 	const int rw = bio_data_dir(bio);
 	struct drbd_request *req;
@@ -1078,7 +1097,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 		bio_endio(bio, -ENOMEM);
 		return ERR_PTR(-ENOMEM);
 	}
-	req->start_time = start_time;
+	req->start_jif = start_jif;
 
 	if (!get_ldev(device)) {
 		bio_put(req->private_bio);
@@ -1095,6 +1114,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 			return NULL;
 		}
 		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
 	}
 
 	return req;
@@ -1197,9 +1217,9 @@ out:
 		complete_master_bio(device, &m);
 }
 
-void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
-	struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+	struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
 	if (IS_ERR_OR_NULL(req))
 		return;
 	drbd_send_and_submit(device, req);
@@ -1218,6 +1238,7 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 				continue;
 
 			req->rq_state |= RQ_IN_ACT_LOG;
+			req->in_actlog_jif = jiffies;
 		}
 
 		list_del_init(&req->tl_requests);
@@ -1240,7 +1261,6 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
 			wake = 1;
 		if (err)
 			continue;
-		req->rq_state |= RQ_IN_ACT_LOG;
 		list_move_tail(&req->tl_requests, pending);
 	}
 	spin_unlock_irq(&device->al_lock);
@@ -1302,6 +1322,8 @@ skip_fast_path:
 		drbd_al_begin_io_commit(device);
 
 		list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+			req->rq_state |= RQ_IN_ACT_LOG;
+			req->in_actlog_jif = jiffies;
 			list_del_init(&req->tl_requests);
 			drbd_send_and_submit(device, req);
 		}
@@ -1311,9 +1333,12 @@ skip_fast_path:
 		 * requests to cold extents. In that case, prepare one request
 		 * in blocking mode. */
 		list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
+			bool was_cold;
 			list_del_init(&req->tl_requests);
-			req->rq_state |= RQ_IN_ACT_LOG;
-			if (!drbd_al_begin_io_prepare(device, &req->i)) {
+			was_cold = drbd_al_begin_io_prepare(device, &req->i);
+			if (!was_cold) {
+				req->rq_state |= RQ_IN_ACT_LOG;
+				req->in_actlog_jif = jiffies;
 				/* Corresponding extent was hot after all? */
 				drbd_send_and_submit(device, req);
 			} else {
@@ -1330,9 +1355,9 @@ skip_fast_path:
 void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct drbd_device *device = (struct drbd_device *) q->queuedata;
-	unsigned long start_time;
+	unsigned long start_jif;
 
-	start_time = jiffies;
+	start_jif = jiffies;
 
 	/*
 	 * what we "blindly" assume:
@@ -1340,7 +1365,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
 
 	inc_ap_bio(device);
-	__drbd_make_request(device, bio, start_time);
+	__drbd_make_request(device, bio, start_jif);
 }
 
 /* This is called by bio_add_page().
@@ -1453,13 +1478,13 @@ void request_timer_fn(unsigned long data)
 	 * to expire twice (worst case) to become effective. Good enough.
 	 */
 	if (ent && req_peer &&
-		 time_after(now, req_peer->start_time + ent) &&
+		 time_after(now, req_peer->start_jif + ent) &&
 		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
 		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
 		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
 	}
 	if (dt && req_disk &&
-		 time_after(now, req_disk->start_time + dt) &&
+		 time_after(now, req_disk->start_jif + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
 		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
 		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1467,10 +1492,10 @@ void request_timer_fn(unsigned long data)
 
 	/* Reschedule timer for the nearest not already expired timeout.
 	 * Fallback to now + min(effective network timeout, disk timeout). */
-	ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
-		? req_peer->start_time + ent : now + et;
-	dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
-		? req_disk->start_time + dt : now + et;
+	ent = (ent && req_peer && time_before(now, req_peer->start_jif + ent))
+		? req_peer->start_jif + ent : now + et;
+	dt = (dt && req_disk && time_before(now, req_disk->start_jif + dt))
+		? req_disk->start_jif + dt : now + et;
 	nt = time_before(ent, dt) ? ent : dt;
 	spin_unlock_irq(&connection->resource->req_lock);
 	mod_timer(&device->request_timer, nt);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3978d9ec6f00..0ff8f4637741 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1368,6 +1368,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 		req_mod(req, SEND_CANCELED);
 		return 0;
 	}
+	req->pre_send_jif = jiffies;
 
 	/* this time, no connection->send.current_epoch_writes++;
 	 * If it was sent, it was the closing barrier for the last
@@ -1398,6 +1399,7 @@ int w_send_dblock(struct drbd_work *w, int cancel)
 		req_mod(req, SEND_CANCELED);
 		return 0;
 	}
+	req->pre_send_jif = jiffies;
 
 	re_init_if_first_write(connection, req->epoch);
 	maybe_send_barrier(connection, req->epoch);
@@ -1426,6 +1428,7 @@ int w_send_read_req(struct drbd_work *w, int cancel)
 		req_mod(req, SEND_CANCELED);
 		return 0;
 	}
+	req->pre_send_jif = jiffies;
 
 	/* Even read requests may close a write epoch,
 	 * if there was any yet. */
-- 
cgit v1.2.3


From 844a6ae7358df3261daec25e0d3a510f3d4152f2 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Nov 2013 12:52:03 +0100
Subject: drbd: add lists to find oldest pending requests

Adding requests to per-device fifo lists as soon as possible after
allocating them leaves a simple list_first_entry_or_null() to find the
oldest request, regardless what it is still waiting for.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  | 11 ++++++++++-
 drivers/block/drbd/drbd_main.c |  7 ++++++-
 drivers/block/drbd/drbd_req.c  | 45 +++++++++++++++++++++++++++++++-----------
 3 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 08fa2dc8cdba..f29f107be9b8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -318,6 +318,10 @@ struct drbd_request {
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
 
+	/* see struct drbd_device */
+	struct list_head req_pending_master_completion;
+	struct list_head req_pending_local;
+
 	/* for generic IO accounting */
 	unsigned long start_jif;
 
@@ -738,7 +742,7 @@ struct submit_worker {
 	struct workqueue_struct *wq;
 	struct work_struct worker;
 
-	spinlock_t lock;
+	/* protected by ..->resource->req_lock */
 	struct list_head writes;
 };
 
@@ -795,6 +799,11 @@ struct drbd_device {
 	struct rb_root read_requests;
 	struct rb_root write_requests;
 
+	/* for statistics and timeouts */
+	/* [0] read, [1] write */
+	struct list_head pending_master_completion[2];
+	struct list_head pending_completion[2];
+
 	/* use checksums for *this* resync */
 	bool use_csums;
 	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0baec7a3fa81..58865969c9f4 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1934,6 +1934,10 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	INIT_LIST_HEAD(&device->resync_work.list);
 	INIT_LIST_HEAD(&device->unplug_work.list);
 	INIT_LIST_HEAD(&device->bm_io_work.w.list);
+	INIT_LIST_HEAD(&device->pending_master_completion[0]);
+	INIT_LIST_HEAD(&device->pending_master_completion[1]);
+	INIT_LIST_HEAD(&device->pending_completion[0]);
+	INIT_LIST_HEAD(&device->pending_completion[1]);
 
 	device->resync_work.cb  = w_resync_timer;
 	device->unplug_work.cb  = w_send_write_hint;
@@ -2268,6 +2272,8 @@ static void do_retry(struct work_struct *ws)
 	}
 }
 
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
 void drbd_restart_request(struct drbd_request *req)
 {
 	unsigned long flags;
@@ -2687,7 +2693,6 @@ static int init_submitter(struct drbd_device *device)
 		return -ENOMEM;
 
 	INIT_WORK(&device->submit.worker, do_submit);
-	spin_lock_init(&device->submit.lock);
 	INIT_LIST_HEAD(&device->submit.writes);
 	return 0;
 }
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 1319beab1b37..23cd909dc7f1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 
 	INIT_LIST_HEAD(&req->tl_requests);
 	INIT_LIST_HEAD(&req->w.list);
+	INIT_LIST_HEAD(&req->req_pending_master_completion);
+	INIT_LIST_HEAD(&req->req_pending_local);
 
 	/* one reference to be put by __drbd_make_request */
 	atomic_set(&req->completion_ref, 1);
@@ -120,12 +122,14 @@ void drbd_req_destroy(struct kref *kref)
 		return;
 	}
 
-	/* remove it from the transfer log.
-	 * well, only if it had been there in the first
-	 * place... if it had not (local only or conflicting
-	 * and never sent), it should still be "empty" as
-	 * initialized in drbd_req_new(), so we can list_del() it
-	 * here unconditionally */
+	/* If called from mod_rq_state (expected normal case) or
+	 * drbd_send_and_submit (the less likely normal path), this holds the
+	 * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+	 * though it may be still empty (never added to the transfer log).
+	 *
+	 * If called from do_retry(), we do NOT hold the req_lock, but we are
+	 * still allowed to unconditionally list_del(&req->tl_requests),
+	 * because it will be on a local on-stack list only. */
 	list_del_init(&req->tl_requests);
 
 	/* finally remove the request from the conflict detection
@@ -312,8 +316,15 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 
 	if (req->i.waiting)
 		wake_up(&device->misc_wait);
+
+	/* Either we are about to complete to upper layers,
+	 * or we will restart this request.
+	 * In either case, the request object will be destroyed soon,
+	 * so better remove it from all lists. */
+	list_del_init(&req->req_pending_master_completion);
 }
 
+/* still holds resource->req_lock */
 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
 {
 	struct drbd_device *device = req->device;
@@ -400,6 +411,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 			++k_put;
 		else
 			++c_put;
+		list_del_init(&req->req_pending_local);
 	}
 
 	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
@@ -1070,9 +1082,11 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 
 static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
 {
-	spin_lock(&device->submit.lock);
+	spin_lock_irq(&device->resource->req_lock);
 	list_add_tail(&req->tl_requests, &device->submit.writes);
-	spin_unlock(&device->submit.lock);
+	list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[1 /* WRITE */]);
+	spin_unlock_irq(&device->resource->req_lock);
 	queue_work(device->submit.wq, &device->submit.worker);
 }
 
@@ -1186,8 +1200,15 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 			no_remote = true;
 	}
 
+	/* If it took the fast path in drbd_request_prepare, add it here.
+	 * The slow path has added it already. */
+	if (list_empty(&req->req_pending_master_completion))
+		list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[rw == WRITE]);
 	if (req->private_bio) {
 		/* needs to be marked within the same spinlock */
+		list_add_tail(&req->req_pending_local,
+			&device->pending_completion[rw == WRITE]);
 		_req_mod(req, TO_BE_SUBMITTED);
 		/* but we need to give up the spinlock to submit */
 		submit_private_bio = true;
@@ -1278,9 +1299,9 @@ void do_submit(struct work_struct *ws)
 	struct drbd_request *req, *tmp;
 
 	for (;;) {
-		spin_lock(&device->submit.lock);
+		spin_lock_irq(&device->resource->req_lock);
 		list_splice_tail_init(&device->submit.writes, &incoming);
-		spin_unlock(&device->submit.lock);
+		spin_unlock_irq(&device->resource->req_lock);
 
 		submit_fast_path(device, &incoming);
 		if (list_empty(&incoming))
@@ -1304,9 +1325,9 @@ skip_fast_path:
 			if (list_empty(&device->submit.writes))
 				break;
 
-			spin_lock(&device->submit.lock);
+			spin_lock_irq(&device->resource->req_lock);
 			list_splice_tail_init(&device->submit.writes, &more_incoming);
-			spin_unlock(&device->submit.lock);
+			spin_unlock_irq(&device->resource->req_lock);
 
 			if (list_empty(&more_incoming))
 				break;
-- 
cgit v1.2.3


From 7753a4c17f9e305ed19d8851e1a3154c8c9abaaf Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Nov 2013 13:00:12 +0100
Subject: drbd: add caching oldest request pointers for replication stages

A request that is to be shipped to the peer goes through a few stages:
- queued
- sent, waiting for ack
- ack received, waiting for "barrier ack", which is re-order epoch being
  closed on the peer by acknowledging a "cache flush" equivalent
  on the lower level device.

In the later two stages, depending on protocol, we may have already
completed this request to the upper layers, so it won't be found anymore
on device->pending_master_completion[] lists.

Track the oldest request yet to be sent (req_next), the oldest not yet
acknowledged (req_ack_pending) and the oldest "still waiting for
something from the peer" (req_not_net_done), doing short list walks on
the transfer log to find the next pending one whenever such a request
makes progress.

Now we have a fast way to look up the oldest requests,
don't do a transfer log walk every time.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h |   7 ++
 drivers/block/drbd/drbd_req.c | 169 ++++++++++++++++++++++++++++++++----------
 2 files changed, 136 insertions(+), 40 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f29f107be9b8..fa010ea3a4bf 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -720,6 +720,13 @@ struct drbd_connection {
 	struct drbd_thread worker;
 	struct drbd_thread asender;
 
+	/* cached pointers,
+	 * so we can look up the oldest pending requests more quickly.
+	 * protected by resource->req_lock */
+	struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+	struct drbd_request *req_ack_pending;
+	struct drbd_request *req_not_net_done;
+
 	/* sender side */
 	struct drbd_work_queue sender_work;
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 23cd909dc7f1..3f6a6ed2fd03 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -345,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_
 	return 1;
 }
 
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next == NULL)
+		connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if (s & RQ_NET_QUEUED)
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending == NULL)
+		connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done == NULL)
+		connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_not_net_done = req;
+}
+
 /* I'd like this to be the only place that manipulates
  * req->completion_ref and req->kref. */
 static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		int clear, int set)
 {
 	struct drbd_device *device = req->device;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
 	unsigned s = req->rq_state;
 	int c_put = 0;
 	int k_put = 0;
@@ -379,6 +458,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
 		atomic_inc(&req->completion_ref);
+		set_if_null_req_next(peer_device, req);
 	}
 
 	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
@@ -386,8 +466,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
 		/* potentially already completed in the asender thread */
-		if (!(s & RQ_NET_DONE))
+		if (!(s & RQ_NET_DONE)) {
 			atomic_add(req->i.size >> 9, &device->ap_in_flight);
+			set_if_null_req_not_net_done(peer_device, req);
+		}
+		if (s & RQ_NET_PENDING)
+			set_if_null_req_ack_pending(peer_device, req);
 	}
 
 	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
@@ -418,10 +502,13 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		dec_ap_pending(device);
 		++c_put;
 		req->acked_jif = jiffies;
+		advance_conn_req_ack_pending(peer_device, req);
 	}
 
-	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
 		++c_put;
+		advance_conn_req_next(peer_device, req);
+	}
 
 	if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
 		if (s & RQ_NET_SENT)
@@ -429,6 +516,13 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		if (s & RQ_EXP_BARR_ACK)
 			++k_put;
 		req->net_done_jif = jiffies;
+
+		/* in ahead/behind mode, or just in case,
+		 * before we finally destroy this request,
+		 * the caching pointers must not reference it anymore */
+		advance_conn_req_next(peer_device, req);
+		advance_conn_req_ack_pending(peer_device, req);
+		advance_conn_req_not_net_done(peer_device, req);
 	}
 
 	/* potentially complete and destroy */
@@ -1423,36 +1517,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	return limit;
 }
 
-static void find_oldest_requests(
-		struct drbd_connection *connection,
-		struct drbd_device *device,
-		struct drbd_request **oldest_req_waiting_for_peer,
-		struct drbd_request **oldest_req_waiting_for_disk)
-{
-	struct drbd_request *r;
-	*oldest_req_waiting_for_peer = NULL;
-	*oldest_req_waiting_for_disk = NULL;
-	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
-		const unsigned s = r->rq_state;
-		if (!*oldest_req_waiting_for_peer
-		&& ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
-			*oldest_req_waiting_for_peer = r;
-
-		if (!*oldest_req_waiting_for_disk
-		&& (s & RQ_LOCAL_PENDING) && r->device == device)
-			*oldest_req_waiting_for_disk = r;
-
-		if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
-			break;
-	}
-}
-
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
 	struct drbd_connection *connection = first_peer_device(device)->connection;
-	struct drbd_request *req_disk, *req_peer; /* oldest request */
+	struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
 	struct net_conf *nc;
+	unsigned long oldest_submit_jif;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
 
@@ -1473,14 +1544,31 @@ void request_timer_fn(unsigned long data)
 		return; /* Recurring timer stopped */
 
 	now = jiffies;
+	nt = now + et;
 
 	spin_lock_irq(&device->resource->req_lock);
-	find_oldest_requests(connection, device, &req_peer, &req_disk);
-	if (req_peer == NULL && req_disk == NULL) {
-		spin_unlock_irq(&device->resource->req_lock);
-		mod_timer(&device->request_timer, now + et);
-		return;
-	}
+	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+	req_peer = connection->req_not_net_done;
+	/* maybe the oldest request waiting for the peer is in fact still
+	 * blocking in tcp sendmsg */
+	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+		req_peer = connection->req_next;
+
+	/* evaluate the oldest peer request only in one timer! */
+	if (req_peer && req_peer->device != device)
+		req_peer = NULL;
+
+	/* do we have something to evaluate? */
+	if (req_peer == NULL && req_write == NULL && req_read == NULL)
+		goto out;
+
+	oldest_submit_jif =
+		(req_write && req_read)
+		? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+		  ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+		: req_write ? req_write->pre_submit_jif
+		: req_read ? req_read->pre_submit_jif : now;
 
 	/* The request is considered timed out, if
 	 * - we have some effective timeout from the configuration,
@@ -1499,13 +1587,13 @@ void request_timer_fn(unsigned long data)
 	 * to expire twice (worst case) to become effective. Good enough.
 	 */
 	if (ent && req_peer &&
-		 time_after(now, req_peer->start_jif + ent) &&
+		 time_after(now, req_peer->pre_send_jif + ent) &&
 		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
 		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
 		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
 	}
-	if (dt && req_disk &&
-		 time_after(now, req_disk->start_jif + dt) &&
+	if (dt && oldest_submit_jif != now &&
+		 time_after(now, oldest_submit_jif + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
 		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
 		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1513,11 +1601,12 @@ void request_timer_fn(unsigned long data)
 
 	/* Reschedule timer for the nearest not already expired timeout.
 	 * Fallback to now + min(effective network timeout, disk timeout). */
-	ent = (ent && req_peer && time_before(now, req_peer->start_jif + ent))
-		? req_peer->start_jif + ent : now + et;
-	dt = (dt && req_disk && time_before(now, req_disk->start_jif + dt))
-		? req_disk->start_jif + dt : now + et;
+	ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+		? req_peer->pre_send_jif + ent : now + et;
+	dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+		? oldest_submit_jif + dt : now + et;
 	nt = time_before(ent, dt) ? ent : dt;
+out:
 	spin_unlock_irq(&connection->resource->req_lock);
 	mod_timer(&device->request_timer, nt);
 }
-- 
cgit v1.2.3


From ad3fee790088d36ad862e31535b5b99c25adeef4 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 20 Dec 2013 11:22:13 +0100
Subject: drbd: improve throttling decisions of background resynchronisation

Background resynchronisation does some "side-stepping", or throttles
itself, if it detects application IO activity, and the current resync
rate estimate is above the configured "cmin-rate".

What was not detected: if there is no application IO,
because it blocks on activity log transactions.

Introduce a new atomic_t ap_actlog_cnt, tracking such blocked requests,
and count non-zero as application IO activity.
This counter is exposed at proc_details level 2 and above.

Also make sure to release the currently locked resync extent
if we side-step due to such voluntary throttling.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c   | 29 ++++++++++++++++++++++++++---
 drivers/block/drbd/drbd_int.h      |  4 +++-
 drivers/block/drbd/drbd_main.c     |  1 +
 drivers/block/drbd/drbd_proc.c     |  3 +++
 drivers/block/drbd/drbd_receiver.c | 19 ++++++++++++-------
 drivers/block/drbd/drbd_req.c      |  4 ++++
 drivers/block/drbd/drbd_worker.c   |  9 ++-------
 7 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index d7e80663187c..6ce5c76d642b 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -991,6 +991,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
 	struct lc_element *e;
 	struct bm_extent *bm_ext;
 	int i;
+	bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+	/* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+	 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+	 * need to throttle. There is at most one such half-locked extent,
+	 * which is remembered in resync_wenr. */
+
+	if (throttle && device->resync_wenr != enr)
+		return -EAGAIN;
 
 	spin_lock_irq(&device->al_lock);
 	if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
@@ -1014,8 +1023,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
 			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
 			clear_bit(BME_NO_WRITES, &bm_ext->flags);
 			device->resync_wenr = LC_FREE;
-			if (lc_put(device->resync, &bm_ext->lce) == 0)
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
 				device->resync_locked--;
+			}
 			wake_up(&device->al_wait);
 		} else {
 			drbd_alert(device, "LOGIC BUG\n");
@@ -1077,8 +1088,20 @@ proceed:
 	return 0;
 
 try_again:
-	if (bm_ext)
-		device->resync_wenr = enr;
+	if (bm_ext) {
+		if (throttle) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
+				device->resync_locked--;
+			}
+			wake_up(&device->al_wait);
+		} else
+			device->resync_wenr = enr;
+	}
 	spin_unlock_irq(&device->al_lock);
 	return -EAGAIN;
 }
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index fa010ea3a4bf..81f4af49b8ac 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -797,6 +797,7 @@ struct drbd_device {
 	unsigned int al_writ_cnt;
 	unsigned int bm_writ_cnt;
 	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
 	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
 	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
 	atomic_t unacked_cnt;	 /* Need to send replies for */
@@ -1454,7 +1455,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
-extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting);
 extern int drbd_submit_peer_request(struct drbd_device *,
 				    struct drbd_peer_request *, const unsigned,
 				    const int);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 58865969c9f4..ad7c0e8843c4 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1909,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	drbd_set_defaults(device);
 
 	atomic_set(&device->ap_bio_cnt, 0);
+	atomic_set(&device->ap_actlog_cnt, 0);
 	atomic_set(&device->ap_pending_cnt, 0);
 	atomic_set(&device->rs_pending_cnt, 0);
 	atomic_set(&device->unacked_cnt, 0);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 9059d7bf8a36..06e6147c7601 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -335,6 +335,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			lc_seq_printf_stats(seq, device->act_log);
 			put_ldev(device);
 		}
+
+		if (proc_details >= 2)
+			seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
 	}
 	rcu_read_unlock();
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 7a1078d285dd..0d3cbd8e4b9c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2417,13 +2417,14 @@ out_interrupted:
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting)
 {
 	struct lc_element *tmp;
-	bool throttle = true;
+	bool throttle = drbd_rs_c_min_rate_throttle(device);
 
-	if (!drbd_rs_c_min_rate_throttle(device))
-		return false;
+	if (!throttle || throttle_if_app_is_waiting)
+		return throttle;
 
 	spin_lock_irq(&device->al_lock);
 	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
@@ -2431,7 +2432,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
 		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 		if (test_bit(BME_PRIORITY, &bm_ext->flags))
 			throttle = false;
-		/* Do not slow down if app IO is already waiting for this extent */
+		/* Do not slow down if app IO is already waiting for this extent,
+		 * and our progress is necessary for application IO to complete. */
 	}
 	spin_unlock_irq(&device->al_lock);
 
@@ -2456,7 +2458,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 		      (int)part_stat_read(&disk->part0, sectors[1]) -
 			atomic_read(&device->rs_sect_ev);
-	if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
+
+	if (atomic_read(&device->ap_actlog_cnt)
+	    || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
 
@@ -2646,7 +2650,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	 * we would also throttle its application reads.
 	 * In that case, throttling is done on the SyncTarget only.
 	 */
-	if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+	if (device->state.peer != R_PRIMARY
+	&& drbd_rs_should_slow_down(device, sector, false))
 		schedule_timeout_uninterruptible(HZ/10);
 	if (drbd_rs_begin_io(device, sector))
 		goto out_free_e;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3f6a6ed2fd03..74ebef101dc7 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1218,6 +1218,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 	if (rw == WRITE && req->private_bio && req->i.size
 	&& !test_bit(AL_SUSPENDED, &device->flags)) {
 		if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+			atomic_inc(&device->ap_actlog_cnt);
 			drbd_queue_write(device, req);
 			return NULL;
 		}
@@ -1354,6 +1355,7 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 
 			req->rq_state |= RQ_IN_ACT_LOG;
 			req->in_actlog_jif = jiffies;
+			atomic_dec(&device->ap_actlog_cnt);
 		}
 
 		list_del_init(&req->tl_requests);
@@ -1439,6 +1441,7 @@ skip_fast_path:
 		list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
 			req->rq_state |= RQ_IN_ACT_LOG;
 			req->in_actlog_jif = jiffies;
+			atomic_dec(&device->ap_actlog_cnt);
 			list_del_init(&req->tl_requests);
 			drbd_send_and_submit(device, req);
 		}
@@ -1454,6 +1457,7 @@ skip_fast_path:
 			if (!was_cold) {
 				req->rq_state |= RQ_IN_ACT_LOG;
 				req->in_actlog_jif = jiffies;
+				atomic_dec(&device->ap_actlog_cnt);
 				/* Corresponding extent was hot after all? */
 				drbd_send_and_submit(device, req);
 			} else {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0ff8f4637741..48975a264985 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -395,9 +395,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 	if (!get_ldev(device))
 		return -EIO;
 
-	if (drbd_rs_should_slow_down(device, sector))
-		goto defer;
-
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
@@ -651,8 +648,7 @@ next_sector:
 
 		sector = BM_BIT_TO_SECT(bit);
 
-		if (drbd_rs_should_slow_down(device, sector) ||
-		    drbd_try_rs_begin_io(device, sector)) {
+		if (drbd_try_rs_begin_io(device, sector)) {
 			device->bm_resync_fo = bit;
 			goto requeue;
 		}
@@ -783,8 +779,7 @@ static int make_ov_request(struct drbd_device *device, int cancel)
 
 		size = BM_BLOCK_SIZE;
 
-		if (drbd_rs_should_slow_down(device, sector) ||
-		    drbd_try_rs_begin_io(device, sector)) {
+		if (drbd_try_rs_begin_io(device, sector)) {
 			device->ov_position = sector;
 			goto requeue;
 		}
-- 
cgit v1.2.3


From 21ae5d7f95aa1a64f35b03c91f8714ced3ab61a9 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 5 May 2014 23:42:24 +0200
Subject: drbd: track timing details of peer_requests

To be able to present timing details in debugfs,
we need to track preparation/submit times of peer requests.

Track peer request flags early,
before they are put on the epoch_entry lists.

Waiting for activity log transactions may be a major latency factor.
We want to be able to present the peer_request state accurately in
debugfs, and what it is waiting for.

Consistently mark/unmark peer requests with EE_CALL_AL_COMPLETE_IO.
Set it only *after* calling drbd_al_begin_io(),
clear it as soon as we call drbd_al_complete_io().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 15 ++++++++
 drivers/block/drbd/drbd_receiver.c | 78 +++++++++++++++++++++++---------------
 drivers/block/drbd/drbd_worker.c   |  1 +
 3 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 81f4af49b8ac..c7a409f3aaf8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -422,6 +422,7 @@ struct drbd_peer_request {
 	struct drbd_interval i;
 	/* see comments on ee flag bits below */
 	unsigned long flags;
+	unsigned long submit_jif;
 	union {
 		u64 block_id;
 		struct digest_info *digest;
@@ -464,6 +465,17 @@ enum {
 
 	/* Is set when net_conf had two_primaries set while creating this peer_req */
 	__EE_IN_INTERVAL_TREE,
+
+	/* for debugfs: */
+	/* has this been submitted, or does it still wait for something else? */
+	__EE_SUBMITTED,
+
+	/* this is/was a write request */
+	__EE_WRITE,
+
+	/* this originates from application on peer
+	 * (not some resync or verify or other DRBD internal request) */
+	__EE_APPLICATION,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
@@ -475,6 +487,9 @@ enum {
 #define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
 #define EE_SEND_WRITE_ACK	(1<<__EE_SEND_WRITE_ACK)
 #define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED		(1<<__EE_SUBMITTED)
+#define EE_WRITE		(1<<__EE_WRITE)
+#define EE_APPLICATION		(1<<__EE_APPLICATION)
 
 /* flag bits per device */
 enum {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 0d3cbd8e4b9c..2f67dc03d403 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -389,11 +389,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 		       int is_net)
 {
+	might_sleep();
 	if (peer_req->flags & EE_HAS_DIGEST)
 		kfree(peer_req->digest);
 	drbd_free_pages(device, peer_req->pages, is_net);
 	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+	if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+		drbd_al_complete_io(device, &peer_req->i);
+	}
 	mempool_free(peer_req, drbd_ee_mempool);
 }
 
@@ -1372,6 +1377,8 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		conn_wait_active_ee_empty(first_peer_device(device)->connection);
 		/* add it to the active list now,
 		 * so we can find it to present it in debugfs */
+		peer_req->submit_jif = jiffies;
+		peer_req->flags |= EE_SUBMITTED;
 		spin_lock_irq(&device->resource->req_lock);
 		list_add_tail(&peer_req->w.list, &device->active_ee);
 		spin_unlock_irq(&device->resource->req_lock);
@@ -1443,6 +1450,9 @@ submit:
 	D_ASSERT(device, page == NULL);
 
 	atomic_set(&peer_req->pending_bios, n_bios);
+	/* for debugfs: update timestamp, mark as submitted */
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_SUBMITTED;
 	do {
 		bio = bios;
 		bios = bios->bi_next;
@@ -1624,6 +1634,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	if (!peer_req)
 		return NULL;
 
+	peer_req->flags |= EE_WRITE;
 	if (trim)
 		return peer_req;
 
@@ -1780,6 +1791,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
 	 * respective _drbd_clear_done_ee */
 
 	peer_req->w.cb = e_end_resync_block;
+	peer_req->submit_jif = jiffies;
 
 	spin_lock_irq(&device->resource->req_lock);
 	list_add_tail(&peer_req->w.list, &device->sync_ee);
@@ -2196,7 +2208,6 @@ static int handle_write_conflicts(struct drbd_device *device,
 					  (unsigned long long)sector, size,
 					  superseded ? "local" : "remote");
 
-			inc_unacked(device);
 			peer_req->w.cb = superseded ? e_send_superseded :
 						   e_send_retry_write;
 			list_add_tail(&peer_req->w.list, &device->done_ee);
@@ -2255,6 +2266,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 {
 	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
+	struct net_conf *nc;
 	sector_t sector;
 	struct drbd_peer_request *peer_req;
 	struct p_data *p = pi->data;
@@ -2294,6 +2306,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	}
 
 	peer_req->w.cb = e_end_block;
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_APPLICATION;
 
 	dp_flags = be32_to_cpu(p->dp_flags);
 	rw |= wire_flags_to_bio(dp_flags);
@@ -2320,9 +2334,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	spin_unlock(&connection->epoch_lock);
 
 	rcu_read_lock();
-	tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	tp = nc->two_primaries;
+	if (peer_device->connection->agreed_pro_version < 100) {
+		switch (nc->wire_protocol) {
+		case DRBD_PROT_C:
+			dp_flags |= DP_SEND_WRITE_ACK;
+			break;
+		case DRBD_PROT_B:
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+			break;
+		}
+	}
 	rcu_read_unlock();
+
+	if (dp_flags & DP_SEND_WRITE_ACK) {
+		peer_req->flags |= EE_SEND_WRITE_ACK;
+		inc_unacked(device);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+	}
+
+	if (dp_flags & DP_SEND_RECEIVE_ACK) {
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+	}
+
 	if (tp) {
+		/* two primaries implies protocol C */
+		D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
 		peer_req->flags |= EE_IN_INTERVAL_TREE;
 		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
 		if (err)
@@ -2352,38 +2393,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	if (device->state.conn == C_SYNC_TARGET)
 		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
 
-	if (peer_device->connection->agreed_pro_version < 100) {
-		rcu_read_lock();
-		switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
-		case DRBD_PROT_C:
-			dp_flags |= DP_SEND_WRITE_ACK;
-			break;
-		case DRBD_PROT_B:
-			dp_flags |= DP_SEND_RECEIVE_ACK;
-			break;
-		}
-		rcu_read_unlock();
-	}
-
-	if (dp_flags & DP_SEND_WRITE_ACK) {
-		peer_req->flags |= EE_SEND_WRITE_ACK;
-		inc_unacked(device);
-		/* corresponding dec_unacked() in e_end_block()
-		 * respective _drbd_clear_done_ee */
-	}
-
-	if (dp_flags & DP_SEND_RECEIVE_ACK) {
-		/* I really don't like it that the receiver thread
-		 * sends on the msock, but anyways */
-		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
-	}
-
 	if (device->state.pdsk < D_INCONSISTENT) {
 		/* In case we have the only disk of the cluster, */
 		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
-		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
 		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
 		drbd_al_begin_io(device, &peer_req->i);
+		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
 	}
 
 	err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2396,8 +2411,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	list_del(&peer_req->w.list);
 	drbd_remove_epoch_entry_interval(device, peer_req);
 	spin_unlock_irq(&device->resource->req_lock);
-	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 		drbd_al_complete_io(device, &peer_req->i);
+	}
 
 out_interrupted:
 	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
@@ -2561,6 +2578,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 		peer_req->w.cb = w_e_end_data_req;
 		fault_type = DRBD_FAULT_DT_RD;
 		/* application IO, don't drbd_rs_begin_io */
+		peer_req->flags |= EE_APPLICATION;
 		goto submit;
 
 	case P_RS_DATA_REQUEST:
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 48975a264985..b908e9b3f63e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -132,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	i = peer_req->i;
 	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
 	block_id = peer_req->block_id;
+	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 
 	spin_lock_irqsave(&device->resource->req_lock, flags);
 	device->writ_cnt += peer_req->i.size >> 9;
-- 
cgit v1.2.3


From c5a2c1509ea55d074bf0cd8ff0679b247f70cbe3 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 8 May 2014 10:08:05 +0200
Subject: drbd: register peer requests on read_ee early

Initialize peer_request with timestamp and proper empty list head.
Add peer_request to list early, so debugfs can find this request and
report it as "preparing", even if we sleep before we actually submit it.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 2f67dc03d403..42e383513bfc 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 			goto fail;
 	}
 
+	memset(peer_req, 0, sizeof(*peer_req));
+	INIT_LIST_HEAD(&peer_req->w.list);
 	drbd_clear_interval(&peer_req->i);
 	peer_req->i.size = data_size;
 	peer_req->i.sector = sector;
-	peer_req->i.local = false;
-	peer_req->i.waiting = false;
-
-	peer_req->epoch = NULL;
+	peer_req->submit_jif = jiffies;
 	peer_req->peer_device = peer_device;
 	peer_req->pages = page;
-	atomic_set(&peer_req->pending_bios, 0);
-	peer_req->flags = 0;
 	/*
 	 * The block_id is opaque to the receiver.  It is not endianness
 	 * converted, and sent back to the sender unchanged.
@@ -2668,6 +2665,15 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	 * we would also throttle its application reads.
 	 * In that case, throttling is done on the SyncTarget only.
 	 */
+
+	/* Even though this may be a resync request, we do add to "read_ee";
+	 * "sync_ee" is only used for resync WRITEs.
+	 * Add to list early, so debugfs can find this request
+	 * even if we have to sleep below. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
 	if (device->state.peer != R_PRIMARY
 	&& drbd_rs_should_slow_down(device, sector, false))
 		schedule_timeout_uninterruptible(HZ/10);
@@ -2679,21 +2685,18 @@ submit_for_resync:
 
 submit:
 	inc_unacked(device);
-	spin_lock_irq(&device->resource->req_lock);
-	list_add_tail(&peer_req->w.list, &device->read_ee);
-	spin_unlock_irq(&device->resource->req_lock);
-
 	if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
 		return 0;
 
 	/* don't care for the reason here */
 	drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
 	spin_lock_irq(&device->resource->req_lock);
 	list_del(&peer_req->w.list);
 	spin_unlock_irq(&device->resource->req_lock);
 	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
 
-out_free_e:
 	put_ldev(device);
 	drbd_free_peer_req(device, peer_req);
 	return -EIO;
-- 
cgit v1.2.3


From 4ce4926683b820c5c85b8033891dbfb53cc8754f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 6 May 2014 00:44:59 +0200
Subject: drbd: track details of bitmap IO

Track start and submit time of bitmap operations, and
add pending bitmap IO contexts to a new pending_bitmap_io list.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 70 +++++++++++++++++++---------------------
 drivers/block/drbd/drbd_int.h    | 16 +++++++++
 drivers/block/drbd/drbd_main.c   |  1 +
 3 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 29af96038e69..426c97aef900 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -928,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device)
 	spin_unlock_irq(&b->bm_lock);
 }
 
-struct bm_aio_ctx {
-	struct drbd_device *device;
-	atomic_t in_flight;
-	unsigned int done;
-	unsigned flags;
-#define BM_AIO_COPY_PAGES	1
-#define BM_AIO_WRITE_HINTED	2
-#define BM_WRITE_ALL_PAGES	4
-	int error;
-	struct kref kref;
-};
-
-static void bm_aio_ctx_destroy(struct kref *kref)
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
 {
-	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+	struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+	unsigned long flags;
 
+	spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
 	put_ldev(ctx->device);
 	kfree(ctx);
 }
@@ -951,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref)
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
-	struct bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
 	struct drbd_device *device = ctx->device;
 	struct drbd_bitmap *b = device->bitmap;
 	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
@@ -994,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error)
 	if (atomic_dec_and_test(&ctx->in_flight)) {
 		ctx->done = 1;
 		wake_up(&device->misc_wait);
-		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 	}
 }
 
-static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
 	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
 	struct drbd_device *device = ctx->device;
 	struct drbd_bitmap *b = device->bitmap;
 	struct page *page;
 	unsigned int len;
+	unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
 
 	sector_t on_disk_sector =
 		device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1050,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-	struct bm_aio_ctx *ctx;
+	struct drbd_bm_aio_ctx *ctx;
 	struct drbd_bitmap *b = device->bitmap;
 	int num_pages, i, count = 0;
 	unsigned long now;
@@ -1068,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 	 * as we submit copies of pages anyways.
 	 */
 
-	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
 	if (!ctx)
 		return -ENOMEM;
 
-	*ctx = (struct bm_aio_ctx) {
+	*ctx = (struct drbd_bm_aio_ctx) {
 		.device = device,
+		.start_jif = jiffies,
 		.in_flight = ATOMIC_INIT(1),
 		.done = 0,
 		.flags = flags,
@@ -1081,7 +1075,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		.kref = { ATOMIC_INIT(2) },
 	};
 
-	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
 		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
 		kfree(ctx);
 		return -ENODEV;
@@ -1089,9 +1083,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
 	   drbd_adm_attach(), after device->ldev was assigned. */
 
-	if (!ctx->flags)
+	if (0 == (ctx->flags & ~BM_AIO_READ))
 		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&ctx->list, &device->pending_bitmap_io);
+	spin_unlock_irq(&device->resource->req_lock);
+
 	num_pages = b->bm_number_of_pages;
 
 	now = jiffies;
@@ -1101,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		/* ignore completely unchanged pages */
 		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
 			break;
-		if (rw & WRITE) {
+		if (!(flags & BM_AIO_READ)) {
 			if ((flags & BM_AIO_WRITE_HINTED) &&
 			    !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
 				    &page_private(b->bm_pages[i])))
 				continue;
 
-			if (!(flags & BM_WRITE_ALL_PAGES) &&
+			if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
 			    bm_test_page_unchanged(b->bm_pages[i])) {
 				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
 				continue;
@@ -1121,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 			}
 		}
 		atomic_inc(&ctx->in_flight);
-		bm_page_io_async(ctx, i, rw);
+		bm_page_io_async(ctx, i);
 		++count;
 		cond_resched();
 	}
@@ -1137,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 	if (!atomic_dec_and_test(&ctx->in_flight))
 		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
 	else
-		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 
 	/* summary for global bitmap IO */
 	if (flags == 0)
 		drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
-			 rw == WRITE ? "WRITE" : "READ",
+			 (flags & BM_AIO_READ) ? "READ" : "WRITE",
 			 count, jiffies - now);
 
 	if (ctx->error) {
@@ -1155,18 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		err = -EIO; /* Disk timeout/force-detach during IO... */
 
 	now = jiffies;
-	if (rw == READ) {
+	if (flags & BM_AIO_READ) {
 		b->bm_set = bm_count_bits(b);
 		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
 		     jiffies - now);
 	}
 	now = b->bm_set;
 
-	if (flags == 0)
+	if ((flags & ~BM_AIO_READ) == 0)
 		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
 		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
-	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 	return err;
 }
 
@@ -1176,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
  */
 int drbd_bm_read(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, READ, 0, 0);
+	return bm_rw(device, BM_AIO_READ, 0);
 }
 
 /**
@@ -1187,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, 0, 0);
+	return bm_rw(device, 0, 0);
 }
 
 /**
@@ -1198,7 +1196,7 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+	return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
 }
 
 /**
@@ -1224,7 +1222,7 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho
  */
 int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+	return bm_rw(device, BM_AIO_COPY_PAGES, 0);
 }
 
 /**
@@ -1233,7 +1231,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+	return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
 }
 
 /* NOTE
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c7a409f3aaf8..40c816ce8d75 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -777,6 +777,7 @@ struct drbd_peer_device {
 struct drbd_device {
 	struct drbd_resource *resource;
 	struct list_head peer_devices;
+	struct list_head pending_bitmap_io;
 	int vnr;			/* volume number within the connection */
 	struct kref kref;
 
@@ -918,6 +919,21 @@ struct drbd_device {
 	struct submit_worker submit;
 };
 
+struct drbd_bm_aio_ctx {
+	struct drbd_device *device;
+	struct list_head list; /* on device->pending_bitmap_io */;
+	unsigned long start_jif;
+	atomic_t in_flight;
+	unsigned int done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+#define BM_AIO_WRITE_HINTED	2
+#define BM_AIO_WRITE_ALL_PAGES	4
+#define BM_AIO_READ		8
+	int error;
+	struct kref kref;
+};
+
 struct drbd_config_context {
 	/* assigned from drbd_genlmsghdr */
 	unsigned int minor;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ad7c0e8843c4..922d631c10b3 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2793,6 +2793,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	kref_get(&device->kref);
 
 	INIT_LIST_HEAD(&device->peer_devices);
+	INIT_LIST_HEAD(&device->pending_bitmap_io);
 	for_each_connection(connection, resource) {
 		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
 		if (!peer_device)
-- 
cgit v1.2.3


From 4d3d5aa83aa45f1c7c9644b30e3a67e42c26695f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 2 May 2014 13:19:51 +0200
Subject: drbd: debugfs: add basic hierarchy

Add new debugfs hierarchy /sys/kernel/debug/
  drbd/
    resources/
      $resource_name/connections/peer/$volume_number/
      $resource_name/volumes/$volume_number/
    minors/$minor_number -> ../resources/$resource_name/volumes/$volume_number/

Followup commits will populate this hierarchy with files containing
statistics, diagnostic information and some attribute data.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/Makefile       |   1 +
 drivers/block/drbd/drbd_debugfs.c | 191 ++++++++++++++++++++++++++++++++++++++
 drivers/block/drbd/drbd_debugfs.h |  39 ++++++++
 drivers/block/drbd/drbd_int.h     |  30 +++++-
 drivers/block/drbd/drbd_main.c    |  22 ++++-
 5 files changed, 278 insertions(+), 5 deletions(-)
 create mode 100644 drivers/block/drbd/drbd_debugfs.c
 create mode 100644 drivers/block/drbd/drbd_debugfs.h

diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 8b450338075e..4464e353c1e8 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
 drbd-y += drbd_interval.o drbd_state.o
 drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
 
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000000..9b120c3d9703
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,191 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+	struct dentry *dentry;
+	if (!drbd_debugfs_resources)
+		return;
+
+	dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res = dentry;
+
+	dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_volumes = dentry;
+
+	dentry = debugfs_create_dir("connections", resource->debugfs_res);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_connections = dentry;
+
+	return;
+
+fail:
+	drbd_debugfs_resource_cleanup(resource);
+	drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+	debugfs_remove(*dp);
+	*dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+	/* it is ok to call debugfs_remove(NULL) */
+	drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+	drbd_debugfs_remove(&resource->debugfs_res_connections);
+	drbd_debugfs_remove(&resource->debugfs_res_volumes);
+	drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+	struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+	struct dentry *dentry;
+	if (!conns_dir)
+		return;
+
+	/* Once we enable mutliple peers,
+	 * these connections will have descriptive names.
+	 * For now, it is just the one connection to the (only) "peer". */
+	dentry = debugfs_create_dir("peer", conns_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn = dentry;
+	return;
+
+fail:
+	drbd_debugfs_connection_cleanup(connection);
+	drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+	drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+	drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+	drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+	struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+	char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+	char vnr_buf[8];   /* volume number vnr is even 16 bit only; */
+	char *slink_name = NULL;
+
+	struct dentry *dentry;
+	if (!vols_dir || !drbd_debugfs_minors)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, vols_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	device->debugfs_vol = dentry;
+
+	snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+	slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+			device->resource->name, device->vnr);
+	if (!slink_name)
+		goto fail;
+	dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	device->debugfs_minor = dentry;
+	kfree(slink_name);
+
+fail:
+	drbd_debugfs_device_cleanup(device);
+	drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+	drbd_debugfs_remove(&device->debugfs_minor);
+	drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+	drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+	struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+	struct dentry *dentry;
+	char vnr_buf[8];
+
+	if (!conn_dir)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, conn_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	peer_device->debugfs_peer_dev = dentry;
+	return;
+
+fail:
+	drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+	drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+	drbd_debugfs_remove(&drbd_debugfs_resources);
+	drbd_debugfs_remove(&drbd_debugfs_minors);
+	drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+	struct dentry *dentry;
+
+	dentry = debugfs_create_dir("drbd", NULL);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_root = dentry;
+
+	dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_resources = dentry;
+
+	dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_minors = dentry;
+	return 0;
+
+fail:
+	drbd_debugfs_cleanup();
+	if (dentry)
+		return PTR_ERR(dentry);
+	else
+		return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000000..8bee21340dce
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 40c816ce8d75..20f2b38e97b9 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -670,6 +670,12 @@ enum {
 
 struct drbd_resource {
 	char *name;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_res;
+	struct dentry *debugfs_res_volumes;
+	struct dentry *debugfs_res_connections;
+	struct dentry *debugfs_res_in_flight_summary;
+#endif
 	struct kref kref;
 	struct idr devices;		/* volume number to device mapping */
 	struct list_head connections;
@@ -691,6 +697,11 @@ struct drbd_resource {
 struct drbd_connection {
 	struct list_head connections;
 	struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_conn;
+	struct dentry *debugfs_conn_callback_history;
+	struct dentry *debugfs_conn_oldest_requests;
+#endif
 	struct kref kref;
 	struct idr peer_devices;	/* volume number to peer device mapping */
 	enum drbd_conns cstate;		/* Only C_STANDALONE to C_WF_REPORT_PARAMS */
@@ -772,13 +783,29 @@ struct drbd_peer_device {
 	struct list_head peer_devices;
 	struct drbd_device *device;
 	struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_peer_dev;
+#endif
 };
 
 struct drbd_device {
 	struct drbd_resource *resource;
 	struct list_head peer_devices;
 	struct list_head pending_bitmap_io;
-	int vnr;			/* volume number within the connection */
+
+	unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_minor;
+	struct dentry *debugfs_vol;
+	struct dentry *debugfs_vol_oldest_requests;
+	struct dentry *debugfs_vol_act_log_extents;
+	struct dentry *debugfs_vol_resync_extents;
+	struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+	unsigned int vnr;	/* volume number within the connection */
+	unsigned int minor;	/* device minor number */
+
 	struct kref kref;
 
 	/* things that are stored as / read from meta data on disk */
@@ -895,7 +922,6 @@ struct drbd_device {
 	atomic_t packet_seq;
 	unsigned int peer_seq;
 	spinlock_t peer_seq_lock;
-	unsigned int minor;
 	unsigned long comm_bm_set; /* communicated number of set bits. */
 	struct bm_io_work bm_io_work;
 	u64 ed_uuid; /* UUID of the exposed data */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 922d631c10b3..01de57ec0110 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -57,8 +57,8 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
-
 #include "drbd_vli.h"
+#include "drbd_debugfs.h"
 
 static DEFINE_MUTEX(drbd_main_mutex);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
@@ -2308,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource)
 
 	for_each_connection_safe(connection, tmp, resource) {
 		list_del(&connection->connections);
+		drbd_debugfs_connection_cleanup(connection);
 		kref_put(&connection->kref, drbd_destroy_connection);
 	}
+	drbd_debugfs_resource_cleanup(resource);
 	kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2334,6 +2336,7 @@ static void drbd_cleanup(void)
 		destroy_workqueue(retry.wq);
 
 	drbd_genl_unregister();
+	drbd_debugfs_cleanup();
 
 	idr_for_each_entry(&drbd_devices, device, i)
 		drbd_delete_device(device);
@@ -2583,6 +2586,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	mutex_init(&resource->conf_update);
 	mutex_init(&resource->adm_mutex);
 	spin_lock_init(&resource->req_lock);
+	drbd_debugfs_resource_add(resource);
 	return resource;
 
 fail_free_name:
@@ -2593,7 +2597,7 @@ fail:
 	return NULL;
 }
 
-/* caller must be under genl_lock() */
+/* caller must be under adm_mutex */
 struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 {
 	struct drbd_resource *resource;
@@ -2651,6 +2655,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 
 	kref_get(&resource->kref);
 	list_add_tail_rcu(&connection->connections, &resource->connections);
+	drbd_debugfs_connection_add(connection);
 	return connection;
 
 fail_resource:
@@ -2829,7 +2834,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 		for_each_peer_device(peer_device, device)
 			drbd_connected(peer_device);
 	}
-
+	/* move to create_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_add(peer_device);
+	drbd_debugfs_device_add(device);
 	return NO_ERROR;
 
 out_idr_remove_vol:
@@ -2868,8 +2876,13 @@ void drbd_delete_device(struct drbd_device *device)
 {
 	struct drbd_resource *resource = device->resource;
 	struct drbd_connection *connection;
+	struct drbd_peer_device *peer_device;
 	int refs = 3;
 
+	/* move to free_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_debugfs_device_cleanup(device);
 	for_each_connection(connection, resource) {
 		idr_remove(&connection->peer_devices, device->vnr);
 		refs++;
@@ -2938,6 +2951,9 @@ static int __init drbd_init(void)
 	spin_lock_init(&retry.lock);
 	INIT_LIST_HEAD(&retry.writes);
 
+	if (drbd_debugfs_init())
+		pr_notice("failed to initialize debugfs -- will not be available\n");
+
 	pr_info("initialized. "
 	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
 	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
-- 
cgit v1.2.3


From db1866ffeed2e142208a801f7598326b92ebf7c5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 2 May 2014 13:20:05 +0200
Subject: drbd: debugfs: add in_flight_summary data

To help diagnosing "high latency" or "hung" IO situations on DRBD,
present per drbd resource group a summary of operations currently in progress.

First item is a list of oldest drbd_request objects
waiting for various things:
 * still being prepared
 * waiting for activity log transaction
 * waiting for local disk
 * waiting to be sent
 * waiting for peer acknowledgement ("receive ack", "write ack")
 * waiting for peer epoch acknowledgement ("barrier ack")

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c | 196 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 196 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 9b120c3d9703..d393aee37da6 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -2,7 +2,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/stat.h>
+#include <linux/jiffies.h>
 #include <linux/list.h>
 
 #include "drbd_int.h"
@@ -13,6 +15,194 @@ static struct dentry *drbd_debugfs_root;
 static struct dentry *drbd_debugfs_resources;
 static struct dentry *drbd_debugfs_minors;
 
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+	if (valid)
+		seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+	else
+		seq_printf(m, "\t-");
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *name)
+{
+	if (!is_set)
+		return;
+	seq_putc(m, *sep);
+	seq_puts(m, name);
+	*sep = '|';
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+	unsigned int s = req->rq_state;
+	char sep = ' ';
+	seq_printf(m, "\t0x%08x", s);
+	seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+	/* RQ_WRITE ignored, already reported */
+	seq_puts(m, "\tlocal:");
+	seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+	seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+	seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	/* for_each_connection ... */
+	seq_printf(m, "\tnet:");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+	seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+	seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+	seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+	seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	seq_printf(m, " :");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+	seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+	if (sep == ' ')
+		seq_puts(m, " -");
+	seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	/* change anything here, fixup header below! */
+	unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+	seq_printf(m, "0x%x\t%llu\t%u\t%s",
+		req->epoch,
+		(unsigned long long)req->i.sector, req->i.size >> 9,
+		(s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+	seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+	seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+	seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+	seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+	seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+	seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+	seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+	seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+	struct drbd_resource *resource,
+	struct drbd_connection *connection,
+	unsigned long now)
+{
+	struct drbd_request *req;
+	unsigned int count = 0;
+	unsigned int show_state = 0;
+
+	seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+		unsigned int tmp = 0;
+		unsigned int s;
+		++count;
+
+		/* don't disable irq "forever" */
+		if (!(count & 0x1ff)) {
+			struct drbd_request *req_next;
+			kref_get(&req->kref);
+			spin_unlock_irq(&resource->req_lock);
+			cond_resched();
+			spin_lock_irq(&resource->req_lock);
+			req_next = list_next_entry(req, tl_requests);
+			if (kref_put(&req->kref, drbd_req_destroy))
+				req = req_next;
+			if (&req->tl_requests == &connection->transfer_log)
+				break;
+		}
+
+		s = req->rq_state;
+
+		/* This is meant to summarize timing issues, to be able to tell
+		 * local disk problems from network problems.
+		 * Skip requests, if we have shown an even older request with
+		 * similar aspects already.  */
+		if (req->master_bio == NULL)
+			tmp |= 1;
+		if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+			tmp |= 2;
+		if (s & RQ_NET_MASK) {
+			if (!(s & RQ_NET_SENT))
+				tmp |= 4;
+			if (s & RQ_NET_PENDING)
+				tmp |= 8;
+			if (!(s & RQ_NET_DONE))
+				tmp |= 16;
+		}
+		if ((tmp & show_state) == tmp)
+			continue;
+		show_state |= tmp;
+		seq_printf(m, "%u\t", count);
+		seq_print_minor_vnr_req(m, req, now);
+		if (show_state == 0x1f)
+			break;
+	}
+	spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+	struct drbd_resource *resource = m->private;
+	struct drbd_connection *connection;
+	unsigned long jif = jiffies;
+
+	connection = first_connection(resource);
+	/* This does not happen, actually.
+	 * But be robust and prepare for future code changes. */
+	if (!connection)
+		return 0;
+
+	seq_puts(m, "oldest application requests\n");
+	seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+	seq_putc(m, '\n');
+
+	jif = jiffies - jif;
+	if (jif)
+		seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+	return 0;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, in_flight_summary_show, inode->i_private);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+	.owner		= THIS_MODULE,
+	.open		= in_flight_summary_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 void drbd_debugfs_resource_add(struct drbd_resource *resource)
 {
 	struct dentry *dentry;
@@ -34,6 +224,12 @@ void drbd_debugfs_resource_add(struct drbd_resource *resource)
 		goto fail;
 	resource->debugfs_res_connections = dentry;
 
+	dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+			resource->debugfs_res, resource,
+			&in_flight_summary_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_in_flight_summary = dentry;
 	return;
 
 fail:
-- 
cgit v1.2.3


From 4a521cca9b9b2e943efb86645540aecccfeab0fc Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 5 May 2014 12:05:54 +0000
Subject: drbd: debugfs: deal with destructor racing with open of debugfs file

Try to close the race between open() and debugfs_remove_recursive()
from inside an object destructor.
Once open succeeds, the object should stay around.
Open should not succeed if the object has already reached its destructor.

This may be overkill, but to make that happen, we check for existence of
a parent directory, "stale-ness" of "this" dentry, and serialize
kref_get_unless_zero() on the outermost object relevant for this file
with d_delete() on this dentry (using the parent's i_mutex).

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c | 58 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index d393aee37da6..51c64ec7bbc1 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -177,8 +177,8 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
 	connection = first_connection(resource);
 	/* This does not happen, actually.
 	 * But be robust and prepare for future code changes. */
-	if (!connection)
-		return 0;
+	if (!connection || !kref_get_unless_zero(&connection->kref))
+		return -ESTALE;
 
 	seq_puts(m, "oldest application requests\n");
 	seq_print_resource_transfer_log_summary(m, resource, connection, jif);
@@ -187,12 +187,62 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
 	jif = jiffies - jif;
 	if (jif)
 		seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+	kref_put(&connection->kref, drbd_destroy_connection);
 	return 0;
 }
 
+/* simple_positive(file->f_dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above.  :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+		                void *data, struct kref *kref,
+				void (*release)(struct kref *))
+{
+	struct dentry *parent;
+	int ret = -ESTALE;
+
+	/* Are we still linked,
+	 * or has debugfs_remove() already been called? */
+	parent = file->f_dentry->d_parent;
+	/* not sure if this can happen: */
+	if (!parent || !parent->d_inode)
+		goto out;
+	/* serialize with d_delete() */
+	mutex_lock(&parent->d_inode->i_mutex);
+	if (!debugfs_positive(file->f_dentry))
+		goto out_unlock;
+	/* Make sure the object is still alive */
+	if (kref_get_unless_zero(kref))
+		ret = 0;
+out_unlock:
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (!ret) {
+		ret = single_open(file, show, data);
+		if (ret)
+			kref_put(kref, release);
+	}
+out:
+	return ret;
+}
+
 static int in_flight_summary_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, in_flight_summary_show, inode->i_private);
+	struct drbd_resource *resource = inode->i_private;
+	return drbd_single_open(file, in_flight_summary_show, resource,
+				&resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	kref_put(&resource->kref, drbd_destroy_resource);
+	return single_release(inode, file);
 }
 
 static const struct file_operations in_flight_summary_fops = {
@@ -200,7 +250,7 @@ static const struct file_operations in_flight_summary_fops = {
 	.open		= in_flight_summary_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= in_flight_summary_release,
 };
 
 void drbd_debugfs_resource_add(struct drbd_resource *resource)
-- 
cgit v1.2.3


From f418815f7adad4917e92e9d11fdc1ca21cd616a1 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 5 May 2014 23:05:47 +0200
Subject: drbd: debugfs: Add in_flight_summary

* Add details about pending meta data operations to in_flight_summary.

* Report number of requests waiting for activity log transactions.

* timing details of peer_requests to in_flight_summary.

* FLUSH details
  DRBD devides the incoming request stream into "epochs",
  in which peers are allowed to re-order writes independendly.

  These epochs are separated by P_BARRIER on the replication link.
  Such barrier packets, depending on configuration, may cause
  the receiving side to drain the lower level device request queues
  and call blkdev_issue_flush().

  This is known to be an other major source of latency in DRBD.

  Track timing details of calls to blkdev_issue_flush(),
  and add them to in_flight_summary.

* data socket stats
  To be able to diagnose bottlenecks and root causes of "slow" IO on DRBD,
  it is useful to see network buffer stats along with the timing details of
  requests, peer requests, and meta data IO.

* pending bitmap IO timing details to in_flight_summary.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c  | 231 ++++++++++++++++++++++++++++++++++++-
 drivers/block/drbd/drbd_int.h      |   3 +
 drivers/block/drbd/drbd_main.c     |  33 +++---
 drivers/block/drbd/drbd_receiver.c |   8 ++
 4 files changed, 255 insertions(+), 20 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 51c64ec7bbc1..12d072d9751f 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -23,14 +23,24 @@ static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long
 		seq_printf(m, "\t-");
 }
 
+static void __seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+	if (is_set && set_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, set_name);
+		*sep = '|';
+	} else if (!is_set && unset_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, unset_name);
+		*sep = '|';
+	}
+}
+
 static void seq_print_rq_state_bit(struct seq_file *m,
-	bool is_set, char *sep, const char *name)
+	bool is_set, char *sep, const char *set_name)
 {
-	if (!is_set)
-		return;
-	seq_putc(m, *sep);
-	seq_puts(m, name);
-	*sep = '|';
+	__seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
 }
 
 /* pretty print enum drbd_req_state_bits req->rq_state */
@@ -108,6 +118,184 @@ static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req
 	seq_print_one_request(m, req, now);
 }
 
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		struct drbd_md_io tmp;
+		/* In theory this is racy,
+		 * in the sense that there could have been a
+		 * drbd_md_put_buffer(); drbd_md_get_buffer();
+		 * between accessing these members here.  */
+		tmp = device->md_io;
+		if (atomic_read(&tmp.in_use)) {
+			seq_printf(m, "%u\t%u\t%d\t",
+				device->minor, device->vnr,
+				jiffies_to_msecs(now - tmp.start_jif));
+			if (time_before(tmp.submit_jif, tmp.start_jif))
+				seq_puts(m, "-\t");
+			else
+				seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+			seq_printf(m, "%s\n", tmp.current_use);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		unsigned long jif;
+		struct drbd_request *req;
+		int n = atomic_read(&device->ap_actlog_cnt);
+		if (n) {
+			spin_lock_irq(&device->resource->req_lock);
+			req = list_first_entry_or_null(&device->pending_master_completion[1],
+				struct drbd_request, req_pending_master_completion);
+			/* if the oldest request does not wait for the activity log
+			 * it is not interesting for us here */
+			if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+				jif = req->start_jif;
+			else
+				req = NULL;
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		if (n) {
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+			if (req)
+				seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+			else
+				seq_puts(m, "-\t");
+			seq_printf(m, "%u\n", n);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+	struct drbd_bm_aio_ctx *ctx;
+	unsigned long start_jif;
+	unsigned int in_flight;
+	unsigned int flags;
+	spin_lock_irq(&device->resource->req_lock);
+	ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+	if (ctx && ctx->done)
+		ctx = NULL;
+	if (ctx) {
+		start_jif = ctx->start_jif;
+		in_flight = atomic_read(&ctx->in_flight);
+		flags = ctx->flags;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+	if (ctx) {
+		seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+			device->minor, device->vnr,
+			(flags & BM_AIO_READ) ? 'R' : 'W',
+			jiffies_to_msecs(now - start_jif),
+			in_flight);
+	}
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_bitmap_io(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+	unsigned long f = peer_req->flags;
+	char sep = ' ';
+
+	__seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+	__seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+	seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+	seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+	if (f & EE_IS_TRIM) {
+		seq_putc(m, sep);
+		sep = '|';
+		if (f & EE_IS_TRIM_USE_ZEROOUT)
+			seq_puts(m, "zero-out");
+		else
+			seq_puts(m, "trim");
+	}
+	seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+	struct drbd_device *device, struct list_head *lh,
+	unsigned long now)
+{
+	bool reported_preparing = false;
+	struct drbd_peer_request *peer_req;
+	list_for_each_entry(peer_req, lh, w.list) {
+		if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+			continue;
+
+		if (device)
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+		seq_printf(m, "%llu\t%u\t%c\t%u\t",
+			(unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+			(peer_req->flags & EE_WRITE) ? 'W' : 'R',
+			jiffies_to_msecs(now - peer_req->submit_jif));
+		seq_print_peer_request_flags(m, peer_req);
+		if (peer_req->flags & EE_SUBMITTED)
+			break;
+		else
+			reported_preparing = true;
+	}
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+	struct drbd_device *device, unsigned long now)
+{
+	seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+	spin_lock_irq(&device->resource->req_lock);
+	seq_print_peer_request(m, device, &device->active_ee, now);
+	seq_print_peer_request(m, device, &device->read_ee, now);
+	seq_print_peer_request(m, device, &device->sync_ee, now);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (test_bit(FLUSH_PENDING, &device->flags)) {
+		seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+			device->minor, device->vnr,
+			jiffies_to_msecs(now - device->flush_jif));
+	}
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+	struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_peer_requests(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
 static void seq_print_resource_transfer_log_summary(struct seq_file *m,
 	struct drbd_resource *resource,
 	struct drbd_connection *connection,
@@ -180,6 +368,37 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
 	if (!connection || !kref_get_unless_zero(&connection->kref))
 		return -ESTALE;
 
+	seq_puts(m, "oldest bitmap IO\n");
+	seq_print_resource_pending_bitmap_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "meta data IO\n");
+	seq_print_resource_pending_meta_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "socket buffer stats\n");
+	/* for each connection ... once we have more than one */
+	rcu_read_lock();
+	if (connection->data.socket) {
+		/* open coded SIOCINQ, the "relevant" part */
+		struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+		int answ = tp->rcv_nxt - tp->copied_seq;
+		seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+		/* open coded SIOCOUTQ, the "relevant" part */
+		answ = tp->write_seq - tp->snd_una;
+		seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+	}
+	rcu_read_unlock();
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest peer requests\n");
+	seq_print_resource_pending_peer_requests(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "application requests waiting for activity log\n");
+	seq_print_waiting_for_AL(m, resource, jif);
+	seq_putc(m, '\n');
+
 	seq_puts(m, "oldest application requests\n");
 	seq_print_resource_transfer_log_summary(m, resource, connection, jif);
 	seq_putc(m, '\n');
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 20f2b38e97b9..30ed430f57bf 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -522,6 +522,9 @@ enum {
 	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
 	READ_BALANCE_RR,
 
+	FLUSH_PENDING,		/* if set, device->flush_jif is when we submitted that flush
+				 * from drbd_flush_after_epoch() */
+
 	/* cleared only after backing device related structures have been destroyed. */
 	GOING_DISKLESS,		/* Disk is being detached, because of io-error, or admin request. */
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 01de57ec0110..9712bcc8e069 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2982,24 +2982,29 @@ void drbd_free_ldev(struct drbd_backing_dev *ldev)
 	kfree(ldev);
 }
 
-void drbd_free_sock(struct drbd_connection *connection)
+static void drbd_free_one_sock(struct drbd_socket *ds)
 {
-	if (connection->data.socket) {
-		mutex_lock(&connection->data.mutex);
-		kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
-		sock_release(connection->data.socket);
-		connection->data.socket = NULL;
-		mutex_unlock(&connection->data.mutex);
-	}
-	if (connection->meta.socket) {
-		mutex_lock(&connection->meta.mutex);
-		kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
-		sock_release(connection->meta.socket);
-		connection->meta.socket = NULL;
-		mutex_unlock(&connection->meta.mutex);
+	struct socket *s;
+	mutex_lock(&ds->mutex);
+	s = ds->socket;
+	ds->socket = NULL;
+	mutex_unlock(&ds->mutex);
+	if (s) {
+		/* so debugfs does not need to mutex_lock() */
+		synchronize_rcu();
+		kernel_sock_shutdown(s, SHUT_RDWR);
+		sock_release(s);
 	}
 }
 
+void drbd_free_sock(struct drbd_connection *connection)
+{
+	if (connection->data.socket)
+		drbd_free_one_sock(&connection->data);
+	if (connection->meta.socket)
+		drbd_free_one_sock(&connection->meta);
+}
+
 /* meta data management */
 
 void conn_md_sync(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 42e383513bfc..9f6ed240e075 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1187,8 +1187,16 @@ static void drbd_flush(struct drbd_connection *connection)
 			kref_get(&device->kref);
 			rcu_read_unlock();
 
+			/* Right now, we have only this one synchronous code path
+			 * for flushes between request epochs.
+			 * We may want to make those asynchronous,
+			 * or at least parallelize the flushes to the volume devices.
+			 */
+			device->flush_jif = jiffies;
+			set_bit(FLUSH_PENDING, &device->flags);
 			rv = blkdev_issue_flush(device->ldev->backing_bdev,
 					GFP_NOIO, NULL);
+			clear_bit(FLUSH_PENDING, &device->flags);
 			if (rv) {
 				drbd_info(device, "local disk flush failed with status %d\n", rv);
 				/* would rather check on EOPNOTSUPP, but that is not reliable.
-- 
cgit v1.2.3


From 944410e97cfcec38369eeb5f77d0e8da91d68afb Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 6 May 2014 15:02:05 +0200
Subject: drbd: debugfs: add callback_history

Add a per-connection worker thread callback_history
with timing details, call site and callback function.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c  | 78 ++++++++++++++++++++++++++++++++++++++
 drivers/block/drbd/drbd_int.h      | 26 +++++++++++++
 drivers/block/drbd/drbd_receiver.c |  6 +++
 drivers/block/drbd/drbd_worker.c   | 37 ++++++++++++++++--
 4 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 12d072d9751f..230d9e1fc85c 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -521,6 +521,77 @@ void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
 	drbd_debugfs_remove(&resource->debugfs_res);
 }
 
+static void seq_print_one_timing_detail(struct seq_file *m,
+	const struct drbd_thread_timing_details *tdp,
+	unsigned long now)
+{
+	struct drbd_thread_timing_details td;
+	/* No locking...
+	 * use temporary assignment to get at consistent data. */
+	do {
+		td = *tdp;
+	} while (td.cb_nr != tdp->cb_nr);
+	if (!td.cb_addr)
+		return;
+	seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+			td.cb_nr,
+			jiffies_to_msecs(now - td.start_jif),
+			td.caller_fn, td.line,
+			td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+		const char *title,
+		unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+	unsigned int start_idx;
+	unsigned int i;
+
+	seq_printf(m, "%s\n", title);
+	/* If not much is going on, this will result in natural ordering.
+	 * If it is very busy, we will possibly skip events, or even see wrap
+	 * arounds, which could only be avoided with locking.
+	 */
+	start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+	for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+	for (i = 0; i < start_idx; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long jif = jiffies;
+
+	seq_puts(m, "n\tage\tcallsite\tfn\n");
+	seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+	seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+	return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, callback_history_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+	.owner		= THIS_MODULE,
+	.open		= callback_history_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= callback_history_release,
+};
+
 void drbd_debugfs_connection_add(struct drbd_connection *connection)
 {
 	struct dentry *conns_dir = connection->resource->debugfs_res_connections;
@@ -535,6 +606,13 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	connection->debugfs_conn = dentry;
+
+	dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+			connection->debugfs_conn, connection,
+			&connection_callback_history_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn_callback_history = dentry;
 	return;
 
 fail:
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 30ed430f57bf..1a000016ccdf 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -697,6 +697,15 @@ struct drbd_resource {
 	cpumask_var_t cpu_mask;
 };
 
+struct drbd_thread_timing_details
+{
+	unsigned long start_jif;
+	void *cb_addr;
+	const char *caller_fn;
+	unsigned int line;
+	unsigned int cb_nr;
+};
+
 struct drbd_connection {
 	struct list_head connections;
 	struct drbd_resource *resource;
@@ -759,6 +768,12 @@ struct drbd_connection {
 	/* sender side */
 	struct drbd_work_queue sender_work;
 
+#define DRBD_THREAD_DETAILS_HIST	16
+	unsigned int w_cb_nr; /* keeps counting up */
+	unsigned int r_cb_nr; /* keeps counting up */
+	struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+	struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
 	struct {
 		/* whether this sender thread
 		 * has processed a single write yet. */
@@ -774,6 +789,17 @@ struct drbd_connection {
 	} send;
 };
 
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+	__update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+	__update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
 struct submit_worker {
 	struct workqueue_struct *wq;
 	struct work_struct worker;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9f6ed240e075..f972988291c5 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2682,9 +2682,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	list_add_tail(&peer_req->w.list, &device->read_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
+	update_receiver_timing_details(connection, drbd_rs_should_slow_down);
 	if (device->state.peer != R_PRIMARY
 	&& drbd_rs_should_slow_down(device, sector, false))
 		schedule_timeout_uninterruptible(HZ/10);
+	update_receiver_timing_details(connection, drbd_rs_begin_io);
 	if (drbd_rs_begin_io(device, sector))
 		goto out_free_e;
 
@@ -2692,6 +2694,7 @@ submit_for_resync:
 	atomic_add(size >> 9, &device->rs_sect_ev);
 
 submit:
+	update_receiver_timing_details(connection, drbd_submit_peer_request);
 	inc_unacked(device);
 	if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
 		return 0;
@@ -4601,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection)
 		struct data_cmd *cmd;
 
 		drbd_thread_current_set_cpu(&connection->receiver);
+		update_receiver_timing_details(connection, drbd_recv_header);
 		if (drbd_recv_header(connection, &pi))
 			goto err_out;
 
@@ -4619,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection)
 		}
 
 		if (shs) {
+			update_receiver_timing_details(connection, drbd_recv_all_warn);
 			err = drbd_recv_all_warn(connection, pi.data, shs);
 			if (err)
 				goto err_out;
 			pi.size -= shs;
 		}
 
+		update_receiver_timing_details(connection, cmd->fn);
 		err = cmd->fn(connection, &pi);
 		if (err) {
 			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index b908e9b3f63e..50776b362828 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1905,6 +1905,29 @@ static int do_md_sync(struct drbd_device *device)
 	return 0;
 }
 
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line)
+{
+	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+	struct drbd_thread_timing_details *td = tdp + i;
+
+	td->start_jif = jiffies;
+	td->cb_addr = cb;
+	td->caller_fn = fn;
+	td->line = line;
+	td->cb_nr = *cb_nr;
+
+	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+	td = tdp + i;
+	memset(td, 0, sizeof(*td));
+
+	++(*cb_nr);
+}
+
 #define WORK_PENDING(work_bit, todo)	(todo & (1UL << work_bit))
 static void do_device_work(struct drbd_device *device, const unsigned long todo)
 {
@@ -2076,11 +2099,15 @@ int drbd_worker(struct drbd_thread *thi)
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
 
-		if (list_empty(&work_list))
+		if (list_empty(&work_list)) {
+			update_worker_timing_details(connection, wait_for_work);
 			wait_for_work(connection, &work_list);
+		}
 
-		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags))
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
 			do_unqueued_work(connection);
+		}
 
 		if (signal_pending(current)) {
 			flush_signals(current);
@@ -2097,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi)
 		while (!list_empty(&work_list)) {
 			w = list_first_entry(&work_list, struct drbd_work, list);
 			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
 			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
 				continue;
 			if (connection->cstate >= C_WF_REPORT_PARAMS)
@@ -2105,11 +2133,14 @@ int drbd_worker(struct drbd_thread *thi)
 	}
 
 	do {
-		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags))
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
 			do_unqueued_work(connection);
+		}
 		while (!list_empty(&work_list)) {
 			w = list_first_entry(&work_list, struct drbd_work, list);
 			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
 			w->cb(w, 1);
 		}
 		dequeue_work_batch(&connection->sender_work, &work_list);
-- 
cgit v1.2.3


From 54e6fc38e888a54b016e1e04e1eceea78ddf7ace Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 8 May 2014 13:39:35 +0200
Subject: drbd: debugfs: add per volume oldest_requests

Show oldest requests
 * pending master bio completion and,
 * if different, local disk bio completion.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c | 153 ++++++++++++++++++++++++++++++++++++--
 lib/lru_cache.c                   |  21 +++---
 2 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 230d9e1fc85c..45b52056f88d 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -434,12 +434,10 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
 		goto out;
 	/* serialize with d_delete() */
 	mutex_lock(&parent->d_inode->i_mutex);
-	if (!debugfs_positive(file->f_dentry))
-		goto out_unlock;
 	/* Make sure the object is still alive */
-	if (kref_get_unless_zero(kref))
+	if (debugfs_positive(file->f_dentry)
+	&& kref_get_unless_zero(kref))
 		ret = 0;
-out_unlock:
 	mutex_unlock(&parent->d_inode->i_mutex);
 	if (!ret) {
 		ret = single_open(file, show, data);
@@ -592,6 +590,53 @@ static const struct file_operations connection_callback_history_fops = {
 	.release	= callback_history_release,
 };
 
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	r1 = connection->req_next;
+	if (r1)
+		seq_print_minor_vnr_req(m, r1, now);
+	r2 = connection->req_ack_pending;
+	if (r2 && r2 != r1) {
+		r1 = r2;
+		seq_print_minor_vnr_req(m, r1, now);
+	}
+	r2 = connection->req_not_net_done;
+	if (r2 && r2 != r1)
+		seq_print_minor_vnr_req(m, r2, now);
+	spin_unlock_irq(&connection->resource->req_lock);
+	return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, connection_oldest_requests_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+	.owner		= THIS_MODULE,
+	.open		= connection_oldest_requests_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= connection_oldest_requests_release,
+};
+
 void drbd_debugfs_connection_add(struct drbd_connection *connection)
 {
 	struct dentry *conns_dir = connection->resource->debugfs_res_connections;
@@ -627,6 +672,89 @@ void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
 	drbd_debugfs_remove(&connection->debugfs_conn);
 }
 
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+       seq_printf(m, "%5d %s %s %s\n", bme->rs_left,
+		  test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+		  test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+		  test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+		  );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->resync);
+		lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->act_log);
+		lc_seq_dump_details(m, device->act_log, "", NULL);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_resource *resource = device->resource;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+	int i;
+
+	seq_puts(m, RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	/* WRITE, then READ */
+	for (i = 1; i >= 0; --i) {
+		r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+			struct drbd_request, req_pending_master_completion);
+		r2 = list_first_entry_or_null(&device->pending_completion[i],
+			struct drbd_request, req_pending_local);
+		if (r1)
+			seq_print_one_request(m, r1, now);
+		if (r2 && r2 != r1)
+			seq_print_one_request(m, r2, now);
+	}
+	spin_unlock_irq(&resource->req_lock);
+	return 0;
+}
+
+#define drbd_debugfs_device_attr(name)						\
+static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	return drbd_single_open(file, device_ ## name ## _show, device,		\
+				&device->kref, drbd_destroy_device);		\
+}										\
+static int device_ ## name ## _release(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	kref_put(&device->kref, drbd_destroy_device);				\
+	return single_release(inode, file);					\
+}										\
+static const struct file_operations device_ ## name ## _fops = {		\
+	.owner		= THIS_MODULE,						\
+	.open		= device_ ## name ## _open,				\
+	.read		= seq_read,						\
+	.llseek		= seq_lseek,						\
+	.release	= device_ ## name ## _release,				\
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+
 void drbd_debugfs_device_add(struct drbd_device *device)
 {
 	struct dentry *vols_dir = device->resource->debugfs_res_volumes;
@@ -650,10 +778,25 @@ void drbd_debugfs_device_add(struct drbd_device *device)
 	if (!slink_name)
 		goto fail;
 	dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+	kfree(slink_name);
+	slink_name = NULL;
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	device->debugfs_minor = dentry;
-	kfree(slink_name);
+
+#define DCF(name)	do {					\
+	dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,	\
+			device->debugfs_vol, device,		\
+			&device_ ## name ## _fops);		\
+	if (IS_ERR_OR_NULL(dentry))				\
+		goto fail;					\
+	device->debugfs_vol_ ## name = dentry;			\
+	} while (0)
+
+	DCF(oldest_requests);
+	DCF(act_log_extents);
+	DCF(resync_extents);
+	return;
 
 fail:
 	drbd_debugfs_device_cleanup(device);
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 6111cd19762d..852c81e3ba9a 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -643,9 +643,10 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
  * lc_dump - Dump a complete LRU cache to seq in textual form.
  * @lc: the lru cache to operate on
  * @seq: the &struct seq_file pointer to seq_printf into
- * @utext: user supplied "heading" or other info
+ * @utext: user supplied additional "heading" or other info
  * @detail: function pointer the user may provide to dump further details
- * of the object the lc_element is embedded in.
+ * of the object the lc_element is embedded in. May be NULL.
+ * Note: a leading space ' ' and trailing newline '\n' is implied.
  */
 void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
 	     void (*detail) (struct seq_file *, struct lc_element *))
@@ -654,16 +655,18 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext
 	struct lc_element *e;
 	int i;
 
-	seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+	seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext);
 	for (i = 0; i < nr_elements; i++) {
 		e = lc_element_by_index(lc, i);
-		if (e->lc_number == LC_FREE) {
-			seq_printf(seq, "\t%2d: FREE\n", i);
-		} else {
-			seq_printf(seq, "\t%2d: %4u %4u    ", i,
-				   e->lc_number, e->refcnt);
+		if (e->lc_number != e->lc_new_number)
+			seq_printf(seq, "\t%5d: %6d %8d %6d ",
+				i, e->lc_number, e->lc_new_number, e->refcnt);
+		else
+			seq_printf(seq, "\t%5d: %6d %-8s %6d ",
+				i, e->lc_number, "-\"-", e->refcnt);
+		if (detail)
 			detail(seq, e);
-		}
+		seq_putc(seq, '\n');
 	}
 }
 
-- 
cgit v1.2.3


From b44e1184fc3158821a74c870f777a39e4aac2ce5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 6 May 2014 15:05:23 +0200
Subject: drbd: debugfs: add version tag to debugfs files

Make the first line of debugfs files a version number,
starting now with "v: 0".

If we change content of presentation, we will bump that.
Monitoring or diagnostic scritps that may parse these files
can then easily know when they need to be reviewed.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c | 52 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 45b52056f88d..a4d0666a5931 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -11,7 +11,13 @@
 #include "drbd_req.h"
 #include "drbd_debugfs.h"
 
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
 static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
 static struct dentry *drbd_debugfs_resources;
 static struct dentry *drbd_debugfs_minors;
 
@@ -368,6 +374,9 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
 	if (!connection || !kref_get_unless_zero(&connection->kref))
 		return -ESTALE;
 
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
 	seq_puts(m, "oldest bitmap IO\n");
 	seq_print_resource_pending_bitmap_io(m, resource, jif);
 	seq_putc(m, '\n');
@@ -562,6 +571,9 @@ static int callback_history_show(struct seq_file *m, void *ignored)
 	struct drbd_connection *connection = m->private;
 	unsigned long jif = jiffies;
 
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
 	seq_puts(m, "n\tage\tcallsite\tfn\n");
 	seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
 	seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
@@ -686,6 +698,10 @@ static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
 static int device_resync_extents_show(struct seq_file *m, void *ignored)
 {
 	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
 	if (get_ldev_if_state(device, D_FAILED)) {
 		lc_seq_printf_stats(m, device->resync);
 		lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
@@ -697,6 +713,10 @@ static int device_resync_extents_show(struct seq_file *m, void *ignored)
 static int device_act_log_extents_show(struct seq_file *m, void *ignored)
 {
 	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
 	if (get_ldev_if_state(device, D_FAILED)) {
 		lc_seq_printf_stats(m, device->act_log);
 		lc_seq_dump_details(m, device->act_log, "", NULL);
@@ -713,6 +733,9 @@ static int device_oldest_requests_show(struct seq_file *m, void *ignored)
 	struct drbd_request *r1, *r2;
 	int i;
 
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
 	seq_puts(m, RQ_HDR);
 	spin_lock_irq(&resource->req_lock);
 	/* WRITE, then READ */
@@ -839,12 +862,36 @@ void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
 	drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
 }
 
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+	seq_printf(m, "# %s\n", drbd_buildtag());
+	seq_printf(m, "VERSION=%s\n", REL_VERSION);
+	seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+	seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+	seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+	return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+	.owner = THIS_MODULE,
+	.open = drbd_version_open,
+	.llseek = seq_lseek,
+	.read = seq_read,
+	.release = single_release,
+};
+
 /* not __exit, may be indirectly called
  * from the module-load-failure path as well. */
 void drbd_debugfs_cleanup(void)
 {
 	drbd_debugfs_remove(&drbd_debugfs_resources);
 	drbd_debugfs_remove(&drbd_debugfs_minors);
+	drbd_debugfs_remove(&drbd_debugfs_version);
 	drbd_debugfs_remove(&drbd_debugfs_root);
 }
 
@@ -857,6 +904,11 @@ int __init drbd_debugfs_init(void)
 		goto fail;
 	drbd_debugfs_root = dentry;
 
+	dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_version = dentry;
+
 	dentry = debugfs_create_dir("resources", drbd_debugfs_root);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
-- 
cgit v1.2.3


From 3d299f48a9ff82d29093578e937e83e41dc93325 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 14 May 2014 20:13:36 +0200
Subject: drbd: debugfs: add per connection oldest requests

Information of former /sys/block/drbdX/drbd/oldest_requests
is already with higher detail in these files:
 debugfs/drbd/resource/$name/in_flight_summary,
 debugfs/drbd/resource/$name/volumes/$vnr/oldest_requests

This patch adds
 debugfs/drbd/resource/$name/connections/peer/oldest_requests

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index a4d0666a5931..db70d6caf220 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -670,6 +670,13 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	connection->debugfs_conn_callback_history = dentry;
+
+	dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+			connection->debugfs_conn, connection,
+			&connection_oldest_requests_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn_oldest_requests = dentry;
 	return;
 
 fail:
-- 
cgit v1.2.3


From cc356f85bb57a3587e4b1d944e669fe39b37cdf2 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 14 May 2014 20:33:05 +0200
Subject: drbd: debugfs: add per device data_gen_id

The data generation identifiers used to be exposed via sysfs
at /sys/block/drbdX/drbd/meta_data/data_gen_id (out-of-tree),
for advanced policy scripting.
Bring that information over to debugfs.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_debugfs.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index db70d6caf220..5c20b18540b8 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -760,6 +760,25 @@ static int device_oldest_requests_show(struct seq_file *m, void *ignored)
 	return 0;
 }
 
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_md *md;
+	enum drbd_uuid_index idx;
+
+	if (!get_ldev_if_state(device, D_FAILED))
+		return -ENODEV;
+
+	md = &device->ldev->md;
+	spin_lock_irq(&md->uuid_lock);
+	for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+		seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+	}
+	spin_unlock_irq(&md->uuid_lock);
+	put_ldev(device);
+	return 0;
+}
+
 #define drbd_debugfs_device_attr(name)						\
 static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
 {										\
@@ -784,6 +803,7 @@ static const struct file_operations device_ ## name ## _fops = {		\
 drbd_debugfs_device_attr(oldest_requests)
 drbd_debugfs_device_attr(act_log_extents)
 drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
 
 void drbd_debugfs_device_add(struct drbd_device *device)
 {
@@ -826,6 +846,8 @@ void drbd_debugfs_device_add(struct drbd_device *device)
 	DCF(oldest_requests);
 	DCF(act_log_extents);
 	DCF(resync_extents);
+	DCF(data_gen_id);
+#undef DCF
 	return;
 
 fail:
-- 
cgit v1.2.3


From f5b90b6bf0cf29a85ceaa8ce334b17814cd5d39b Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 7 May 2014 22:41:28 +0200
Subject: drbd: resync should only lock out specific ranges

During resync, if we need to block some specific incoming write because
of active resync requests to that same range, we potentially caused
*all* new application writes (to "cold" activity log extents) to block
until this one request has been processed.

Improve the do_submit() logic to
 * grab all incoming requests to some "incoming" list
 * process this list
   - move aside requests that are blocked by resync
   - prepare activity log transactions,
   - commit transactions and submit corresponding requests
   - if there are remaining requests that only wait for
     activity log extents to become free, stop the fast path
     (mark activity log as "starving")
   - iterate until no more requests are waiting for the activity log,
     but all potentially remaining requests are only blocked by resync
 * only then grab new incoming requests

That way, very busy IO on currently "hot" activity log extents cannot
starve scattered IO to "cold" extents. And blocked-by-resync requests
are processed once resync traffic on the affected region has ceased,
without blocking anything else.

The only blocking mode left is when we cannot start requests to "cold"
extents because all currently "hot" extents are actually used.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c |  15 ++++-
 drivers/block/drbd/drbd_req.c    | 136 ++++++++++++++++++++++++---------------
 2 files changed, 98 insertions(+), 53 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 6ce5c76d642b..e9fbcafaccdc 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -357,8 +357,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *
 	/* We want all necessary updates for a given request within the same transaction
 	 * We could first check how many updates are *actually* needed,
 	 * and use that instead of the worst-case nr_al_extents */
-	if (available_update_slots < nr_al_extents)
-		return -EWOULDBLOCK;
+	if (available_update_slots < nr_al_extents) {
+		/* Too many activity log extents are currently "hot".
+		 *
+		 * If we have accumulated pending changes already,
+		 * we made progress.
+		 *
+		 * If we cannot get even a single pending change through,
+		 * stop the fast path until we made some progress,
+		 * or requests to "cold" extents could be starved. */
+		if (!al->pending_changes)
+			__set_bit(__LC_STARVING, &device->act_log->flags);
+		return -ENOBUFS;
+	}
 
 	/* Is resync active in this area? */
 	for (enr = first; enr <= last; enr++) {
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 74ebef101dc7..c67717d572d1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1182,6 +1182,8 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
 			&device->pending_master_completion[1 /* WRITE */]);
 	spin_unlock_irq(&device->resource->req_lock);
 	queue_work(device->submit.wq, &device->submit.worker);
+	/* do_submit() may sleep internally on al_wait, too */
+	wake_up(&device->al_wait);
 }
 
 /* returns the new drbd_request pointer, if the caller is expected to
@@ -1365,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 
 static bool prepare_al_transaction_nonblock(struct drbd_device *device,
 					    struct list_head *incoming,
-					    struct list_head *pending)
+					    struct list_head *pending,
+					    struct list_head *later)
 {
 	struct drbd_request *req, *tmp;
 	int wake = 0;
@@ -1374,44 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
 	spin_lock_irq(&device->al_lock);
 	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
 		err = drbd_al_begin_io_nonblock(device, &req->i);
+		if (err == -ENOBUFS)
+			break;
 		if (err == -EBUSY)
 			wake = 1;
 		if (err)
-			continue;
-		list_move_tail(&req->tl_requests, pending);
+			list_move_tail(&req->tl_requests, later);
+		else
+			list_move_tail(&req->tl_requests, pending);
 	}
 	spin_unlock_irq(&device->al_lock);
 	if (wake)
 		wake_up(&device->al_wait);
-
 	return !list_empty(pending);
 }
 
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+	struct drbd_request *req, *tmp;
+
+	list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
+		atomic_dec(&device->ap_actlog_cnt);
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+}
+
 void do_submit(struct work_struct *ws)
 {
 	struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
-	LIST_HEAD(incoming);
-	LIST_HEAD(pending);
-	struct drbd_request *req, *tmp;
+	LIST_HEAD(incoming);	/* from drbd_make_request() */
+	LIST_HEAD(pending);	/* to be submitted after next AL-transaction commit */
+	LIST_HEAD(busy);	/* blocked by resync requests */
+
+	/* grab new incoming requests */
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_tail_init(&device->submit.writes, &incoming);
+	spin_unlock_irq(&device->resource->req_lock);
 
 	for (;;) {
-		spin_lock_irq(&device->resource->req_lock);
-		list_splice_tail_init(&device->submit.writes, &incoming);
-		spin_unlock_irq(&device->resource->req_lock);
+		DEFINE_WAIT(wait);
 
+		/* move used-to-be-busy back to front of incoming */
+		list_splice_init(&busy, &incoming);
 		submit_fast_path(device, &incoming);
 		if (list_empty(&incoming))
 			break;
 
-skip_fast_path:
-		wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
-		/* Maybe more was queued, while we prepared the transaction?
-		 * Try to stuff them into this transaction as well.
-		 * Be strictly non-blocking here, no wait_event, we already
-		 * have something to commit.
-		 * Stop if we don't make any more progres.
-		 */
 		for (;;) {
+			prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+			list_splice_init(&busy, &incoming);
+			prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+			if (!list_empty(&pending))
+				break;
+
+			schedule();
+
+			/* If all currently "hot" activity log extents are kept busy by
+			 * incoming requests, we still must not totally starve new
+			 * requests to "cold" extents.
+			 * Something left on &incoming means there had not been
+			 * enough update slots available, and the activity log
+			 * has been marked as "starving".
+			 *
+			 * Try again now, without looking for new requests,
+			 * effectively blocking all new requests until we made
+			 * at least _some_ progress with what we currently have.
+			 */
+			if (!list_empty(&incoming))
+				continue;
+
+			/* Nothing moved to pending, but nothing left
+			 * on incoming: all moved to busy!
+			 * Grab new and iterate. */
+			spin_lock_irq(&device->resource->req_lock);
+			list_splice_tail_init(&device->submit.writes, &incoming);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		finish_wait(&device->al_wait, &wait);
+
+		/* If the transaction was full, before all incoming requests
+		 * had been processed, skip ahead to commit, and iterate
+		 * without splicing in more incoming requests from upper layers.
+		 *
+		 * Else, if all incoming have been processed,
+		 * they have become either "pending" (to be submitted after
+		 * next transaction commit) or "busy" (blocked by resync).
+		 *
+		 * Maybe more was queued, while we prepared the transaction?
+		 * Try to stuff those into this transaction as well.
+		 * Be strictly non-blocking here,
+		 * we already have something to commit.
+		 *
+		 * Commit if we don't make any more progres.
+		 */
+
+		while (list_empty(&incoming)) {
 			LIST_HEAD(more_pending);
 			LIST_HEAD(more_incoming);
 			bool made_progress;
@@ -1428,46 +1492,16 @@ skip_fast_path:
 			if (list_empty(&more_incoming))
 				break;
 
-			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
 
 			list_splice_tail_init(&more_pending, &pending);
 			list_splice_tail_init(&more_incoming, &incoming);
-
 			if (!made_progress)
 				break;
 		}
-		drbd_al_begin_io_commit(device);
 
-		list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
-			req->rq_state |= RQ_IN_ACT_LOG;
-			req->in_actlog_jif = jiffies;
-			atomic_dec(&device->ap_actlog_cnt);
-			list_del_init(&req->tl_requests);
-			drbd_send_and_submit(device, req);
-		}
-
-		/* If all currently hot activity log extents are kept busy by
-		 * incoming requests, we still must not totally starve new
-		 * requests to cold extents. In that case, prepare one request
-		 * in blocking mode. */
-		list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
-			bool was_cold;
-			list_del_init(&req->tl_requests);
-			was_cold = drbd_al_begin_io_prepare(device, &req->i);
-			if (!was_cold) {
-				req->rq_state |= RQ_IN_ACT_LOG;
-				req->in_actlog_jif = jiffies;
-				atomic_dec(&device->ap_actlog_cnt);
-				/* Corresponding extent was hot after all? */
-				drbd_send_and_submit(device, req);
-			} else {
-				/* Found a request to a cold extent.
-				 * Put on "pending" list,
-				 * and try to cumulate with more. */
-				list_add(&req->tl_requests, &pending);
-				goto skip_fast_path;
-			}
-		}
+		drbd_al_begin_io_commit(device);
+		send_and_submit_pending(device, &pending);
 	}
 }
 
-- 
cgit v1.2.3


From 193cb00ce377fd672526b5ad065c4de52f429ea1 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 2 Apr 2014 00:24:18 +0200
Subject: drbd: drop spurious parameters from _drbd_md_sync_page_io

size is always 4096,
page is always device->md_io.page.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index e9fbcafaccdc..d26a3fa63688 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -137,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b
 
 static int _drbd_md_sync_page_io(struct drbd_device *device,
 				 struct drbd_backing_dev *bdev,
-				 struct page *page, sector_t sector,
-				 int rw, int size)
+				 sector_t sector, int rw)
 {
 	struct bio *bio;
+	/* we do all our meta data IO in aligned 4k blocks. */
+	const int size = 4096;
 	int err;
 
 	device->md_io.done = 0;
@@ -154,7 +155,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	bio->bi_bdev = bdev->md_bdev;
 	bio->bi_iter.bi_sector = sector;
 	err = -EIO;
-	if (bio_add_page(bio, page, size, 0) != size)
+	if (bio_add_page(bio, device->md_io.page, size, 0) != size)
 		goto out;
 	bio->bi_private = device;
 	bio->bi_end_io = drbd_md_io_complete;
@@ -190,8 +191,6 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
 			 sector_t sector, int rw)
 {
 	int err;
-	struct page *iop = device->md_io.page;
-
 	D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
@@ -207,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
 		     current->comm, current->pid, __func__,
 		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
-	/* we do all our meta data IO in aligned 4k blocks. */
-	err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+	err = _drbd_md_sync_page_io(device, bdev, sector, rw);
 	if (err) {
 		drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
 		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
-- 
cgit v1.2.3


From 1e39152fea2dccd1af6479aa1a036201886f0743 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 19 May 2014 09:52:02 +0200
Subject: drbd: implicitly truncate cpu-mask

Don't error out with misleading "out of memory"
if the cpu-mask has more bits set than there are CPUs.
Just truncate to nr_cpu_ids implicitly.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 9712bcc8e069..9b465bb68487 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2541,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
 	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
 		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
 				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err == -EOVERFLOW) {
+			/* So what. mask it out. */
+			cpumask_var_t tmp_cpu_mask;
+			if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+				cpumask_setall(tmp_cpu_mask);
+				cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+				drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+					res_opts->cpu_mask,
+					strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+					nr_cpu_ids);
+				free_cpumask_var(tmp_cpu_mask);
+				err = 0;
+			}
+		}
 		if (err) {
 			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
 			/* retcode = ERR_CPU_MASK_PARSE; */
-- 
cgit v1.2.3


From bf0d6e4a1138e71cafdbbb99cde430eee50c4ff1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 6 May 2014 14:28:32 +0300
Subject: drbd: silence underflow warning in read_in_block()

My static checker warns that "data_size" could be negative and underflow
the limit check.  The code looks suspicious but I don't know if it is a
real bug.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 2 +-
 include/linux/drbd.h               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index f972988291c5..9342b8da73ab 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1592,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	struct drbd_peer_request *peer_req;
 	struct page *page;
 	int dgs, ds, err;
-	int data_size = pi->size;
+	unsigned int data_size = pi->size;
 	void *dig_in = peer_device->connection->int_dig_in;
 	void *dig_vv = peer_device->connection->int_dig_vv;
 	unsigned long *data;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 20ec8903b1e4..debb70d40547 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.3"
+#define REL_VERSION "8.4.5"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
-- 
cgit v1.2.3


From 5b1016e62f74c53e0330403025954c8d95384c03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 19 Mar 2014 17:49:37 -0700
Subject: bcache: Fix a bug when detaching

After detaching a backing device from a cache set, a bit wasn't getting
reset meaning the second detach wouldn't work correctly.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 926ded8ccbf5..1ea9fa27ee3c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -927,6 +927,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	list_move(&dc->list, &uncached_devices);
 
 	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
 
 	mutex_unlock(&bch_register_lock);
 
@@ -1405,9 +1406,11 @@ static void cache_set_flush(struct closure *cl)
 		if (ca->alloc_thread)
 			kthread_stop(ca->alloc_thread);
 
-	cancel_delayed_work_sync(&c->journal.work);
-	/* flush last journal entry if needed */
-	c->journal.work.work.func(&c->journal.work.work);
+	if (c->journal.cur) {
+		cancel_delayed_work_sync(&c->journal.work);
+		/* flush last journal entry if needed */
+		c->journal.work.work.func(&c->journal.work.work);
+	}
 
 	closure_return(cl);
 }
-- 
cgit v1.2.3


From 9aa61a992acceeec0d1de2cd99938421498659d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 10 Apr 2014 17:58:49 -0700
Subject: bcache: Fix a journal replay bug

journal replay wansn't validating pointers with bch_extent_invalid() before
derefing, fixed

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/extents.c | 13 +++++++++----
 drivers/md/bcache/extents.h |  1 +
 drivers/md/bcache/journal.c | 16 +++++++++-------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 3a0de4cf9771..243de0bf15cd 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -474,9 +474,8 @@ out:
 	return false;
 }
 
-static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
 {
-	struct btree *b = container_of(bk, struct btree, keys);
 	char buf[80];
 
 	if (!KEY_SIZE(k))
@@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 	if (KEY_SIZE(k) > KEY_OFFSET(k))
 		goto bad;
 
-	if (__ptr_invalid(b->c, k))
+	if (__ptr_invalid(c, k))
 		goto bad;
 
 	return false;
 bad:
 	bch_extent_to_text(buf, sizeof(buf), k);
-	cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+	cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
 	return true;
 }
 
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	return __bch_extent_invalid(b->c, k);
+}
+
 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 				     unsigned ptr)
 {
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
index e4e23409782d..e2ed54054e7a 100644
--- a/drivers/md/bcache/extents.h
+++ b/drivers/md/bcache/extents.h
@@ -9,5 +9,6 @@ struct cache_set;
 
 void bch_extent_to_text(char *, size_t, const struct bkey *);
 bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool __bch_extent_invalid(struct cache_set *, const struct bkey *);
 
 #endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 59e82021b5bb..363b88131f01 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,6 +7,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 
 #include <trace/events/bcache.h>
 
@@ -291,15 +292,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 
 		for (k = i->j.start;
 		     k < bset_bkey_last(&i->j);
-		     k = bkey_next(k)) {
-			unsigned j;
+		     k = bkey_next(k))
+			if (!__bch_extent_invalid(c, k)) {
+				unsigned j;
 
-			for (j = 0; j < KEY_PTRS(k); j++)
-				if (ptr_available(c, k, j))
-					atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+				for (j = 0; j < KEY_PTRS(k); j++)
+					if (ptr_available(c, k, j))
+						atomic_inc(&PTR_BUCKET(c, k, j)->pin);
 
-			bch_initial_mark_key(c, 0, k);
-		}
+				bch_initial_mark_key(c, 0, k);
+			}
 	}
 }
 
-- 
cgit v1.2.3


From dbd810ab678d262d3772d29b65844d7b20dc47bc Mon Sep 17 00:00:00 2001
From: Surbhi Palande <sap@daterainc.com>
Date: Thu, 10 Apr 2014 16:09:51 -0700
Subject: bcache: Fix to remove the rcu_sched stalls.

while loop was executing infinitely.
This fix ends the while loop gracefully.

Signed-off-by: Surbhi Palande <sap@daterainc.com>
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/journal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 363b88131f01..ead001c9bed8 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -194,7 +194,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 			continue;
 bsearch:
 		/* Binary search */
-		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+		m = l;
+		r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
 		pr_debug("starting binary search, l %u r %u", l, r);
 
 		while (l + 1 < r) {
-- 
cgit v1.2.3


From 8b326d3a2a76912dfed2f0ab937d59fae9512ca2 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Mon, 21 Apr 2014 18:22:35 -0700
Subject: bcache allocator: send discards with correct size

---
 drivers/md/bcache/alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 443d03fbac47..8eeab72b93e2 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg)
 				mutex_unlock(&ca->set->bucket_lock);
 				blkdev_issue_discard(ca->bdev,
 					bucket_to_sector(ca->set, bucket),
-					ca->sb.block_size, GFP_KERNEL, 0);
+					ca->sb.bucket_size, GFP_KERNEL, 0);
 				mutex_lock(&ca->set->bucket_lock);
 			}
 
-- 
cgit v1.2.3


From e5112201c1285841f8b565ece5d6ae7e0d7947a2 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Tue, 29 Apr 2014 15:39:27 -0700
Subject: bcache: fix lockdep warnings on shutdown

---
 drivers/md/bcache/super.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1ea9fa27ee3c..09573c2bcb5d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1214,7 +1214,9 @@ void bch_flash_dev_release(struct kobject *kobj)
 static void flash_dev_free(struct closure *cl)
 {
 	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	mutex_lock(&bch_register_lock);
 	bcache_device_free(d);
+	mutex_unlock(&bch_register_lock);
 	kobject_put(&d->kobj);
 }
 
@@ -1222,7 +1224,9 @@ static void flash_dev_flush(struct closure *cl)
 {
 	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
 
+	mutex_lock(&bch_register_lock);
 	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
 	kobject_del(&d->kobj);
 	continue_at(cl, flash_dev_free, system_wq);
 }
-- 
cgit v1.2.3


From a664d0f05a2ec02c8f042db536d84d15d6e19e81 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Tue, 20 May 2014 12:20:28 -0700
Subject: bcache: fix crash on shutdown in passthrough mode

We never started the writeback thread in this case, so don't stop it.
---
 drivers/md/bcache/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 09573c2bcb5d..6ceaec35b871 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1071,7 +1071,8 @@ static void cached_dev_free(struct closure *cl)
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
-	kthread_stop(dc->writeback_thread);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		kthread_stop(dc->writeback_thread);
 
 	mutex_lock(&bch_register_lock);
 
-- 
cgit v1.2.3


From c5aa4a3157b55bdca18dd2a9d9f43314470b6d32 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Mon, 21 Apr 2014 18:23:12 -0700
Subject: bcache: wait for buckets when allocating new btree root

Tested:
- sometimes bcache_tier test would hang on startup with a failure
  to allocate the btree root -- no longer seeing this

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 12 +++++++++---
 drivers/md/bcache/btree.h |  3 ++-
 drivers/md/bcache/super.c |  2 +-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7347b6100961..9dd9f1c4d0d9 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1060,15 +1060,15 @@ static void btree_node_free(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
-				   int level)
+struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+				     int level, bool wait)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
+	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
 		goto err;
 
 	bkey_put(c, &k.key);
@@ -1100,6 +1100,12 @@ err:
 	return b;
 }
 
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+					  struct btree_op *op, int level)
+{
+	return __bch_btree_node_alloc(c, op, level, op != NULL);
+}
+
 static struct btree *btree_node_alloc_replacement(struct btree *b,
 						  struct btree_op *op)
 {
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 91dfa5e69685..00441821f5ad 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -242,7 +242,8 @@ void __bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
+				     int, bool);
 struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
 				 struct bkey *, int, bool);
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 6ceaec35b871..3b043a04d9bd 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1669,7 +1669,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, NULL, 0);
+		c->root = __bch_btree_node_alloc(c, NULL, 0, true);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
-- 
cgit v1.2.3


From 9e5c353510b26500bd6b8309823ac9ef2837b761 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Thu, 1 May 2014 13:48:57 -0700
Subject: bcache: fix uninterruptible sleep in writeback thread

There were two issues here:

- writeback thread did not start until the device first became dirty
- writeback thread used uninterruptible sleep once running

Without this patch I see kernel warnings printed and a load average of
1.52 after booting my test VM. With this patch the warnings are gone and
the load average is near 0.00 as expected.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c     |  3 +++
 drivers/md/bcache/writeback.c | 14 ++++++++++----
 drivers/md/bcache/writeback.h |  3 ++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3b043a04d9bd..00cc42550f34 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1042,6 +1042,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	 */
 	atomic_set(&dc->count, 1);
 
+	if (bch_cached_dev_writeback_start(dc))
+		return -ENOMEM;
+
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
 		bch_sectors_dirty_init(dc);
 		atomic_set(&dc->has_dirty, 1);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f4300e4c0114..f1986bcd1bf0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
 		if (KEY_START(&w->key) != dc->last_read ||
 		    jiffies_to_msecs(delay) > 50)
 			while (!kthread_should_stop() && delay)
-				delay = schedule_timeout_uninterruptible(delay);
+				delay = schedule_timeout_interruptible(delay);
 
 		dc->last_read	= KEY_OFFSET(&w->key);
 
@@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg)
 			while (delay &&
 			       !kthread_should_stop() &&
 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
-				delay = schedule_timeout_uninterruptible(delay);
+				delay = schedule_timeout_interruptible(delay);
 		}
 	}
 
@@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
 }
 
-int bch_cached_dev_writeback_init(struct cached_dev *dc)
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	sema_init(&dc->in_flight, 64);
 	init_rwsem(&dc->writeback_lock);
@@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 					      "bcache_writeback");
 	if (IS_ERR(dc->writeback_thread))
 		return PTR_ERR(dc->writeback_thread);
 
-	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 
+	bch_writeback_queue(dc);
+
 	return 0;
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index e2f8598937ac..0a9dab187b79 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
 void bch_sectors_dirty_init(struct cached_dev *dc);
-int bch_cached_dev_writeback_init(struct cached_dev *);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
 
 #endif
-- 
cgit v1.2.3


From bcf090e0040e30f8409e6a535a01e6473afb096f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 19 May 2014 08:57:55 -0700
Subject: bcache: Make sure to pass GFP_WAIT to mempool_alloc()

this was very wrong - mempool_alloc() only guarantees success with GFP_WAIT.
bcache uses GFP_NOWAIT in various other places where we have a fallback,
circuits must've gotten crossed when writing this code or something.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 9dd9f1c4d0d9..e538d45fa65a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -202,7 +202,7 @@ void bch_btree_node_read_done(struct btree *b)
 	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
-	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+	iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 
-- 
cgit v1.2.3


From 501d52a90cbe652b41336c206ff0e95799d5a9b5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 19 May 2014 08:55:40 -0700
Subject: bcache: Allocate bounce buffers with GFP_NOWAIT

There's no point in blocking on these allocations, since our fallback paths will
probably go faster than blocking.

Change-Id: I733ca202c25cb36bde02607a0a60552229a4241c
---
 drivers/md/bcache/bset.c  | 2 +-
 drivers/md/bcache/btree.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 545416415305..646fe85261c1 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 {
 	uint64_t start_time;
 	bool used_mempool = false;
-	struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+	struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT,
 						     order);
 	if (!out) {
 		struct page *outp;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index e538d45fa65a..39c7f5b73724 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -421,7 +421,7 @@ static void do_btree_node_write(struct btree *b)
 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
 		       bset_sector_offset(&b->keys, i));
 
-	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+	if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
 		int j;
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
-- 
cgit v1.2.3


From 8e0948080670f6330229718b15a6a1a011d441ce Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Mon, 30 Jun 2014 22:31:20 -0700
Subject: bcache: fix typo in bch_bkey_equal_header

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 5f6728d5d4dd..ae964624efb2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l,
 {
 	return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
 		KEY_PTRS(l) == KEY_PTRS(r) &&
-		KEY_CSUM(l) == KEY_CSUM(l));
+		KEY_CSUM(l) == KEY_CSUM(r));
 }
 
 /* Keylists */
-- 
cgit v1.2.3


From 60ae81eee86dd7a520db8c1e3d702b49fc0418b5 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Thu, 22 May 2014 12:14:24 -0700
Subject: bcache: bcache_write tracepoint was crashing

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/request.c   |  3 ++-
 include/trace/events/bcache.h | 15 +++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 15fff4f68a7c..62e6e98186b5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	trace_bcache_write(op->bio, op->writeback, op->bypass);
+	trace_bcache_write(op->c, op->inode, op->bio,
+			   op->writeback, op->bypass);
 
 	bch_keylist_init(&op->insert_keys);
 	bio_get(op->bio);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index c9c3c044b32f..6778e4135a8e 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -148,11 +148,13 @@ TRACE_EVENT(bcache_read,
 );
 
 TRACE_EVENT(bcache_write,
-	TP_PROTO(struct bio *bio, bool writeback, bool bypass),
-	TP_ARGS(bio, writeback, bypass),
+	TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio,
+		bool writeback, bool bypass),
+	TP_ARGS(c, inode, bio, writeback, bypass),
 
 	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
+		__array(char,		uuid,	16		)
+		__field(u64,		inode			)
 		__field(sector_t,	sector			)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
@@ -161,7 +163,8 @@ TRACE_EVENT(bcache_write,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		memcpy(__entry->uuid, c->sb.set_uuid, 16);
+		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
@@ -169,8 +172,8 @@ TRACE_EVENT(bcache_write,
 		__entry->bypass = bypass;
 	),
 
-	TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
+	TP_printk("%pU inode %llu  %s %llu + %u hit %u bypass %u",
+		  __entry->uuid, __entry->inode,
 		  __entry->rwbs, (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->writeback, __entry->bypass)
 );
-- 
cgit v1.2.3


From 913dc33fb2720fb5f979011664294137ddd8b13b Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Fri, 23 May 2014 11:18:35 -0700
Subject: bcache: fix crash in bcache_btree_node_alloc_fail tracepoint

'b' was NULL.

Change-Id: Icac0fd04afa2d23f213d96d51afd53374e6dd0c0
---
 drivers/md/bcache/btree.c     | 2 +-
 include/trace/events/bcache.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 39c7f5b73724..f8237856a61b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1096,7 +1096,7 @@ err_free:
 err:
 	mutex_unlock(&c->bucket_lock);
 
-	trace_bcache_btree_node_alloc_fail(b);
+	trace_bcache_btree_node_alloc_fail(c);
 	return b;
 }
 
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 6778e4135a8e..981acf74b14f 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -261,9 +261,9 @@ DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
 	TP_ARGS(b)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
-	TP_PROTO(struct btree *b),
-	TP_ARGS(b)
+DEFINE_EVENT(cache_set, bcache_btree_node_alloc_fail,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
 );
 
 DEFINE_EVENT(btree_node, bcache_btree_node_free,
-- 
cgit v1.2.3


From 6b708de64adb6dc8319e7aeac922b46904fbeeec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 2 Jun 2014 15:39:44 -0700
Subject: bcache: Fix an infinite loop in journal replay

When running with multiple cache devices, if one of the devices has a completely
empty journal but we'd already found some journal entries on a previosu device
we'd go into an infinite loop.

Change-Id: I1dcdc0d738192746de28f40e8b08825b0dea5e2b
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/journal.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ead001c9bed8..fe080ad0e558 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -190,9 +190,12 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 			if (read_bucket(l))
 				goto bsearch;
 
-		if (list_empty(list))
+		/* no journal entries on this device? */
+		if (l == ca->sb.njournal_buckets)
 			continue;
 bsearch:
+		BUG_ON(list_empty(list));
+
 		/* Binary search */
 		m = l;
 		r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
-- 
cgit v1.2.3


From 400ffaa2acd72274e2c7293a9724382383bebf3e Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Sat, 12 Jul 2014 21:53:11 -0700
Subject: bcache: fix use-after-free in btree_gc_coalesce()

If we goto out_nocoalesce after we free new_nodes[0], we end up freeing
new_nodes[0] again. This was generating a lockdep warning. The fix is
to set new_nodes[0] to NULL, since the out_nocoalesce path safely
ignores NULL entries in the new_nodes array.

This regression was introduced in 2d7f9531.

Change-Id: I76564d7257800583214376b4bacf236cda90c89c
---
 drivers/md/bcache/btree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f8237856a61b..776583f7247d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1409,6 +1409,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	BUG_ON(btree_bset_first(new_nodes[0])->keys);
 	btree_node_free(new_nodes[0]);
 	rw_unlock(true, new_nodes[0]);
+	new_nodes[0] = NULL;
 
 	for (i = 0; i < nodes; i++) {
 		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
-- 
cgit v1.2.3


From d83353b319d47ef8cce82467da6a25c2d558253f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 11 Jun 2014 19:44:49 -0700
Subject: bcache: Fix more early shutdown bugs

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 00cc42550f34..29dd1e8ae19f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d)
 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
 				 unsigned id)
 {
-	BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
-
 	d->id = id;
 	d->c = c;
 	c->devices[id] = d;
@@ -1771,6 +1769,7 @@ found:
 		pr_debug("set version = %llu", c->sb.version);
 	}
 
+	kobject_get(&ca->kobj);
 	ca->set = c;
 	ca->set->cache[ca->sb.nr_this_dev] = ca;
 	c->cache_by_alloc[c->caches_loaded++] = ca;
@@ -1888,10 +1887,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
 		goto err;
 
 	pr_info("registered cache device %s", bdevname(bdev, name));
+out:
+	kobject_put(&ca->kobj);
 	return;
 err:
 	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
-	kobject_put(&ca->kobj);
+	goto out;
 }
 
 /* Global interfaces/init */
-- 
cgit v1.2.3


From bf0c55c986540483c34ca640f2eef4c3314388b1 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Fri, 11 Jul 2014 12:17:41 -0700
Subject: bcache: fix crash with incomplete cache set

Change-Id: I6abde52afe917633480caaf4e2518f42a816d886
---
 drivers/md/bcache/bcache.h | 4 ++++
 drivers/md/bcache/super.c  | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d2ebcf323094..04f7bc28ef83 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -477,9 +477,13 @@ struct gc_stat {
  * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
  * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
  * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
  */
 #define CACHE_SET_UNREGISTERING		0
 #define	CACHE_SET_STOPPING		1
+#define	CACHE_SET_RUNNING		2
 
 struct cache_set {
 	struct closure		cl;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 29dd1e8ae19f..72fbaf79b0c8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1284,6 +1284,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
 	if (test_bit(CACHE_SET_STOPPING, &c->flags))
 		return -EINTR;
 
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return -EPERM;
+
 	u = uuid_find_empty(c);
 	if (!u) {
 		pr_err("Can't create volume, no room for UUID");
@@ -1706,6 +1709,7 @@ static void run_cache_set(struct cache_set *c)
 
 	flash_devs_run(c);
 
+	set_bit(CACHE_SET_RUNNING, &c->flags);
 	return;
 err:
 	closure_sync(&cl);
-- 
cgit v1.2.3


From c9a78332b42cbdcdd386a95192a716b67d1711a4 Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Thu, 19 Jun 2014 15:05:59 -0700
Subject: bcache: fix memory corruption in init error path

If register_cache_set() failed, we would touch ca->set after
it had already been freed. Also, fix an assertion to catch
this.

Change-Id: I748e5f5b223e2d9b2602075dec2f997cced2394d
---
 drivers/md/bcache/super.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 72fbaf79b0c8..12ad381bdbab 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1356,8 +1356,11 @@ static void cache_set_free(struct closure *cl)
 	bch_journal_free(c);
 
 	for_each_cache(ca, c, i)
-		if (ca)
+		if (ca) {
+			ca->set = NULL;
+			c->cache[ca->sb.nr_this_dev] = NULL;
 			kobject_put(&ca->kobj);
+		}
 
 	bch_bset_sort_state_free(&c->sort);
 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
@@ -1794,8 +1797,10 @@ void bch_cache_release(struct kobject *kobj)
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 	unsigned i;
 
-	if (ca->set)
+	if (ca->set) {
+		BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
 		ca->set->cache[ca->sb.nr_this_dev] = NULL;
+	}
 
 	bio_split_pool_free(&ca->bio_split_hook);
 
@@ -1858,7 +1863,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 }
 
 static void register_cache(struct cache_sb *sb, struct page *sb_page,
-				  struct block_device *bdev, struct cache *ca)
+				struct block_device *bdev, struct cache *ca)
 {
 	char name[BDEVNAME_SIZE];
 	const char *err = "cannot allocate memory";
-- 
cgit v1.2.3


From 2452cc89063a2a6890368f185c4b6d7d8802179e Mon Sep 17 00:00:00 2001
From: Slava Pestov <sp@daterainc.com>
Date: Sat, 12 Jul 2014 00:22:53 -0700
Subject: bcache: try to set b->parent properly

bcache_flash_dev.ktest would reliably crash with 8k and 16k bucket size
before; now it passes.

Change-Id: Ib542232235e39298c3a7548fe52b645cabb823d1
---
 drivers/md/bcache/btree.c | 37 +++++++++++++++++++++----------------
 drivers/md/bcache/btree.h |  4 ++--
 drivers/md/bcache/super.c |  4 ++--
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 776583f7247d..00cde40db572 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -117,9 +117,9 @@
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
+	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
+						  _w, b);		\
 	if (!IS_ERR(_child)) {						\
-		_child->parent = (b);					\
 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
 		rw_unlock(_w, _child);					\
 	} else								\
@@ -142,7 +142,6 @@
 		rw_lock(_w, _b, _b->level);				\
 		if (_b == (c)->root &&					\
 		    _w == insert_lock(op, _b)) {			\
-			_b->parent = NULL;				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		}							\
 		rw_unlock(_w, _b);					\
@@ -967,7 +966,8 @@ err:
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
-				 struct bkey *k, int level, bool write)
+				 struct bkey *k, int level, bool write,
+				 struct btree *parent)
 {
 	int i = 0;
 	struct btree *b;
@@ -1002,6 +1002,7 @@ retry:
 		BUG_ON(b->level != level);
 	}
 
+	b->parent = parent;
 	b->accessed = 1;
 
 	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
@@ -1022,15 +1023,16 @@ retry:
 	return b;
 }
 
-static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+static void btree_node_prefetch(struct btree *parent, struct bkey *k)
 {
 	struct btree *b;
 
-	mutex_lock(&c->bucket_lock);
-	b = mca_alloc(c, NULL, k, level);
-	mutex_unlock(&c->bucket_lock);
+	mutex_lock(&parent->c->bucket_lock);
+	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
+	mutex_unlock(&parent->c->bucket_lock);
 
 	if (!IS_ERR_OR_NULL(b)) {
+		b->parent = parent;
 		bch_btree_node_read(b);
 		rw_unlock(true, b);
 	}
@@ -1061,7 +1063,8 @@ static void btree_node_free(struct btree *b)
 }
 
 struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
-				     int level, bool wait)
+				     int level, bool wait,
+				     struct btree *parent)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
@@ -1085,6 +1088,7 @@ retry:
 	}
 
 	b->accessed = 1;
+	b->parent = parent;
 	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
 
 	mutex_unlock(&c->bucket_lock);
@@ -1101,15 +1105,16 @@ err:
 }
 
 static struct btree *bch_btree_node_alloc(struct cache_set *c,
-					  struct btree_op *op, int level)
+					  struct btree_op *op, int level,
+					  struct btree *parent)
 {
-	return __bch_btree_node_alloc(c, op, level, op != NULL);
+	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
 }
 
 static struct btree *btree_node_alloc_replacement(struct btree *b,
 						  struct btree_op *op)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
 	if (!IS_ERR_OR_NULL(n)) {
 		mutex_lock(&n->write_lock);
 		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1523,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
 		if (k) {
 			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
-						  true);
+						  true, b);
 			if (IS_ERR(r->b)) {
 				ret = PTR_ERR(r->b);
 				break;
@@ -1818,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 			k = bch_btree_iter_next_filter(&iter, &b->keys,
 						       bch_ptr_bad);
 			if (k)
-				btree_node_prefetch(b->c, k, b->level - 1);
+				btree_node_prefetch(b, k);
 
 			if (p)
 				ret = btree(check_recurse, p, b, op);
@@ -1983,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
-		n2 = bch_btree_node_alloc(b->c, op, b->level);
+		n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
+			n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 00441821f5ad..5c391fa01bed 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -243,9 +243,9 @@ void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
 struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
-				     int, bool);
+				     int, bool, struct btree *);
 struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
-				 struct bkey *, int, bool);
+				 struct bkey *, int, bool, struct btree *);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 12ad381bdbab..b6114d672413 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1601,7 +1601,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "error reading btree root";
-		c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
+		c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
@@ -1676,7 +1676,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "cannot allocate new btree root";
-		c->root = __bch_btree_node_alloc(c, NULL, 0, true);
+		c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
-- 
cgit v1.2.3


From 5b25abade29616d42d60f9bd5e6a5ad07f7314e3 Mon Sep 17 00:00:00 2001
From: Surbhi Palande <sap@daterainc.com>
Date: Thu, 17 Apr 2014 12:07:04 -0700
Subject: bcache: Correct printing of btree_gc_max_duration_ms

time_stats::btree_gc_max_duration_mc is not bit shifted by 8

Fixes BUG #138

Change-Id: I44fc6e1d0579674016acc533f1a546b080e5371a
Signed-off-by: Surbhi Palande <sap@daterainc.com>
---
 drivers/md/bcache/util.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ac7d0d1f70d7..98df7572b5f7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -416,8 +416,8 @@ do {									\
 			  average_frequency,	frequency_units);	\
 	__print_time_stat(stats, name,					\
 			  average_duration,	duration_units);	\
-	__print_time_stat(stats, name,					\
-			  max_duration,		duration_units);	\
+	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
+			div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
 									\
 	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
 		    ? div_s64(local_clock() - (stats)->last,		\
-- 
cgit v1.2.3


From 789d21dbd9d8889e62c79ec19585fcc97e42ef07 Mon Sep 17 00:00:00 2001
From: Jianjian Huo <samuel.huo@gmail.com>
Date: Sun, 13 Jul 2014 09:08:59 -0700
Subject: bcache: add mutex lock for bch_is_open

Since bch_is_open will iterate linked list bch_cache_sets and
uncached_devices, it needs bch_register_lock.

Signed-off-by: Jianjian Huo <samuel.huo@gmail.com>
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b6114d672413..60e75130a44c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1966,10 +1966,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (IS_ERR(bdev)) {
 		if (bdev == ERR_PTR(-EBUSY)) {
 			bdev = lookup_bdev(strim(path));
+			mutex_lock(&bch_register_lock);
 			if (!IS_ERR(bdev) && bch_is_open(bdev))
 				err = "device already registered";
 			else
 				err = "device busy";
+			mutex_unlock(&bch_register_lock);
 		}
 		goto err;
 	}
-- 
cgit v1.2.3


From 0781c8748cf1ea2b0dcd966571103909528c4efa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 7 Jul 2014 13:03:36 -0700
Subject: bcache: Drop unneeded blk_sync_queue() calls

this is needed for the queue/block device we created (it's done by
blk_cleanup_queue() which we do call) - but calling it for the block devices we
only opened is pointless.

Change-Id: I53dfded14ed15b9581d10ca8399d5e1b3abbf9f2
---
 drivers/md/bcache/super.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 60e75130a44c..d4713d098a39 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1084,12 +1084,8 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_unlock(&bch_register_lock);
 
-	if (!IS_ERR_OR_NULL(dc->bdev)) {
-		if (dc->bdev->bd_disk)
-			blk_sync_queue(bdev_get_queue(dc->bdev));
-
+	if (!IS_ERR_OR_NULL(dc->bdev))
 		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-	}
 
 	wake_up(&unregister_wait);
 
@@ -1817,10 +1813,8 @@ void bch_cache_release(struct kobject *kobj)
 	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
 		put_page(ca->sb_bio.bi_io_vec[0].bv_page);
 
-	if (!IS_ERR_OR_NULL(ca->bdev)) {
-		blk_sync_queue(bdev_get_queue(ca->bdev));
+	if (!IS_ERR_OR_NULL(ca->bdev))
 		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-	}
 
 	kfree(ca);
 	module_put(THIS_MODULE);
-- 
cgit v1.2.3