From dd4f699da674010c58d7a2534215b4ca1ff13b13 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Jun 2016 00:26:20 +0200 Subject: drbd: when receiving P_TRIM, zero-out partial unaligned chunks We can avoid spurious data divergence caused by partially-ignored discards on certain backends with discard_zeroes_data=0, if we translate partial unaligned discard requests into explicit zero-out. The relevant use case is LVM/DM thin. If on different nodes, DRBD is backed by devices with differing discard characteristics, discards may lead to data divergence (old data or garbage left over on one backend, zeroes due to unmapped areas on the other backend). Online verify would now potentially report tons of spurious differences. While probably harmless for most use cases (fstrim on a file system), DRBD cannot have that, it would violate our promise to upper layers that our data instances on the nodes are identical. To be correct and play safe (make sure data is identical on both copies), we would have to disable discard support, if our local backend (on a Primary) does not support "discard_zeroes_data=true". We'd also have to translate discards to explicit zero-out on the receiving (typically: Secondary) side, unless the receiving side supports "discard_zeroes_data=true". Which both would allocate those blocks, instead of unmapping them, in contrast with expectations. LVM/DM thin does set discard_zeroes_data=0, because it silently ignores discards to partial chunks. We can work around this by checking the alignment first. For unaligned (wrt. alignment and granularity) or too small discards, we zero-out the initial (and/or) trailing unaligned partial chunks, but discard all the aligned full chunks. At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1". Arguably it should behave this way internally, by default, and we'll try to make that happen. But our workaround is still valid for already deployed setups, and for other devices that may behave this way. Setting discard-zeroes-if-aligned=yes will allow DRBD to use discards, and to announce discard_zeroes_data=true, even on backends that announce discard_zeroes_data=false. Setting discard-zeroes-if-aligned=no will cause DRBD to always fall-back to zero-out on the receiving side, and to not even announce discard capabilities on the Primary, if the respective backend announces discard_zeroes_data=false. We used to ignore the discard_zeroes_data setting completely. To not break established and expected behaviour, and suddenly cause fstrim on thin-provisioned LVs to run out-of-space, instead of freeing up space, the default value is "yes". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/block/drbd/drbd_nl.c') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 3643f9ca5a38..8d757d6f21e7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1161,13 +1161,17 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; struct request_queue *b = NULL; + struct disk_conf *dc; + bool discard_zeroes_if_aligned = true; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); - max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + dc = rcu_dereference(device->ldev->disk_conf); + max_segments = dc->max_bio_bvecs; + discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned; rcu_read_unlock(); blk_set_stacking_limits(&q->limits); @@ -1185,7 +1189,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); - if (blk_queue_discard(b) && + if (blk_queue_discard(b) && (b->limits.discard_zeroes_data || discard_zeroes_if_aligned) && (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { /* We don't care, stacking below should fix it for the local device. * Whether or not it is a suitable granularity on the remote device @@ -1216,7 +1220,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi } } -void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) +void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev) { unsigned int now, new, local, peer; @@ -1488,6 +1492,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (write_ordering_changed(old_disk_conf, new_disk_conf)) drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); + if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned) + drbd_reconsider_queue_parameters(device, device->ldev); + drbd_md_sync(device); if (device->state.conn >= C_CONNECTED) { @@ -1866,7 +1873,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device->read_cnt = 0; device->writ_cnt = 0; - drbd_reconsider_max_bio_size(device, device->ldev); + drbd_reconsider_queue_parameters(device, device->ldev); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, -- cgit v1.2.3