Skip to content

Commit

Permalink
dm delay: for short delays, use kthread instead of timers and wq
Browse files Browse the repository at this point in the history
DM delay's current design of using timers and wq to realize the delays
is insufficient for delays below ~50ms.

This commit enhances the design to use a kthread to flush the expired
delays, trading some CPU time (in some cases) for better delay
accuracy and delays closer to what the user requested for smaller
delays. The new design is chosen as long as all the delays are below
50ms.

Since bios can't be completed in interrupt context using a kthread
is probably the most reasonable way to approach this.

Testing with
echo "0 2097152 zero" | dmsetup create dm-zeros
for i in $(seq 0 20);
do
  echo "0 2097152 delay /dev/mapper/dm-zeros 0 $i" | dmsetup create dm-delay-${i}ms;
done

Some performance numbers for comparison, on beaglebone black (single
core) CONFIG_HZ_1000=y:

fio --name=1msread --rw=randread --bs=4k --runtime=60 --time_based \
    --filename=/dev/mapper/dm-delay-1ms
Theoretical maximum: 1000 IOPS
Previous: 250 IOPS
Kthread: 500 IOPS

fio --name=10msread --rw=randread --bs=4k --runtime=60 --time_based \
    --filename=/dev/mapper/dm-delay-10ms
Theoretical maximum: 100 IOPS
Previous: 45 IOPS
Kthread: 50 IOPS

fio --name=1mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 \
    --time_based --filename=/dev/mapper/dm-delay-1ms
Theoretical maximum: 1000 IOPS
Previous: 498 IOPS
Kthread: 1000 IOPS

fio --name=10mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 \
    --time_based --filename=/dev/mapper/dm-delay-10ms
Theoretical maximum: 100 IOPS
Previous: 90 IOPS
Kthread: 100 IOPS

(This one is just to prove the new design isn't impacting throughput,
not really about delays):
fio --name=10mswriteasync --rw=randwrite --direct=1 --bs=4k \
    --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms \
    --numjobs=32 --iodepth=64 --ioengine=libaio --group_reporting
Previous: 13.3k IOPS
Kthread: 13.3k IOPS

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
[Harshit: kthread_create error handling fix in delay_ctr]
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
  • Loading branch information
cloehle authored and Mike Snitzer committed Oct 31, 2023
1 parent 8388cba commit 70bbeb2
Showing 1 changed file with 88 additions and 15 deletions.
103 changes: 88 additions & 15 deletions drivers/md/dm-delay.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/kthread.h>

#include <linux/device-mapper.h>

Expand All @@ -31,6 +32,7 @@ struct delay_c {
struct workqueue_struct *kdelayd_wq;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
struct task_struct *worker;
atomic_t may_delay;

struct delay_class read;
Expand Down Expand Up @@ -66,6 +68,44 @@ static void queue_timeout(struct delay_c *dc, unsigned long expires)
mutex_unlock(&dc->timer_lock);
}

static inline bool delay_is_fast(struct delay_c *dc)
{
return !!dc->worker;
}

static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
{
struct dm_delay_info *delayed, *next;

mutex_lock(&delayed_bios_lock);
list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
if (flush_all || time_after_eq(jiffies, delayed->expires)) {
struct bio *bio = dm_bio_from_per_bio_data(delayed,
sizeof(struct dm_delay_info));
list_del(&delayed->list);
dm_submit_bio_remap(bio, NULL);
delayed->class->ops--;
}
}
mutex_unlock(&delayed_bios_lock);
}

static int flush_worker_fn(void *data)
{
struct delay_c *dc = data;

while (1) {
flush_delayed_bios_fast(dc, false);
if (unlikely(list_empty(&dc->delayed_bios))) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
} else
cond_resched();
}

return 0;
}

static void flush_bios(struct bio *bio)
{
struct bio *n;
Expand All @@ -78,7 +118,7 @@ static void flush_bios(struct bio *bio)
}
}

static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
{
struct dm_delay_info *delayed, *next;
unsigned long next_expires = 0;
Expand Down Expand Up @@ -115,7 +155,10 @@ static void flush_expired_bios(struct work_struct *work)
struct delay_c *dc;

dc = container_of(work, struct delay_c, flush_expired_bios);
flush_bios(flush_delayed_bios(dc, 0));
if (delay_is_fast(dc))
flush_delayed_bios_fast(dc, false);
else
flush_bios(flush_delayed_bios(dc, false));
}

static void delay_dtr(struct dm_target *ti)
Expand All @@ -131,8 +174,11 @@ static void delay_dtr(struct dm_target *ti)
dm_put_device(ti, dc->write.dev);
if (dc->flush.dev)
dm_put_device(ti, dc->flush.dev);
if (dc->worker)
kthread_stop(dc->worker);

mutex_destroy(&dc->timer_lock);
if (!delay_is_fast(dc))
mutex_destroy(&dc->timer_lock);

kfree(dc);
}
Expand Down Expand Up @@ -175,6 +221,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
int ret;
unsigned int max_delay;

if (argc != 3 && argc != 6 && argc != 9) {
ti->error = "Requires exactly 3, 6 or 9 arguments";
Expand All @@ -188,16 +235,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}

ti->private = dc;
timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
mutex_init(&dc->timer_lock);
atomic_set(&dc->may_delay, 1);
dc->argc = argc;

ret = delay_class_ctr(ti, &dc->read, argv);
if (ret)
goto bad;
max_delay = dc->read.delay;

if (argc == 3) {
ret = delay_class_ctr(ti, &dc->write, argv);
Expand All @@ -206,6 +251,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->flush, argv);
if (ret)
goto bad;
max_delay = max(max_delay, dc->write.delay);
max_delay = max(max_delay, dc->flush.delay);
goto out;
}

Expand All @@ -216,19 +263,37 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
if (ret)
goto bad;
max_delay = max(max_delay, dc->flush.delay);
goto out;
}

ret = delay_class_ctr(ti, &dc->flush, argv + 6);
if (ret)
goto bad;
max_delay = max(max_delay, dc->flush.delay);

out:
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!dc->kdelayd_wq) {
ret = -EINVAL;
DMERR("Couldn't start kdelayd");
goto bad;
if (max_delay < 50) {
/*
* In case of small requested delays, use kthread instead of
* timers and workqueue to achieve better latency.
*/
dc->worker = kthread_create(&flush_worker_fn, dc,
"dm-delay-flush-worker");
if (IS_ERR(dc->worker)) {
ret = PTR_ERR(dc->worker);
goto bad;
}
} else {
timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
mutex_init(&dc->timer_lock);
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!dc->kdelayd_wq) {
ret = -EINVAL;
DMERR("Couldn't start kdelayd");
goto bad;
}
}

ti->num_flush_bios = 1;
Expand Down Expand Up @@ -260,7 +325,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
list_add_tail(&delayed->list, &dc->delayed_bios);
mutex_unlock(&delayed_bios_lock);

queue_timeout(dc, expires);
if (delay_is_fast(dc))
wake_up_process(dc->worker);
else
queue_timeout(dc, expires);

return DM_MAPIO_SUBMITTED;
}
Expand All @@ -270,8 +338,13 @@ static void delay_presuspend(struct dm_target *ti)
struct delay_c *dc = ti->private;

atomic_set(&dc->may_delay, 0);
del_timer_sync(&dc->delay_timer);
flush_bios(flush_delayed_bios(dc, 1));

if (delay_is_fast(dc))
flush_delayed_bios_fast(dc, true);
else {
del_timer_sync(&dc->delay_timer);
flush_bios(flush_delayed_bios(dc, true));
}
}

static void delay_resume(struct dm_target *ti)
Expand Down Expand Up @@ -356,7 +429,7 @@ static int delay_iterate_devices(struct dm_target *ti,

static struct target_type delay_target = {
.name = "delay",
.version = {1, 3, 0},
.version = {1, 4, 0},
.features = DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
.ctr = delay_ctr,
Expand Down

0 comments on commit 70bbeb2

Please sign in to comment.