Skip to content

Commit

Permalink
Merge tag 'for-5.17-rc1-tag' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "Several fixes for defragmentation that got broken in 5.16 after
  refactoring and added subpage support. The observed bugs are excessive
  IO or uninterruptible ioctl.

  All stable material"

* tag 'for-5.17-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: update writeback index when starting defrag
  btrfs: add back missing dirty page rate limiting to defrag
  btrfs: fix deadlock when reserving space during defrag
  btrfs: defrag: properly update range->start for autodefrag
  btrfs: defrag: fix wrong number of defragged sectors
  btrfs: allow defrag to be interruptible
  btrfs: fix too long loop when defragging a 1 byte file
  • Loading branch information
torvalds committed Jan 25, 2022
2 parents a08b41a + 27cdfde commit 49d766f
Showing 1 changed file with 75 additions and 9 deletions.
84 changes: 75 additions & 9 deletions fs/btrfs/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,35 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->generation < newer_than)
goto next;

/*
* Our start offset might be in the middle of an existing extent
* map, so take that into account.
*/
range_len = em->len - (cur - em->start);
/*
* If this range of the extent map is already flagged for delalloc,
* skip it, because:
*
* 1) We could deadlock later, when trying to reserve space for
* delalloc, because in case we can't immediately reserve space
* the flusher can start delalloc and wait for the respective
* ordered extents to complete. The deadlock would happen
* because we do the space reservation while holding the range
* locked, and starting writeback, or finishing an ordered
* extent, requires locking the range;
*
* 2) If there's delalloc there, it means there's dirty pages for
* which writeback has not started yet (we clean the delalloc
* flag when starting writeback and after creating an ordered
* extent). If we mark pages in an adjacent range for defrag,
* then we will have a larger contiguous range for delalloc,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
EXTENT_DELALLOC, 0, NULL))
goto next;

/*
* For do_compress case, we want to compress all valid file
* extents, thus no @extent_thresh or mergeable check.
Expand All @@ -1221,7 +1250,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;

/* Skip too large extent */
if (em->len >= extent_thresh)
if (range_len >= extent_thresh)
goto next;

next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
Expand Down Expand Up @@ -1442,9 +1471,11 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
list_for_each_entry(entry, &target_list, list) {
u32 range_len = entry->len;

/* Reached the limit */
if (max_sectors && max_sectors == *sectors_defragged)
/* Reached or beyond the limit */
if (max_sectors && *sectors_defragged >= max_sectors) {
ret = 1;
break;
}

if (max_sectors)
range_len = min_t(u32, range_len,
Expand All @@ -1465,7 +1496,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
extent_thresh, newer_than, do_compress);
if (ret < 0)
break;
*sectors_defragged += range_len;
*sectors_defragged += range_len >>
inode->root->fs_info->sectorsize_bits;
}
out:
list_for_each_entry_safe(entry, tmp, &target_list, list) {
Expand All @@ -1484,6 +1516,12 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
* @newer_than: minimum transid to defrag
* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
* will be defragged.
*
* Return <0 for error.
* Return >=0 for the number of sectors defragged, and range->start will be updated
* to indicate the file offset where next defrag should be started at.
* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
* defragging all the range).
*/
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
Expand All @@ -1499,6 +1537,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
int compress_type = BTRFS_COMPRESS_ZLIB;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;

if (isize == 0)
return 0;
Expand All @@ -1518,12 +1557,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,

if (range->start + range->len > range->start) {
/* Got a specific range */
last_byte = min(isize, range->start + range->len) - 1;
last_byte = min(isize, range->start + range->len);
} else {
/* Defrag until file end */
last_byte = isize - 1;
last_byte = isize;
}

/* Align the range */
cur = round_down(range->start, fs_info->sectorsize);
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;

/*
* If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
Expand All @@ -1536,16 +1579,26 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
file_ra_state_init(ra, inode->i_mapping);
}

/* Align the range */
cur = round_down(range->start, fs_info->sectorsize);
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
/*
* Make writeback start from the beginning of the range, so that the
* defrag range can be written sequentially.
*/
start_index = cur >> PAGE_SHIFT;
if (start_index < inode->i_mapping->writeback_index)
inode->i_mapping->writeback_index = start_index;

while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
u64 cluster_end;

/* The cluster size 256K should always be page aligned */
BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));

if (btrfs_defrag_cancelled(fs_info)) {
ret = -EAGAIN;
break;
}

/* We want the cluster end at page boundary when possible */
cluster_end = (((cur >> PAGE_SHIFT) +
(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
Expand All @@ -1567,14 +1620,27 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
cluster_end + 1 - cur, extent_thresh,
newer_than, do_compress,
&sectors_defragged, max_to_defrag);

if (sectors_defragged > prev_sectors_defragged)
balance_dirty_pages_ratelimited(inode->i_mapping);

btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
cur = cluster_end + 1;
if (ret > 0) {
ret = 0;
break;
}
}

if (ra_allocated)
kfree(ra);
/*
* Update range.start for autodefrag, this will indicate where to start
* in next run.
*/
range->start = cur;
if (sectors_defragged) {
/*
* We have defragged some sectors, for compression case they
Expand Down

0 comments on commit 49d766f

Please sign in to comment.