The following commit has been merged in the master branch: commit 400dd456bda8be0b566f2690c51609ea02f85766 Merge: dc189b8e6adbe113a6d4b3a7c5d0c9cd7febb3bb ef1e68236b9153c27cb7cf29ead0c532870d4215 Author: Linus Torvalds torvalds@linux-foundation.org Date: Wed Mar 27 13:56:41 2024 -0700
Merge tag 'for-6.9-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba:
- fix race when reading extent buffer and 'uptodate' status is missed by one thread (introduced in 6.5)
- do additional validation of devices using major:minor numbers
- zoned mode fixes: - use zone-aware super block access during scrub - fix use-after-free during device replace (found by KASAN) - also delete zones that are 100% unusable to reclaim space
- extent unpinning fixes: - fix extent map leak after error handling - print correct range in error message
- error code and message updates
* tag 'for-6.9-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix race in read_extent_buffer_pages() btrfs: return accurate error code on open failure in open_fs_devices() btrfs: zoned: don't skip block groups with 100% zone unusable btrfs: use btrfs_warn() to log message at btrfs_add_extent_mapping() btrfs: fix message not properly printing interval when adding extent map btrfs: fix warning messages not printing interval at unpin_extent_range() btrfs: fix extent map leak in unexpected scenario at unpin_extent_cache() btrfs: validate device maj:min during open btrfs: zoned: fix use-after-free in do_zone_finish() btrfs: zoned: use zone aware sb location for scrub
diff --combined fs/btrfs/volumes.c index 1dc1f1946ae0e,dedec3d9b1117..f15591f3e54fa --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@@ -466,39 -466,39 +466,39 @@@ static noinline struct btrfs_fs_device
static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct bdev_handle **bdev_handle, + int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret;
- *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev_handle)) { - ret = PTR_ERR(*bdev_handle); + if (IS_ERR(*bdev_file)) { + ret = PTR_ERR(*bdev_file); goto error; } - bdev = (*bdev_handle)->bdev; + bdev = file_bdev(*bdev_file);
if (flush) sync_blockdev(bdev); ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; }
return 0;
error: - *bdev_handle = NULL; + *bdev_file = NULL; return ret; }
@@@ -641,7 -641,7 +641,7 @@@ static int btrfs_open_one_device(struc struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; @@@ -652,7 -652,7 +652,7 @@@ return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) return ret;
@@@ -676,22 -676,32 +676,32 @@@ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev_handle->bdev)) + if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); }
- if (!bdev_nonrot(bdev_handle->bdev)) + if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev_handle->bdev)) + if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true;
- device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ if (device->devt != device->bdev->bd_dev) { + btrfs_warn(NULL, + "device %s maj:min changed from %d:%d to %d:%d", + device->name->str, MAJOR(device->devt), + MINOR(device->devt), MAJOR(device->bdev->bd_dev), + MINOR(device->bdev->bd_dev)); + + device->devt = device->bdev->bd_dev; + } + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@@ -704,7 -714,7 +714,7 @@@
error_free_page: btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file);
return -EINVAL; } @@@ -1017,10 -1027,10 +1027,10 @@@ static void __btrfs_free_extra_devids(s if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue;
- if (device->bdev_handle) { - bdev_release(device->bdev_handle); + if (device->bdev_file) { + fput(device->bdev_file); device->bdev = NULL; - device->bdev_handle = NULL; + device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@@ -1065,7 -1075,7 +1075,7 @@@ static void btrfs_close_bdev(struct btr invalidate_bdev(device->bdev); }
- bdev_release(device->bdev_handle); + fput(device->bdev_file); }
static void btrfs_close_one_device(struct btrfs_device *device) @@@ -1174,23 -1184,30 +1184,30 @@@ static int open_fs_devices(struct btrfs struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + int ret = 0;
list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int ret; + int ret2;
- ret = btrfs_open_one_device(fs_devices, device, flags, holder); - if (ret == 0 && + ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; - } else if (ret == -ENODATA) { + } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } + if (ret == 0 && ret2 != 0) + ret = ret2; } - if (fs_devices->open_devices == 0) + + if (fs_devices->open_devices == 0) { + if (ret) + return ret; return -EINVAL; + }
fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; @@@ -1303,47 -1320,6 +1320,47 @@@ int btrfs_forget_devices(dev_t devt return ret; }
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, + const char *path, dev_t devt, + bool mount_arg_dev) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Do not skip device registration for mounted devices with matching + * maj:min but different paths. Booting without initrd relies on + * /dev/root initially, later replaced with the actual root device. + * A successful scan ensures grub2-probe selects the correct device. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + mutex_lock(&fs_devices->device_list_mutex); + + if (!fs_devices->opened) { + mutex_unlock(&fs_devices->device_list_mutex); + continue; + } + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->bdev && (device->bdev->bd_dev == devt) && + strcmp(device->name->str, path) != 0) { + mutex_unlock(&fs_devices->device_list_mutex); + + /* Do not skip registration. */ + return false; + } + } + mutex_unlock(&fs_devices->device_list_mutex); + } + + if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && + !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) + return true; + + return false; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@@ -1359,9 -1335,8 +1376,9 @@@ struct btrfs_device *btrfs_scan_one_dev struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct bdev_handle *bdev_handle; + struct file *bdev_file; u64 bytenr, bytenr_orig; + dev_t devt; int ret;
lockdep_assert_held(&uuid_mutex); @@@ -1383,31 -1358,37 +1400,31 @@@ * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file);
bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; }
- disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; }
- if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && - !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) { - dev_t devt; + devt = file_bdev(bdev_file)->bd_dev; + if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { + pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + path, MAJOR(devt), MINOR(devt));
- ret = lookup_bdev(path, &devt); - if (ret) - btrfs_warn(NULL, "lookup bdev failed for path %s: %d", - path, ret); - else - btrfs_free_stale_devices(devt, NULL); + btrfs_free_stale_devices(devt, NULL);
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", - path, MAJOR(devt), MINOR(devt)); device = NULL; goto free_disk_super; } @@@ -1420,7 -1401,7 +1437,7 @@@ free_disk_super btrfs_release_disk_super(disk_super);
error_bdev_put: - bdev_release(bdev_handle); + fput(bdev_file);
return device; } @@@ -2095,7 -2076,7 +2112,7 @@@ void btrfs_scratch_superblocks(struct b
int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle) + struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@@ -2204,7 -2185,7 +2221,7 @@@
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev_handle) { + if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@@ -2220,9 -2201,9 +2237,9 @@@ * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and bdev_release() will pull in the ->open_mutex on - * the block device and it's dependencies. Instead just flush the - * device and let the caller do the final bdev_release. + * write lock, and fput() on the block device will pull in the + * ->open_mutex on the block device and it's dependencies. Instead + * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device); @@@ -2232,7 -2213,7 +2249,7 @@@ } }
- *bdev_handle = device->bdev_handle; + *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device);
@@@ -2368,7 -2349,7 +2385,7 @@@ int btrfs_get_dev_args_from_path(struc const char *path) { struct btrfs_super_block *disk_super; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int ret;
if (!path || !path[0]) @@@ -2386,7 -2367,7 +2403,7 @@@ }
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@@ -2399,7 -2380,7 +2416,7 @@@ else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return 0; }
@@@ -2619,7 -2600,7 +2636,7 @@@ int btrfs_init_new_device(struct btrfs_ struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@@ -2632,12 -2613,12 +2649,12 @@@ if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS;
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { + if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } @@@ -2649,11 -2630,11 +2666,11 @@@ locked = true; }
- sync_blockdev(bdev_handle->bdev); + sync_blockdev(file_bdev(bdev_file));
rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev_handle->bdev) { + if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; @@@ -2669,8 -2650,8 +2686,8 @@@ }
device->fs_info = fs_info; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@@ -2853,7 -2834,7 +2870,7 @@@ error_free_zone error_free_device: btrfs_free_device(device); error: - bdev_release(bdev_handle); + fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --combined fs/btrfs/zoned.c index 5a3d5ec75c5a9,459d1af02c3ce..4cba80b34387c --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@@ -822,14 -822,11 +822,14 @@@ static int sb_log_location(struct block reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret;
@@@ -975,14 -972,11 +975,14 @@@ int btrfs_advance_sb_log(struct btrfs_d * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret;
+ nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@@ -1000,13 -994,11 +1000,13 @@@
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret;
zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@@ -1017,12 -1009,9 +1017,12 @@@ if (sb_zone + 1 >= nr_zones) return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; }
/* @@@ -1133,14 -1122,12 +1133,14 @@@ static void btrfs_dev_clear_active_zone int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret;
*bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret;
@@@ -1574,11 -1561,7 +1574,7 @@@ int btrfs_load_block_group_zone_info(st if (!map) return -EINVAL;
- cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); - if (!cache->physical_map) { - ret = -ENOMEM; - goto out; - } + cache->physical_map = map;
zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { @@@ -1690,7 -1673,6 +1686,6 @@@ out } bitmap_free(active); kfree(zone_info); - btrfs_free_chunk_map(map);
return ret; } @@@ -2175,6 -2157,7 +2170,7 @@@ static int do_zone_finish(struct btrfs_ struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i;
@@@ -2250,29 -2233,31 +2246,33 @@@ btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock);
+ down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags;
if (zinfo->max_active_zones == 0) continue;
+ nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags);
- if (ret) + if (ret) { + up_read(&dev_replace->rwsem); return ret; + }
if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } + up_read(&dev_replace->rwsem);
if (!fully_written) btrfs_dec_block_group_ro(block_group);