#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
+#include "check-integrity.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
sync_pending = 0;
}
- submit_bio(cur->bi_rw, cur);
+ btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
if (need_resched())
/*
* find_free_dev_extent - find free space in the specified device
- * @trans: transaction handler
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @start: store the start of the free space.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
/*
* does all the dirty work required for changing file system's UUID.
*/
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
- ret = btrfs_prepare_sprout(trans, root);
+ ret = btrfs_prepare_sprout(root);
BUG_ON(ret);
}
return ret;
}
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+ /*
+ * Turn on soft mode for chunk types that were being converted.
+ */
+ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+ /*
+ * Turn on usage filter if is not already used. The idea is
+ * that chunks that we have already balanced should be
+ * reasonably full. Don't do it for chunks that are being
+ * converted - that will keep us from relocating unconverted
+ * (albeit full) chunks.
+ */
+ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->data.usage = 90;
+ }
+ if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->sys.usage = 90;
+ }
+ if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->meta.usage = 90;
+ }
+}
+
/*
* Should be called with both balance and volume mutexes held to
* serialize other volume operations (add_dev/rm_dev/resize) with
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_root *dev_root = fs_info->dev_root;
struct list_head *devices;
int slot;
int ret;
int enospc_errors = 0;
+ bool counting = true;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
ret = -ENOMEM;
goto error;
}
+
+ /* zero out stat counters */
+ spin_lock(&fs_info->balance_lock);
+ memset(&bctl->stat, 0, sizeof(bctl->stat));
+ spin_unlock(&fs_info->balance_lock);
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -ECANCELED;
+ goto error;
+ }
+
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ if (!counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.considered++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
if (!ret)
goto loop;
+ if (counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.expected++;
+ spin_unlock(&fs_info->balance_lock);
+ goto loop;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
if (ret && ret != -ENOSPC)
goto error;
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
enospc_errors++;
+ } else {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed++;
+ spin_unlock(&fs_info->balance_lock);
+ }
loop:
key.offset = found_key.offset - 1;
}
+ if (counting) {
+ btrfs_release_path(path);
+ counting = false;
+ goto again;
+ }
error:
btrfs_free_path(path);
if (enospc_errors) {
return ret;
}
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+ /* cancel requested || normal exit path */
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ (atomic_read(&fs_info->balance_pause_req) == 0 &&
+ atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
static void __cancel_balance(struct btrfs_fs_info *fs_info)
{
int ret;
BUG_ON(ret);
}
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
/*
u64 allowed;
int ret;
- if (btrfs_fs_closing(fs_info)) {
+ if (btrfs_fs_closing(fs_info) ||
+ atomic_read(&fs_info->balance_pause_req) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
ret = -EINVAL;
goto out;
}
do_balance:
ret = insert_balance_item(fs_info->tree_root, bctl);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
- set_balance_control(bctl);
+ if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ BUG_ON(ret == -EEXIST);
+ set_balance_control(bctl);
+ } else {
+ BUG_ON(ret != -EEXIST);
+ spin_lock(&fs_info->balance_lock);
+ update_balance_args(bctl);
+ spin_unlock(&fs_info->balance_lock);
+ }
+ atomic_inc(&fs_info->balance_running);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
+ atomic_dec(&fs_info->balance_running);
if (bargs) {
memset(bargs, 0, sizeof(*bargs));
- update_ioctl_balance_args(fs_info, bargs);
+ update_ioctl_balance_args(fs_info, 0, bargs);
}
- __cancel_balance(fs_info);
+ if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+ balance_need_close(fs_info)) {
+ __cancel_balance(fs_info);
+ }
+
+ wake_up(&fs_info->balance_wait_q);
return ret;
out:
+ if (bctl->flags & BTRFS_BALANCE_RESUME)
+ __cancel_balance(fs_info);
+ else
+ kfree(bctl);
+ return ret;
+}
+
+static int balance_kthread(void *data)
+{
+ struct btrfs_balance_control *bctl =
+ (struct btrfs_balance_control *)data;
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ int ret = 0;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ set_balance_control(bctl);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ printk(KERN_INFO "btrfs: force skipping balance\n");
+ } else {
+ printk(KERN_INFO "btrfs: continuing balance\n");
+ ret = btrfs_balance(bctl, NULL);
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+ struct task_struct *tsk;
+ struct btrfs_balance_control *bctl;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_bctl;
+ if (ret > 0) { /* ret = -ENOENT; */
+ ret = 0;
+ goto out_bctl;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ bctl->fs_info = tree_root->fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+ btrfs_balance_data(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ btrfs_balance_meta(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ btrfs_balance_sys(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+ tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+ if (IS_ERR(tsk))
+ ret = PTR_ERR(tsk);
+ else
+ goto out;
+
+out_bctl:
kfree(bctl);
+out:
+ btrfs_free_path(path);
return ret;
}
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ if (atomic_read(&fs_info->balance_running)) {
+ atomic_inc(&fs_info->balance_pause_req);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+
+ mutex_lock(&fs_info->balance_mutex);
+ /* we are good with balance_ctl ripped off from under us */
+ BUG_ON(atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_pause_req);
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->balance_cancel_req);
+ /*
+ * if we are running just wait and return, balance item is
+ * deleted in btrfs_balance in this case
+ */
+ if (atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+ mutex_lock(&fs_info->balance_mutex);
+ } else {
+ /* __cancel_balance needs volume_mutex */
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl)
+ __cancel_balance(fs_info);
+
+ mutex_unlock(&fs_info->volume_mutex);
+ }
+
+ BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_cancel_req);
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+}
+
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
return ret;
}
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_stripe_size = 256 * 1024 * 1024;
+ /* for larger filesystems, use larger metadata chunks */
+ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ max_stripe_size = 1024 * 1024 * 1024;
+ else
+ max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 8 * 1024 * 1024;
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(trans, device,
+ ret = find_free_dev_extent(device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
BUG_ON(ret);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+ ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
item_size);
BUG_ON(ret);
}
u64 stripe_nr;
u64 stripe_nr_orig;
u64 stripe_nr_end;
- int stripes_allocated = 8;
- int stripes_required = 1;
int stripe_index;
int i;
+ int ret = 0;
int num_stripes;
int max_errors = 0;
struct btrfs_bio *bbio = NULL;
- if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
- stripes_allocated = 1;
-again:
- if (bbio_ret) {
- bbio = kzalloc(btrfs_bio_size(stripes_allocated),
- GFP_NOFS);
- if (!bbio)
- return -ENOMEM;
-
- atomic_set(&bbio->error, 0);
- }
-
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
if (mirror_num > map->num_stripes)
mirror_num = 0;
- /* if our btrfs_bio struct is too small, back off and try again */
- if (rw & REQ_WRITE) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_DUP)) {
- stripes_required = map->num_stripes;
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- stripes_required = map->sub_stripes;
- max_errors = 1;
- }
- }
- if (rw & REQ_DISCARD) {
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
- stripes_required = map->num_stripes;
- }
- if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
- stripes_allocated < stripes_required) {
- stripes_allocated = map->num_stripes;
- free_extent_map(em);
- kfree(bbio);
- goto again;
- }
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
}
BUG_ON(stripe_index >= map->num_stripes);
+ bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ atomic_set(&bbio->error, 0);
+
if (rw & REQ_DISCARD) {
+ int factor = 0;
+ int sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+
+ if (map->type &
+ (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_nr_end -
+ stripe_nr_orig,
+ factor,
+ &remaining_stripes);
+ }
+
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- u64 stripes;
- u32 last_stripe = 0;
- int j;
-
- div_u64_rem(stripe_nr_end - 1,
- map->num_stripes,
- &last_stripe);
-
- for (j = 0; j < map->num_stripes; j++) {
- u32 test;
-
- div_u64_rem(stripe_nr_end - 1 - j,
- map->num_stripes, &test);
- if (test == stripe_index)
- break;
- }
- stripes = stripe_nr_end - 1 - j;
- do_div(stripes, map->num_stripes);
- bbio->stripes[i].length = map->stripe_len *
- (stripes - stripe_nr + 1);
-
- if (i == 0) {
- bbio->stripes[i].length -=
- stripe_offset;
- stripe_offset = 0;
- }
- if (stripe_index == last_stripe)
- bbio->stripes[i].length -=
- stripe_end_offset;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u64 stripes;
- int j;
- int factor = map->num_stripes /
- map->sub_stripes;
- u32 last_stripe = 0;
-
- div_u64_rem(stripe_nr_end - 1,
- factor, &last_stripe);
- last_stripe *= map->sub_stripes;
-
- for (j = 0; j < factor; j++) {
- u32 test;
-
- div_u64_rem(stripe_nr_end - 1 - j,
- factor, &test);
-
- if (test ==
- stripe_index / map->sub_stripes)
- break;
- }
- stripes = stripe_nr_end - 1 - j;
- do_div(stripes, factor);
- bbio->stripes[i].length = map->stripe_len *
- (stripes - stripe_nr + 1);
-
- if (i < map->sub_stripes) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+ if (i < sub_stripes)
bbio->stripes[i].length -=
stripe_offset;
- if (i == map->sub_stripes - 1)
- stripe_offset = 0;
- }
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- map->sub_stripes - 1)) {
+ if ((i / sub_stripes + 1) %
+ sub_stripes == remaining_stripes)
bbio->stripes[i].length -=
stripe_end_offset;
- }
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
} else
bbio->stripes[i].length = *length;
stripe_index++;
}
}
- if (bbio_ret) {
- *bbio_ret = bbio;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+
+ if (rw & REQ_WRITE) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ }
}
+
+ *bbio_ret = bbio;
+ bbio->num_stripes = num_stripes;
+ bbio->max_errors = max_errors;
+ bbio->mirror_num = mirror_num;
out:
free_extent_map(em);
- return 0;
+ return ret;
}
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
/* don't bother with additional async steps for reads, right now */
if (!(rw & REQ_WRITE)) {
bio_get(bio);
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
bio_put(bio);
return 0;
}
if (async_submit)
schedule_bio(root, dev, rw, bio);
else
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
} else {
bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
bio->bi_sector = logical >> 9;
struct btrfs_fs_devices *fs_devices;
int ret;
- mutex_lock(&uuid_mutex);
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
- mutex_unlock(&uuid_mutex);
return ret;
}
if (!path)
return -ENOMEM;
+ mutex_lock(&uuid_mutex);
+ lock_chunks(root);
+
/* first we search for all of the device items, and then we
* read in all of the chunk items. This way we can create chunk
* mappings that reference all of the devices that are afound
}
ret = 0;
error:
+ unlock_chunks(root);
+ mutex_unlock(&uuid_mutex);
+
btrfs_free_path(path);
return ret;
}