Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Apr 2014 22:31:36 +0000 (15:31 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Apr 2014 22:31:36 +0000 (15:31 -0700)
Pull btrfs changes from Chris Mason:
 "This is a pretty long stream of bug fixes and performance fixes.

  Qu Wenruo has replaced the btrfs async threads with regular kernel
  workqueues.  We'll keep an eye out for performance differences, but
  it's nice to be using more generic code for this.

  We still have some corruption fixes and other patches coming in for
  the merge window, but this batch is tested and ready to go"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (108 commits)
  Btrfs: fix a crash of clone with inline extents's split
  btrfs: fix uninit variable warning
  Btrfs: take into account total references when doing backref lookup
  Btrfs: part 2, fix incremental send's decision to delay a dir move/rename
  Btrfs: fix incremental send's decision to delay a dir move/rename
  Btrfs: remove unnecessary inode generation lookup in send
  Btrfs: fix race when updating existing ref head
  btrfs: Add trace for btrfs_workqueue alloc/destroy
  Btrfs: less fs tree lock contention when using autodefrag
  Btrfs: return EPERM when deleting a default subvolume
  Btrfs: add missing kfree in btrfs_destroy_workqueue
  Btrfs: cache extent states in defrag code path
  Btrfs: fix deadlock with nested trans handles
  Btrfs: fix possible empty list access when flushing the delalloc inodes
  Btrfs: split the global ordered extents mutex
  Btrfs: don't flush all delalloc inodes when we doesn't get s_umount lock
  Btrfs: reclaim delalloc metadata more aggressively
  Btrfs: remove unnecessary lock in may_commit_transaction()
  Btrfs: remove the unnecessary flush when preparing the pages
  Btrfs: just do dirty page flush for the inode with compression before direct IO
  ...

1  2 
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/raid56.c
fs/btrfs/scrub.c
fs/btrfs/volumes.c

diff --combined fs/btrfs/ctree.h
@@@ -351,6 -351,7 +351,7 @@@ static inline unsigned long btrfs_chunk
  #define BTRFS_FS_STATE_ERROR          0
  #define BTRFS_FS_STATE_REMOUNTING     1
  #define BTRFS_FS_STATE_TRANS_ABORTED  2
+ #define BTRFS_FS_STATE_DEV_REPLACING  3
  
  /* Super block flags */
  /* Errors detected */
@@@ -1489,6 -1490,7 +1490,7 @@@ struct btrfs_fs_info 
         */
        struct list_head ordered_roots;
  
+       struct mutex delalloc_root_mutex;
        spinlock_t delalloc_root_lock;
        /* all fs/file tree roots that have delalloc inodes. */
        struct list_head delalloc_roots;
         * A third pool does submit_bio to avoid deadlocking with the other
         * two
         */
-       struct btrfs_workers generic_worker;
-       struct btrfs_workers workers;
-       struct btrfs_workers delalloc_workers;
-       struct btrfs_workers flush_workers;
-       struct btrfs_workers endio_workers;
-       struct btrfs_workers endio_meta_workers;
-       struct btrfs_workers endio_raid56_workers;
-       struct btrfs_workers rmw_workers;
-       struct btrfs_workers endio_meta_write_workers;
-       struct btrfs_workers endio_write_workers;
-       struct btrfs_workers endio_freespace_worker;
-       struct btrfs_workers submit_workers;
-       struct btrfs_workers caching_workers;
-       struct btrfs_workers readahead_workers;
+       struct btrfs_workqueue *workers;
+       struct btrfs_workqueue *delalloc_workers;
+       struct btrfs_workqueue *flush_workers;
+       struct btrfs_workqueue *endio_workers;
+       struct btrfs_workqueue *endio_meta_workers;
+       struct btrfs_workqueue *endio_raid56_workers;
+       struct btrfs_workqueue *rmw_workers;
+       struct btrfs_workqueue *endio_meta_write_workers;
+       struct btrfs_workqueue *endio_write_workers;
+       struct btrfs_workqueue *endio_freespace_worker;
+       struct btrfs_workqueue *submit_workers;
+       struct btrfs_workqueue *caching_workers;
+       struct btrfs_workqueue *readahead_workers;
  
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
         * for the sys_munmap function call path
         */
-       struct btrfs_workers fixup_workers;
-       struct btrfs_workers delayed_workers;
+       struct btrfs_workqueue *fixup_workers;
+       struct btrfs_workqueue *delayed_workers;
        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
        atomic_t scrub_cancel_req;
        wait_queue_head_t scrub_pause_wait;
        int scrub_workers_refcnt;
-       struct btrfs_workers scrub_workers;
-       struct btrfs_workers scrub_wr_completion_workers;
-       struct btrfs_workers scrub_nocow_workers;
+       struct btrfs_workqueue *scrub_workers;
+       struct btrfs_workqueue *scrub_wr_completion_workers;
+       struct btrfs_workqueue *scrub_nocow_workers;
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        u32 check_integrity_print_mask;
        /* qgroup rescan items */
        struct mutex qgroup_rescan_lock; /* protects the progress item */
        struct btrfs_key qgroup_rescan_progress;
-       struct btrfs_workers qgroup_rescan_workers;
+       struct btrfs_workqueue *qgroup_rescan_workers;
        struct completion qgroup_rescan_completion;
        struct btrfs_work qgroup_rescan_work;
  
  
        atomic_t mutually_exclusive_operation_running;
  
+       struct percpu_counter bio_counter;
+       wait_queue_head_t replace_wait;
        struct semaphore uuid_tree_rescan_sem;
        unsigned int update_uuid_tree_gen:1;
  };
  
+ struct btrfs_subvolume_writers {
+       struct percpu_counter   counter;
+       wait_queue_head_t       wait;
+ };
  /*
   * in ram representation of the tree.  extent_root is used for all allocations
   * and for the extent tree extent_root root.
@@@ -1714,11 -1723,15 +1723,15 @@@ struct btrfs_root 
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
+       struct list_head log_ctxs[2];
        atomic_t log_writers;
        atomic_t log_commit[2];
        atomic_t log_batch;
-       unsigned long log_transid;
-       unsigned long last_log_commit;
+       int log_transid;
+       /* No matter the commit succeeds or not*/
+       int log_transid_committed;
+       /* Just be updated when the commit succeeds. */
+       int last_log_commit;
        pid_t log_start_pid;
        bool log_multiple_pids;
  
        spinlock_t root_item_lock;
        atomic_t refs;
  
+       struct mutex delalloc_mutex;
        spinlock_t delalloc_lock;
        /*
         * all of the inodes that have delalloc bytes.  It is possible for
        struct list_head delalloc_inodes;
        struct list_head delalloc_root;
        u64 nr_delalloc_inodes;
+       struct mutex ordered_extent_mutex;
        /*
         * this is used by the balancing code to wait for all the pending
         * ordered extents
         * manipulation with the read-only status via SUBVOL_SETFLAGS
         */
        int send_in_progress;
+       struct btrfs_subvolume_writers *subv_writers;
+       atomic_t will_be_snapshoted;
  };
  
  struct btrfs_ioctl_defrag_range_args {
@@@ -3346,6 -3364,9 +3364,9 @@@ int btrfs_init_space_info(struct btrfs_
  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
  int __get_raid_index(u64 flags);
+ int btrfs_start_nocow_write(struct btrfs_root *root);
+ void btrfs_end_nocow_write(struct btrfs_root *root);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@@ -3723,7 -3744,8 +3744,8 @@@ int btrfs_truncate_inode_items(struct b
                               u32 min_type);
  
  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
- int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
+ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+                              int nr);
  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@@ -3963,17 -3985,20 +3985,17 @@@ do {                                                                 
  /* acl.c */
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
  struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
 +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
  int btrfs_init_acl(struct btrfs_trans_handle *trans,
                   struct inode *inode, struct inode *dir);
 -int btrfs_acl_chmod(struct inode *inode);
  #else
  #define btrfs_get_acl NULL
 +#define btrfs_set_acl NULL
  static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
                                 struct inode *inode, struct inode *dir)
  {
        return 0;
  }
 -static inline int btrfs_acl_chmod(struct inode *inode)
 -{
 -      return 0;
 -}
  #endif
  
  /* relocation.c */
@@@ -4004,6 -4029,11 +4026,11 @@@ int btrfs_scrub_cancel_dev(struct btrfs
                           struct btrfs_device *dev);
  int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
+ /* dev-replace.c */
+ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+ void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
  
  /* reada.c */
  struct reada_control {
diff --combined fs/btrfs/disk-io.c
@@@ -678,32 -678,31 +678,31 @@@ static void end_workqueue_bio(struct bi
  
        fs_info = end_io_wq->info;
        end_io_wq->error = err;
-       end_io_wq->work.func = end_workqueue_fn;
-       end_io_wq->work.flags = 0;
+       btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
  
        if (bio->bi_rw & REQ_WRITE) {
                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
-                       btrfs_queue_worker(&fs_info->endio_meta_write_workers,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_meta_write_workers,
+                                        &end_io_wq->work);
                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
-                       btrfs_queue_worker(&fs_info->endio_freespace_worker,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_freespace_worker,
+                                        &end_io_wq->work);
                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-                       btrfs_queue_worker(&fs_info->endio_raid56_workers,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_raid56_workers,
+                                        &end_io_wq->work);
                else
-                       btrfs_queue_worker(&fs_info->endio_write_workers,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_write_workers,
+                                        &end_io_wq->work);
        } else {
                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-                       btrfs_queue_worker(&fs_info->endio_raid56_workers,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_raid56_workers,
+                                        &end_io_wq->work);
                else if (end_io_wq->metadata)
-                       btrfs_queue_worker(&fs_info->endio_meta_workers,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_meta_workers,
+                                        &end_io_wq->work);
                else
-                       btrfs_queue_worker(&fs_info->endio_workers,
-                                          &end_io_wq->work);
+                       btrfs_queue_work(fs_info->endio_workers,
+                                        &end_io_wq->work);
        }
  }
  
@@@ -738,7 -737,7 +737,7 @@@ int btrfs_bio_wq_end_io(struct btrfs_fs
  unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
  {
        unsigned long limit = min_t(unsigned long,
-                                   info->workers.max_workers,
+                                   info->thread_pool_size,
                                    info->fs_devices->open_devices);
        return 256 * limit;
  }
@@@ -811,11 -810,9 +810,9 @@@ int btrfs_wq_submit_bio(struct btrfs_fs
        async->submit_bio_start = submit_bio_start;
        async->submit_bio_done = submit_bio_done;
  
-       async->work.func = run_one_async_start;
-       async->work.ordered_func = run_one_async_done;
-       async->work.ordered_free = run_one_async_free;
+       btrfs_init_work(&async->work, run_one_async_start,
+                       run_one_async_done, run_one_async_free);
  
-       async->work.flags = 0;
        async->bio_flags = bio_flags;
        async->bio_offset = bio_offset;
  
        atomic_inc(&fs_info->nr_async_submits);
  
        if (rw & REQ_SYNC)
-               btrfs_set_work_high_prio(&async->work);
+               btrfs_set_work_high_priority(&async->work);
  
-       btrfs_queue_worker(&fs_info->workers, &async->work);
+       btrfs_queue_work(fs_info->workers, &async->work);
  
        while (atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
  
  static int btree_csum_one_bio(struct bio *bio)
  {
 -      struct bio_vec *bvec = bio->bi_io_vec;
 -      int bio_index = 0;
 +      struct bio_vec *bvec;
        struct btrfs_root *root;
 -      int ret = 0;
 +      int i, ret = 0;
  
 -      WARN_ON(bio->bi_vcnt <= 0);
 -      while (bio_index < bio->bi_vcnt) {
 +      bio_for_each_segment_all(bvec, bio, i) {
                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
                ret = csum_dirty_buffer(root, bvec->bv_page);
                if (ret)
                        break;
 -              bio_index++;
 -              bvec++;
        }
 +
        return ret;
  }
  
@@@ -1149,6 -1149,32 +1146,32 @@@ void clean_tree_block(struct btrfs_tran
        }
  }
  
+ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+ {
+       struct btrfs_subvolume_writers *writers;
+       int ret;
+       writers = kmalloc(sizeof(*writers), GFP_NOFS);
+       if (!writers)
+               return ERR_PTR(-ENOMEM);
+       ret = percpu_counter_init(&writers->counter, 0);
+       if (ret < 0) {
+               kfree(writers);
+               return ERR_PTR(ret);
+       }
+       init_waitqueue_head(&writers->wait);
+       return writers;
+ }
+ static void
+ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+ {
+       percpu_counter_destroy(&writers->counter);
+       kfree(writers);
+ }
  static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
                         u32 stripesize, struct btrfs_root *root,
                         struct btrfs_fs_info *fs_info,
        spin_lock_init(&root->log_extents_lock[1]);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
+       mutex_init(&root->ordered_extent_mutex);
+       mutex_init(&root->delalloc_mutex);
        init_waitqueue_head(&root->log_writer_wait);
        init_waitqueue_head(&root->log_commit_wait[0]);
        init_waitqueue_head(&root->log_commit_wait[1]);
+       INIT_LIST_HEAD(&root->log_ctxs[0]);
+       INIT_LIST_HEAD(&root->log_ctxs[1]);
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
        atomic_set(&root->refs, 1);
+       atomic_set(&root->will_be_snapshoted, 0);
        root->log_transid = 0;
+       root->log_transid_committed = -1;
        root->last_log_commit = 0;
        if (fs_info)
                extent_io_tree_init(&root->dirty_log_pages,
@@@ -1417,6 -1449,7 +1446,7 @@@ int btrfs_add_log_tree(struct btrfs_tra
        WARN_ON(root->log_root);
        root->log_root = log_root;
        root->log_transid = 0;
+       root->log_transid_committed = -1;
        root->last_log_commit = 0;
        return 0;
  }
@@@ -1498,6 -1531,7 +1528,7 @@@ struct btrfs_root *btrfs_read_fs_root(s
  int btrfs_init_fs_root(struct btrfs_root *root)
  {
        int ret;
+       struct btrfs_subvolume_writers *writers;
  
        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
                goto fail;
        }
  
+       writers = btrfs_alloc_subvolume_writers();
+       if (IS_ERR(writers)) {
+               ret = PTR_ERR(writers);
+               goto fail;
+       }
+       root->subv_writers = writers;
        btrfs_init_free_ino_ctl(root);
        mutex_init(&root->fs_commit_mutex);
        spin_lock_init(&root->cache_lock);
  
        ret = get_anon_bdev(&root->anon_dev);
        if (ret)
-               goto fail;
+               goto free_writers;
        return 0;
+ free_writers:
+       btrfs_free_subvolume_writers(root->subv_writers);
  fail:
        kfree(root->free_ino_ctl);
        kfree(root->free_ino_pinned);
@@@ -1677,7 -1721,7 +1718,7 @@@ static void end_workqueue_fn(struct btr
        bio->bi_private = end_io_wq->private;
        bio->bi_end_io = end_io_wq->end_io;
        kfree(end_io_wq);
 -      bio_endio(bio, error);
 +      bio_endio_nodec(bio, error);
  }
  
  static int cleaner_kthread(void *arg)
@@@ -1990,23 -2034,22 +2031,22 @@@ static noinline int next_root_backup(st
  /* helper to cleanup workers */
  static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
  {
-       btrfs_stop_workers(&fs_info->generic_worker);
-       btrfs_stop_workers(&fs_info->fixup_workers);
-       btrfs_stop_workers(&fs_info->delalloc_workers);
-       btrfs_stop_workers(&fs_info->workers);
-       btrfs_stop_workers(&fs_info->endio_workers);
-       btrfs_stop_workers(&fs_info->endio_meta_workers);
-       btrfs_stop_workers(&fs_info->endio_raid56_workers);
-       btrfs_stop_workers(&fs_info->rmw_workers);
-       btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-       btrfs_stop_workers(&fs_info->endio_write_workers);
-       btrfs_stop_workers(&fs_info->endio_freespace_worker);
-       btrfs_stop_workers(&fs_info->submit_workers);
-       btrfs_stop_workers(&fs_info->delayed_workers);
-       btrfs_stop_workers(&fs_info->caching_workers);
-       btrfs_stop_workers(&fs_info->readahead_workers);
-       btrfs_stop_workers(&fs_info->flush_workers);
-       btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+       btrfs_destroy_workqueue(fs_info->fixup_workers);
+       btrfs_destroy_workqueue(fs_info->delalloc_workers);
+       btrfs_destroy_workqueue(fs_info->workers);
+       btrfs_destroy_workqueue(fs_info->endio_workers);
+       btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+       btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+       btrfs_destroy_workqueue(fs_info->rmw_workers);
+       btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+       btrfs_destroy_workqueue(fs_info->endio_write_workers);
+       btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+       btrfs_destroy_workqueue(fs_info->submit_workers);
+       btrfs_destroy_workqueue(fs_info->delayed_workers);
+       btrfs_destroy_workqueue(fs_info->caching_workers);
+       btrfs_destroy_workqueue(fs_info->readahead_workers);
+       btrfs_destroy_workqueue(fs_info->flush_workers);
+       btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
  }
  
  static void free_root_extent_buffers(struct btrfs_root *root)
@@@ -2097,6 -2140,8 +2137,8 @@@ int open_ctree(struct super_block *sb
        int err = -EINVAL;
        int num_backups_tried = 0;
        int backup_index = 0;
+       int max_active;
+       int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
        bool create_uuid_tree;
        bool check_uuid_tree;
  
                goto fail_dirty_metadata_bytes;
        }
  
+       ret = percpu_counter_init(&fs_info->bio_counter, 0);
+       if (ret) {
+               err = ret;
+               goto fail_delalloc_bytes;
+       }
        fs_info->btree_inode = new_inode(sb);
        if (!fs_info->btree_inode) {
                err = -ENOMEM;
-               goto fail_delalloc_bytes;
+               goto fail_bio_counter;
        }
  
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
        spin_lock_init(&fs_info->buffer_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
+       mutex_init(&fs_info->delalloc_root_mutex);
        seqlock_init(&fs_info->profiles_lock);
  
        init_completion(&fs_info->kobj_unregister);
        atomic_set(&fs_info->scrub_pause_req, 0);
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
+       init_waitqueue_head(&fs_info->replace_wait);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        fs_info->scrub_workers_refcnt = 0;
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
                goto fail_alloc;
        }
  
-       btrfs_init_workers(&fs_info->generic_worker,
-                          "genwork", 1, NULL);
+       max_active = fs_info->thread_pool_size;
  
-       btrfs_init_workers(&fs_info->workers, "worker",
-                          fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
+       fs_info->workers =
+               btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+                                     max_active, 16);
  
-       btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-                          fs_info->thread_pool_size, NULL);
+       fs_info->delalloc_workers =
+               btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
  
-       btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
-                          fs_info->thread_pool_size, NULL);
+       fs_info->flush_workers =
+               btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
  
-       btrfs_init_workers(&fs_info->submit_workers, "submit",
-                          min_t(u64, fs_devices->num_devices,
-                          fs_info->thread_pool_size), NULL);
+       fs_info->caching_workers =
+               btrfs_alloc_workqueue("cache", flags, max_active, 0);
  
-       btrfs_init_workers(&fs_info->caching_workers, "cache",
-                          fs_info->thread_pool_size, NULL);
-       /* a higher idle thresh on the submit workers makes it much more
+       /*
+        * a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
         * devices
         */
-       fs_info->submit_workers.idle_thresh = 64;
-       fs_info->workers.idle_thresh = 16;
-       fs_info->workers.ordered = 1;
-       fs_info->delalloc_workers.idle_thresh = 2;
-       fs_info->delalloc_workers.ordered = 1;
-       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->endio_workers, "endio",
-                          fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-                          fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->endio_meta_write_workers,
-                          "endio-meta-write", fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->endio_raid56_workers,
-                          "endio-raid56", fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->rmw_workers,
-                          "rmw", fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-                          fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
-                          1, &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
-                          fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->readahead_workers, "readahead",
-                          fs_info->thread_pool_size,
-                          &fs_info->generic_worker);
-       btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
-                          &fs_info->generic_worker);
+       fs_info->submit_workers =
+               btrfs_alloc_workqueue("submit", flags,
+                                     min_t(u64, fs_devices->num_devices,
+                                           max_active), 64);
+       fs_info->fixup_workers =
+               btrfs_alloc_workqueue("fixup", flags, 1, 0);
  
        /*
         * endios are largely parallel and should have a very
         * low idle thresh
         */
-       fs_info->endio_workers.idle_thresh = 4;
-       fs_info->endio_meta_workers.idle_thresh = 4;
-       fs_info->endio_raid56_workers.idle_thresh = 4;
-       fs_info->rmw_workers.idle_thresh = 2;
-       fs_info->endio_write_workers.idle_thresh = 2;
-       fs_info->endio_meta_write_workers.idle_thresh = 2;
-       fs_info->readahead_workers.idle_thresh = 2;
-       /*
-        * btrfs_start_workers can really only fail because of ENOMEM so just
-        * return -ENOMEM if any of these fail.
-        */
-       ret = btrfs_start_workers(&fs_info->workers);
-       ret |= btrfs_start_workers(&fs_info->generic_worker);
-       ret |= btrfs_start_workers(&fs_info->submit_workers);
-       ret |= btrfs_start_workers(&fs_info->delalloc_workers);
-       ret |= btrfs_start_workers(&fs_info->fixup_workers);
-       ret |= btrfs_start_workers(&fs_info->endio_workers);
-       ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
-       ret |= btrfs_start_workers(&fs_info->rmw_workers);
-       ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
-       ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
-       ret |= btrfs_start_workers(&fs_info->endio_write_workers);
-       ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
-       ret |= btrfs_start_workers(&fs_info->delayed_workers);
-       ret |= btrfs_start_workers(&fs_info->caching_workers);
-       ret |= btrfs_start_workers(&fs_info->readahead_workers);
-       ret |= btrfs_start_workers(&fs_info->flush_workers);
-       ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
-       if (ret) {
+       fs_info->endio_workers =
+               btrfs_alloc_workqueue("endio", flags, max_active, 4);
+       fs_info->endio_meta_workers =
+               btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+       fs_info->endio_meta_write_workers =
+               btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+       fs_info->endio_raid56_workers =
+               btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+       fs_info->rmw_workers =
+               btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+       fs_info->endio_write_workers =
+               btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+       fs_info->endio_freespace_worker =
+               btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+       fs_info->delayed_workers =
+               btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+       fs_info->readahead_workers =
+               btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+       fs_info->qgroup_rescan_workers =
+               btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+       if (!(fs_info->workers && fs_info->delalloc_workers &&
+             fs_info->submit_workers && fs_info->flush_workers &&
+             fs_info->endio_workers && fs_info->endio_meta_workers &&
+             fs_info->endio_meta_write_workers &&
+             fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+             fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+             fs_info->caching_workers && fs_info->readahead_workers &&
+             fs_info->fixup_workers && fs_info->delayed_workers &&
+             fs_info->qgroup_rescan_workers)) {
                err = -ENOMEM;
                goto fail_sb_buffer;
        }
@@@ -2963,6 -2980,8 +2977,8 @@@ fail_iput
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
  
        iput(fs_info->btree_inode);
+ fail_bio_counter:
+       percpu_counter_destroy(&fs_info->bio_counter);
  fail_delalloc_bytes:
        percpu_counter_destroy(&fs_info->delalloc_bytes);
  fail_dirty_metadata_bytes:
@@@ -3244,6 -3263,8 +3260,8 @@@ static int barrier_all_devices(struct b
        /* send down all the barriers */
        head = &info->fs_devices->devices;
        list_for_each_entry_rcu(dev, head, dev_list) {
+               if (dev->missing)
+                       continue;
                if (!dev->bdev) {
                        errors_send++;
                        continue;
  
        /* wait for all the barriers */
        list_for_each_entry_rcu(dev, head, dev_list) {
+               if (dev->missing)
+                       continue;
                if (!dev->bdev) {
                        errors_wait++;
                        continue;
@@@ -3477,6 -3500,8 +3497,8 @@@ static void free_fs_root(struct btrfs_r
        root->orphan_block_rsv = NULL;
        if (root->anon_dev)
                free_anon_bdev(root->anon_dev);
+       if (root->subv_writers)
+               btrfs_free_subvolume_writers(root->subv_writers);
        free_extent_buffer(root->node);
        free_extent_buffer(root->commit_root);
        kfree(root->free_ino_ctl);
@@@ -3610,6 -3635,7 +3632,7 @@@ int close_ctree(struct btrfs_root *root
  
        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
        percpu_counter_destroy(&fs_info->delalloc_bytes);
+       percpu_counter_destroy(&fs_info->bio_counter);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
  
@@@ -3791,9 -3817,11 +3814,11 @@@ static void btrfs_destroy_all_ordered_e
                list_move_tail(&root->ordered_root,
                               &fs_info->ordered_roots);
  
+               spin_unlock(&fs_info->ordered_root_lock);
                btrfs_destroy_ordered_extents(root);
  
-               cond_resched_lock(&fs_info->ordered_root_lock);
+               cond_resched();
+               spin_lock(&fs_info->ordered_root_lock);
        }
        spin_unlock(&fs_info->ordered_root_lock);
  }
diff --combined fs/btrfs/extent_io.c
@@@ -229,12 -229,14 +229,14 @@@ void free_extent_state(struct extent_st
        }
  }
  
- static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+ static struct rb_node *tree_insert(struct rb_root *root,
+                                  struct rb_node *search_start,
+                                  u64 offset,
                                   struct rb_node *node,
                                   struct rb_node ***p_in,
                                   struct rb_node **parent_in)
  {
-       struct rb_node **p = &root->rb_node;
+       struct rb_node **p;
        struct rb_node *parent = NULL;
        struct tree_entry *entry;
  
                goto do_insert;
        }
  
+       p = search_start ? &search_start : &root->rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct tree_entry, rb_node);
@@@ -430,7 -433,7 +433,7 @@@ static int insert_state(struct extent_i
  
        set_state_bits(tree, state, bits);
  
-       node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
+       node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
        if (node) {
                struct extent_state *found;
                found = rb_entry(node, struct extent_state, rb_node);
@@@ -477,8 -480,8 +480,8 @@@ static int split_state(struct extent_io
        prealloc->state = orig->state;
        orig->start = split;
  
-       node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
-                          NULL, NULL);
+       node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+                          &prealloc->rb_node, NULL, NULL);
        if (node) {
                free_extent_state(prealloc);
                return -EEXIST;
@@@ -2026,7 -2029,7 +2029,7 @@@ int repair_io_failure(struct btrfs_fs_i
        bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
        if (!bio)
                return -EIO;
 -      bio->bi_size = 0;
 +      bio->bi_iter.bi_size = 0;
        map_length = length;
  
        ret = btrfs_map_block(fs_info, WRITE, logical,
        }
        BUG_ON(mirror_num != bbio->mirror_num);
        sector = bbio->stripes[mirror_num-1].physical >> 9;
 -      bio->bi_sector = sector;
 +      bio->bi_iter.bi_sector = sector;
        dev = bbio->stripes[mirror_num-1].dev;
        kfree(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
@@@ -2311,9 -2314,9 +2314,9 @@@ static int bio_readpage_error(struct bi
                return -EIO;
        }
        bio->bi_end_io = failed_bio->bi_end_io;
 -      bio->bi_sector = failrec->logical >> 9;
 +      bio->bi_iter.bi_sector = failrec->logical >> 9;
        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 -      bio->bi_size = 0;
 +      bio->bi_iter.bi_size = 0;
  
        btrfs_failed_bio = btrfs_io_bio(failed_bio);
        if (btrfs_failed_bio->csum) {
@@@ -2375,12 -2378,11 +2378,12 @@@ int end_extent_writepage(struct page *p
   */
  static void end_bio_extent_writepage(struct bio *bio, int err)
  {
 -      struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 +      struct bio_vec *bvec;
        u64 start;
        u64 end;
 +      int i;
  
 -      do {
 +      bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
  
                /* We always issue full-page reads, but if some block
                start = page_offset(page);
                end = start + bvec->bv_offset + bvec->bv_len - 1;
  
 -              if (--bvec >= bio->bi_io_vec)
 -                      prefetchw(&bvec->bv_page->flags);
 -
                if (end_extent_writepage(page, err, start, end))
                        continue;
  
                end_page_writeback(page);
 -      } while (bvec >= bio->bi_io_vec);
 +      }
  
        bio_put(bio);
  }
@@@ -2437,8 -2442,9 +2440,8 @@@ endio_readpage_release_extent(struct ex
   */
  static void end_bio_extent_readpage(struct bio *bio, int err)
  {
 +      struct bio_vec *bvec;
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 -      struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
 -      struct bio_vec *bvec = bio->bi_io_vec;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
        struct extent_io_tree *tree;
        u64 offset = 0;
        u64 extent_len = 0;
        int mirror;
        int ret;
 +      int i;
  
        if (err)
                uptodate = 0;
  
 -      do {
 +      bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
                struct inode *inode = page->mapping->host;
  
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
 -                       "mirror=%lu\n", (u64)bio->bi_sector, err,
 +                       "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
                         io_bio->mirror_num);
                tree = &BTRFS_I(inode)->io_tree;
  
                end = start + bvec->bv_offset + bvec->bv_len - 1;
                len = bvec->bv_len;
  
 -              if (++bvec <= bvec_end)
 -                      prefetchw(&bvec->bv_page->flags);
 -
                mirror = io_bio->mirror_num;
                if (likely(uptodate && tree->ops &&
                           tree->ops->readpage_end_io_hook)) {
@@@ -2564,7 -2572,7 +2567,7 @@@ readpage_ok
                        extent_start = start;
                        extent_len = end + 1 - start;
                }
 -      } while (bvec <= bvec_end);
 +      }
  
        if (extent_len)
                endio_readpage_release_extent(tree, extent_start, extent_len,
@@@ -2595,8 -2603,9 +2598,8 @@@ btrfs_bio_alloc(struct block_device *bd
        }
  
        if (bio) {
 -              bio->bi_size = 0;
                bio->bi_bdev = bdev;
 -              bio->bi_sector = first_sector;
 +              bio->bi_iter.bi_sector = first_sector;
                btrfs_bio = btrfs_io_bio(bio);
                btrfs_bio->csum = NULL;
                btrfs_bio->csum_allocated = NULL;
@@@ -2690,7 -2699,7 +2693,7 @@@ static int submit_extent_page(int rw, s
        if (bio_ret && *bio_ret) {
                bio = *bio_ret;
                if (old_compressed)
 -                      contig = bio->bi_sector == sector;
 +                      contig = bio->bi_iter.bi_sector == sector;
                else
                        contig = bio_end_sector(bio) == sector;
  
@@@ -2757,7 -2766,7 +2760,7 @@@ __get_extent_map(struct inode *inode, s
  
        if (em_cached && *em_cached) {
                em = *em_cached;
-               if (em->in_tree && start >= em->start &&
+               if (extent_map_in_tree(em) && start >= em->start &&
                    start < extent_map_end(em)) {
                        atomic_inc(&em->refs);
                        return em;
@@@ -3457,18 -3466,20 +3460,18 @@@ static void end_extent_buffer_writeback
  
  static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
  {
 -      int uptodate = err == 0;
 -      struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 +      struct bio_vec *bvec;
        struct extent_buffer *eb;
 -      int done;
 +      int i, done;
  
 -      do {
 +      bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
  
 -              bvec--;
                eb = (struct extent_buffer *)page->private;
                BUG_ON(!eb);
                done = atomic_dec_and_test(&eb->io_pages);
  
 -              if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
 +              if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
                        ClearPageUptodate(page);
                        SetPageError(page);
                        continue;
  
                end_extent_buffer_writeback(eb);
 -      } while (bvec >= bio->bi_io_vec);
 +      }
  
        bio_put(bio);
 -
  }
  
  static int write_one_eb(struct extent_buffer *eb,
diff --combined fs/btrfs/file.c
@@@ -591,7 -591,6 +591,6 @@@ void btrfs_drop_extent_cache(struct ino
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                clear_bit(EXTENT_FLAG_LOGGING, &flags);
                modified = !list_empty(&em->list);
-               remove_extent_mapping(em_tree, em);
                if (no_splits)
                        goto next;
  
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
-                       ret = add_extent_mapping(em_tree, split, modified);
-                       BUG_ON(ret); /* Logic error */
+                       replace_extent_mapping(em_tree, em, split, modified);
                        free_extent_map(split);
                        split = split2;
                        split2 = NULL;
                                split->orig_block_len = 0;
                        }
  
-                       ret = add_extent_mapping(em_tree, split, modified);
-                       BUG_ON(ret); /* Logic error */
+                       if (extent_map_in_tree(em)) {
+                               replace_extent_mapping(em_tree, em, split,
+                                                      modified);
+                       } else {
+                               ret = add_extent_mapping(em_tree, split,
+                                                        modified);
+                               ASSERT(ret == 0); /* Logic error */
+                       }
                        free_extent_map(split);
                        split = NULL;
                }
  next:
+               if (extent_map_in_tree(em))
+                       remove_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
  
                /* once for us */
@@@ -720,7 -726,7 +726,7 @@@ int __btrfs_drop_extents(struct btrfs_t
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
  
-       if (start >= BTRFS_I(inode)->disk_i_size)
+       if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
                modify_tree = 0;
  
        while (1) {
@@@ -798,7 -804,10 +804,10 @@@ next_slot
                 */
                if (start > key.offset && end < extent_end) {
                        BUG_ON(del_nr > 0);
-                       BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                       if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                               ret = -EINVAL;
+                               break;
+                       }
  
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.offset = start;
                 *      | -------- extent -------- |
                 */
                if (start <= key.offset && end < extent_end) {
-                       BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                       if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                               ret = -EINVAL;
+                               break;
+                       }
  
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.offset = end;
                 */
                if (start > key.offset && end >= extent_end) {
                        BUG_ON(del_nr > 0);
-                       BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                       if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                               ret = -EINVAL;
+                               break;
+                       }
  
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        start - key.offset);
                 * Set path->slots[0] to first slot, so that after the delete
                 * if items are move off from our leaf to its immediate left or
                 * right neighbor leafs, we end up with a correct and adjusted
-                * path->slots[0] for our insertion.
+                * path->slots[0] for our insertion (if replace_extent != 0).
                 */
                path->slots[0] = del_slot;
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                if (ret)
                        btrfs_abort_transaction(trans, root, ret);
+       }
  
-               leaf = path->nodes[0];
-               /*
-                * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
-                * is, its contents got pushed to its neighbors), in which case
-                * it means path->locks[0] == 0
-                */
-               if (!ret && replace_extent && leafs_visited == 1 &&
-                   path->locks[0] &&
-                   btrfs_leaf_free_space(root, leaf) >=
-                   sizeof(struct btrfs_item) + extent_item_size) {
-                       key.objectid = ino;
-                       key.type = BTRFS_EXTENT_DATA_KEY;
-                       key.offset = start;
-                       setup_items_for_insert(root, path, &key,
-                                              &extent_item_size,
-                                              extent_item_size,
-                                              sizeof(struct btrfs_item) +
-                                              extent_item_size, 1);
-                       *key_inserted = 1;
+       leaf = path->nodes[0];
+       /*
+        * If btrfs_del_items() was called, it might have deleted a leaf, in
+        * which case it unlocked our path, so check path->locks[0] matches a
+        * write lock.
+        */
+       if (!ret && replace_extent && leafs_visited == 1 &&
+           (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+            path->locks[0] == BTRFS_WRITE_LOCK) &&
+           btrfs_leaf_free_space(root, leaf) >=
+           sizeof(struct btrfs_item) + extent_item_size) {
+               key.objectid = ino;
+               key.type = BTRFS_EXTENT_DATA_KEY;
+               key.offset = start;
+               if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+                       struct btrfs_key slot_key;
+                       btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+                       if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+                               path->slots[0]++;
                }
+               setup_items_for_insert(root, path, &key,
+                                      &extent_item_size,
+                                      extent_item_size,
+                                      sizeof(struct btrfs_item) +
+                                      extent_item_size, 1);
+               *key_inserted = 1;
        }
  
        if (!replace_extent || !(*key_inserted))
@@@ -1346,11 -1369,11 +1369,11 @@@ lock_and_cleanup_extent_if_need(struct 
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
                                 start_pos, last_pos, 0, cached_state);
-               ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
+               ordered = btrfs_lookup_ordered_range(inode, start_pos,
+                                                    last_pos - start_pos + 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > start_pos &&
                    ordered->file_offset <= last_pos) {
-                       btrfs_put_ordered_extent(ordered);
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                             start_pos, last_pos,
                                             cached_state, GFP_NOFS);
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                       ret = btrfs_wait_ordered_range(inode, start_pos,
-                                               last_pos - start_pos + 1);
-                       if (ret)
-                               return ret;
-                       else
-                               return -EAGAIN;
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+                       return -EAGAIN;
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
@@@ -1396,8 -1416,12 +1416,12 @@@ static noinline int check_can_nocow(str
        u64 num_bytes;
        int ret;
  
+       ret = btrfs_start_nocow_write(root);
+       if (!ret)
+               return -ENOSPC;
        lockstart = round_down(pos, root->sectorsize);
-       lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+       lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
  
        while (1) {
                lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
        ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
        if (ret <= 0) {
                ret = 0;
+               btrfs_end_nocow_write(root);
        } else {
-               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-                                NULL, GFP_NOFS);
-               *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+               *write_bytes = min_t(size_t, *write_bytes ,
+                                    num_bytes - pos + lockstart);
        }
  
        unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@@ -1510,6 -1532,8 +1532,8 @@@ static noinline ssize_t __btrfs_buffere
                        if (!only_release_metadata)
                                btrfs_free_reserved_data_space(inode,
                                                               reserve_bytes);
+                       else
+                               btrfs_end_nocow_write(root);
                        break;
                }
  
@@@ -1598,6 -1622,9 +1622,9 @@@ again
                }
  
                release_bytes = 0;
+               if (only_release_metadata)
+                       btrfs_end_nocow_write(root);
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
                        u64 lockend = lockstart +
        kfree(pages);
  
        if (release_bytes) {
-               if (only_release_metadata)
+               if (only_release_metadata) {
+                       btrfs_end_nocow_write(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
-               else
+               } else {
                        btrfs_delalloc_release_space(inode, release_bytes);
+               }
        }
  
        return num_written ? num_written : ret;
@@@ -1797,7 -1826,7 +1826,7 @@@ static ssize_t btrfs_file_aio_write(str
        BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0) {
                err = generic_write_sync(file, pos, num_written);
 -              if (err < 0 && num_written > 0)
 +              if (err < 0)
                        num_written = err;
        }
  
@@@ -1856,8 -1885,9 +1885,9 @@@ int btrfs_sync_file(struct file *file, 
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret = 0;
        struct btrfs_trans_handle *trans;
+       struct btrfs_log_ctx ctx;
+       int ret = 0;
        bool full_sync = 0;
  
        trace_btrfs_sync_file(file, datasync);
        }
        trans->sync = true;
  
-       ret = btrfs_log_dentry_safe(trans, root, dentry);
+       btrfs_init_log_ctx(&ctx);
+       ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
  
        if (ret != BTRFS_NO_LOG_SYNC) {
                if (!ret) {
-                       ret = btrfs_sync_log(trans, root);
+                       ret = btrfs_sync_log(trans, root, &ctx);
                        if (!ret) {
                                ret = btrfs_end_transaction(trans, root);
                                goto out;
@@@ -2157,6 -2189,7 +2189,7 @@@ static int btrfs_punch_hole(struct inod
        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+       u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
  
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-               if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+               if (offset < ino_size)
                        ret = btrfs_truncate_page(inode, offset, len, 0);
                mutex_unlock(&inode->i_mutex);
                return ret;
        }
  
        /* zero back part of the first page */
-       if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+       if (offset < ino_size) {
                ret = btrfs_truncate_page(inode, offset, 0, 0);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
        }
  
        /* zero the front end of the last page */
-       if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+       if (offset + len < ino_size) {
                ret = btrfs_truncate_page(inode, offset + len, 0, 1);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
  
                trans->block_rsv = &root->fs_info->trans_block_rsv;
  
-               ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-               if (ret) {
-                       err = ret;
-                       break;
+               if (cur_offset < ino_size) {
+                       ret = fill_holes(trans, inode, path, cur_offset,
+                                        drop_end);
+                       if (ret) {
+                               err = ret;
+                               break;
+                       }
                }
  
                cur_offset = drop_end;
        }
  
        trans->block_rsv = &root->fs_info->trans_block_rsv;
-       ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-       if (ret) {
-               err = ret;
-               goto out_trans;
+       if (cur_offset < ino_size) {
+               ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+               if (ret) {
+                       err = ret;
+                       goto out_trans;
+               }
        }
  
  out_trans:
diff --combined fs/btrfs/inode.c
@@@ -864,7 -864,8 +864,8 @@@ static noinline int cow_file_range(stru
  
        if (btrfs_is_free_space_inode(inode)) {
                WARN_ON_ONCE(1);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto out_unlock;
        }
  
        num_bytes = ALIGN(end - start + 1, blocksize);
@@@ -1075,17 -1076,15 +1076,15 @@@ static int cow_file_range_async(struct 
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
  
-               async_cow->work.func = async_cow_start;
-               async_cow->work.ordered_func = async_cow_submit;
-               async_cow->work.ordered_free = async_cow_free;
-               async_cow->work.flags = 0;
+               btrfs_init_work(&async_cow->work, async_cow_start,
+                               async_cow_submit, async_cow_free);
  
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
  
-               btrfs_queue_worker(&root->fs_info->delalloc_workers,
-                                  &async_cow->work);
+               btrfs_queue_work(root->fs_info->delalloc_workers,
+                                &async_cow->work);
  
                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
                        wait_event(root->fs_info->async_submit_wait,
@@@ -1597,7 -1596,7 +1596,7 @@@ int btrfs_merge_bio_hook(int rw, struc
                         unsigned long bio_flags)
  {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 -      u64 logical = (u64)bio->bi_sector << 9;
 +      u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        int ret;
        if (bio_flags & EXTENT_BIO_COMPRESSED)
                return 0;
  
 -      length = bio->bi_size;
 +      length = bio->bi_iter.bi_size;
        map_length = length;
        ret = btrfs_map_block(root->fs_info, rw, logical,
                              &map_length, NULL, 0);
@@@ -1843,9 -1842,9 +1842,9 @@@ static int btrfs_writepage_start_hook(s
  
        SetPageChecked(page);
        page_cache_get(page);
-       fixup->work.func = btrfs_writepage_fixup_worker;
+       btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
-       btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+       btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
        return -EBUSY;
  }
  
@@@ -2239,6 -2238,11 +2238,11 @@@ static noinline int relink_extent_backr
                return PTR_ERR(root);
        }
  
+       if (btrfs_root_readonly(root)) {
+               srcu_read_unlock(&fs_info->subvol_srcu, index);
+               return 0;
+       }
        /* step 2: get inode */
        key.objectid = backref->inum;
        key.type = BTRFS_INODE_ITEM_KEY;
@@@ -2759,7 -2763,7 +2763,7 @@@ static int btrfs_writepage_end_io_hook(
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered_extent = NULL;
-       struct btrfs_workers *workers;
+       struct btrfs_workqueue *workers;
  
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
  
                                            end - start + 1, uptodate))
                return 0;
  
-       ordered_extent->work.func = finish_ordered_fn;
-       ordered_extent->work.flags = 0;
+       btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
  
        if (btrfs_is_free_space_inode(inode))
-               workers = &root->fs_info->endio_freespace_worker;
+               workers = root->fs_info->endio_freespace_worker;
        else
-               workers = &root->fs_info->endio_write_workers;
-       btrfs_queue_worker(workers, &ordered_extent->work);
+               workers = root->fs_info->endio_write_workers;
+       btrfs_queue_work(workers, &ordered_extent->work);
  
        return 0;
  }
@@@ -4454,12 -4457,8 +4457,12 @@@ static int btrfs_setsize(struct inode *
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
 -      if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
 -              inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 +      if (newsize != oldsize) {
 +              inode_inc_iversion(inode);
 +              if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
 +                      inode->i_ctime = inode->i_mtime =
 +                              current_fs_time(inode->i_sb);
 +      }
  
        if (newsize > oldsize) {
                truncate_pagecache(inode, newsize);
@@@ -4568,7 -4567,7 +4571,7 @@@ static int btrfs_setattr(struct dentry 
                err = btrfs_dirty_inode(inode);
  
                if (!err && attr->ia_valid & ATTR_MODE)
 -                      err = btrfs_acl_chmod(inode);
 +                      err = posix_acl_chmod(inode, inode->i_mode);
        }
  
        return err;
@@@ -4593,7 -4592,7 +4596,7 @@@ static void evict_inode_truncate_pages(
        struct rb_node *node;
  
        ASSERT(inode->i_state & I_FREEING);
 -      truncate_inode_pages(&inode->i_data, 0);
 +      truncate_inode_pages_final(&inode->i_data);
  
        write_lock(&map_tree->lock);
        while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@@ -4924,7 -4923,8 +4927,8 @@@ void btrfs_invalidate_inodes(struct btr
        struct inode *inode;
        u64 objectid = 0;
  
-       WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+       if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+               WARN_ON(btrfs_root_refs(&root->root_item) != 0);
  
        spin_lock(&root->inode_lock);
  again:
@@@ -5799,6 -5799,7 +5803,7 @@@ static int btrfs_mknod(struct inode *di
        }
  out_unlock:
        btrfs_end_transaction(trans, root);
+       btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@@ -5872,6 -5873,7 +5877,7 @@@ out_unlock
                inode_dec_link_count(inode);
                iput(inode);
        }
+       btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
  }
@@@ -5930,6 -5932,7 +5936,7 @@@ static int btrfs_link(struct dentry *ol
        }
  
        btrfs_end_transaction(trans, root);
+       btrfs_balance_delayed_items(root);
  fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
@@@ -5996,6 -5999,7 +6003,7 @@@ out_fail
        btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
+       btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
  }
@@@ -6550,6 -6554,7 +6558,7 @@@ noinline int can_nocow_extent(struct in
        int ret;
        struct extent_buffer *leaf;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        u64 disk_bytenr;
  
        if (btrfs_extent_readonly(root, disk_bytenr))
                goto out;
+       num_bytes = min(offset + *len, extent_end) - offset;
+       if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+               u64 range_end;
+               range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+               ret = test_range_bit(io_tree, offset, range_end,
+                                    EXTENT_DELALLOC, 0, NULL);
+               if (ret) {
+                       ret = -EAGAIN;
+                       goto out;
+               }
+       }
        btrfs_release_path(path);
  
        /*
         */
        disk_bytenr += backref_offset;
        disk_bytenr += offset - key.offset;
-       num_bytes = min(offset + *len, extent_end) - offset;
        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                goto out;
        /*
@@@ -6955,16 -6973,17 +6977,16 @@@ unlock_err
  static void btrfs_endio_direct_read(struct bio *bio, int err)
  {
        struct btrfs_dio_private *dip = bio->bi_private;
 -      struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
 -      struct bio_vec *bvec = bio->bi_io_vec;
 +      struct bio_vec *bvec;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct bio *dio_bio;
        u32 *csums = (u32 *)dip->csum;
 -      int index = 0;
        u64 start;
 +      int i;
  
        start = dip->logical_offset;
 -      do {
 +      bio_for_each_segment_all(bvec, bio, i) {
                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                        struct page *page = bvec->bv_page;
                        char *kaddr;
                        local_irq_restore(flags);
  
                        flush_dcache_page(bvec->bv_page);
 -                      if (csum != csums[index]) {
 +                      if (csum != csums[i]) {
                                btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
                                          btrfs_ino(inode), start, csum,
 -                                        csums[index]);
 +                                        csums[i]);
                                err = -EIO;
                        }
                }
  
                start += bvec->bv_len;
 -              bvec++;
 -              index++;
 -      } while (bvec <= bvec_end);
 +      }
  
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
@@@ -7024,10 -7045,9 +7046,9 @@@ again
        if (!ret)
                goto out_test;
  
-       ordered->work.func = finish_ordered_fn;
-       ordered->work.flags = 0;
-       btrfs_queue_worker(&root->fs_info->endio_write_workers,
-                          &ordered->work);
+       btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+       btrfs_queue_work(root->fs_info->endio_write_workers,
+                        &ordered->work);
  out_test:
        /*
         * our bio might span multiple ordered extents.  If we haven't
@@@ -7070,8 -7090,7 +7091,8 @@@ static void btrfs_end_dio_bio(struct bi
                btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
                      btrfs_ino(dip->inode), bio->bi_rw,
 -                    (unsigned long long)bio->bi_sector, bio->bi_size, err);
 +                    (unsigned long long)bio->bi_iter.bi_sector,
 +                    bio->bi_iter.bi_size, err);
                dip->errors = 1;
  
                /*
@@@ -7162,7 -7181,7 +7183,7 @@@ static int btrfs_submit_direct_hook(in
        struct bio *bio;
        struct bio *orig_bio = dip->orig_bio;
        struct bio_vec *bvec = orig_bio->bi_io_vec;
 -      u64 start_sector = orig_bio->bi_sector;
 +      u64 start_sector = orig_bio->bi_iter.bi_sector;
        u64 file_offset = dip->logical_offset;
        u64 submit_len = 0;
        u64 map_length;
        int ret = 0;
        int async_submit = 0;
  
 -      map_length = orig_bio->bi_size;
 +      map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
                return -EIO;
        }
  
 -      if (map_length >= orig_bio->bi_size) {
 +      if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
                goto submit;
        }
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
  
 -                      map_length = orig_bio->bi_size;
 +                      map_length = orig_bio->bi_iter.bi_size;
                        ret = btrfs_map_block(root->fs_info, rw,
                                              start_sector << 9,
                                              &map_length, NULL, 0);
@@@ -7288,8 -7307,7 +7309,8 @@@ static void btrfs_submit_direct(int rw
  
        if (!skip_sum && !write) {
                csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 -              sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
 +              sum_len = dio_bio->bi_iter.bi_size >>
 +                      inode->i_sb->s_blocksize_bits;
                sum_len *= csum_size;
        } else {
                sum_len = 0;
        dip->private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
 -      dip->bytes = dio_bio->bi_size;
 -      dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
 +      dip->bytes = dio_bio->bi_iter.bi_size;
 +      dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        io_bio->bi_private = dip;
        dip->errors = 0;
        dip->orig_bio = io_bio;
@@@ -7404,15 -7422,15 +7425,15 @@@ static ssize_t btrfs_direct_IO(int rw, 
        smp_mb__after_atomic_inc();
  
        /*
-        * The generic stuff only does filemap_write_and_wait_range, which isn't
-        * enough if we've written compressed pages to this area, so we need to
-        * call btrfs_wait_ordered_range to make absolutely sure that any
-        * outstanding dirty pages are on disk.
+        * The generic stuff only does filemap_write_and_wait_range, which
+        * isn't enough if we've written compressed pages to this area, so
+        * we need to flush the dirty pages again to make absolutely sure
+        * that any outstanding dirty pages are on disk.
         */
        count = iov_length(iov, nr_segs);
-       ret = btrfs_wait_ordered_range(inode, offset, count);
-       if (ret)
-               return ret;
+       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                    &BTRFS_I(inode)->runtime_flags))
+               filemap_fdatawrite_range(inode->i_mapping, offset, count);
  
        if (rw & WRITE) {
                /*
@@@ -8404,7 -8422,7 +8425,7 @@@ struct btrfs_delalloc_work *btrfs_alloc
        work->inode = inode;
        work->wait = wait;
        work->delay_iput = delay_iput;
-       work->work.func = btrfs_run_delalloc_work;
+       btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
  
        return work;
  }
@@@ -8419,7 -8437,8 +8440,8 @@@ void btrfs_wait_and_free_delalloc_work(
   * some fairly slow code that needs optimization. This walks the list
   * of all the inodes with pending delalloc and forces them to disk.
   */
- static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+                                  int nr)
  {
        struct btrfs_inode *binode;
        struct inode *inode;
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
  
+       mutex_lock(&root->delalloc_mutex);
        spin_lock(&root->delalloc_lock);
        list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                        else
                                iput(inode);
                        ret = -ENOMEM;
-                       goto out;
+                       break;
                }
                list_add_tail(&work->list, &works);
-               btrfs_queue_worker(&root->fs_info->flush_workers,
-                                  &work->work);
+               btrfs_queue_work(root->fs_info->flush_workers,
+                                &work->work);
+               ret++;
+               if (nr != -1 && ret >= nr)
+                       break;
                cond_resched();
                spin_lock(&root->delalloc_lock);
        }
        spin_unlock(&root->delalloc_lock);
  
-       list_for_each_entry_safe(work, next, &works, list) {
-               list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
-       }
-       return 0;
- out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
                list_splice_tail(&splice, &root->delalloc_inodes);
                spin_unlock(&root->delalloc_lock);
        }
+       mutex_unlock(&root->delalloc_mutex);
        return ret;
  }
  
@@@ -8490,7 -8507,9 +8510,9 @@@ int btrfs_start_delalloc_inodes(struct 
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return -EROFS;
  
-       ret = __start_delalloc_inodes(root, delay_iput);
+       ret = __start_delalloc_inodes(root, delay_iput, -1);
+       if (ret > 0)
+               ret = 0;
        /*
         * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
        return ret;
  }
  
- int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
+ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+                              int nr)
  {
        struct btrfs_root *root;
        struct list_head splice;
  
        INIT_LIST_HEAD(&splice);
  
+       mutex_lock(&fs_info->delalloc_root_mutex);
        spin_lock(&fs_info->delalloc_root_lock);
        list_splice_init(&fs_info->delalloc_roots, &splice);
-       while (!list_empty(&splice)) {
+       while (!list_empty(&splice) && nr) {
                root = list_first_entry(&splice, struct btrfs_root,
                                        delalloc_root);
                root = btrfs_grab_fs_root(root);
                               &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
  
-               ret = __start_delalloc_inodes(root, delay_iput);
+               ret = __start_delalloc_inodes(root, delay_iput, nr);
                btrfs_put_fs_root(root);
-               if (ret)
+               if (ret < 0)
                        goto out;
  
+               if (nr != -1) {
+                       nr -= ret;
+                       WARN_ON(nr < 0);
+               }
                spin_lock(&fs_info->delalloc_root_lock);
        }
        spin_unlock(&fs_info->delalloc_root_lock);
  
+       ret = 0;
        atomic_inc(&fs_info->async_submit_draining);
        while (atomic_read(&fs_info->nr_async_submits) ||
              atomic_read(&fs_info->async_delalloc_pages)) {
                    atomic_read(&fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&fs_info->async_submit_draining);
-       return 0;
  out:
        if (!list_empty_careful(&splice)) {
                spin_lock(&fs_info->delalloc_root_lock);
                list_splice_tail(&splice, &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
        }
+       mutex_unlock(&fs_info->delalloc_root_mutex);
        return ret;
  }
  
@@@ -8850,14 -8876,12 +8879,14 @@@ static const struct inode_operations bt
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  
@@@ -8927,7 -8951,6 +8956,7 @@@ static const struct inode_operations bt
        .permission     = btrfs_permission,
        .fiemap         = btrfs_fiemap,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_special_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_symlink_inode_operations = {
        .getxattr       = btrfs_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
 -      .get_acl        = btrfs_get_acl,
        .update_time    = btrfs_update_time,
  };
  
diff --combined fs/btrfs/ioctl.c
  #include "props.h"
  #include "sysfs.h"
  
+ #ifdef CONFIG_64BIT
+ /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+  * structures are incorrect, as the timespec structure from userspace
+  * is 4 bytes too small. We define these alternatives here to teach
+  * the kernel about the 32-bit struct packing.
+  */
+ struct btrfs_ioctl_timespec_32 {
+       __u64 sec;
+       __u32 nsec;
+ } __attribute__ ((__packed__));
+ struct btrfs_ioctl_received_subvol_args_32 {
+       char    uuid[BTRFS_UUID_SIZE];  /* in */
+       __u64   stransid;               /* in */
+       __u64   rtransid;               /* out */
+       struct btrfs_ioctl_timespec_32 stime; /* in */
+       struct btrfs_ioctl_timespec_32 rtime; /* out */
+       __u64   flags;                  /* in */
+       __u64   reserved[16];           /* in */
+ } __attribute__ ((__packed__));
+ #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+                               struct btrfs_ioctl_received_subvol_args_32)
+ #endif
  static int btrfs_clone(struct inode *src, struct inode *inode,
                       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
  
@@@ -585,6 -611,23 +611,23 @@@ fail
        return ret;
  }
  
+ static void btrfs_wait_nocow_write(struct btrfs_root *root)
+ {
+       s64 writers;
+       DEFINE_WAIT(wait);
+       do {
+               prepare_to_wait(&root->subv_writers->wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+               writers = percpu_counter_sum(&root->subv_writers->counter);
+               if (writers)
+                       schedule();
+               finish_wait(&root->subv_writers->wait, &wait);
+       } while (writers);
+ }
  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                           struct dentry *dentry, char *name, int namelen,
                           u64 *async_transid, bool readonly,
        if (!root->ref_cows)
                return -EINVAL;
  
+       atomic_inc(&root->will_be_snapshoted);
+       smp_mb__after_atomic_inc();
+       btrfs_wait_nocow_write(root);
        ret = btrfs_start_delalloc_inodes(root, 0);
        if (ret)
-               return ret;
+               goto out;
  
        btrfs_wait_ordered_extents(root, -1);
  
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-       if (!pending_snapshot)
-               return -ENOMEM;
+       if (!pending_snapshot) {
+               ret = -ENOMEM;
+               goto out;
+       }
  
        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                             BTRFS_BLOCK_RSV_TEMP);
                                        &pending_snapshot->qgroup_reserved,
                                        false);
        if (ret)
-               goto out;
+               goto free;
  
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
@@@ -674,8 -723,10 +723,10 @@@ fail
        btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
                                         &pending_snapshot->block_rsv,
                                         pending_snapshot->qgroup_reserved);
out:
free:
        kfree(pending_snapshot);
+ out:
+       atomic_dec(&root->will_be_snapshoted);
        return ret;
  }
  
@@@ -884,12 -935,14 +935,14 @@@ static int find_new_extents(struct btrf
        min_key.type = BTRFS_EXTENT_DATA_KEY;
        min_key.offset = *off;
  
-       path->keep_locks = 1;
        while (1) {
+               path->keep_locks = 1;
                ret = btrfs_search_forward(root, &min_key, path, newer_than);
                if (ret != 0)
                        goto none;
+               path->keep_locks = 0;
+               btrfs_unlock_up_safe(path, 1);
+ process_slot:
                if (min_key.objectid != ino)
                        goto none;
                if (min_key.type != BTRFS_EXTENT_DATA_KEY)
                        return 0;
                }
  
+               path->slots[0]++;
+               if (path->slots[0] < btrfs_header_nritems(leaf)) {
+                       btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+                       goto process_slot;
+               }
                if (min_key.offset == (u64)-1)
                        goto none;
  
@@@ -935,10 -994,13 +994,13 @@@ static struct extent_map *defrag_lookup
        read_unlock(&em_tree->lock);
  
        if (!em) {
+               struct extent_state *cached = NULL;
+               u64 end = start + len - 1;
                /* get the big lock and read metadata off disk */
-               lock_extent(io_tree, start, start + len - 1);
+               lock_extent_bits(io_tree, start, end, 0, &cached);
                em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-               unlock_extent(io_tree, start, start + len - 1);
+               unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
  
                if (IS_ERR(em))
                        return NULL;
@@@ -957,7 -1019,8 +1019,8 @@@ static bool defrag_check_next_extent(st
                return false;
  
        next = defrag_lookup_extent(inode, em->start + em->len);
-       if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+       if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+           (em->block_start + em->block_len == next->block_start))
                ret = false;
  
        free_extent_map(next);
@@@ -1076,10 -1139,12 +1139,12 @@@ again
                page_start = page_offset(page);
                page_end = page_start + PAGE_CACHE_SIZE - 1;
                while (1) {
-                       lock_extent(tree, page_start, page_end);
+                       lock_extent_bits(tree, page_start, page_end,
+                                        0, &cached_state);
                        ordered = btrfs_lookup_ordered_extent(inode,
                                                              page_start);
-                       unlock_extent(tree, page_start, page_end);
+                       unlock_extent_cached(tree, page_start, page_end,
+                                            &cached_state, GFP_NOFS);
                        if (!ordered)
                                break;
  
@@@ -1356,8 -1421,12 +1421,12 @@@ int btrfs_defrag_file(struct inode *ino
                }
        }
  
-       if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+       if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
                filemap_flush(inode->i_mapping);
+               if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                            &BTRFS_I(inode)->runtime_flags))
+                       filemap_flush(inode->i_mapping);
+       }
  
        if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
                /* the filemap_flush will queue IO into the worker threads, but
@@@ -1573,7 -1642,7 +1642,7 @@@ static noinline int btrfs_ioctl_snap_cr
                if (src_inode->i_sb != file_inode(file)->i_sb) {
                        btrfs_info(BTRFS_I(src_inode)->root->fs_info,
                                   "Snapshot src from another FS");
-                       ret = -EINVAL;
+                       ret = -EXDEV;
                } else if (!inode_owner_or_capable(src_inode)) {
                        /*
                         * Subvolume creation is not restricted, but snapshots
@@@ -1797,7 -1866,9 +1866,9 @@@ static noinline int may_destroy_subvol(
        if (di && !IS_ERR(di)) {
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
                if (key.objectid == root->root_key.objectid) {
-                       ret = -ENOTEMPTY;
+                       ret = -EPERM;
+                       btrfs_err(root->fs_info, "deleting default subvolume "
+                                 "%llu is not allowed", key.objectid);
                        goto out;
                }
                btrfs_release_path(path);
@@@ -2735,11 -2806,14 +2806,11 @@@ out_unlock
  #define BTRFS_MAX_DEDUPE_LEN  (16 * 1024 * 1024)
  
  static long btrfs_ioctl_file_extent_same(struct file *file,
 -                                       void __user *argp)
 +                      struct btrfs_ioctl_same_args __user *argp)
  {
 -      struct btrfs_ioctl_same_args tmp;
        struct btrfs_ioctl_same_args *same;
        struct btrfs_ioctl_same_extent_info *info;
 -      struct inode *src = file->f_dentry->d_inode;
 -      struct file *dst_file = NULL;
 -      struct inode *dst;
 +      struct inode *src = file_inode(file);
        u64 off;
        u64 len;
        int i;
        unsigned long size;
        u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
        bool is_admin = capable(CAP_SYS_ADMIN);
 +      u16 count;
  
        if (!(file->f_mode & FMODE_READ))
                return -EINVAL;
        if (ret)
                return ret;
  
 -      if (copy_from_user(&tmp,
 -                         (struct btrfs_ioctl_same_args __user *)argp,
 -                         sizeof(tmp))) {
 +      if (get_user(count, &argp->dest_count)) {
                ret = -EFAULT;
                goto out;
        }
  
 -      size = sizeof(tmp) +
 -              tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
 +      size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
  
 -      same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size);
 +      same = memdup_user(argp, size);
  
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
                goto out;
  
        /* pre-format output fields to sane values */
 -      for (i = 0; i < same->dest_count; i++) {
 +      for (i = 0; i < count; i++) {
                same->info[i].bytes_deduped = 0ULL;
                same->info[i].status = 0;
        }
  
 -      ret = 0;
 -      for (i = 0; i < same->dest_count; i++) {
 -              info = &same->info[i];
 -
 -              dst_file = fget(info->fd);
 -              if (!dst_file) {
 +      for (i = 0, info = same->info; i < count; i++, info++) {
 +              struct inode *dst;
 +              struct fd dst_file = fdget(info->fd);
 +              if (!dst_file.file) {
                        info->status = -EBADF;
 -                      goto next;
 +                      continue;
                }
 +              dst = file_inode(dst_file.file);
  
 -              if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
 +              if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
                        info->status = -EINVAL;
 -                      goto next;
 -              }
 -
 -              info->status = -EXDEV;
 -              if (file->f_path.mnt != dst_file->f_path.mnt)
 -                      goto next;
 -
 -              dst = dst_file->f_dentry->d_inode;
 -              if (src->i_sb != dst->i_sb)
 -                      goto next;
 -
 -              if (S_ISDIR(dst->i_mode)) {
 +              } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
 +                      info->status = -EXDEV;
 +              } else if (S_ISDIR(dst->i_mode)) {
                        info->status = -EISDIR;
 -                      goto next;
 -              }
 -
 -              if (!S_ISREG(dst->i_mode)) {
 +              } else if (!S_ISREG(dst->i_mode)) {
                        info->status = -EACCES;
 -                      goto next;
 +              } else {
 +                      info->status = btrfs_extent_same(src, off, len, dst,
 +                                                      info->logical_offset);
 +                      if (info->status == 0)
 +                              info->bytes_deduped += len;
                }
 -
 -              info->status = btrfs_extent_same(src, off, len, dst,
 -                                              info->logical_offset);
 -              if (info->status == 0)
 -                      info->bytes_deduped += len;
 -
 -next:
 -              if (dst_file)
 -                      fput(dst_file);
 +              fdput(dst_file);
        }
  
        ret = copy_to_user(argp, same, size);
@@@ -2994,8 -3087,9 +3065,9 @@@ process_slot
                                                         new_key.offset + datal,
                                                         1);
                                if (ret) {
-                                       btrfs_abort_transaction(trans, root,
-                                                               ret);
+                                       if (ret != -EINVAL)
+                                               btrfs_abort_transaction(trans,
+                                                               root, ret);
                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
@@@ -3153,8 -3247,9 +3225,9 @@@ static noinline long btrfs_ioctl_clone(
         *   decompress into destination's address_space (the file offset
         *   may change, so source mapping won't do), then recompress (or
         *   otherwise reinsert) a subrange.
-        * - allow ranges within the same file to be cloned (provided
-        *   they don't overlap)?
+        *
+        * - split destination inode's inline extents.  The inline extents can
+        *   be either compressed or non-compressed.
         */
  
        /* the destination must be opened for writing */
@@@ -4353,10 -4448,9 +4426,9 @@@ static long btrfs_ioctl_quota_rescan_wa
        return btrfs_qgroup_wait_for_completion(root->fs_info);
  }
  
- static long btrfs_ioctl_set_received_subvol(struct file *file,
-                                           void __user *arg)
+ static long _btrfs_ioctl_set_received_subvol(struct file *file,
+                                           struct btrfs_ioctl_received_subvol_args *sa)
  {
-       struct btrfs_ioctl_received_subvol_args *sa = NULL;
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root_item *root_item = &root->root_item;
                goto out;
        }
  
-       sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa)) {
-               ret = PTR_ERR(sa);
-               sa = NULL;
-               goto out;
-       }
        /*
         * 1 - root item
         * 2 - uuid items (received uuid + subvol uuid)
                goto out;
        }
  
+ out:
+       up_write(&root->fs_info->subvol_sem);
+       mnt_drop_write_file(file);
+       return ret;
+ }
+ #ifdef CONFIG_64BIT
+ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+                                               void __user *arg)
+ {
+       struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+       struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+       int ret = 0;
+       args32 = memdup_user(arg, sizeof(*args32));
+       if (IS_ERR(args32)) {
+               ret = PTR_ERR(args32);
+               args32 = NULL;
+               goto out;
+       }
+       args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+       if (IS_ERR(args64)) {
+               ret = PTR_ERR(args64);
+               args64 = NULL;
+               goto out;
+       }
+       memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+       args64->stransid = args32->stransid;
+       args64->rtransid = args32->rtransid;
+       args64->stime.sec = args32->stime.sec;
+       args64->stime.nsec = args32->stime.nsec;
+       args64->rtime.sec = args32->rtime.sec;
+       args64->rtime.nsec = args32->rtime.nsec;
+       args64->flags = args32->flags;
+       ret = _btrfs_ioctl_set_received_subvol(file, args64);
+       if (ret)
+               goto out;
+       memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+       args32->stransid = args64->stransid;
+       args32->rtransid = args64->rtransid;
+       args32->stime.sec = args64->stime.sec;
+       args32->stime.nsec = args64->stime.nsec;
+       args32->rtime.sec = args64->rtime.sec;
+       args32->rtime.nsec = args64->rtime.nsec;
+       args32->flags = args64->flags;
+       ret = copy_to_user(arg, args32, sizeof(*args32));
+       if (ret)
+               ret = -EFAULT;
+ out:
+       kfree(args32);
+       kfree(args64);
+       return ret;
+ }
+ #endif
+ static long btrfs_ioctl_set_received_subvol(struct file *file,
+                                           void __user *arg)
+ {
+       struct btrfs_ioctl_received_subvol_args *sa = NULL;
+       int ret = 0;
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               sa = NULL;
+               goto out;
+       }
+       ret = _btrfs_ioctl_set_received_subvol(file, sa);
+       if (ret)
+               goto out;
        ret = copy_to_user(arg, sa, sizeof(*sa));
        if (ret)
                ret = -EFAULT;
  
  out:
        kfree(sa);
-       up_write(&root->fs_info->subvol_sem);
-       mnt_drop_write_file(file);
        return ret;
  }
  
@@@ -4746,7 -4910,7 +4888,7 @@@ long btrfs_ioctl(struct file *file, uns
        case BTRFS_IOC_SYNC: {
                int ret;
  
-               ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+               ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
                if (ret)
                        return ret;
                ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return btrfs_ioctl_balance_progress(root, argp);
        case BTRFS_IOC_SET_RECEIVED_SUBVOL:
                return btrfs_ioctl_set_received_subvol(file, argp);
+ #ifdef CONFIG_64BIT
+       case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+               return btrfs_ioctl_set_received_subvol_32(file, argp);
+ #endif
        case BTRFS_IOC_SEND:
                return btrfs_ioctl_send(file, argp);
        case BTRFS_IOC_GET_DEV_STATS:
diff --combined fs/btrfs/raid56.c
@@@ -1032,8 -1032,8 +1032,8 @@@ static int rbio_add_io_page(struct btrf
  
        /* see if we can add this page onto our existing bio */
        if (last) {
 -              last_end = (u64)last->bi_sector << 9;
 -              last_end += last->bi_size;
 +              last_end = (u64)last->bi_iter.bi_sector << 9;
 +              last_end += last->bi_iter.bi_size;
  
                /*
                 * we can't merge these if they are from different
        if (!bio)
                return -ENOMEM;
  
 -      bio->bi_size = 0;
 +      bio->bi_iter.bi_size = 0;
        bio->bi_bdev = stripe->dev->bdev;
 -      bio->bi_sector = disk_start >> 9;
 +      bio->bi_iter.bi_sector = disk_start >> 9;
        set_bit(BIO_UPTODATE, &bio->bi_flags);
  
        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@@ -1111,7 -1111,7 +1111,7 @@@ static void index_rbio_pages(struct btr
  
        spin_lock_irq(&rbio->bio_list_lock);
        bio_list_for_each(bio, &rbio->bio_list) {
 -              start = (u64)bio->bi_sector << 9;
 +              start = (u64)bio->bi_iter.bi_sector << 9;
                stripe_offset = start - rbio->raid_map[0];
                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
  
@@@ -1272,7 -1272,7 +1272,7 @@@ cleanup
  static int find_bio_stripe(struct btrfs_raid_bio *rbio,
                           struct bio *bio)
  {
 -      u64 physical = bio->bi_sector;
 +      u64 physical = bio->bi_iter.bi_sector;
        u64 stripe_start;
        int i;
        struct btrfs_bio_stripe *stripe;
  static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
                                   struct bio *bio)
  {
 -      u64 logical = bio->bi_sector;
 +      u64 logical = bio->bi_iter.bi_sector;
        u64 stripe_start;
        int i;
  
@@@ -1416,20 -1416,18 +1416,18 @@@ cleanup
  
  static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
  {
-       rbio->work.flags = 0;
-       rbio->work.func = rmw_work;
+       btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
  
-       btrfs_queue_worker(&rbio->fs_info->rmw_workers,
-                          &rbio->work);
+       btrfs_queue_work(rbio->fs_info->rmw_workers,
+                        &rbio->work);
  }
  
  static void async_read_rebuild(struct btrfs_raid_bio *rbio)
  {
-       rbio->work.flags = 0;
-       rbio->work.func = read_rebuild_work;
+       btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
  
-       btrfs_queue_worker(&rbio->fs_info->rmw_workers,
-                          &rbio->work);
+       btrfs_queue_work(rbio->fs_info->rmw_workers,
+                        &rbio->work);
  }
  
  /*
@@@ -1602,8 -1600,8 +1600,8 @@@ static int plug_cmp(void *priv, struct 
                                                 plug_list);
        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
                                                 plug_list);
 -      u64 a_sector = ra->bio_list.head->bi_sector;
 -      u64 b_sector = rb->bio_list.head->bi_sector;
 +      u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
 +      u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
  
        if (a_sector < b_sector)
                return -1;
@@@ -1667,10 -1665,9 +1665,9 @@@ static void btrfs_raid_unplug(struct bl
        plug = container_of(cb, struct btrfs_plug_cb, cb);
  
        if (from_schedule) {
-               plug->work.flags = 0;
-               plug->work.func = unplug_work;
-               btrfs_queue_worker(&plug->info->rmw_workers,
-                                  &plug->work);
+               btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+               btrfs_queue_work(plug->info->rmw_workers,
+                                &plug->work);
                return;
        }
        run_plug(plug);
@@@ -1691,7 -1688,7 +1688,7 @@@ int raid56_parity_write(struct btrfs_ro
        if (IS_ERR(rbio))
                return PTR_ERR(rbio);
        bio_list_add(&rbio->bio_list, bio);
 -      rbio->bio_list_bytes = bio->bi_size;
 +      rbio->bio_list_bytes = bio->bi_iter.bi_size;
  
        /*
         * don't plug on full rbios, just get them out the door
@@@ -2044,7 -2041,7 +2041,7 @@@ int raid56_parity_recover(struct btrfs_
  
        rbio->read_rebuild = 1;
        bio_list_add(&rbio->bio_list, bio);
 -      rbio->bio_list_bytes = bio->bi_size;
 +      rbio->bio_list_bytes = bio->bi_iter.bi_size;
  
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
diff --combined fs/btrfs/scrub.c
@@@ -315,6 -315,16 +315,16 @@@ static void scrub_pending_trans_workers
        atomic_inc(&fs_info->scrubs_running);
        atomic_inc(&fs_info->scrubs_paused);
        mutex_unlock(&fs_info->scrub_lock);
+       /*
+        * check if @scrubs_running=@scrubs_paused condition
+        * inside wait_event() is not an atomic operation.
+        * which means we may inc/dec @scrub_running/paused
+        * at any time. Let's wake up @scrub_pause_wait as
+        * much as we can to let commit transaction blocked less.
+        */
+       wake_up(&fs_info->scrub_pause_wait);
        atomic_inc(&sctx->workers_pending);
  }
  
@@@ -418,7 -428,8 +428,8 @@@ struct scrub_ctx *scrub_setup_ctx(struc
                sbio->index = i;
                sbio->sctx = sctx;
                sbio->page_count = 0;
-               sbio->work.func = scrub_bio_end_io_worker;
+               btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+                               NULL, NULL);
  
                if (i != SCRUB_BIOS_PER_SCTX - 1)
                        sctx->bios[i]->next_free = i + 1;
@@@ -987,9 -998,10 +998,10 @@@ nodatasum_case
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                scrub_pending_trans_workers_inc(sctx);
-               fixup_nodatasum->work.func = scrub_fixup_nodatasum;
-               btrfs_queue_worker(&fs_info->scrub_workers,
-                                  &fixup_nodatasum->work);
+               btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+                               NULL, NULL);
+               btrfs_queue_work(fs_info->scrub_workers,
+                                &fixup_nodatasum->work);
                goto out;
        }
  
@@@ -1331,7 -1343,7 +1343,7 @@@ static void scrub_recheck_block(struct 
                        continue;
                }
                bio->bi_bdev = page->dev->bdev;
 -              bio->bi_sector = page->physical >> 9;
 +              bio->bi_iter.bi_sector = page->physical >> 9;
  
                bio_add_page(bio, page->page, PAGE_SIZE, 0);
                if (btrfsic_submit_bio_wait(READ, bio))
@@@ -1451,7 -1463,7 +1463,7 @@@ static int scrub_repair_page_from_good_
                if (!bio)
                        return -EIO;
                bio->bi_bdev = page_bad->dev->bdev;
 -              bio->bi_sector = page_bad->physical >> 9;
 +              bio->bi_iter.bi_sector = page_bad->physical >> 9;
  
                ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
                if (PAGE_SIZE != ret) {
@@@ -1544,7 -1556,7 +1556,7 @@@ again
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_wr_bio_end_io;
                bio->bi_bdev = sbio->dev->bdev;
 -              bio->bi_sector = sbio->physical >> 9;
 +              bio->bi_iter.bi_sector = sbio->physical >> 9;
                sbio->err = 0;
        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                   spage->physical_for_dev_replace ||
@@@ -1603,8 -1615,8 +1615,8 @@@ static void scrub_wr_bio_end_io(struct 
        sbio->err = err;
        sbio->bio = bio;
  
-       sbio->work.func = scrub_wr_bio_end_io_worker;
-       btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+       btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+       btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
  }
  
  static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@@ -1950,7 -1962,7 +1962,7 @@@ again
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
                bio->bi_bdev = sbio->dev->bdev;
 -              bio->bi_sector = sbio->physical >> 9;
 +              bio->bi_iter.bi_sector = sbio->physical >> 9;
                sbio->err = 0;
        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                   spage->physical ||
@@@ -2072,7 -2084,7 +2084,7 @@@ static void scrub_bio_end_io(struct bi
        sbio->err = err;
        sbio->bio = bio;
  
-       btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+       btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
  }
  
  static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@@ -2686,10 -2698,23 +2698,23 @@@ int scrub_enumerate_chunks(struct scrub
  
                wait_event(sctx->list_wait,
                           atomic_read(&sctx->bios_in_flight) == 0);
-               atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+               atomic_inc(&fs_info->scrubs_paused);
+               wake_up(&fs_info->scrub_pause_wait);
+               /*
+                * must be called before we decrease @scrub_paused.
+                * make sure we don't block transaction commit while
+                * we are waiting pending workers finished.
+                */
                wait_event(sctx->list_wait,
                           atomic_read(&sctx->workers_pending) == 0);
-               scrub_blocked_if_needed(fs_info);
+               atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+               mutex_lock(&fs_info->scrub_lock);
+               __scrub_blocked_if_needed(fs_info);
+               atomic_dec(&fs_info->scrubs_paused);
+               mutex_unlock(&fs_info->scrub_lock);
+               wake_up(&fs_info->scrub_pause_wait);
  
                btrfs_put_block_group(cache);
                if (ret)
@@@ -2757,33 -2782,35 +2782,35 @@@ static noinline_for_stack int scrub_wor
                                                int is_dev_replace)
  {
        int ret = 0;
+       int flags = WQ_FREEZABLE | WQ_UNBOUND;
+       int max_active = fs_info->thread_pool_size;
  
        if (fs_info->scrub_workers_refcnt == 0) {
                if (is_dev_replace)
-                       btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
-                                       &fs_info->generic_worker);
+                       fs_info->scrub_workers =
+                               btrfs_alloc_workqueue("btrfs-scrub", flags,
+                                                     1, 4);
                else
-                       btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-                                       fs_info->thread_pool_size,
-                                       &fs_info->generic_worker);
-               fs_info->scrub_workers.idle_thresh = 4;
-               ret = btrfs_start_workers(&fs_info->scrub_workers);
-               if (ret)
+                       fs_info->scrub_workers =
+                               btrfs_alloc_workqueue("btrfs-scrub", flags,
+                                                     max_active, 4);
+               if (!fs_info->scrub_workers) {
+                       ret = -ENOMEM;
                        goto out;
-               btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
-                                  "scrubwrc",
-                                  fs_info->thread_pool_size,
-                                  &fs_info->generic_worker);
-               fs_info->scrub_wr_completion_workers.idle_thresh = 2;
-               ret = btrfs_start_workers(
-                               &fs_info->scrub_wr_completion_workers);
-               if (ret)
+               }
+               fs_info->scrub_wr_completion_workers =
+                       btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+                                             max_active, 2);
+               if (!fs_info->scrub_wr_completion_workers) {
+                       ret = -ENOMEM;
                        goto out;
-               btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
-                                  &fs_info->generic_worker);
-               ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
-               if (ret)
+               }
+               fs_info->scrub_nocow_workers =
+                       btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+               if (!fs_info->scrub_nocow_workers) {
+                       ret = -ENOMEM;
                        goto out;
+               }
        }
        ++fs_info->scrub_workers_refcnt;
  out:
  static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
  {
        if (--fs_info->scrub_workers_refcnt == 0) {
-               btrfs_stop_workers(&fs_info->scrub_workers);
-               btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
-               btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+               btrfs_destroy_workqueue(fs_info->scrub_workers);
+               btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+               btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
        }
        WARN_ON(fs_info->scrub_workers_refcnt < 0);
  }
@@@ -3106,10 -3133,10 +3133,10 @@@ static int copy_nocow_pages(struct scru
        nocow_ctx->len = len;
        nocow_ctx->mirror_num = mirror_num;
        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-       nocow_ctx->work.func = copy_nocow_pages_worker;
+       btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
        INIT_LIST_HEAD(&nocow_ctx->inodes);
-       btrfs_queue_worker(&fs_info->scrub_nocow_workers,
-                          &nocow_ctx->work);
+       btrfs_queue_work(fs_info->scrub_nocow_workers,
+                        &nocow_ctx->work);
  
        return 0;
  }
@@@ -3373,8 -3400,8 +3400,8 @@@ static int write_page_nocow(struct scru
                spin_unlock(&sctx->stat_lock);
                return -ENOMEM;
        }
 -      bio->bi_size = 0;
 -      bio->bi_sector = physical_for_dev_replace >> 9;
 +      bio->bi_iter.bi_size = 0;
 +      bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
        bio->bi_bdev = dev->bdev;
        ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
        if (ret != PAGE_CACHE_SIZE) {
diff --combined fs/btrfs/volumes.c
@@@ -415,7 -415,8 +415,8 @@@ loop_lock
                        device->running_pending = 1;
  
                        spin_unlock(&device->io_lock);
-                       btrfs_requeue_work(&device->work);
+                       btrfs_queue_work(fs_info->submit_workers,
+                                        &device->work);
                        goto done;
                }
                /* unplug every 64 requests just for good measure */
@@@ -5263,6 -5264,7 +5264,7 @@@ int btrfs_rmap_block(struct btrfs_mappi
  static void btrfs_end_bio(struct bio *bio, int err)
  {
        struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_device *dev = bbio->stripes[0].dev;
        int is_orig_bio = 0;
  
        if (err) {
                if (err == -EIO || err == -EREMOTEIO) {
                        unsigned int stripe_index =
                                btrfs_io_bio(bio)->stripe_index;
-                       struct btrfs_device *dev;
  
                        BUG_ON(stripe_index >= bbio->num_stripes);
                        dev = bbio->stripes[stripe_index].dev;
        if (bio == bbio->orig_bio)
                is_orig_bio = 1;
  
+       btrfs_bio_counter_dec(bbio->fs_info);
        if (atomic_dec_and_test(&bbio->stripes_pending)) {
                if (!is_orig_bio) {
                        bio_put(bio);
                        bio = bbio->orig_bio;
                }
 +
 +              /*
 +               * We have original bio now. So increment bi_remaining to
 +               * account for it in endio
 +               */
 +              atomic_inc(&bio->bi_remaining);
 +
                bio->bi_private = bbio->private;
                bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
        }
  }
  
- struct async_sched {
-       struct bio *bio;
-       int rw;
-       struct btrfs_fs_info *info;
-       struct btrfs_work work;
- };
  /*
   * see run_scheduled_bios for a description of why bios are collected for
   * async submit.
@@@ -5391,8 -5380,8 +5387,8 @@@ static noinline void btrfs_schedule_bio
        spin_unlock(&device->io_lock);
  
        if (should_queue)
-               btrfs_queue_worker(&root->fs_info->submit_workers,
-                                  &device->work);
+               btrfs_queue_work(root->fs_info->submit_workers,
+                                &device->work);
  }
  
  static int bio_size_ok(struct block_device *bdev, struct bio *bio,
        if (!q->merge_bvec_fn)
                return 1;
  
 -      bvm.bi_size = bio->bi_size - prev->bv_len;
 +      bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
        if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
                return 0;
        return 1;
@@@ -5432,7 -5421,7 +5428,7 @@@ static void submit_stripe_bio(struct bt
        bio->bi_private = bbio;
        btrfs_io_bio(bio)->stripe_index = dev_nr;
        bio->bi_end_io = btrfs_end_bio;
 -      bio->bi_sector = physical >> 9;
 +      bio->bi_iter.bi_sector = physical >> 9;
  #ifdef DEBUG
        {
                struct rcu_string *name;
        }
  #endif
        bio->bi_bdev = dev->bdev;
+       btrfs_bio_counter_inc_noblocked(root->fs_info);
        if (async)
                btrfs_schedule_bio(root, dev, rw, bio);
        else
@@@ -5470,7 -5462,7 +5469,7 @@@ again
        while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
                if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
                                 bvec->bv_offset) < bvec->bv_len) {
 -                      u64 len = bio->bi_size;
 +                      u64 len = bio->bi_iter.bi_size;
  
                        atomic_inc(&bbio->stripes_pending);
                        submit_stripe_bio(root, bbio, bio, physical, dev_nr,
@@@ -5492,7 -5484,7 +5491,7 @@@ static void bbio_error(struct btrfs_bi
                bio->bi_private = bbio->private;
                bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 -              bio->bi_sector = logical >> 9;
 +              bio->bi_iter.bi_sector = logical >> 9;
                kfree(bbio);
                bio_endio(bio, -EIO);
        }
@@@ -5503,7 -5495,7 +5502,7 @@@ int btrfs_map_bio(struct btrfs_root *ro
  {
        struct btrfs_device *dev;
        struct bio *first_bio = bio;
 -      u64 logical = (u64)bio->bi_sector << 9;
 +      u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        u64 *raid_map = NULL;
        int total_devs = 1;
        struct btrfs_bio *bbio = NULL;
  
 -      length = bio->bi_size;
 +      length = bio->bi_iter.bi_size;
        map_length = length;
  
+       btrfs_bio_counter_inc_blocked(root->fs_info);
        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                              mirror_num, &raid_map);
-       if (ret) /* -ENOMEM */
+       if (ret) {
+               btrfs_bio_counter_dec(root->fs_info);
                return ret;
+       }
  
        total_devs = bbio->num_stripes;
        bbio->orig_bio = first_bio;
        bbio->private = first_bio->bi_private;
        bbio->end_io = first_bio->bi_end_io;
+       bbio->fs_info = root->fs_info;
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
  
        if (raid_map) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (rw & WRITE) {
-                       return raid56_parity_write(root, bio, bbio,
-                                                  raid_map, map_length);
+                       ret = raid56_parity_write(root, bio, bbio,
+                                                 raid_map, map_length);
                } else {
-                       return raid56_parity_recover(root, bio, bbio,
-                                                    raid_map, map_length,
-                                                    mirror_num);
+                       ret = raid56_parity_recover(root, bio, bbio,
+                                                   raid_map, map_length,
+                                                   mirror_num);
                }
+               /*
+                * FIXME, replace dosen't support raid56 yet, please fix
+                * it in the future.
+                */
+               btrfs_bio_counter_dec(root->fs_info);
+               return ret;
        }
  
        if (map_length < length) {
                                  async_submit);
                dev_nr++;
        }
+       btrfs_bio_counter_dec(root->fs_info);
        return 0;
  }
  
@@@ -5666,7 -5669,7 +5676,7 @@@ struct btrfs_device *btrfs_alloc_device
        else
                generate_random_uuid(dev->uuid);
  
-       dev->work.func = pending_bios_fn;
+       btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
  
        return dev;
  }